use new resolve_source_to_x functions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-12-02 17:15:43 +01:00
parent 72e76512bd
commit d1e439de92
2 changed files with 88 additions and 84 deletions

View File

@ -2,6 +2,7 @@ import importlib
import json
import logging
import re
import tempfile
import time
import warnings
from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -256,9 +257,10 @@ def convert(
if from_formats is None:
from_formats = [e for e in InputFormat]
with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = []
for src in input_sources:
source = resolve_file_source(source=src)
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
@ -302,7 +304,9 @@ def convert(
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.do_cell_matching = (
True # do_cell_matching
)
pipeline_options.table_structure_options.mode = table_mode
if artifacts_path is not None:

View File

@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel
from typing_extensions import deprecated
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj)
if format not in format_options.keys():
_log.info(