use new resolve_source_to_x functions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-12-02 17:15:43 +01:00
parent 72e76512bd
commit d1e439de92
2 changed files with 88 additions and 84 deletions

View File

@ -2,6 +2,7 @@ import importlib
import json import json
import logging import logging
import re import re
import tempfile
import time import time
import warnings import warnings
from enum import Enum from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer import typer
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_source_to_path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -256,9 +257,10 @@ def convert(
if from_formats is None: if from_formats is None:
from_formats = [e for e in InputFormat] from_formats = [e for e in InputFormat]
with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = [] input_doc_paths: List[Path] = []
for src in input_sources: for src in input_sources:
source = resolve_file_source(source=src) source = resolve_source_to_path(source=src, workdir=Path(tempdir))
if not source.exists(): if not source.exists():
err_console.print( err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]" f"[red]Error: The input file {source} does not exist.[/red]"
@ -302,7 +304,9 @@ def convert(
ocr_options=ocr_options, ocr_options=ocr_options,
do_table_structure=True, do_table_structure=True,
) )
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching pipeline_options.table_structure_options.do_cell_matching = (
True # do_cell_matching
)
pipeline_options.table_structure_options.mode = table_mode pipeline_options.table_structure_options.mode = table_mode
if artifacts_path is not None: if artifacts_path is not None:

View File

@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
) )
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
self, format_options: Dict[InputFormat, "FormatOption"] self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator: for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj) format = self._guess_format(obj)
if format not in format_options.keys(): if format not in format_options.keys():
_log.info( _log.info(