mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
fix: use new resolve_source_to_x functions to avoid tempfile leftovers (#490)
use new resolve_source_to_x functions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
72e76512bd
commit
a762c8394e
@ -2,6 +2,7 @@ import importlib
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@ -9,7 +10,7 @@ from pathlib import Path
|
|||||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
@ -256,9 +257,10 @@ def convert(
|
|||||||
if from_formats is None:
|
if from_formats is None:
|
||||||
from_formats = [e for e in InputFormat]
|
from_formats = [e for e in InputFormat]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
input_doc_paths: List[Path] = []
|
input_doc_paths: List[Path] = []
|
||||||
for src in input_sources:
|
for src in input_sources:
|
||||||
source = resolve_file_source(source=src)
|
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
||||||
if not source.exists():
|
if not source.exists():
|
||||||
err_console.print(
|
err_console.print(
|
||||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||||
@ -302,7 +304,9 @@ def convert(
|
|||||||
ocr_options=ocr_options,
|
ocr_options=ocr_options,
|
||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = (
|
||||||
|
True # do_cell_matching
|
||||||
|
)
|
||||||
pipeline_options.table_structure_options.mode = table_mode
|
pipeline_options.table_structure_options.mode = table_mode
|
||||||
|
|
||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
|
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
|||||||
)
|
)
|
||||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
for item in self.path_or_stream_iterator:
|
for item in self.path_or_stream_iterator:
|
||||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
_log.info(
|
_log.info(
|
||||||
|
Loading…
Reference in New Issue
Block a user