mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
CLI and error handling fixes
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
a66c4ee8eb
commit
ba9eaf1bd7
@ -393,7 +393,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if contains_lists is None:
|
if contains_lists is None:
|
||||||
return cell.text
|
return cell.text
|
||||||
else:
|
else:
|
||||||
_log.warn(
|
_log.debug(
|
||||||
"should extract the content correctly for table-cells with lists ..."
|
"should extract the content correctly for table-cells with lists ..."
|
||||||
)
|
)
|
||||||
return cell.text
|
return cell.text
|
||||||
|
@ -157,9 +157,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
new_list = None
|
new_list = None
|
||||||
|
|
||||||
if is_a_list:
|
if is_a_list:
|
||||||
_log.info("LIST DETECTED!")
|
_log.debug("LIST DETECTED!")
|
||||||
else:
|
else:
|
||||||
_log.info("No List")
|
_log.debug("No List")
|
||||||
|
|
||||||
# for e in p.iter():
|
# for e in p.iter():
|
||||||
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
||||||
|
@ -138,7 +138,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
try:
|
try:
|
||||||
self.handle_tables(element, docx_obj, doc)
|
self.handle_tables(element, docx_obj, doc)
|
||||||
except Exception:
|
except Exception:
|
||||||
_log.error("could not parse a table, broken docx table")
|
_log.debug("could not parse a table, broken docx table")
|
||||||
|
|
||||||
elif found_drawing or found_pict:
|
elif found_drawing or found_pict:
|
||||||
self.handle_pictures(element, docx_obj, doc)
|
self.handle_pictures(element, docx_obj, doc)
|
||||||
@ -146,7 +146,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elif tag_name in ["p"]:
|
elif tag_name in ["p"]:
|
||||||
self.handle_text_elements(element, docx_obj, doc)
|
self.handle_text_elements(element, docx_obj, doc)
|
||||||
else:
|
else:
|
||||||
_log.warn(f"Ignoring element in DOCX with tag: {tag_name}")
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def str_to_int(self, s, default=0):
|
def str_to_int(self, s, default=0):
|
||||||
|
@ -5,7 +5,7 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Iterable, List, Optional
|
from typing import Annotated, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_file_source
|
||||||
@ -26,7 +26,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
@ -152,6 +152,14 @@ def convert(
|
|||||||
ocr_engine: Annotated[
|
ocr_engine: Annotated[
|
||||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
] = OcrEngine.EASYOCR,
|
] = OcrEngine.EASYOCR,
|
||||||
|
abort_on_error: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
"--abort-on-error/--no-abort-on-error",
|
||||||
|
help="If enabled, the bitmap content will be processed using OCR.",
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
output: Annotated[
|
output: Annotated[
|
||||||
Path, typer.Option(..., help="Output directory where results are saved.")
|
Path, typer.Option(..., help="Output directory where results are saved.")
|
||||||
] = Path("."),
|
] = Path("."),
|
||||||
@ -211,18 +219,22 @@ def convert(
|
|||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
format_options={
|
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
backend=DoclingParseDocumentBackend, # pdf_backend
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
allowed_formats=from_formats,
|
||||||
|
format_options=format_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(input_doc_paths)
|
conv_results = doc_converter.convert_all(
|
||||||
|
input_doc_paths, raises_on_error=abort_on_error
|
||||||
|
)
|
||||||
|
|
||||||
output.mkdir(parents=True, exist_ok=True)
|
output.mkdir(parents=True, exist_ok=True)
|
||||||
export_documents(
|
export_documents(
|
||||||
|
@ -160,7 +160,7 @@ class InputDocument(BaseModel):
|
|||||||
) -> None:
|
) -> None:
|
||||||
if backend is None:
|
if backend is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"No backend configuration provided for file {self.file} with format {self.format}. "
|
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||||
f"Please check your format configuration on DocumentConverter."
|
f"Please check your format configuration on DocumentConverter."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -472,8 +472,8 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
_log.debug(
|
_log.info(
|
||||||
f"Skipping input document {obj.name} because its format is not in the whitelist."
|
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
|
@ -111,6 +111,14 @@ class DocumentConverter:
|
|||||||
_log.debug(f"Requested format {f} will use default options.")
|
_log.debug(f"Requested format {f} will use default options.")
|
||||||
self.format_to_options[f] = _format_to_default_options[f]
|
self.format_to_options[f] = _format_to_default_options[f]
|
||||||
|
|
||||||
|
remove_keys = []
|
||||||
|
for f in self.format_to_options.keys():
|
||||||
|
if f not in self.allowed_formats:
|
||||||
|
remove_keys.append(f)
|
||||||
|
|
||||||
|
for f in remove_keys:
|
||||||
|
self.format_to_options.pop(f)
|
||||||
|
|
||||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||||
|
|
||||||
@validate_call(config=ConfigDict(strict=True))
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
@ -176,7 +184,7 @@ class DocumentConverter:
|
|||||||
|
|
||||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||||
for item in map(
|
for item in map(
|
||||||
partial(self.process_document, raises_on_error=raises_on_error),
|
partial(self._process_document, raises_on_error=raises_on_error),
|
||||||
input_batch,
|
input_batch,
|
||||||
):
|
):
|
||||||
if item is not None:
|
if item is not None:
|
||||||
@ -205,20 +213,20 @@ class DocumentConverter:
|
|||||||
)
|
)
|
||||||
return self.initialized_pipelines[pipeline_class]
|
return self.initialized_pipelines[pipeline_class]
|
||||||
|
|
||||||
def process_document(
|
def _process_document(
|
||||||
self, in_doc: InputDocument, raises_on_error: bool
|
self, in_doc: InputDocument, raises_on_error: bool
|
||||||
) -> Optional[ConversionResult]:
|
) -> Optional[ConversionResult]:
|
||||||
assert self.allowed_formats is not None
|
assert self.allowed_formats is not None
|
||||||
|
assert in_doc.format in self.allowed_formats
|
||||||
|
|
||||||
if in_doc.format not in self.allowed_formats:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
start_doc_time = time.time()
|
start_doc_time = time.time()
|
||||||
|
|
||||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||||
|
|
||||||
end_doc_time = time.time() - start_doc_time
|
end_doc_time = time.time() - start_doc_time
|
||||||
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
|
_log.info(
|
||||||
|
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
||||||
|
)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@ -228,12 +236,21 @@ class DocumentConverter:
|
|||||||
if in_doc.valid:
|
if in_doc.valid:
|
||||||
pipeline = self._get_pipeline(in_doc)
|
pipeline = self._get_pipeline(in_doc)
|
||||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||||
|
if raises_on_error:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"No pipeline could be initialized for {in_doc.file}."
|
||||||
|
)
|
||||||
|
else:
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(input=in_doc)
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if raises_on_error:
|
||||||
|
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# invalid doc or not of desired format
|
# invalid doc or not of desired format
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(input=in_doc)
|
||||||
|
@ -34,12 +34,6 @@ class BasePipeline(ABC):
|
|||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(input=in_doc)
|
||||||
|
|
||||||
_log.info(f"Processing document {in_doc.file.name}")
|
_log.info(f"Processing document {in_doc.file.name}")
|
||||||
|
|
||||||
if not in_doc.valid:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
# TODO: propagate option for raises_on_error?
|
|
||||||
try:
|
try:
|
||||||
# These steps are building and assembling the structure of the
|
# These steps are building and assembling the structure of the
|
||||||
# output DoclingDocument
|
# output DoclingDocument
|
||||||
@ -155,7 +149,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
end_pb_time = time.time() - start_pb_time
|
end_pb_time = time.time() - start_pb_time
|
||||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
Loading…
Reference in New Issue
Block a user