From 203cf19b1b3174ca6e4a6495861d4cf6a277bca3 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 8 Oct 2024 16:38:42 +0200 Subject: [PATCH] Lots of improvements Signed-off-by: Christoph Auer --- docling/datamodel/document.py | 68 +++++++++++++------- docling/document_converter.py | 84 +++++++++++++++++++------ docling/pipeline/base_model_pipeline.py | 2 +- examples/batch_convert.py | 9 +-- examples/custom_convert.py | 16 +++-- examples/export_figures.py | 13 ++-- examples/export_multimodal.py | 13 ++-- examples/export_tables.py | 4 +- examples/minimal.py | 4 +- examples/run_with_formats.py | 33 +++++++--- 10 files changed, 169 insertions(+), 77 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 0802602f..7e5897c1 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -98,21 +98,18 @@ class InputDocument(BaseModel): def __init__( self, path_or_stream: Union[BytesIO, Path], + format: InputFormat, + backend: AbstractDocumentBackend, filename: Optional[str] = None, limits: Optional[DocumentLimits] = None, - backend: Optional[Type[AbstractDocumentBackend]] = None, - format: Optional[InputFormat] = None, ): super().__init__() self.limits = limits or DocumentLimits() + self.format = format try: if isinstance(path_or_stream, Path): - mime = filetype.guess_mime(str(path_or_stream)) - if mime is None: - if path_or_stream.suffix == ".html": - mime = "text/html" self.file = path_or_stream self.filesize = path_or_stream.stat().st_size @@ -121,11 +118,9 @@ class InputDocument(BaseModel): else: self.document_hash = create_file_hash(path_or_stream) - self._init_doc(backend, mime, path_or_stream) + self._init_doc(backend, path_or_stream) elif isinstance(path_or_stream, BytesIO): - mime = filetype.guess_mime(path_or_stream.read(8192)) - self.file = PurePath(filename) self.filesize = path_or_stream.getbuffer().nbytes @@ -134,7 +129,7 @@ class InputDocument(BaseModel): else: self.document_hash = create_file_hash(path_or_stream) - self._init_doc(backend, mime, path_or_stream) + self._init_doc(backend, path_or_stream) # For paginated backends, check if the maximum page count is exceeded. if self.valid and self._backend.is_valid(): @@ -158,23 +153,19 @@ class InputDocument(BaseModel): def _init_doc( self, backend: AbstractDocumentBackend, - mime: str, path_or_stream: Union[BytesIO, Path], ) -> None: - self.format = MimeTypeToFormat.get(mime) - if self.format is not None: - backend = backend or _input_format_default_backends.get(self.format) + if backend is None: + backend = _input_format_default_backends.get(self.format) if backend is None: + self.valid = False raise RuntimeError( - f"Could not find suitable default backend for format: {self.format}" + f"Could not find suitable backend for file: {self.file}" ) - if self.format is None or self.format not in backend.supported_formats(): - # TODO decide if to raise exception here too. - self.valid = False - else: - self._backend = backend( - path_or_stream=path_or_stream, document_hash=self.document_hash - ) + + self._backend = backend( + path_or_stream=path_or_stream, document_hash=self.document_hash + ) @deprecated("Use `ConversionResult` instead.") @@ -478,17 +469,46 @@ class DocumentConversionInput(BaseModel): limits: Optional[DocumentLimits] = DocumentLimits() def docs( - self, backend: Optional[Type[AbstractDocumentBackend]] = None + self, format_options: Dict[InputFormat, "FormatOption"] ) -> Iterable[InputDocument]: for obj in self._path_or_stream_iterator: if isinstance(obj, Path): + + mime = filetype.guess_mime(str(obj)) + if mime is None: + if obj.suffix == ".html": + mime = "text/html" + + format = MimeTypeToFormat.get(mime) + if format not in format_options.keys(): + continue + else: + backend = format_options.get(format).backend + yield InputDocument( - path_or_stream=obj, limits=self.limits, backend=backend + path_or_stream=obj, + format=format, + limits=self.limits, + backend=backend, ) elif isinstance(obj, DocumentStream): + mime = filetype.guess_mime(obj.stream.read(8192)) + obj.stream.seek(0) + + if mime is None: + if obj.suffix == ".html": + mime = "text/html" + + format = MimeTypeToFormat.get(mime) + if format not in format_options.keys(): + continue + else: + backend = format_options.get(format).backend + yield InputDocument( path_or_stream=obj.stream, + format=format, filename=obj.filename, limits=self.limits, backend=backend, diff --git a/docling/document_converter.py b/docling/document_converter.py index 8d71a880..9a2e5143 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -8,6 +8,7 @@ import requests from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ( ConversionResult, @@ -27,7 +28,7 @@ _log = logging.getLogger(__name__) class FormatOption(BaseModel): pipeline_cls: Type[BaseModelPipeline] pipeline_options: Optional[PipelineOptions] = None - backend: Optional[Type[AbstractDocumentBackend]] + backend: Optional[Type[AbstractDocumentBackend]] = None model_config = ConfigDict(arbitrary_types_allowed=True) @@ -47,11 +48,29 @@ class FormatOption(BaseModel): ) +class PdfFormatOption(FormatOption): + def __init__( + self, + pipeline_cls: Optional[Type[BaseModelPipeline]] = None, + pipeline_options: Optional[PipelineOptions] = None, + backend: Optional[Type[AbstractDocumentBackend]] = None, + ): + if pipeline_cls is None: + pipeline_cls = StandardPdfModelPipeline + if backend is None: + backend = DoclingParseDocumentBackend + super().__init__( + pipeline_cls=pipeline_cls, + pipeline_options=pipeline_options, + backend=backend, + ) + + _format_to_default_options = { InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline), InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline), InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline), - InputFormat.IMAGE: None, + InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline), InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline), } @@ -61,11 +80,26 @@ class DocumentConverter: def __init__( self, - formats: List[InputFormat] = [e for e in InputFormat], - format_options: Dict[InputFormat, FormatOption] = _format_to_default_options, + formats: Optional[List[InputFormat]] = None, + format_options: Optional[Dict[InputFormat, FormatOption]] = None, ): self.formats = formats self.format_to_options = format_options + + if self.formats is None: + if self.format_to_options is not None: + self.formats = self.format_to_options.keys() + else: + self.formats = [e for e in InputFormat] # all formats + + if self.format_to_options is None: + self.format_to_options = _format_to_default_options + + for f in self.formats: + if f not in self.format_to_options.keys(): + _log.info(f"Requested format {f} will use default options.") + self.format_to_options[f] = _format_to_default_options[f] + self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = ( {} ) @@ -73,7 +107,8 @@ class DocumentConverter: def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: for input_batch in chunkify( - input.docs(), settings.perf.doc_batch_size # pass format_options + input.docs(self.format_to_options), + settings.perf.doc_batch_size, # pass format_options ): _log.info(f"Going to convert document batch...") # parallel processing only within input_batch @@ -83,7 +118,9 @@ class DocumentConverter: # yield from pool.map(self.process_document, input_batch) # Note: PDF backends are not thread-safe, thread pool usage was disabled. - yield from map(self.process_document, input_batch) + for item in map(self.process_document, input_batch): + if item is not None: + yield item def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult: """Convert a single document. @@ -136,31 +173,40 @@ class DocumentConverter: return conv_res def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]: - pipeline_class = None fopt = self.format_to_options.get(doc.format) + if fopt is None: - return None + raise RuntimeError(f"Could not get pipeline for document {doc.file}") else: pipeline_class = fopt.pipeline_cls + pipeline_options = fopt.pipeline_options - if pipeline_class not in self.initialized_pipelines: + # TODO this will ignore if different options have been defined for the same pipeline class. + if ( + pipeline_class not in self.initialized_pipelines + or self.initialized_pipelines[pipeline_class].pipeline_options + != pipeline_options + ): self.initialized_pipelines[pipeline_class] = pipeline_class( - pipeline_options=pipeline_class.get_default_options() + pipeline_options=pipeline_options ) return self.initialized_pipelines[pipeline_class] def process_document(self, in_doc: InputDocument) -> ConversionResult: - start_doc_time = time.time() + if in_doc.format not in self.formats: + return None + else: + start_doc_time = time.time() - conv_res = self._execute_pipeline(in_doc) + conv_res = self._execute_pipeline(in_doc) - end_doc_time = time.time() - start_doc_time - _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.") + end_doc_time = time.time() - start_doc_time + _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.") - return conv_res + return conv_res - def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult: - if in_doc.valid and in_doc.format in self.formats: + def _execute_pipeline(self, in_doc: InputDocument) -> Optional[ConversionResult]: + if in_doc.valid: pipeline = self._get_pipeline(in_doc) if pipeline is None: # Can't find a default pipeline. Should this raise? conv_res = ConversionResult(input=in_doc) @@ -168,7 +214,9 @@ class DocumentConverter: return conv_res conv_res = pipeline.execute(in_doc) - else: # invalid doc or not of desired format + + else: + # invalid doc or not of desired format conv_res = ConversionResult(input=in_doc) conv_res.status = ConversionStatus.FAILURE # TODO add error log why it failed. diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py index 87d251c1..3a77a70d 100644 --- a/docling/pipeline/base_model_pipeline.py +++ b/docling/pipeline/base_model_pipeline.py @@ -109,7 +109,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name. f"Encountered an error during conversion of document {in_doc.document_hash}:\n" f"{trace}" ) - raise e + raise e # TODO Debug, should not be here. finally: # Always unload the PDF backend, even in case of failure if in_doc._backend: diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 382d930d..67788e83 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -6,8 +6,9 @@ from typing import Iterable import yaml -from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions +from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -110,11 +111,7 @@ def main(): # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # input = DocumentConversionInput.from_streams(docs) - doc_converter = PdfDocumentConverter( - pipeline_options=PdfPipelineOptions(), - pdf_backend=DocumentConversionInput.DEFAULT_BACKEND, - pipeline_cls=StandardModelPipeline, - ) + doc_converter = DocumentConverter() input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/examples/custom_convert.py b/examples/custom_convert.py index ad19480d..9606e0c0 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -4,10 +4,11 @@ import time from pathlib import Path from typing import Iterable -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions +from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.pdf_document_converter import PdfDocumentConverter +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, FormatOption +from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline _log = logging.getLogger(__name__) @@ -101,9 +102,12 @@ def main(): pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - doc_converter = PdfDocumentConverter( - pipeline_options=pipeline_options, - pdf_backend=DoclingParseDocumentBackend, + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: FormatOption( + pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options + ) + } ) # Docling Parse with OCR diff --git a/examples/export_figures.py b/examples/export_figures.py index f2285d50..0851aa6b 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -5,11 +5,12 @@ from pathlib import Path from docling.datamodel.base_models import ( ConversionStatus, FigureElement, - PdfPipelineOptions, + InputFormat, Table, ) from docling.datamodel.document import DocumentConversionInput -from docling.pdf_document_converter import PdfDocumentConverter +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption _log = logging.getLogger(__name__) @@ -28,12 +29,16 @@ def main(): # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. - # This is done by setting PipelineOptions.images_scale, which also defines the scale of images. + # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # scale=1 correspond of a standard 72 DPI image pipeline_options = PdfPipelineOptions() pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE - doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options) + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } + ) start_time = time.time() diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py index 2ac33466..c8dd3cc1 100644 --- a/examples/export_multimodal.py +++ b/examples/export_multimodal.py @@ -5,9 +5,10 @@ from pathlib import Path import pandas as pd -from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions +from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import DocumentConversionInput -from docling.pdf_document_converter import PdfDocumentConverter +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption from docling.utils.export import generate_multimodal_pages _log = logging.getLogger(__name__) @@ -27,12 +28,16 @@ def main(): # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. - # This is done by setting PipelineOptions.images_scale, which also defines the scale of images. + # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # scale=1 correspond of a standard 72 DPI image pipeline_options = PdfPipelineOptions() pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE - doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options) + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } + ) start_time = time.time() diff --git a/examples/export_tables.py b/examples/export_tables.py index 90329324..827d254e 100644 --- a/examples/export_tables.py +++ b/examples/export_tables.py @@ -6,7 +6,7 @@ import pandas as pd from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import DocumentConversionInput -from docling.pdf_document_converter import PdfDocumentConverter +from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -21,7 +21,7 @@ def main(): input_files = DocumentConversionInput.from_paths(input_doc_paths) - doc_converter = PdfDocumentConverter() + doc_converter = DocumentConverter() start_time = time.time() diff --git a/examples/minimal.py b/examples/minimal.py index 4102f7e2..837db718 100644 --- a/examples/minimal.py +++ b/examples/minimal.py @@ -1,6 +1,6 @@ -from docling.pdf_document_converter import PdfDocumentConverter +from docling.document_converter import DocumentConverter source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL -converter = PdfDocumentConverter() +converter = DocumentConverter() doc = converter.convert_single(source) print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]" diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index de6478dd..cdf1c670 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -1,8 +1,13 @@ import logging from pathlib import Path +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.msword_backend import MsWordDocumentBackend +from docling.datamodel.base_models import InputFormat from docling.datamodel.document import DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption +from docling.pipeline.simple_model_pipeline import SimpleModelPipeline +from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline _log = logging.getLogger(__name__) @@ -13,6 +18,7 @@ input_paths = [ Path("tests/data/word_sample.docx"), Path("tests/data/lorem_ipsum.docx"), Path("tests/data/powerpoint_sample.pptx"), + Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/2206.01062.pdf"), ] input = DocumentConversionInput.from_paths(input_paths) @@ -21,22 +27,29 @@ input = DocumentConversionInput.from_paths(input_paths) doc_converter = DocumentConverter() # to customize use: -# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. -# formats=[InputFormat.PDF, InputFormat.DOCX], +# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. +# formats=[ +# InputFormat.PDF, +# InputFormat.DOCX, +# ], # whitelist formats, other files are ignored. # format_options={ -# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend), -# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend) -# } +# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend), +# InputFormat.DOCX: FormatOption( +# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend +# ), +# # InputFormat.IMAGE: PdfFormatOption(), +# }, # ) conv_results = doc_converter.convert(input) for res in conv_results: - print("") + out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" print( - f"Document {res.input.file.name} converted with status {res.status}. Content:" + f"Document {res.input.file.name} converted with status {res.status}." + f"\nSaved markdown output to: {str(out_path)}" ) - print(res.experimental.export_to_markdown()) + # print(res.experimental.export_to_markdown()) # Export Docling document format to markdown (experimental): - with (Path("./scratch") / f"{res.input.file.name}.experimental.md").open("w") as fp: + with out_path.open("w") as fp: fp.write(res.experimental.export_to_markdown())