From 304d16029a5d5e69ef3e1ea6bb0831f02c40efab Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 11 Oct 2024 10:21:31 +0200 Subject: [PATCH] More renaming, design enrichment interface Signed-off-by: Christoph Auer --- README.md | 3 ++- docling/cli/main.py | 8 +++--- docling/datamodel/document.py | 13 +++++----- docling/document_converter.py | 25 +++++++++++++------ docling/models/abstract_model.py | 12 ++++++++- docling/pipeline/base_model_pipeline.py | 12 ++++----- docling/pipeline/simple_model_pipeline.py | 20 ++++++++++----- .../pipeline/standard_pdf_model_pipeline.py | 6 ++++- examples/batch_convert.py | 10 ++++---- examples/custom_convert.py | 8 +++--- examples/export_figures.py | 2 +- examples/export_multimodal.py | 2 +- examples/export_tables.py | 2 +- examples/run_with_formats.py | 2 +- tests/test_e2e_ocr_conversion.py | 4 +-- tests/test_interfaces.py | 4 +-- tests/verify_utils.py | 4 +-- 17 files changed, 85 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 96fa50ee..df93472b 100644 --- a/README.md +++ b/README.md @@ -270,11 +270,12 @@ conv_input = DocumentConversionInput.from_paths( ### Convert from binary PDF streams You can convert PDFs from a binary stream instead of from the filesystem as follows: + ```python buf = BytesIO(your_binary_stream) docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] conv_input = DocumentConversionInput.from_streams(docs) -results = doc_converter.convert(conv_input) +results = doc_converter.convert_batch(conv_input) ``` ### Limit resource usage diff --git a/docling/cli/main.py b/docling/cli/main.py index 2387fc35..b925e796 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -94,21 +94,21 @@ def export_documents( fname = output_dir / f"{doc_filename}.txt" with fname.open("w") as fp: _log.info(f"writing Text output to {fname}") - fp.write(conv_res.render_as_text_v1()) + fp.write(conv_res.render_as_text()) # Export Markdown format: if export_md: fname = output_dir / f"{doc_filename}.md" with fname.open("w") as fp: _log.info(f"writing Markdown output to {fname}") - fp.write(conv_res.render_as_markdown_v1()) + fp.write(conv_res.render_as_markdown()) # Export Document Tags format: if export_doctags: fname = output_dir / f"{doc_filename}.doctags" with fname.open("w") as fp: _log.info(f"writing Doc Tags output to {fname}") - fp.write(conv_res.render_as_doctags_v1()) + fp.write(conv_res.render_as_doctags()) else: _log.warning(f"Document {conv_res.input.file} failed to convert.") @@ -236,7 +236,7 @@ def convert( start_time = time.time() - conv_results = doc_converter.convert(input) + conv_results = doc_converter.convert_batch(input) output.mkdir(parents=True, exist_ok=True) export_documents( diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index c819526c..b7c020f2 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -351,11 +351,11 @@ class ConvertedDocument(BaseModel): return ds_doc @deprecated("Use output.export_to_dict() instead.") - def render_as_dict_v1(self): + def render_as_dict(self): return self.legacy_output.model_dump(by_alias=True, exclude_none=True) @deprecated("Use output.export_to_markdown() instead.") - def render_as_markdown_v1( + def render_as_markdown( self, delim: str = "\n\n", main_text_start: int = 0, @@ -381,7 +381,7 @@ class ConvertedDocument(BaseModel): ) @deprecated("Use output.export_to_text() instead.") - def render_as_text_v1( + def render_as_text( self, delim: str = "\n\n", main_text_start: int = 0, @@ -402,7 +402,7 @@ class ConvertedDocument(BaseModel): ) @deprecated("Use output.export_to_document_tokens() instead.") - def render_as_doctags_v1( + def render_as_doctags( self, delim: str = "\n\n", main_text_start: int = 0, @@ -501,11 +501,12 @@ class DocumentConversionInput(BaseModel): mime = filetype.guess_mime(str(obj)) elif isinstance(obj, DocumentStream): mime = filetype.guess_mime(obj.stream.read(8192)) - else: - 1 == 1 # alert!! if mime is None: + # TODO improve this. + if obj.suffix == ".html": mime = "text/html" + format = MimeTypeToFormat.get(mime) return format diff --git a/docling/document_converter.py b/docling/document_converter.py index 8a1d1464..dc919883 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -14,6 +14,7 @@ from pydantic import ( field_validator, model_validator, ) +from typing_extensions import deprecated from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend @@ -28,7 +29,7 @@ from docling.datamodel.document import ( ) from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import settings -from docling.pipeline.base_model_pipeline import BaseModelPipeline +from docling.pipeline.base_model_pipeline import AbstractModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.utils.utils import chunkify @@ -37,7 +38,7 @@ _log = logging.getLogger(__name__) class FormatOption(BaseModel): - pipeline_cls: Type[BaseModelPipeline] + pipeline_cls: Type[AbstractModelPipeline] pipeline_options: Optional[PipelineOptions] = None backend: Type[AbstractDocumentBackend] @@ -114,11 +115,17 @@ class DocumentConverter: _log.info(f"Requested format {f} will use default options.") self.format_to_options[f] = _format_to_default_options[f] - self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = ( - {} - ) + self.initialized_pipelines: Dict[ + Type[AbstractModelPipeline], AbstractModelPipeline + ] = {} + @deprecated("Use convert_batch instead.") def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: + yield from self.convert_batch(input=input) + + def convert_batch( + self, input: DocumentConversionInput, raise_on_error: bool = False + ) -> Iterable[ConversionResult]: for input_batch in chunkify( input.docs(self.format_to_options), @@ -136,7 +143,9 @@ class DocumentConverter: if item is not None: yield item - def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult: + def convert_single( + self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False + ) -> ConversionResult: """Convert a single document. Args: @@ -177,7 +186,7 @@ class DocumentConverter: f"Unexpected file path type encountered: {type(source)}" ) conv_inp = DocumentConversionInput.from_paths(paths=[local_path]) - conv_res_iter = self.convert(conv_inp) + conv_res_iter = self.convert_batch(conv_inp) conv_res: ConversionResult = next(conv_res_iter) if conv_res.status not in { ConversionStatus.SUCCESS, @@ -186,7 +195,7 @@ class DocumentConverter: raise RuntimeError(f"Conversion failed with status: {conv_res.status}") return conv_res - def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]: + def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]: fopt = self.format_to_options.get(doc.format) if fopt is None: diff --git a/docling/models/abstract_model.py b/docling/models/abstract_model.py index ba5dc62c..d028bad9 100644 --- a/docling/models/abstract_model.py +++ b/docling/models/abstract_model.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod -from typing import Iterable +from typing import Any, Iterable + +from docling_core.types.experimental import DoclingDocument, NodeItem from docling.datamodel.base_models import Page @@ -8,3 +10,11 @@ class AbstractPageModel(ABC): @abstractmethod def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: pass + + +class AbstractEnrichmentModel(ABC): + @abstractmethod + def __call__( + self, doc: DoclingDocument, elements: Iterable[NodeItem] + ) -> Iterable[Any]: + pass diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py index ce76ba7b..aa4384b2 100644 --- a/docling/pipeline/base_model_pipeline.py +++ b/docling/pipeline/base_model_pipeline.py @@ -21,7 +21,7 @@ from docling.utils.utils import chunkify _log = logging.getLogger(__name__) -class BaseModelPipeline(ABC): +class AbstractModelPipeline(ABC): def __init__(self, pipeline_options: PipelineOptions): self.pipeline_options = pipeline_options self.model_pipe: List[Callable] = [] @@ -31,7 +31,7 @@ class BaseModelPipeline(ABC): pass @abstractmethod - def assemble_document( + def _assemble_document( self, in_doc: InputDocument, conv_res: ConversionResult ) -> ConversionResult: pass @@ -47,9 +47,9 @@ class BaseModelPipeline(ABC): pass -class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name. +class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name. - def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: for model in self.model_pipe: page_batch = model(page_batch) @@ -83,7 +83,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name. ) # 2. Run pipeline stages - pipeline_pages = self.apply_on_pages(init_pages) + pipeline_pages = self._apply_on_pages(init_pages) for p in pipeline_pages: # Must exhaust! pass @@ -91,7 +91,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name. end_pb_time = time.time() - start_pb_time _log.info(f"Finished converting page batch time={end_pb_time:.3f}") - conv_res = self.assemble_document(in_doc, conv_res) + conv_res = self._assemble_document(in_doc, conv_res) status = ConversionStatus.SUCCESS for page in conv_res.pages: diff --git a/docling/pipeline/simple_model_pipeline.py b/docling/pipeline/simple_model_pipeline.py index cff41c0f..ceef4d06 100644 --- a/docling/pipeline/simple_model_pipeline.py +++ b/docling/pipeline/simple_model_pipeline.py @@ -1,4 +1,7 @@ import logging +from typing import Iterable + +from docling_core.types.experimental import NodeItem from docling.backend.abstract_backend import ( AbstractDocumentBackend, @@ -7,19 +10,19 @@ from docling.backend.abstract_backend import ( from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions -from docling.pipeline.base_model_pipeline import BaseModelPipeline +from docling.pipeline.base_model_pipeline import AbstractModelPipeline _log = logging.getLogger(__name__) -class SimpleModelPipeline(BaseModelPipeline): +class SimpleModelPipeline(AbstractModelPipeline): """SimpleModelPipeline. This class is used at the moment for formats / backends which produce straight DoclingDocument output. """ - def __init__(self, pipeline_options: PdfPipelineOptions): + def __init__(self, pipeline_options: PipelineOptions): super().__init__(pipeline_options) def execute(self, in_doc: InputDocument) -> ConversionResult: @@ -45,16 +48,21 @@ class SimpleModelPipeline(BaseModelPipeline): # a DoclingDocument straight. conv_res.output = in_doc._backend.convert() - # Do other stuff with conv_res.experimental - conv_res = self.assemble_document(in_doc, conv_res) + conv_res = self._assemble_document(in_doc, conv_res) conv_res.status = ConversionStatus.SUCCESS return conv_res - def assemble_document( + # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]: + # for model in self.model_pipe: + # element_batch = model(element_batch) + # + # yield from element_batch + + def _assemble_document( self, in_doc: InputDocument, conv_res: ConversionResult ) -> ConversionResult: return conv_res diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py index dc19e1f7..dba1f3dc 100644 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -83,6 +83,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): PageAssembleModel(config={"images_scale": pipeline_options.images_scale}), ] + self.enrichment_pipe = [ + # Other models working on `NodeItem` elements in the DoclingDocument + ] + @staticmethod def download_models_hf( local_dir: Optional[Path] = None, force: bool = False @@ -104,7 +108,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): return page - def assemble_document( + def _assemble_document( self, in_doc: InputDocument, conv_res: ConversionResult ) -> ConversionResult: all_elements = [] diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 6f04ef03..ca4988f3 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -36,25 +36,25 @@ def export_documents( with (output_dir / f"{doc_filename}.legacy.json").open( "w", encoding="utf-8" ) as fp: - fp.write(json.dumps(conv_res.render_as_dict_v1())) + fp.write(json.dumps(conv_res.render_as_dict())) # Export Text format: with (output_dir / f"{doc_filename}.legacy.txt").open( "w", encoding="utf-8" ) as fp: - fp.write(conv_res.render_as_text_v1()) + fp.write(conv_res.render_as_text()) # Export Markdown format: with (output_dir / f"{doc_filename}.legacy.md").open( "w", encoding="utf-8" ) as fp: - fp.write(conv_res.render_as_markdown_v1()) + fp.write(conv_res.render_as_markdown()) # Export Document Tags format: with (output_dir / f"{doc_filename}.legacy.doctags.txt").open( "w", encoding="utf-8" ) as fp: - fp.write(conv_res.render_as_doctags_v1()) + fp.write(conv_res.render_as_doctags()) if USE_V2: # Export Docling document format to JSON (experimental): @@ -129,7 +129,7 @@ def main(): start_time = time.time() - conv_results = doc_converter.convert(input) + conv_results = doc_converter.convert_batch(input) success_count, partial_success_count, failure_count = export_documents( conv_results, output_dir=Path("./scratch") ) diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 1ebd936e..0805837b 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -39,17 +39,17 @@ def export_documents( # Export Text format: with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp: - fp.write(conv_res.render_as_text_v1()) + fp.write(conv_res.render_as_text()) # Export Markdown format: with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp: - fp.write(conv_res.render_as_markdown_v1()) + fp.write(conv_res.render_as_markdown()) # Export Document Tags format: with (output_dir / f"{doc_filename}.doctags").open( "w", encoding="utf-8" ) as fp: - fp.write(conv_res.render_as_doctags_v1()) + fp.write(conv_res.render_as_doctags()) else: _log.info(f"Document {conv_res.input.file} failed to convert.") @@ -157,7 +157,7 @@ def main(): start_time = time.time() - conv_results = doc_converter.convert(input) + conv_results = doc_converter.convert_batch(input) success_count, failure_count = export_documents( conv_results, output_dir=Path("./scratch") ) diff --git a/examples/export_figures.py b/examples/export_figures.py index 0851aa6b..23f1bd20 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -42,7 +42,7 @@ def main(): start_time = time.time() - conv_results = doc_converter.convert(input_files) + conv_results = doc_converter.convert_batch(input_files) success_count = 0 failure_count = 0 diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py index c8dd3cc1..11dd3f41 100644 --- a/examples/export_multimodal.py +++ b/examples/export_multimodal.py @@ -41,7 +41,7 @@ def main(): start_time = time.time() - converted_docs = doc_converter.convert(input_files) + converted_docs = doc_converter.convert_batch(input_files) success_count = 0 failure_count = 0 diff --git a/examples/export_tables.py b/examples/export_tables.py index 126aa502..720e8c67 100644 --- a/examples/export_tables.py +++ b/examples/export_tables.py @@ -25,7 +25,7 @@ def main(): start_time = time.time() - conv_results = doc_converter.convert(input_files) + conv_results = doc_converter.convert_batch(input_files) success_count = 0 failure_count = 0 diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index aa915578..37d49e1c 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -50,7 +50,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal }, ) -conv_results = doc_converter.convert(input) +conv_results = doc_converter.convert_batch(input) for res in conv_results: out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 1f7b619d..c0c0a497 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -39,11 +39,11 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str): doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt") with open(doctags_fn, "w") as fd: - fd.write(doc_result.render_as_doctags_v1()) + fd.write(doc_result.render_as_doctags()) md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md") with open(md_fn, "w") as fd: - fd.write(doc_result.render_as_markdown_v1()) + fd.write(doc_result.render_as_markdown()) def get_pdf_paths(): diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index d3c33d99..80f5ea4e 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -54,7 +54,7 @@ def test_batch_path(converter: DocumentConverter): conv_input = DocumentConversionInput.from_paths([pdf_path]) - results = converter.convert(conv_input) + results = converter.convert_batch(conv_input) for doc_result in results: verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) @@ -69,7 +69,7 @@ def test_batch_bytes(converter: DocumentConverter): docs = [DocumentStream(name=pdf_path.name, stream=buf)] conv_input = DocumentConversionInput.from_streams(docs) - results = converter.convert(conv_input) + results = converter.convert_batch(conv_input) for doc_result in results: verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 6bfa3460..7af6ab12 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -198,8 +198,8 @@ def verify_conversion_result_v1( doc_pred_pages: List[Page] = doc_result.pages doc_pred: DsDocument = doc_result.legacy_output - doc_pred_md = doc_result.render_as_markdown_v1() - doc_pred_dt = doc_result.render_as_doctags_v1() + doc_pred_md = doc_result.render_as_markdown() + doc_pred_dt = doc_result.render_as_doctags() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name