From 304d16029a5d5e69ef3e1ea6bb0831f02c40efab Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 11 Oct 2024 10:21:31 +0200
Subject: [PATCH] More renaming, design enrichment interface

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 README.md                                     |  3 ++-
 docling/cli/main.py                           |  8 +++---
 docling/datamodel/document.py                 | 13 +++++-----
 docling/document_converter.py                 | 25 +++++++++++++------
 docling/models/abstract_model.py              | 12 ++++++++-
 docling/pipeline/base_model_pipeline.py       | 12 ++++-----
 docling/pipeline/simple_model_pipeline.py     | 20 ++++++++++-----
 .../pipeline/standard_pdf_model_pipeline.py   |  6 ++++-
 examples/batch_convert.py                     | 10 ++++----
 examples/custom_convert.py                    |  8 +++---
 examples/export_figures.py                    |  2 +-
 examples/export_multimodal.py                 |  2 +-
 examples/export_tables.py                     |  2 +-
 examples/run_with_formats.py                  |  2 +-
 tests/test_e2e_ocr_conversion.py              |  4 +--
 tests/test_interfaces.py                      |  4 +--
 tests/verify_utils.py                         |  4 +--
 17 files changed, 85 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 96fa50ee..df93472b 100644
--- a/README.md
+++ b/README.md
@@ -270,11 +270,12 @@ conv_input = DocumentConversionInput.from_paths(
 ### Convert from binary PDF streams
 
 You can convert PDFs from a binary stream instead of from the filesystem as follows:
+
 ```python
 buf = BytesIO(your_binary_stream)
 docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
 conv_input = DocumentConversionInput.from_streams(docs)
-results = doc_converter.convert(conv_input)
+results = doc_converter.convert_batch(conv_input)
 ```
 ### Limit resource usage
 
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 2387fc35..b925e796 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -94,21 +94,21 @@ def export_documents(
                 fname = output_dir / f"{doc_filename}.txt"
                 with fname.open("w") as fp:
                     _log.info(f"writing Text output to {fname}")
-                    fp.write(conv_res.render_as_text_v1())
+                    fp.write(conv_res.render_as_text())
 
             # Export Markdown format:
             if export_md:
                 fname = output_dir / f"{doc_filename}.md"
                 with fname.open("w") as fp:
                     _log.info(f"writing Markdown output to {fname}")
-                    fp.write(conv_res.render_as_markdown_v1())
+                    fp.write(conv_res.render_as_markdown())
 
             # Export Document Tags format:
             if export_doctags:
                 fname = output_dir / f"{doc_filename}.doctags"
                 with fname.open("w") as fp:
                     _log.info(f"writing Doc Tags output to {fname}")
-                    fp.write(conv_res.render_as_doctags_v1())
+                    fp.write(conv_res.render_as_doctags())
 
         else:
             _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -236,7 +236,7 @@ def convert(
 
     start_time = time.time()
 
-    conv_results = doc_converter.convert(input)
+    conv_results = doc_converter.convert_batch(input)
 
     output.mkdir(parents=True, exist_ok=True)
     export_documents(
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index c819526c..b7c020f2 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -351,11 +351,11 @@ class ConvertedDocument(BaseModel):
         return ds_doc
 
     @deprecated("Use output.export_to_dict() instead.")
-    def render_as_dict_v1(self):
+    def render_as_dict(self):
         return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
 
     @deprecated("Use output.export_to_markdown() instead.")
-    def render_as_markdown_v1(
+    def render_as_markdown(
         self,
         delim: str = "\n\n",
         main_text_start: int = 0,
@@ -381,7 +381,7 @@ class ConvertedDocument(BaseModel):
         )
 
     @deprecated("Use output.export_to_text() instead.")
-    def render_as_text_v1(
+    def render_as_text(
         self,
         delim: str = "\n\n",
         main_text_start: int = 0,
@@ -402,7 +402,7 @@ class ConvertedDocument(BaseModel):
         )
 
     @deprecated("Use output.export_to_document_tokens() instead.")
-    def render_as_doctags_v1(
+    def render_as_doctags(
         self,
         delim: str = "\n\n",
         main_text_start: int = 0,
@@ -501,11 +501,12 @@ class DocumentConversionInput(BaseModel):
             mime = filetype.guess_mime(str(obj))
         elif isinstance(obj, DocumentStream):
             mime = filetype.guess_mime(obj.stream.read(8192))
-        else:
-            1 == 1  # alert!!
         if mime is None:
+            # TODO improve this.
+
             if obj.suffix == ".html":
                 mime = "text/html"
+
         format = MimeTypeToFormat.get(mime)
         return format
 
diff --git a/docling/document_converter.py b/docling/document_converter.py
index 8a1d1464..dc919883 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -14,6 +14,7 @@ from pydantic import (
     field_validator,
     model_validator,
 )
+from typing_extensions import deprecated
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -28,7 +29,7 @@ from docling.datamodel.document import (
 )
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
+from docling.pipeline.base_model_pipeline import AbstractModelPipeline
 from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
 from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
 from docling.utils.utils import chunkify
@@ -37,7 +38,7 @@ _log = logging.getLogger(__name__)
 
 
 class FormatOption(BaseModel):
-    pipeline_cls: Type[BaseModelPipeline]
+    pipeline_cls: Type[AbstractModelPipeline]
     pipeline_options: Optional[PipelineOptions] = None
     backend: Type[AbstractDocumentBackend]
 
@@ -114,11 +115,17 @@ class DocumentConverter:
                 _log.info(f"Requested format {f} will use default options.")
                 self.format_to_options[f] = _format_to_default_options[f]
 
-        self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
-            {}
-        )
+        self.initialized_pipelines: Dict[
+            Type[AbstractModelPipeline], AbstractModelPipeline
+        ] = {}
 
+    @deprecated("Use convert_batch instead.")
     def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
+        yield from self.convert_batch(input=input)
+
+    def convert_batch(
+        self, input: DocumentConversionInput, raise_on_error: bool = False
+    ) -> Iterable[ConversionResult]:
 
         for input_batch in chunkify(
             input.docs(self.format_to_options),
@@ -136,7 +143,9 @@ class DocumentConverter:
                 if item is not None:
                     yield item
 
-    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
+    def convert_single(
+        self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
+    ) -> ConversionResult:
         """Convert a single document.
 
         Args:
@@ -177,7 +186,7 @@ class DocumentConverter:
                         f"Unexpected file path type encountered: {type(source)}"
                     )
             conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
-            conv_res_iter = self.convert(conv_inp)
+            conv_res_iter = self.convert_batch(conv_inp)
             conv_res: ConversionResult = next(conv_res_iter)
         if conv_res.status not in {
             ConversionStatus.SUCCESS,
@@ -186,7 +195,7 @@ class DocumentConverter:
             raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
         return conv_res
 
-    def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
+    def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
         fopt = self.format_to_options.get(doc.format)
 
         if fopt is None:
diff --git a/docling/models/abstract_model.py b/docling/models/abstract_model.py
index ba5dc62c..d028bad9 100644
--- a/docling/models/abstract_model.py
+++ b/docling/models/abstract_model.py
@@ -1,5 +1,7 @@
 from abc import ABC, abstractmethod
-from typing import Iterable
+from typing import Any, Iterable
+
+from docling_core.types.experimental import DoclingDocument, NodeItem
 
 from docling.datamodel.base_models import Page
 
@@ -8,3 +10,11 @@ class AbstractPageModel(ABC):
     @abstractmethod
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         pass
+
+
+class AbstractEnrichmentModel(ABC):
+    @abstractmethod
+    def __call__(
+        self, doc: DoclingDocument, elements: Iterable[NodeItem]
+    ) -> Iterable[Any]:
+        pass
diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py
index ce76ba7b..aa4384b2 100644
--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@@ -21,7 +21,7 @@ from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
 
 
-class BaseModelPipeline(ABC):
+class AbstractModelPipeline(ABC):
     def __init__(self, pipeline_options: PipelineOptions):
         self.pipeline_options = pipeline_options
         self.model_pipe: List[Callable] = []
@@ -31,7 +31,7 @@ class BaseModelPipeline(ABC):
         pass
 
     @abstractmethod
-    def assemble_document(
+    def _assemble_document(
         self, in_doc: InputDocument, conv_res: ConversionResult
     ) -> ConversionResult:
         pass
@@ -47,9 +47,9 @@ class BaseModelPipeline(ABC):
         pass
 
 
-class PaginatedModelPipeline(BaseModelPipeline):  # TODO this is a bad name.
+class PaginatedModelPipeline(AbstractModelPipeline):  # TODO this is a bad name.
 
-    def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for model in self.model_pipe:
             page_batch = model(page_batch)
 
@@ -83,7 +83,7 @@ class PaginatedModelPipeline(BaseModelPipeline):  # TODO this is a bad name.
                 )
 
                 # 2. Run pipeline stages
-                pipeline_pages = self.apply_on_pages(init_pages)
+                pipeline_pages = self._apply_on_pages(init_pages)
 
                 for p in pipeline_pages:  # Must exhaust!
                     pass
@@ -91,7 +91,7 @@ class PaginatedModelPipeline(BaseModelPipeline):  # TODO this is a bad name.
                 end_pb_time = time.time() - start_pb_time
                 _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
 
-            conv_res = self.assemble_document(in_doc, conv_res)
+            conv_res = self._assemble_document(in_doc, conv_res)
 
             status = ConversionStatus.SUCCESS
             for page in conv_res.pages:
diff --git a/docling/pipeline/simple_model_pipeline.py b/docling/pipeline/simple_model_pipeline.py
index cff41c0f..ceef4d06 100644
--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@@ -1,4 +1,7 @@
 import logging
+from typing import Iterable
+
+from docling_core.types.experimental import NodeItem
 
 from docling.backend.abstract_backend import (
     AbstractDocumentBackend,
@@ -7,19 +10,19 @@ from docling.backend.abstract_backend import (
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
+from docling.pipeline.base_model_pipeline import AbstractModelPipeline
 
 _log = logging.getLogger(__name__)
 
 
-class SimpleModelPipeline(BaseModelPipeline):
+class SimpleModelPipeline(AbstractModelPipeline):
     """SimpleModelPipeline.
 
     This class is used at the moment for formats / backends
     which produce straight DoclingDocument output.
     """
 
-    def __init__(self, pipeline_options: PdfPipelineOptions):
+    def __init__(self, pipeline_options: PipelineOptions):
         super().__init__(pipeline_options)
 
     def execute(self, in_doc: InputDocument) -> ConversionResult:
@@ -45,16 +48,21 @@ class SimpleModelPipeline(BaseModelPipeline):
         # a DoclingDocument straight.
 
         conv_res.output = in_doc._backend.convert()
-
         # Do other stuff with conv_res.experimental
 
-        conv_res = self.assemble_document(in_doc, conv_res)
+        conv_res = self._assemble_document(in_doc, conv_res)
 
         conv_res.status = ConversionStatus.SUCCESS
 
         return conv_res
 
-    def assemble_document(
+    # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
+    #    for model in self.model_pipe:
+    #        element_batch = model(element_batch)
+    #
+    #    yield from element_batch
+
+    def _assemble_document(
         self, in_doc: InputDocument, conv_res: ConversionResult
     ) -> ConversionResult:
         return conv_res
diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py
index dc19e1f7..dba1f3dc 100644
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@@ -83,6 +83,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
             PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
         ]
 
+        self.enrichment_pipe = [
+            # Other models working on `NodeItem` elements in the DoclingDocument
+        ]
+
     @staticmethod
     def download_models_hf(
         local_dir: Optional[Path] = None, force: bool = False
@@ -104,7 +108,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
 
         return page
 
-    def assemble_document(
+    def _assemble_document(
         self, in_doc: InputDocument, conv_res: ConversionResult
     ) -> ConversionResult:
         all_elements = []
diff --git a/examples/batch_convert.py b/examples/batch_convert.py
index 6f04ef03..ca4988f3 100644
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@@ -36,25 +36,25 @@ def export_documents(
                 with (output_dir / f"{doc_filename}.legacy.json").open(
                     "w", encoding="utf-8"
                 ) as fp:
-                    fp.write(json.dumps(conv_res.render_as_dict_v1()))
+                    fp.write(json.dumps(conv_res.render_as_dict()))
 
                 # Export Text format:
                 with (output_dir / f"{doc_filename}.legacy.txt").open(
                     "w", encoding="utf-8"
                 ) as fp:
-                    fp.write(conv_res.render_as_text_v1())
+                    fp.write(conv_res.render_as_text())
 
                 # Export Markdown format:
                 with (output_dir / f"{doc_filename}.legacy.md").open(
                     "w", encoding="utf-8"
                 ) as fp:
-                    fp.write(conv_res.render_as_markdown_v1())
+                    fp.write(conv_res.render_as_markdown())
 
                 # Export Document Tags format:
                 with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
                     "w", encoding="utf-8"
                 ) as fp:
-                    fp.write(conv_res.render_as_doctags_v1())
+                    fp.write(conv_res.render_as_doctags())
 
             if USE_V2:
                 # Export Docling document format to JSON (experimental):
@@ -129,7 +129,7 @@ def main():
 
     start_time = time.time()
 
-    conv_results = doc_converter.convert(input)
+    conv_results = doc_converter.convert_batch(input)
     success_count, partial_success_count, failure_count = export_documents(
         conv_results, output_dir=Path("./scratch")
     )
diff --git a/examples/custom_convert.py b/examples/custom_convert.py
index 1ebd936e..0805837b 100644
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -39,17 +39,17 @@ def export_documents(
 
             # Export Text format:
             with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
-                fp.write(conv_res.render_as_text_v1())
+                fp.write(conv_res.render_as_text())
 
             # Export Markdown format:
             with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
-                fp.write(conv_res.render_as_markdown_v1())
+                fp.write(conv_res.render_as_markdown())
 
             # Export Document Tags format:
             with (output_dir / f"{doc_filename}.doctags").open(
                 "w", encoding="utf-8"
             ) as fp:
-                fp.write(conv_res.render_as_doctags_v1())
+                fp.write(conv_res.render_as_doctags())
 
         else:
             _log.info(f"Document {conv_res.input.file} failed to convert.")
@@ -157,7 +157,7 @@ def main():
 
     start_time = time.time()
 
-    conv_results = doc_converter.convert(input)
+    conv_results = doc_converter.convert_batch(input)
     success_count, failure_count = export_documents(
         conv_results, output_dir=Path("./scratch")
     )
diff --git a/examples/export_figures.py b/examples/export_figures.py
index 0851aa6b..23f1bd20 100644
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@@ -42,7 +42,7 @@ def main():
 
     start_time = time.time()
 
-    conv_results = doc_converter.convert(input_files)
+    conv_results = doc_converter.convert_batch(input_files)
 
     success_count = 0
     failure_count = 0
diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py
index c8dd3cc1..11dd3f41 100644
--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@@ -41,7 +41,7 @@ def main():
 
     start_time = time.time()
 
-    converted_docs = doc_converter.convert(input_files)
+    converted_docs = doc_converter.convert_batch(input_files)
 
     success_count = 0
     failure_count = 0
diff --git a/examples/export_tables.py b/examples/export_tables.py
index 126aa502..720e8c67 100644
--- a/examples/export_tables.py
+++ b/examples/export_tables.py
@@ -25,7 +25,7 @@ def main():
 
     start_time = time.time()
 
-    conv_results = doc_converter.convert(input_files)
+    conv_results = doc_converter.convert_batch(input_files)
 
     success_count = 0
     failure_count = 0
diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py
index aa915578..37d49e1c 100644
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -50,7 +50,7 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
     },
 )
 
-conv_results = doc_converter.convert(input)
+conv_results = doc_converter.convert_batch(input)
 
 for res in conv_results:
     out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index 1f7b619d..c0c0a497 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -39,11 +39,11 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
 
     doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
     with open(doctags_fn, "w") as fd:
-        fd.write(doc_result.render_as_doctags_v1())
+        fd.write(doc_result.render_as_doctags())
 
     md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
     with open(md_fn, "w") as fd:
-        fd.write(doc_result.render_as_markdown_v1())
+        fd.write(doc_result.render_as_markdown())
 
 
 def get_pdf_paths():
diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py
index d3c33d99..80f5ea4e 100644
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@@ -54,7 +54,7 @@ def test_batch_path(converter: DocumentConverter):
 
     conv_input = DocumentConversionInput.from_paths([pdf_path])
 
-    results = converter.convert(conv_input)
+    results = converter.convert_batch(conv_input)
     for doc_result in results:
         verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
         verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
@@ -69,7 +69,7 @@ def test_batch_bytes(converter: DocumentConverter):
     docs = [DocumentStream(name=pdf_path.name, stream=buf)]
     conv_input = DocumentConversionInput.from_streams(docs)
 
-    results = converter.convert(conv_input)
+    results = converter.convert_batch(conv_input)
     for doc_result in results:
         verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
         verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 6bfa3460..7af6ab12 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -198,8 +198,8 @@ def verify_conversion_result_v1(
 
     doc_pred_pages: List[Page] = doc_result.pages
     doc_pred: DsDocument = doc_result.legacy_output
-    doc_pred_md = doc_result.render_as_markdown_v1()
-    doc_pred_dt = doc_result.render_as_doctags_v1()
+    doc_pred_md = doc_result.render_as_markdown()
+    doc_pred_dt = doc_result.render_as_doctags()
 
     engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
     gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name