docs: enrich existing DoclingDocument (#1969)

add example for enriching an existing doclingdocument Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-23 18:45:00 +00:00 · 2025-07-22 16:20:15 +02:00 · 2025-07-22 16:20:15 +02:00 · 90a7cc4bdd
commit 90a7cc4bdd
parent a069b1175b
2 changed files with 133 additions and 0 deletions
--- a/docs/examples/enrich_doclingdocument.py
+++ b/docs/examples/enrich_doclingdocument.py
@ -0,0 +1,132 @@
+## Enrich DoclingDocument
+# This example allows to run Docling enrichment models on documents which have been already converted
+# and stored as serialized DoclingDocument JSON files.
+
+### Load modules
+
+from pathlib import Path
+from typing import Iterable, Optional
+
+from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from rich.pretty import pprint
+
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import InputFormat, ItemAndImageEnrichmentElement
+from docling.datamodel.document import InputDocument
+from docling.models.base_model import BaseItemAndImageEnrichmentModel
+from docling.models.document_picture_classifier import (
+    DocumentPictureClassifier,
+    DocumentPictureClassifierOptions,
+)
+from docling.utils.utils import chunkify
+
+### Define batch size used for processing
+
+BATCH_SIZE = 4
+
+### From DocItem to the model inputs
+# The following function is responsible for taking an item and applying the required pre-processing for the model.
+# In this case we generate a cropped image from the document backend.
+
+
+def prepare_element(
+    doc: DoclingDocument,
+    backend: PyPdfiumDocumentBackend,
+    model: BaseItemAndImageEnrichmentModel,
+    element: NodeItem,
+) -> Optional[ItemAndImageEnrichmentElement]:
+    if not model.is_processable(doc=doc, element=element):
+        return None
+
+    assert isinstance(element, DocItem)
+    element_prov = element.prov[0]
+
+    bbox = element_prov.bbox
+    width = bbox.r - bbox.l
+    height = bbox.t - bbox.b
+
+    expanded_bbox = BoundingBox(
+        l=bbox.l - width * model.expansion_factor,
+        t=bbox.t + height * model.expansion_factor,
+        r=bbox.r + width * model.expansion_factor,
+        b=bbox.b - height * model.expansion_factor,
+        coord_origin=bbox.coord_origin,
+    )
+
+    page_ix = element_prov.page_no - 1
+    page_backend = backend.load_page(page_no=page_ix)
+    cropped_image = page_backend.get_page_image(
+        scale=model.images_scale, cropbox=expanded_bbox
+    )
+    return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
+
+
+### Iterate through the document
+# This block defines the `enrich_document()` which is responsible for iterating through the document
+# and batch the selected document items for running through the model.
+
+
+def enrich_document(
+    doc: DoclingDocument,
+    backend: PyPdfiumDocumentBackend,
+    model: BaseItemAndImageEnrichmentModel,
+) -> DoclingDocument:
+    def _prepare_elements(
+        doc: DoclingDocument,
+        backend: PyPdfiumDocumentBackend,
+        model: BaseItemAndImageEnrichmentModel,
+    ) -> Iterable[NodeItem]:
+        for doc_element, _level in doc.iterate_items():
+            prepared_element = prepare_element(
+                doc=doc, backend=backend, model=model, element=doc_element
+            )
+            if prepared_element is not None:
+                yield prepared_element
+
+    for element_batch in chunkify(
+        _prepare_elements(doc, backend, model),
+        BATCH_SIZE,
+    ):
+        for element in model(doc=doc, element_batch=element_batch):  # Must exhaust!
+            pass
+
+    return doc
+
+
+### Open and process
+# The `main()` function which initializes the document and model objects for calling `enrich_document()`.
+
+
+def main():
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_pdf_path = data_folder / "pdf/2206.01062.pdf"
+
+    input_doc_path = data_folder / "groundtruth/docling_v2/2206.01062.json"
+
+    doc = DoclingDocument.load_from_json(input_doc_path)
+
+    in_pdf_doc = InputDocument(
+        input_pdf_path,
+        format=InputFormat.PDF,
+        backend=PyPdfiumDocumentBackend,
+        filename=input_pdf_path.name,
+    )
+    backend = in_pdf_doc._backend
+
+    model = DocumentPictureClassifier(
+        enabled=True,
+        artifacts_path=None,
+        options=DocumentPictureClassifierOptions(),
+        accelerator_options=AcceleratorOptions(),
+    )
+
+    doc = enrich_document(doc=doc, backend=backend, model=model)
+
+    for pic in doc.pictures[:5]:
+        print(pic.self_ref)
+        pprint(pic.annotations)
+
+
+if __name__ == "__main__":
+    main()
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -107,6 +107,7 @@ nav:
    - ✨ Enrichment development:
      - "Figure enrichment": examples/develop_picture_enrichment.py
      - "Formula enrichment": examples/develop_formula_understanding.py
+      - "Enrich a DoclingDocument": examples/enrich_doclingdocument.py
    - 🗂️ More examples:
      - examples/rag_milvus.ipynb
      - examples/rag_weaviate.ipynb