From 90a7cc4bdda7272cd87d6f4ab3c0b7966f6e9c73 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:20:15 +0200 Subject: [PATCH] docs: enrich existing DoclingDocument (#1969) add example for enriching an existing doclingdocument Signed-off-by: Michele Dolfi --- docs/examples/enrich_doclingdocument.py | 132 ++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 133 insertions(+) create mode 100644 docs/examples/enrich_doclingdocument.py diff --git a/docs/examples/enrich_doclingdocument.py b/docs/examples/enrich_doclingdocument.py new file mode 100644 index 00000000..3ccc3f10 --- /dev/null +++ b/docs/examples/enrich_doclingdocument.py @@ -0,0 +1,132 @@ +## Enrich DoclingDocument +# This example allows to run Docling enrichment models on documents which have been already converted +# and stored as serialized DoclingDocument JSON files. + +### Load modules + +from pathlib import Path +from typing import Iterable, Optional + +from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem +from rich.pretty import pprint + +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.accelerator_options import AcceleratorOptions +from docling.datamodel.base_models import InputFormat, ItemAndImageEnrichmentElement +from docling.datamodel.document import InputDocument +from docling.models.base_model import BaseItemAndImageEnrichmentModel +from docling.models.document_picture_classifier import ( + DocumentPictureClassifier, + DocumentPictureClassifierOptions, +) +from docling.utils.utils import chunkify + +### Define batch size used for processing + +BATCH_SIZE = 4 + +### From DocItem to the model inputs +# The following function is responsible for taking an item and applying the required pre-processing for the model. +# In this case we generate a cropped image from the document backend. + + +def prepare_element( + doc: DoclingDocument, + backend: PyPdfiumDocumentBackend, + model: BaseItemAndImageEnrichmentModel, + element: NodeItem, +) -> Optional[ItemAndImageEnrichmentElement]: + if not model.is_processable(doc=doc, element=element): + return None + + assert isinstance(element, DocItem) + element_prov = element.prov[0] + + bbox = element_prov.bbox + width = bbox.r - bbox.l + height = bbox.t - bbox.b + + expanded_bbox = BoundingBox( + l=bbox.l - width * model.expansion_factor, + t=bbox.t + height * model.expansion_factor, + r=bbox.r + width * model.expansion_factor, + b=bbox.b - height * model.expansion_factor, + coord_origin=bbox.coord_origin, + ) + + page_ix = element_prov.page_no - 1 + page_backend = backend.load_page(page_no=page_ix) + cropped_image = page_backend.get_page_image( + scale=model.images_scale, cropbox=expanded_bbox + ) + return ItemAndImageEnrichmentElement(item=element, image=cropped_image) + + +### Iterate through the document +# This block defines the `enrich_document()` which is responsible for iterating through the document +# and batch the selected document items for running through the model. + + +def enrich_document( + doc: DoclingDocument, + backend: PyPdfiumDocumentBackend, + model: BaseItemAndImageEnrichmentModel, +) -> DoclingDocument: + def _prepare_elements( + doc: DoclingDocument, + backend: PyPdfiumDocumentBackend, + model: BaseItemAndImageEnrichmentModel, + ) -> Iterable[NodeItem]: + for doc_element, _level in doc.iterate_items(): + prepared_element = prepare_element( + doc=doc, backend=backend, model=model, element=doc_element + ) + if prepared_element is not None: + yield prepared_element + + for element_batch in chunkify( + _prepare_elements(doc, backend, model), + BATCH_SIZE, + ): + for element in model(doc=doc, element_batch=element_batch): # Must exhaust! + pass + + return doc + + +### Open and process +# The `main()` function which initializes the document and model objects for calling `enrich_document()`. + + +def main(): + data_folder = Path(__file__).parent / "../../tests/data" + input_pdf_path = data_folder / "pdf/2206.01062.pdf" + + input_doc_path = data_folder / "groundtruth/docling_v2/2206.01062.json" + + doc = DoclingDocument.load_from_json(input_doc_path) + + in_pdf_doc = InputDocument( + input_pdf_path, + format=InputFormat.PDF, + backend=PyPdfiumDocumentBackend, + filename=input_pdf_path.name, + ) + backend = in_pdf_doc._backend + + model = DocumentPictureClassifier( + enabled=True, + artifacts_path=None, + options=DocumentPictureClassifierOptions(), + accelerator_options=AcceleratorOptions(), + ) + + doc = enrich_document(doc=doc, backend=backend, model=model) + + for pic in doc.pictures[:5]: + print(pic.self_ref) + pprint(pic.annotations) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 8f61e2a3..3e8b9449 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -107,6 +107,7 @@ nav: - ✨ Enrichment development: - "Figure enrichment": examples/develop_picture_enrichment.py - "Formula enrichment": examples/develop_formula_understanding.py + - "Enrich a DoclingDocument": examples/enrich_doclingdocument.py - 🗂️ More examples: - examples/rag_milvus.ipynb - examples/rag_weaviate.ipynb