mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-23 18:45:00 +00:00
docs: enrich existing DoclingDocument (#1969)
add example for enriching an existing doclingdocument Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
a069b1175b
commit
90a7cc4bdd
132
docs/examples/enrich_doclingdocument.py
vendored
Normal file
132
docs/examples/enrich_doclingdocument.py
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
## Enrich DoclingDocument
|
||||
# This example allows to run Docling enrichment models on documents which have been already converted
|
||||
# and stored as serialized DoclingDocument JSON files.
|
||||
|
||||
### Load modules
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||
from rich.pretty import pprint
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||
from docling.datamodel.base_models import InputFormat, ItemAndImageEnrichmentElement
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
||||
from docling.models.document_picture_classifier import (
|
||||
DocumentPictureClassifier,
|
||||
DocumentPictureClassifierOptions,
|
||||
)
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
### Define batch size used for processing
|
||||
|
||||
BATCH_SIZE = 4
|
||||
|
||||
### From DocItem to the model inputs
|
||||
# The following function is responsible for taking an item and applying the required pre-processing for the model.
|
||||
# In this case we generate a cropped image from the document backend.
|
||||
|
||||
|
||||
def prepare_element(
|
||||
doc: DoclingDocument,
|
||||
backend: PyPdfiumDocumentBackend,
|
||||
model: BaseItemAndImageEnrichmentModel,
|
||||
element: NodeItem,
|
||||
) -> Optional[ItemAndImageEnrichmentElement]:
|
||||
if not model.is_processable(doc=doc, element=element):
|
||||
return None
|
||||
|
||||
assert isinstance(element, DocItem)
|
||||
element_prov = element.prov[0]
|
||||
|
||||
bbox = element_prov.bbox
|
||||
width = bbox.r - bbox.l
|
||||
height = bbox.t - bbox.b
|
||||
|
||||
expanded_bbox = BoundingBox(
|
||||
l=bbox.l - width * model.expansion_factor,
|
||||
t=bbox.t + height * model.expansion_factor,
|
||||
r=bbox.r + width * model.expansion_factor,
|
||||
b=bbox.b - height * model.expansion_factor,
|
||||
coord_origin=bbox.coord_origin,
|
||||
)
|
||||
|
||||
page_ix = element_prov.page_no - 1
|
||||
page_backend = backend.load_page(page_no=page_ix)
|
||||
cropped_image = page_backend.get_page_image(
|
||||
scale=model.images_scale, cropbox=expanded_bbox
|
||||
)
|
||||
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
||||
|
||||
|
||||
### Iterate through the document
|
||||
# This block defines the `enrich_document()` which is responsible for iterating through the document
|
||||
# and batch the selected document items for running through the model.
|
||||
|
||||
|
||||
def enrich_document(
|
||||
doc: DoclingDocument,
|
||||
backend: PyPdfiumDocumentBackend,
|
||||
model: BaseItemAndImageEnrichmentModel,
|
||||
) -> DoclingDocument:
|
||||
def _prepare_elements(
|
||||
doc: DoclingDocument,
|
||||
backend: PyPdfiumDocumentBackend,
|
||||
model: BaseItemAndImageEnrichmentModel,
|
||||
) -> Iterable[NodeItem]:
|
||||
for doc_element, _level in doc.iterate_items():
|
||||
prepared_element = prepare_element(
|
||||
doc=doc, backend=backend, model=model, element=doc_element
|
||||
)
|
||||
if prepared_element is not None:
|
||||
yield prepared_element
|
||||
|
||||
for element_batch in chunkify(
|
||||
_prepare_elements(doc, backend, model),
|
||||
BATCH_SIZE,
|
||||
):
|
||||
for element in model(doc=doc, element_batch=element_batch): # Must exhaust!
|
||||
pass
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
### Open and process
|
||||
# The `main()` function which initializes the document and model objects for calling `enrich_document()`.
|
||||
|
||||
|
||||
def main():
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_pdf_path = data_folder / "pdf/2206.01062.pdf"
|
||||
|
||||
input_doc_path = data_folder / "groundtruth/docling_v2/2206.01062.json"
|
||||
|
||||
doc = DoclingDocument.load_from_json(input_doc_path)
|
||||
|
||||
in_pdf_doc = InputDocument(
|
||||
input_pdf_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
filename=input_pdf_path.name,
|
||||
)
|
||||
backend = in_pdf_doc._backend
|
||||
|
||||
model = DocumentPictureClassifier(
|
||||
enabled=True,
|
||||
artifacts_path=None,
|
||||
options=DocumentPictureClassifierOptions(),
|
||||
accelerator_options=AcceleratorOptions(),
|
||||
)
|
||||
|
||||
doc = enrich_document(doc=doc, backend=backend, model=model)
|
||||
|
||||
for pic in doc.pictures[:5]:
|
||||
print(pic.self_ref)
|
||||
pprint(pic.annotations)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -107,6 +107,7 @@ nav:
|
||||
- ✨ Enrichment development:
|
||||
- "Figure enrichment": examples/develop_picture_enrichment.py
|
||||
- "Formula enrichment": examples/develop_formula_understanding.py
|
||||
- "Enrich a DoclingDocument": examples/enrich_doclingdocument.py
|
||||
- 🗂️ More examples:
|
||||
- examples/rag_milvus.ipynb
|
||||
- examples/rag_weaviate.ipynb
|
||||
|
Loading…
Reference in New Issue
Block a user