diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 473ef980..50838eb3 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -64,8 +64,6 @@ class PipelineOptions(BaseModel): True # This defautl will be set to False on a future version of docling ) - do_dummy_picture_classifer: bool = False - class PdfPipelineOptions(PipelineOptions): artifacts_path: Optional[Union[Path, str]] = None @@ -77,4 +75,7 @@ class PdfPipelineOptions(PipelineOptions): Field(EasyOcrOptions(), discriminator="kind") ) - images_scale: Optional[float] = None # if set, the scale for generated images + images_scale: float = 1.0 + generate_page_images: bool = False + generate_picture_images: bool = False + generate_table_images: bool = False diff --git a/docling/models/dummy_picture_enrichment.py b/docling/models/dummy_picture_enrichment.py deleted file mode 100644 index d227b31f..00000000 --- a/docling/models/dummy_picture_enrichment.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Any, Iterable - -from docling_core.types.doc import DoclingDocument, NodeItem -from docling_core.types.doc.document import PictureClassificationData, PictureItem - -from docling.models.base_model import BaseEnrichmentModel - - -class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel): - - def __init__(self, enabled: bool): - self.enabled = enabled - - def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: - return self.enabled and isinstance(element, PictureItem) - - def __call__( - self, doc: DoclingDocument, element_batch: Iterable[NodeItem] - ) -> Iterable[Any]: - if not self.enabled: - return - - for element in element_batch: - assert isinstance(element, PictureItem) - element.data.classification = PictureClassificationData( - provenance="dummy_classifier-0.0.1", - predicted_class="dummy", - confidence=0.42, - ) - - yield element diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 6d72884d..6182ec4e 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -2,6 +2,8 @@ import logging from pathlib import Path from typing import Optional +from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem + from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import AssembledUnit, Page @@ -14,9 +16,6 @@ from docling.datamodel.pipeline_options import ( ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel, GlmOptions -from docling.models.dummy_picture_enrichment import ( - DummyPictureClassifierEnrichmentModel, -) from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions @@ -45,6 +44,12 @@ class StandardPdfPipeline(PaginatedPipeline): else: self.artifacts_path = Path(pipeline_options.artifacts_path) + keep_images = ( + self.pipeline_options.generate_page_images + or self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ) + self.glm_model = GlmModel(options=GlmOptions()) if (ocr_model := self.get_ocr_model()) is None: @@ -74,19 +79,11 @@ class StandardPdfPipeline(PaginatedPipeline): options=pipeline_options.table_structure_options, ), # Page assemble - PageAssembleModel( - options=PageAssembleOptions( - keep_images=pipeline_options.images_scale is not None - ) - ), + PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)), ] self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument - # TODO Question: should we use the enabled flag or simply not add the model in the list? - DummyPictureClassifierEnrichmentModel( - enabled=pipeline_options.do_dummy_picture_classifer - ) ] @staticmethod @@ -150,6 +147,45 @@ class StandardPdfPipeline(PaginatedPipeline): conv_res.document = self.glm_model(conv_res) + # Generate page images in the output + if self.pipeline_options.generate_page_images: + for page in conv_res.pages: + assert page.image is not None + page_ix = page.page_no - 1 + conv_res.document.pages[page_ix].image = ImageRef.from_pil( + page.image, dpi=int(72 * self.pipeline_options.images_scale) + ) + + # Generate images of the requested element types + if ( + self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ): + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if not isinstance(element, DocItem) or len(element.prov) == 0: + continue + if ( + isinstance(element, PictureItem) + and self.pipeline_options.generate_picture_images + ) or ( + isinstance(element, TableItem) + and self.pipeline_options.generate_table_images + ): + page_ix = element.prov[0].page_no - 1 + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin( + page_height=conv_res.pages[page_ix].size.height * scale + ) + ) + + cropped_im = conv_res.pages[page_ix].image.crop( + crop_bbox.as_tuple() + ) + element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale)) + return conv_res @classmethod diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 67d58a6b..6eb09a13 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -58,7 +58,6 @@ def main(): pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - pipeline_options.do_dummy_picture_classifer = True doc_converter = DocumentConverter( format_options={ diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py new file mode 100644 index 00000000..aeec2f32 --- /dev/null +++ b/docs/examples/develop_picture_enrichment.py @@ -0,0 +1,92 @@ +import logging +from pathlib import Path +from typing import Any, Iterable + +from docling_core.types.doc import DoclingDocument, NodeItem +from docling_core.types.doc.document import PictureClassificationData, PictureItem + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.models.base_model import BaseEnrichmentModel +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline + + +class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions): + do_picture_classifer: bool = True + + +class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel): + + def __init__(self, enabled: bool): + self.enabled = enabled + + def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: + return self.enabled and isinstance(element, PictureItem) + + def __call__( + self, doc: DoclingDocument, element_batch: Iterable[NodeItem] + ) -> Iterable[Any]: + if not self.enabled: + return + + for element in element_batch: + assert isinstance(element, PictureItem) + + # uncomment this to interactively visualize the image + element.image.pil_image.show() + + element.data.classification = PictureClassificationData( + provenance="example_classifier-0.0.1", + predicted_class="dummy", + confidence=0.42, + ) + + yield element + + +class ExamplePictureClassifierPipeline(StandardPdfPipeline): + + def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions): + super().__init__(pipeline_options) + self.pipeline_options: ExamplePictureClassifierPipeline + + self.enrichment_pipe = [ + ExamplePictureClassifierEnrichmentModel( + enabled=pipeline_options.do_picture_classifer + ) + ] + + @classmethod + def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions: + return ExamplePictureClassifierPipelineOptions() + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_path = Path("./tests/data/2206.01062.pdf") + + pipeline_options = ExamplePictureClassifierPipelineOptions() + pipeline_options.images_scale = 2.0 + pipeline_options.generate_picture_images = True + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=ExamplePictureClassifierPipeline, + pipeline_options=pipeline_options, + ) + } + ) + result = doc_converter.convert(input_doc_path) + + for element, _level in result.document.iterate_items(): + if isinstance(element, PictureItem): + print( + f"The model populated the `data` portion of picture {element.self_ref}:\n{element.data}" + ) + + +if __name__ == "__main__": + main()