add options to generate images

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-15 17:09:54 +02:00
parent f49d7881d0
commit 1cb11be06f
5 changed files with 144 additions and 47 deletions

View File

@ -64,8 +64,6 @@ class PipelineOptions(BaseModel):
True # This defautl will be set to False on a future version of docling True # This defautl will be set to False on a future version of docling
) )
do_dummy_picture_classifer: bool = False
class PdfPipelineOptions(PipelineOptions): class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
@ -77,4 +75,7 @@ class PdfPipelineOptions(PipelineOptions):
Field(EasyOcrOptions(), discriminator="kind") Field(EasyOcrOptions(), discriminator="kind")
) )
images_scale: Optional[float] = None # if set, the scale for generated images images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
generate_table_images: bool = False

View File

@ -1,31 +0,0 @@
from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc.document import PictureClassificationData, PictureItem
from docling.models.base_model import BaseEnrichmentModel
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
def __init__(self, enabled: bool):
self.enabled = enabled
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return self.enabled and isinstance(element, PictureItem)
def __call__(
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]:
if not self.enabled:
return
for element in element_batch:
assert isinstance(element, PictureItem)
element.data.classification = PictureClassificationData(
provenance="dummy_classifier-0.0.1",
predicted_class="dummy",
confidence=0.42,
)
yield element

View File

@ -2,6 +2,8 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.base_models import AssembledUnit, Page
@ -14,9 +16,6 @@ from docling.datamodel.pipeline_options import (
) )
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.dummy_picture_enrichment import (
DummyPictureClassifierEnrichmentModel,
)
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
@ -45,6 +44,12 @@ class StandardPdfPipeline(PaginatedPipeline):
else: else:
self.artifacts_path = Path(pipeline_options.artifacts_path) self.artifacts_path = Path(pipeline_options.artifacts_path)
keep_images = (
self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
)
self.glm_model = GlmModel(options=GlmOptions()) self.glm_model = GlmModel(options=GlmOptions())
if (ocr_model := self.get_ocr_model()) is None: if (ocr_model := self.get_ocr_model()) is None:
@ -74,19 +79,11 @@ class StandardPdfPipeline(PaginatedPipeline):
options=pipeline_options.table_structure_options, options=pipeline_options.table_structure_options,
), ),
# Page assemble # Page assemble
PageAssembleModel( PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
options=PageAssembleOptions(
keep_images=pipeline_options.images_scale is not None
)
),
] ]
self.enrichment_pipe = [ self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument # Other models working on `NodeItem` elements in the DoclingDocument
# TODO Question: should we use the enabled flag or simply not add the model in the list?
DummyPictureClassifierEnrichmentModel(
enabled=pipeline_options.do_dummy_picture_classifer
)
] ]
@staticmethod @staticmethod
@ -150,6 +147,45 @@ class StandardPdfPipeline(PaginatedPipeline):
conv_res.document = self.glm_model(conv_res) conv_res.document = self.glm_model(conv_res)
# Generate page images in the output
if self.pipeline_options.generate_page_images:
for page in conv_res.pages:
assert page.image is not None
page_ix = page.page_no - 1
conv_res.document.pages[page_ix].image = ImageRef.from_pil(
page.image, dpi=int(72 * self.pipeline_options.images_scale)
)
# Generate images of the requested element types
if (
self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
) or (
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(
page_height=conv_res.pages[page_ix].size.height * scale
)
)
cropped_im = conv_res.pages[page_ix].image.crop(
crop_bbox.as_tuple()
)
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
return conv_res return conv_res
@classmethod @classmethod

View File

@ -58,7 +58,6 @@ def main():
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.do_dummy_picture_classifer = True
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
format_options={ format_options={

View File

@ -0,0 +1,92 @@
import logging
from pathlib import Path
from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc.document import PictureClassificationData, PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
do_picture_classifer: bool = True
class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
def __init__(self, enabled: bool):
self.enabled = enabled
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return self.enabled and isinstance(element, PictureItem)
def __call__(
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]:
if not self.enabled:
return
for element in element_batch:
assert isinstance(element, PictureItem)
# uncomment this to interactively visualize the image
element.image.pil_image.show()
element.data.classification = PictureClassificationData(
provenance="example_classifier-0.0.1",
predicted_class="dummy",
confidence=0.42,
)
yield element
class ExamplePictureClassifierPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: ExamplePictureClassifierPipeline
self.enrichment_pipe = [
ExamplePictureClassifierEnrichmentModel(
enabled=pipeline_options.do_picture_classifer
)
]
@classmethod
def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions:
return ExamplePictureClassifierPipelineOptions()
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ExamplePictureClassifierPipeline,
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)
for element, _level in result.document.iterate_items():
if isinstance(element, PictureItem):
print(
f"The model populated the `data` portion of picture {element.self_ref}:\n{element.data}"
)
if __name__ == "__main__":
main()