mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
add options to generate images
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
f49d7881d0
commit
1cb11be06f
@ -64,8 +64,6 @@ class PipelineOptions(BaseModel):
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
)
|
||||
|
||||
do_dummy_picture_classifer: bool = False
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
@ -77,4 +75,7 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
Field(EasyOcrOptions(), discriminator="kind")
|
||||
)
|
||||
|
||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
generate_picture_images: bool = False
|
||||
generate_table_images: bool = False
|
||||
|
@ -1,31 +0,0 @@
|
||||
from typing import Any, Iterable
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
from docling_core.types.doc.document import PictureClassificationData, PictureItem
|
||||
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
|
||||
|
||||
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
||||
|
||||
def __init__(self, enabled: bool):
|
||||
self.enabled = enabled
|
||||
|
||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||
return self.enabled and isinstance(element, PictureItem)
|
||||
|
||||
def __call__(
|
||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||
) -> Iterable[Any]:
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
for element in element_batch:
|
||||
assert isinstance(element, PictureItem)
|
||||
element.data.classification = PictureClassificationData(
|
||||
provenance="dummy_classifier-0.0.1",
|
||||
predicted_class="dummy",
|
||||
confidence=0.42,
|
||||
)
|
||||
|
||||
yield element
|
@ -2,6 +2,8 @@ import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
@ -14,9 +16,6 @@ from docling.datamodel.pipeline_options import (
|
||||
)
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.dummy_picture_enrichment import (
|
||||
DummyPictureClassifierEnrichmentModel,
|
||||
)
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||
@ -45,6 +44,12 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
else:
|
||||
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
||||
|
||||
keep_images = (
|
||||
self.pipeline_options.generate_page_images
|
||||
or self.pipeline_options.generate_picture_images
|
||||
or self.pipeline_options.generate_table_images
|
||||
)
|
||||
|
||||
self.glm_model = GlmModel(options=GlmOptions())
|
||||
|
||||
if (ocr_model := self.get_ocr_model()) is None:
|
||||
@ -74,19 +79,11 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
options=pipeline_options.table_structure_options,
|
||||
),
|
||||
# Page assemble
|
||||
PageAssembleModel(
|
||||
options=PageAssembleOptions(
|
||||
keep_images=pipeline_options.images_scale is not None
|
||||
)
|
||||
),
|
||||
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
||||
]
|
||||
|
||||
self.enrichment_pipe = [
|
||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||
# TODO Question: should we use the enabled flag or simply not add the model in the list?
|
||||
DummyPictureClassifierEnrichmentModel(
|
||||
enabled=pipeline_options.do_dummy_picture_classifer
|
||||
)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
@ -150,6 +147,45 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
|
||||
conv_res.document = self.glm_model(conv_res)
|
||||
|
||||
# Generate page images in the output
|
||||
if self.pipeline_options.generate_page_images:
|
||||
for page in conv_res.pages:
|
||||
assert page.image is not None
|
||||
page_ix = page.page_no - 1
|
||||
conv_res.document.pages[page_ix].image = ImageRef.from_pil(
|
||||
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
||||
)
|
||||
|
||||
# Generate images of the requested element types
|
||||
if (
|
||||
self.pipeline_options.generate_picture_images
|
||||
or self.pipeline_options.generate_table_images
|
||||
):
|
||||
scale = self.pipeline_options.images_scale
|
||||
for element, _level in conv_res.document.iterate_items():
|
||||
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||
continue
|
||||
if (
|
||||
isinstance(element, PictureItem)
|
||||
and self.pipeline_options.generate_picture_images
|
||||
) or (
|
||||
isinstance(element, TableItem)
|
||||
and self.pipeline_options.generate_table_images
|
||||
):
|
||||
page_ix = element.prov[0].page_no - 1
|
||||
crop_bbox = (
|
||||
element.prov[0]
|
||||
.bbox.scaled(scale=scale)
|
||||
.to_top_left_origin(
|
||||
page_height=conv_res.pages[page_ix].size.height * scale
|
||||
)
|
||||
)
|
||||
|
||||
cropped_im = conv_res.pages[page_ix].image.crop(
|
||||
crop_bbox.as_tuple()
|
||||
)
|
||||
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
|
||||
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
|
@ -58,7 +58,6 @@ def main():
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
pipeline_options.do_dummy_picture_classifer = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
|
92
docs/examples/develop_picture_enrichment.py
Normal file
92
docs/examples/develop_picture_enrichment.py
Normal file
@ -0,0 +1,92 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
from docling_core.types.doc.document import PictureClassificationData, PictureItem
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
|
||||
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
|
||||
do_picture_classifer: bool = True
|
||||
|
||||
|
||||
class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
||||
|
||||
def __init__(self, enabled: bool):
|
||||
self.enabled = enabled
|
||||
|
||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||
return self.enabled and isinstance(element, PictureItem)
|
||||
|
||||
def __call__(
|
||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||
) -> Iterable[Any]:
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
for element in element_batch:
|
||||
assert isinstance(element, PictureItem)
|
||||
|
||||
# uncomment this to interactively visualize the image
|
||||
element.image.pil_image.show()
|
||||
|
||||
element.data.classification = PictureClassificationData(
|
||||
provenance="example_classifier-0.0.1",
|
||||
predicted_class="dummy",
|
||||
confidence=0.42,
|
||||
)
|
||||
|
||||
yield element
|
||||
|
||||
|
||||
class ExamplePictureClassifierPipeline(StandardPdfPipeline):
|
||||
|
||||
def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.pipeline_options: ExamplePictureClassifierPipeline
|
||||
|
||||
self.enrichment_pipe = [
|
||||
ExamplePictureClassifierEnrichmentModel(
|
||||
enabled=pipeline_options.do_picture_classifer
|
||||
)
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions:
|
||||
return ExamplePictureClassifierPipelineOptions()
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
pipeline_options = ExamplePictureClassifierPipelineOptions()
|
||||
pipeline_options.images_scale = 2.0
|
||||
pipeline_options.generate_picture_images = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=ExamplePictureClassifierPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
}
|
||||
)
|
||||
result = doc_converter.convert(input_doc_path)
|
||||
|
||||
for element, _level in result.document.iterate_items():
|
||||
if isinstance(element, PictureItem):
|
||||
print(
|
||||
f"The model populated the `data` portion of picture {element.self_ref}:\n{element.data}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user