mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
add options to generate images
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
f49d7881d0
commit
1cb11be06f
@ -64,8 +64,6 @@ class PipelineOptions(BaseModel):
|
|||||||
True # This defautl will be set to False on a future version of docling
|
True # This defautl will be set to False on a future version of docling
|
||||||
)
|
)
|
||||||
|
|
||||||
do_dummy_picture_classifer: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PipelineOptions):
|
class PdfPipelineOptions(PipelineOptions):
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
@ -77,4 +75,7 @@ class PdfPipelineOptions(PipelineOptions):
|
|||||||
Field(EasyOcrOptions(), discriminator="kind")
|
Field(EasyOcrOptions(), discriminator="kind")
|
||||||
)
|
)
|
||||||
|
|
||||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
images_scale: float = 1.0
|
||||||
|
generate_page_images: bool = False
|
||||||
|
generate_picture_images: bool = False
|
||||||
|
generate_table_images: bool = False
|
||||||
|
@ -1,31 +0,0 @@
|
|||||||
from typing import Any, Iterable
|
|
||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
|
||||||
from docling_core.types.doc.document import PictureClassificationData, PictureItem
|
|
||||||
|
|
||||||
from docling.models.base_model import BaseEnrichmentModel
|
|
||||||
|
|
||||||
|
|
||||||
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
|
||||||
|
|
||||||
def __init__(self, enabled: bool):
|
|
||||||
self.enabled = enabled
|
|
||||||
|
|
||||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
|
||||||
return self.enabled and isinstance(element, PictureItem)
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
|
||||||
) -> Iterable[Any]:
|
|
||||||
if not self.enabled:
|
|
||||||
return
|
|
||||||
|
|
||||||
for element in element_batch:
|
|
||||||
assert isinstance(element, PictureItem)
|
|
||||||
element.data.classification = PictureClassificationData(
|
|
||||||
provenance="dummy_classifier-0.0.1",
|
|
||||||
predicted_class="dummy",
|
|
||||||
confidence=0.42,
|
|
||||||
)
|
|
||||||
|
|
||||||
yield element
|
|
@ -2,6 +2,8 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
@ -14,9 +16,6 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||||
from docling.models.dummy_picture_enrichment import (
|
|
||||||
DummyPictureClassifierEnrichmentModel,
|
|
||||||
)
|
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
@ -45,6 +44,12 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
else:
|
else:
|
||||||
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
||||||
|
|
||||||
|
keep_images = (
|
||||||
|
self.pipeline_options.generate_page_images
|
||||||
|
or self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
)
|
||||||
|
|
||||||
self.glm_model = GlmModel(options=GlmOptions())
|
self.glm_model = GlmModel(options=GlmOptions())
|
||||||
|
|
||||||
if (ocr_model := self.get_ocr_model()) is None:
|
if (ocr_model := self.get_ocr_model()) is None:
|
||||||
@ -74,19 +79,11 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
options=pipeline_options.table_structure_options,
|
options=pipeline_options.table_structure_options,
|
||||||
),
|
),
|
||||||
# Page assemble
|
# Page assemble
|
||||||
PageAssembleModel(
|
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
||||||
options=PageAssembleOptions(
|
|
||||||
keep_images=pipeline_options.images_scale is not None
|
|
||||||
)
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||||
# TODO Question: should we use the enabled flag or simply not add the model in the list?
|
|
||||||
DummyPictureClassifierEnrichmentModel(
|
|
||||||
enabled=pipeline_options.do_dummy_picture_classifer
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -150,6 +147,45 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
conv_res.document = self.glm_model(conv_res)
|
conv_res.document = self.glm_model(conv_res)
|
||||||
|
|
||||||
|
# Generate page images in the output
|
||||||
|
if self.pipeline_options.generate_page_images:
|
||||||
|
for page in conv_res.pages:
|
||||||
|
assert page.image is not None
|
||||||
|
page_ix = page.page_no - 1
|
||||||
|
conv_res.document.pages[page_ix].image = ImageRef.from_pil(
|
||||||
|
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate images of the requested element types
|
||||||
|
if (
|
||||||
|
self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
scale = self.pipeline_options.images_scale
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
isinstance(element, PictureItem)
|
||||||
|
and self.pipeline_options.generate_picture_images
|
||||||
|
) or (
|
||||||
|
isinstance(element, TableItem)
|
||||||
|
and self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
page_ix = element.prov[0].page_no - 1
|
||||||
|
crop_bbox = (
|
||||||
|
element.prov[0]
|
||||||
|
.bbox.scaled(scale=scale)
|
||||||
|
.to_top_left_origin(
|
||||||
|
page_height=conv_res.pages[page_ix].size.height * scale
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
cropped_im = conv_res.pages[page_ix].image.crop(
|
||||||
|
crop_bbox.as_tuple()
|
||||||
|
)
|
||||||
|
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -58,7 +58,6 @@ def main():
|
|||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
pipeline_options.do_dummy_picture_classifer = True
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
|
92
docs/examples/develop_picture_enrichment.py
Normal file
92
docs/examples/develop_picture_enrichment.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterable
|
||||||
|
|
||||||
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||||
|
from docling_core.types.doc.document import PictureClassificationData, PictureItem
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.models.base_model import BaseEnrichmentModel
|
||||||
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
|
|
||||||
|
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
|
||||||
|
do_picture_classifer: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
||||||
|
|
||||||
|
def __init__(self, enabled: bool):
|
||||||
|
self.enabled = enabled
|
||||||
|
|
||||||
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
|
return self.enabled and isinstance(element, PictureItem)
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||||
|
) -> Iterable[Any]:
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
for element in element_batch:
|
||||||
|
assert isinstance(element, PictureItem)
|
||||||
|
|
||||||
|
# uncomment this to interactively visualize the image
|
||||||
|
element.image.pil_image.show()
|
||||||
|
|
||||||
|
element.data.classification = PictureClassificationData(
|
||||||
|
provenance="example_classifier-0.0.1",
|
||||||
|
predicted_class="dummy",
|
||||||
|
confidence=0.42,
|
||||||
|
)
|
||||||
|
|
||||||
|
yield element
|
||||||
|
|
||||||
|
|
||||||
|
class ExamplePictureClassifierPipeline(StandardPdfPipeline):
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
self.pipeline_options: ExamplePictureClassifierPipeline
|
||||||
|
|
||||||
|
self.enrichment_pipe = [
|
||||||
|
ExamplePictureClassifierEnrichmentModel(
|
||||||
|
enabled=pipeline_options.do_picture_classifer
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions:
|
||||||
|
return ExamplePictureClassifierPipelineOptions()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
|
|
||||||
|
pipeline_options = ExamplePictureClassifierPipelineOptions()
|
||||||
|
pipeline_options.images_scale = 2.0
|
||||||
|
pipeline_options.generate_picture_images = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=ExamplePictureClassifierPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
result = doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
|
for element, _level in result.document.iterate_items():
|
||||||
|
if isinstance(element, PictureItem):
|
||||||
|
print(
|
||||||
|
f"The model populated the `data` portion of picture {element.self_ref}:\n{element.data}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user