move logic in BaseTextImageEnrichmentModel

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-01-14 12:53:52 +01:00
parent 3611335d22
commit 12b6417f51
3 changed files with 35 additions and 31 deletions

View File

@ -7,6 +7,7 @@ from docling_core.types.doc import (
PictureDataType, PictureDataType,
Size, Size,
TableCell, TableCell,
TextItem,
) )
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream, DocumentStream,
@ -201,6 +202,13 @@ class AssembledUnit(BaseModel):
headers: List[PageElement] = [] headers: List[PageElement] = []
class TextImageEnrichmentElement(BaseModel):
element: TextItem
image: Image
model_config = ConfigDict(arbitrary_types_allowed=True)
class Page(BaseModel): class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)

View File

@ -1,10 +1,10 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional from typing import Any, Generic, Iterable, Optional
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import DoclingDocument, NodeItem, TextItem
from typing_extensions import TypeVar from typing_extensions import TypeVar
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page, TextImageEnrichmentElement
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
@ -46,3 +46,22 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
if self.is_processable(doc=conv_res.document, element=element): if self.is_processable(doc=conv_res.document, element=element):
return element return element
return None return None
class BaseTextImageEnrichmentModel(GenericEnrichmentModel[TextImageEnrichmentElement]):
images_scale: float
def prepare_element(
self, conv_res: ConversionResult, element: NodeItem
) -> Optional[TextImageEnrichmentElement]:
if not self.is_processable(doc=conv_res.document, element=element):
return None
assert isinstance(element, TextItem)
element_prov = element.prov[0]
page_ix = element_prov.page_no - 1
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=element_prov.bbox
)
return TextImageEnrichmentElement(element=element, image=cropped_image)

View File

@ -3,14 +3,11 @@ from pathlib import Path
from typing import Iterable from typing import Iterable
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
from PIL import Image as PILImage
from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat, TextImageEnrichmentElement
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel, GenericEnrichmentModel from docling.models.base_model import BaseTextImageEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@ -18,16 +15,9 @@ class ExampleFormulaUPipelineOptions(PdfPipelineOptions):
do_formula_understanding: bool = True do_formula_understanding: bool = True
class FormulaEnrichmentElement(BaseModel): # A new enrichment model using both the document element and its image as input
element: TextItem class ExampleFormulaUEnrichmentModel(BaseTextImageEnrichmentModel):
image: PILImage.Image images_scale = 2.6
model_config = ConfigDict(arbitrary_types_allowed=True)
class ExampleFormulaUEnrichmentModel(GenericEnrichmentModel[FormulaEnrichmentElement]):
images_scale: float = 2.6
def __init__(self, enabled: bool): def __init__(self, enabled: bool):
self.enabled = enabled self.enabled = enabled
@ -39,21 +29,8 @@ class ExampleFormulaUEnrichmentModel(GenericEnrichmentModel[FormulaEnrichmentEle
and element.label == DocItemLabel.FORMULA and element.label == DocItemLabel.FORMULA
) )
def prepare_element(
self, conv_res: ConversionResult, element: NodeItem
) -> FormulaEnrichmentElement:
if self.is_processable(doc=conv_res.document, element=element):
assert isinstance(element, TextItem)
element_prov = element.prov[0]
page_ix = element_prov.page_no - 1
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=element_prov.bbox
)
return FormulaEnrichmentElement(element=element, image=cropped_image)
def __call__( def __call__(
self, doc: DoclingDocument, element_batch: Iterable[FormulaEnrichmentElement] self, doc: DoclingDocument, element_batch: Iterable[TextImageEnrichmentElement]
) -> Iterable[NodeItem]: ) -> Iterable[NodeItem]:
if not self.enabled: if not self.enabled:
return return