diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index c6ec97eb..329d9de5 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -16,9 +16,7 @@ from docling.datamodel import asr_model_specs # Import the following for backwards compatibility from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions - from docling.datamodel.asr_model_specs import WHISPER_TINY as whisper_tiny - from docling.datamodel.layout_model_specs import ( LayoutModelConfig, docling_layout_egret_large, @@ -28,7 +26,6 @@ from docling.datamodel.layout_model_specs import ( docling_layout_heron_101, docling_layout_v2, ) - from docling.datamodel.pipeline_options_asr_model import ( InlineAsrOptions, ) diff --git a/docling/models/base_model.py b/docling/models/base_model.py index dd019216..c8691e17 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -6,7 +6,12 @@ from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeIt from PIL import Image from typing_extensions import TypeVar -from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page +from docling.datamodel.base_models import ( + Cluster, + ItemAndImageEnrichmentElement, + Page, + TextCell, +) from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import BaseOptions from docling.datamodel.settings import settings @@ -29,10 +34,18 @@ class BasePageModel(ABC): ) -> Iterable[Page]: pass + class BaseLayoutModel(BasePageModel): @abstractmethod - def predict_on_page_image(self, *, page_image: Image.Image) -> list(Cluster): - pass + def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]: + pass + + @abstractmethod + def postprocess_on_page_image( + self, *, page: Page, clusters: list[Cluster] + ) -> tuple[Page, list[Cluster], list[TextCell]]: + pass + class BaseVlmModel(BasePageModel): @abstractmethod diff --git a/docling/models/vlm_models_inline/two_stage_vlm_model.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py index b31d46a9..131b874e 100644 --- a/docling/models/vlm_models_inline/two_stage_vlm_model.py +++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py @@ -8,14 +8,14 @@ from typing import Any, Optional from docling.datamodel.accelerator_options import ( AcceleratorOptions, ) -from docling.datamodel.base_models import Page, VlmPrediction +from docling.datamodel.base_models import Cluster, Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options_vlm_model import ( InlineVlmOptions, TransformersModelType, TransformersPromptStyle, ) -from docling.models.base_model import BasePageModel, BaseVlmModel +from docling.models.base_model import BaseLayoutModel, BasePageModel, BaseVlmModel from docling.models.layout_model import LayoutModel from docling.models.utils.hf_model_download import ( HuggingFaceModelDownloadMixin, @@ -30,7 +30,7 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin): def __init__( self, *, - layout_model: LayoutModel, + layout_model: BaseLayoutModel, vlm_model: BaseVlmModel, ): self.layout_model = layout_model @@ -51,13 +51,17 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin): scale=self.vlm_model.scale, max_size=self.vlm_model.max_size ) - pred_clusters = self.layout_model.predict_on_page(page_image=page_image) + assert page_image is not None + + pred_clusters = self.layout_model.predict_on_page_image( + page_image=page_image + ) page, processed_clusters, processed_cells = ( - self.layout_model.postprocess_on_page( + self.layout_model.postprocess_on_page_image( page=page, clusters=pred_clusters ) ) - + # Define prompt structure if callable(self.vlm_options.prompt): user_prompt = self.vlm_options.prompt(page.parsed_page)