diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index a24df89d..c241da2d 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -226,7 +226,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions): params: Dict[str, Any] = {} timeout: float = 20 - prompt: str = "Describe this image in a few sentences." + prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image." provenance: str = "" @@ -234,7 +234,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): kind: ClassVar[Literal["vlm"]] = "vlm" repo_id: str - prompt: str = "Describe this image in a few sentences." + prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image." # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False) @@ -249,7 +249,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions( # phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct") granite_picture_description = PictureDescriptionVlmOptions( repo_id="ibm-granite/granite-vision-3.1-2b-preview", - prompt="What is shown in this image?", + prompt="What is shown in this image? Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image.", ) @@ -399,6 +399,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions): do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code do_picture_classification: bool = False # True: classify pictures in documents do_picture_description: bool = False # True: run describe pictures in documents + do_table_description: bool = False # True: run describe tables in documents force_backend_text: bool = ( False # (To be used with vlms, or other generative models) ) diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index 44bb5e21..7c5e5ea9 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -24,6 +24,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel): def __init__( self, enabled: bool, + description_type: str, enable_remote_services: bool, artifacts_path: Optional[Union[Path, str]], options: PictureDescriptionApiOptions, @@ -31,6 +32,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel): ): super().__init__( enabled=enabled, + description_type=description_type, enable_remote_services=enable_remote_services, artifacts_path=artifacts_path, options=options, diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py index 2f6e6479..6ecf8a2c 100644 --- a/docling/models/picture_description_base_model.py +++ b/docling/models/picture_description_base_model.py @@ -7,6 +7,7 @@ from docling_core.types.doc import ( DoclingDocument, NodeItem, PictureItem, + TableItem ) from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc PictureDescriptionData, @@ -34,17 +35,22 @@ class PictureDescriptionBaseModel( *, enabled: bool, enable_remote_services: bool, + description_type: str, artifacts_path: Optional[Union[Path, str]], options: PictureDescriptionBaseOptions, accelerator_options: AcceleratorOptions, ): self.enabled = enabled self.options = options + self.description_type = description_type self.provenance = "not-implemented" def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: - return self.enabled and isinstance(element, PictureItem) - + if self.description_type == 'table': + return self.enabled and isinstance(element, TableItem) + elif self.description_type == 'picture': + return self.enabled and isinstance(element, PictureItem) + def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: raise NotImplementedError @@ -59,9 +65,9 @@ class PictureDescriptionBaseModel( return images: List[Image.Image] = [] - elements: List[PictureItem] = [] + elements: List[PictureItem | TableItem] = [] for el in element_batch: - assert isinstance(el.item, PictureItem) + assert isinstance(el.item, PictureItem) or isinstance(el.item, TableItem) describe_image = True # Don't describe the image if it's smaller than the threshold if len(el.item.prov) > 0: @@ -76,7 +82,6 @@ class PictureDescriptionBaseModel( if describe_image: elements.append(el.item) images.append(el.image) - outputs = self._annotate_images(images) for item, output in zip(elements, outputs): diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py index 679e80c2..b3e2b4ef 100644 --- a/docling/models/picture_description_vlm_model.py +++ b/docling/models/picture_description_vlm_model.py @@ -21,6 +21,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): def __init__( self, enabled: bool, + description_type: str, enable_remote_services: bool, artifacts_path: Optional[Union[Path, str]], options: PictureDescriptionVlmOptions, @@ -28,6 +29,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): ): super().__init__( enabled=enabled, + description_type=description_type, enable_remote_services=enable_remote_services, artifacts_path=artifacts_path, options=options, @@ -57,9 +59,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): artifacts_path, torch_dtype=torch.bfloat16, _attn_implementation=( - "flash_attention_2" - if self.device.startswith("cuda") - and accelerator_options.cuda_use_flash_attention2 + "flash_attention_2" if self.device.startswith("cuda") and accelerator_options.cuda_use_flash_attention2 else "eager" ), ).to(self.device) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index fe93c6c5..b7742046 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -99,6 +99,18 @@ class StandardPdfPipeline(PaginatedPipeline): raise RuntimeError( f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}." ) + + # Table description model + if ( + table_description_model := self.get_table_description_model( + artifacts_path=artifacts_path + ) + ) is None: + raise RuntimeError( + f"The specified table description kind is not supported: {pipeline_options.picture_description_options.kind}." + ) + + self.enrichment_pipe = [ # Code Formula Enrichment Model @@ -121,12 +133,17 @@ class StandardPdfPipeline(PaginatedPipeline): ), # Document Picture description picture_description_model, + # Document Table description + table_description_model, + + ] if ( self.pipeline_options.do_formula_enrichment or self.pipeline_options.do_code_enrichment or self.pipeline_options.do_picture_description + or self.pipeline_options.do_table_description ): self.keep_backend = True @@ -165,10 +182,27 @@ class StandardPdfPipeline(PaginatedPipeline): return factory.create_instance( options=self.pipeline_options.picture_description_options, enabled=self.pipeline_options.do_picture_description, + description_type = 'picture', enable_remote_services=self.pipeline_options.enable_remote_services, artifacts_path=artifacts_path, accelerator_options=self.pipeline_options.accelerator_options, ) + + def get_table_description_model( + self, artifacts_path: Optional[Path] = None + ) -> Optional[PictureDescriptionBaseModel]: + factory = get_picture_description_factory( + allow_external_plugins=self.pipeline_options.allow_external_plugins + ) + return factory.create_instance( + options=self.pipeline_options.picture_description_options, + enabled=self.pipeline_options.do_table_description, + description_type = 'table', + enable_remote_services=self.pipeline_options.enable_remote_services, + artifacts_path=artifacts_path, + accelerator_options=self.pipeline_options.accelerator_options, + ) + def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: with TimeRecorder(conv_res, "page_init"):