table enrichments - Description and Indexing

Signed-off-by: Nikhil Khandelwal <nikhil.khandelwal3@ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-05-13 13:49:10 +05:30 · 2025-05-13 13:49:10 +05:30 · f2c019cad7
commit f2c019cad7
parent 0d0fa6cbe3
5 changed files with 53 additions and 11 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -226,7 +226,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
    params: Dict[str, Any] = {}
    timeout: float = 20
-    prompt: str = "Describe this image in a few sentences."
+    prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
    provenance: str = ""
@ -234,7 +234,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
    kind: ClassVar[Literal["vlm"]] = "vlm"
    repo_id: str
-    prompt: str = "Describe this image in a few sentences."
+    prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
    # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
    generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
@ -249,7 +249,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
 # phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
 granite_picture_description = PictureDescriptionVlmOptions(
    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
-    prompt="What is shown in this image?",
+    prompt="What is shown in this image? Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image.",
 )
@ -399,6 +399,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
    do_picture_classification: bool = False  # True: classify pictures in documents
    do_picture_description: bool = False  # True: run describe pictures in documents
    do_table_description: bool = False # True: run describe tables in documents
    force_backend_text: bool = (
        False  # (To be used with vlms, or other generative models)
    )
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -24,6 +24,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
    def __init__(
        self,
        enabled: bool,
        description_type: str,
        enable_remote_services: bool,
        artifacts_path: Optional[Union[Path, str]],
        options: PictureDescriptionApiOptions,
@ -31,6 +32,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
    ):
        super().__init__(
            enabled=enabled,
            description_type=description_type,
            enable_remote_services=enable_remote_services,
            artifacts_path=artifacts_path,
            options=options,
--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@ -7,6 +7,7 @@ from docling_core.types.doc import (
    DoclingDocument,
    NodeItem,
    PictureItem,
    TableItem
 )
 from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
    PictureDescriptionData,
@ -34,15 +35,20 @@ class PictureDescriptionBaseModel(
        *,
        enabled: bool,
        enable_remote_services: bool,
        description_type: str,
        artifacts_path: Optional[Union[Path, str]],
        options: PictureDescriptionBaseOptions,
        accelerator_options: AcceleratorOptions,
    ):
        self.enabled = enabled
        self.options = options
        self.description_type = description_type
        self.provenance = "not-implemented"
    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
        if self.description_type == 'table':
            return self.enabled and isinstance(element, TableItem)
        elif self.description_type == 'picture':
            return self.enabled and isinstance(element, PictureItem)
    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
@ -59,9 +65,9 @@ class PictureDescriptionBaseModel(
            return
        images: List[Image.Image] = []
-        elements: List[PictureItem] = []
+        elements: List[PictureItem | TableItem] = []
        for el in element_batch:
-            assert isinstance(el.item, PictureItem)
+            assert isinstance(el.item, PictureItem) or isinstance(el.item, TableItem)
            describe_image = True
            # Don't describe the image if it's smaller than the threshold
            if len(el.item.prov) > 0:
@ -76,7 +82,6 @@ class PictureDescriptionBaseModel(
            if describe_image:
                elements.append(el.item)
                images.append(el.image)
        outputs = self._annotate_images(images)
        for item, output in zip(elements, outputs):
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -21,6 +21,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
    def __init__(
        self,
        enabled: bool,
        description_type: str,
        enable_remote_services: bool,
        artifacts_path: Optional[Union[Path, str]],
        options: PictureDescriptionVlmOptions,
@ -28,6 +29,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
    ):
        super().__init__(
            enabled=enabled,
            description_type=description_type,
            enable_remote_services=enable_remote_services,
            artifacts_path=artifacts_path,
            options=options,
@ -57,9 +59,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
                artifacts_path,
                torch_dtype=torch.bfloat16,
                _attn_implementation=(
-                    "flash_attention_2"
+                    "flash_attention_2" if self.device.startswith("cuda") and accelerator_options.cuda_use_flash_attention2
                    if self.device.startswith("cuda")
                    and accelerator_options.cuda_use_flash_attention2
                    else "eager"
                ),
            ).to(self.device)
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -100,6 +100,18 @@ class StandardPdfPipeline(PaginatedPipeline):
                f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
            )
        # Table description model
        if (
            table_description_model := self.get_table_description_model(
                artifacts_path=artifacts_path
            )
        ) is None:
            raise RuntimeError(
                f"The specified table description kind is not supported: {pipeline_options.picture_description_options.kind}."
            )
        self.enrichment_pipe = [
            # Code Formula Enrichment Model
            CodeFormulaModel(
@ -121,12 +133,17 @@ class StandardPdfPipeline(PaginatedPipeline):
            ),
            # Document Picture description
            picture_description_model,
            # Document Table description
            table_description_model,
        ]
        if (
            self.pipeline_options.do_formula_enrichment
            or self.pipeline_options.do_code_enrichment
            or self.pipeline_options.do_picture_description
            or self.pipeline_options.do_table_description
        ):
            self.keep_backend = True
@ -165,11 +182,28 @@ class StandardPdfPipeline(PaginatedPipeline):
        return factory.create_instance(
            options=self.pipeline_options.picture_description_options,
            enabled=self.pipeline_options.do_picture_description,
            description_type = 'picture',
            enable_remote_services=self.pipeline_options.enable_remote_services,
            artifacts_path=artifacts_path,
            accelerator_options=self.pipeline_options.accelerator_options,
        )
    def get_table_description_model(
        self, artifacts_path: Optional[Path] = None
    ) -> Optional[PictureDescriptionBaseModel]:
        factory = get_picture_description_factory(
            allow_external_plugins=self.pipeline_options.allow_external_plugins
        )
        return factory.create_instance(
            options=self.pipeline_options.picture_description_options,
            enabled=self.pipeline_options.do_table_description,
            description_type = 'table',
            enable_remote_services=self.pipeline_options.enable_remote_services,
            artifacts_path=artifacts_path,
            accelerator_options=self.pipeline_options.accelerator_options,
        )
    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
        with TimeRecorder(conv_res, "page_init"):
            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore