table enrichments - Description and Indexing

Signed-off-by: Nikhil Khandelwal <nikhil.khandelwal3@ibm.com>
This commit is contained in:
Shivani Kabu 2025-05-13 13:49:10 +05:30 committed by Nikhil Khandelwal
parent 0d0fa6cbe3
commit f2c019cad7
5 changed files with 53 additions and 11 deletions

View File

@ -226,7 +226,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
params: Dict[str, Any] = {}
timeout: float = 20
prompt: str = "Describe this image in a few sentences."
prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
provenance: str = ""
@ -234,7 +234,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: ClassVar[Literal["vlm"]] = "vlm"
repo_id: str
prompt: str = "Describe this image in a few sentences."
prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
@ -249,7 +249,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
granite_picture_description = PictureDescriptionVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
prompt="What is shown in this image?",
prompt="What is shown in this image? Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image.",
)
@ -399,6 +399,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
do_table_description: bool = False # True: run describe tables in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)

View File

@ -24,6 +24,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
def __init__(
self,
enabled: bool,
description_type: str,
enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionApiOptions,
@ -31,6 +32,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
):
super().__init__(
enabled=enabled,
description_type=description_type,
enable_remote_services=enable_remote_services,
artifacts_path=artifacts_path,
options=options,

View File

@ -7,6 +7,7 @@ from docling_core.types.doc import (
DoclingDocument,
NodeItem,
PictureItem,
TableItem
)
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
PictureDescriptionData,
@ -34,17 +35,22 @@ class PictureDescriptionBaseModel(
*,
enabled: bool,
enable_remote_services: bool,
description_type: str,
artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionBaseOptions,
accelerator_options: AcceleratorOptions,
):
self.enabled = enabled
self.options = options
self.description_type = description_type
self.provenance = "not-implemented"
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return self.enabled and isinstance(element, PictureItem)
if self.description_type == 'table':
return self.enabled and isinstance(element, TableItem)
elif self.description_type == 'picture':
return self.enabled and isinstance(element, PictureItem)
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
raise NotImplementedError
@ -59,9 +65,9 @@ class PictureDescriptionBaseModel(
return
images: List[Image.Image] = []
elements: List[PictureItem] = []
elements: List[PictureItem | TableItem] = []
for el in element_batch:
assert isinstance(el.item, PictureItem)
assert isinstance(el.item, PictureItem) or isinstance(el.item, TableItem)
describe_image = True
# Don't describe the image if it's smaller than the threshold
if len(el.item.prov) > 0:
@ -76,7 +82,6 @@ class PictureDescriptionBaseModel(
if describe_image:
elements.append(el.item)
images.append(el.image)
outputs = self._annotate_images(images)
for item, output in zip(elements, outputs):

View File

@ -21,6 +21,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
def __init__(
self,
enabled: bool,
description_type: str,
enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionVlmOptions,
@ -28,6 +29,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
):
super().__init__(
enabled=enabled,
description_type=description_type,
enable_remote_services=enable_remote_services,
artifacts_path=artifacts_path,
options=options,
@ -57,9 +59,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
artifacts_path,
torch_dtype=torch.bfloat16,
_attn_implementation=(
"flash_attention_2"
if self.device.startswith("cuda")
and accelerator_options.cuda_use_flash_attention2
"flash_attention_2" if self.device.startswith("cuda") and accelerator_options.cuda_use_flash_attention2
else "eager"
),
).to(self.device)

View File

@ -99,6 +99,18 @@ class StandardPdfPipeline(PaginatedPipeline):
raise RuntimeError(
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
)
# Table description model
if (
table_description_model := self.get_table_description_model(
artifacts_path=artifacts_path
)
) is None:
raise RuntimeError(
f"The specified table description kind is not supported: {pipeline_options.picture_description_options.kind}."
)
self.enrichment_pipe = [
# Code Formula Enrichment Model
@ -121,12 +133,17 @@ class StandardPdfPipeline(PaginatedPipeline):
),
# Document Picture description
picture_description_model,
# Document Table description
table_description_model,
]
if (
self.pipeline_options.do_formula_enrichment
or self.pipeline_options.do_code_enrichment
or self.pipeline_options.do_picture_description
or self.pipeline_options.do_table_description
):
self.keep_backend = True
@ -165,10 +182,27 @@ class StandardPdfPipeline(PaginatedPipeline):
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description,
description_type = 'picture',
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
def get_table_description_model(
self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_table_description,
description_type = 'table',
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"):