table enrichments - Description and Indexing

Signed-off-by: Nikhil Khandelwal <nikhil.khandelwal3@ibm.com>
This commit is contained in:
Shivani Kabu 2025-05-13 13:49:10 +05:30 committed by Nikhil Khandelwal
parent 0d0fa6cbe3
commit f2c019cad7
5 changed files with 53 additions and 11 deletions

View File

@ -226,7 +226,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
params: Dict[str, Any] = {} params: Dict[str, Any] = {}
timeout: float = 20 timeout: float = 20
prompt: str = "Describe this image in a few sentences." prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
provenance: str = "" provenance: str = ""
@ -234,7 +234,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: ClassVar[Literal["vlm"]] = "vlm" kind: ClassVar[Literal["vlm"]] = "vlm"
repo_id: str repo_id: str
prompt: str = "Describe this image in a few sentences." prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False) generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
@ -249,7 +249,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct") # phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
granite_picture_description = PictureDescriptionVlmOptions( granite_picture_description = PictureDescriptionVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview", repo_id="ibm-granite/granite-vision-3.1-2b-preview",
prompt="What is shown in this image?", prompt="What is shown in this image? Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image.",
) )
@ -399,6 +399,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents do_picture_description: bool = False # True: run describe pictures in documents
do_table_description: bool = False # True: run describe tables in documents
force_backend_text: bool = ( force_backend_text: bool = (
False # (To be used with vlms, or other generative models) False # (To be used with vlms, or other generative models)
) )

View File

@ -24,6 +24,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
description_type: str,
enable_remote_services: bool, enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]], artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionApiOptions, options: PictureDescriptionApiOptions,
@ -31,6 +32,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
): ):
super().__init__( super().__init__(
enabled=enabled, enabled=enabled,
description_type=description_type,
enable_remote_services=enable_remote_services, enable_remote_services=enable_remote_services,
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
options=options, options=options,

View File

@ -7,6 +7,7 @@ from docling_core.types.doc import (
DoclingDocument, DoclingDocument,
NodeItem, NodeItem,
PictureItem, PictureItem,
TableItem
) )
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
PictureDescriptionData, PictureDescriptionData,
@ -34,15 +35,20 @@ class PictureDescriptionBaseModel(
*, *,
enabled: bool, enabled: bool,
enable_remote_services: bool, enable_remote_services: bool,
description_type: str,
artifacts_path: Optional[Union[Path, str]], artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionBaseOptions, options: PictureDescriptionBaseOptions,
accelerator_options: AcceleratorOptions, accelerator_options: AcceleratorOptions,
): ):
self.enabled = enabled self.enabled = enabled
self.options = options self.options = options
self.description_type = description_type
self.provenance = "not-implemented" self.provenance = "not-implemented"
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
if self.description_type == 'table':
return self.enabled and isinstance(element, TableItem)
elif self.description_type == 'picture':
return self.enabled and isinstance(element, PictureItem) return self.enabled and isinstance(element, PictureItem)
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
@ -59,9 +65,9 @@ class PictureDescriptionBaseModel(
return return
images: List[Image.Image] = [] images: List[Image.Image] = []
elements: List[PictureItem] = [] elements: List[PictureItem | TableItem] = []
for el in element_batch: for el in element_batch:
assert isinstance(el.item, PictureItem) assert isinstance(el.item, PictureItem) or isinstance(el.item, TableItem)
describe_image = True describe_image = True
# Don't describe the image if it's smaller than the threshold # Don't describe the image if it's smaller than the threshold
if len(el.item.prov) > 0: if len(el.item.prov) > 0:
@ -76,7 +82,6 @@ class PictureDescriptionBaseModel(
if describe_image: if describe_image:
elements.append(el.item) elements.append(el.item)
images.append(el.image) images.append(el.image)
outputs = self._annotate_images(images) outputs = self._annotate_images(images)
for item, output in zip(elements, outputs): for item, output in zip(elements, outputs):

View File

@ -21,6 +21,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
description_type: str,
enable_remote_services: bool, enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]], artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionVlmOptions, options: PictureDescriptionVlmOptions,
@ -28,6 +29,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
): ):
super().__init__( super().__init__(
enabled=enabled, enabled=enabled,
description_type=description_type,
enable_remote_services=enable_remote_services, enable_remote_services=enable_remote_services,
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
options=options, options=options,
@ -57,9 +59,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
artifacts_path, artifacts_path,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
_attn_implementation=( _attn_implementation=(
"flash_attention_2" "flash_attention_2" if self.device.startswith("cuda") and accelerator_options.cuda_use_flash_attention2
if self.device.startswith("cuda")
and accelerator_options.cuda_use_flash_attention2
else "eager" else "eager"
), ),
).to(self.device) ).to(self.device)

View File

@ -100,6 +100,18 @@ class StandardPdfPipeline(PaginatedPipeline):
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}." f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
) )
# Table description model
if (
table_description_model := self.get_table_description_model(
artifacts_path=artifacts_path
)
) is None:
raise RuntimeError(
f"The specified table description kind is not supported: {pipeline_options.picture_description_options.kind}."
)
self.enrichment_pipe = [ self.enrichment_pipe = [
# Code Formula Enrichment Model # Code Formula Enrichment Model
CodeFormulaModel( CodeFormulaModel(
@ -121,12 +133,17 @@ class StandardPdfPipeline(PaginatedPipeline):
), ),
# Document Picture description # Document Picture description
picture_description_model, picture_description_model,
# Document Table description
table_description_model,
] ]
if ( if (
self.pipeline_options.do_formula_enrichment self.pipeline_options.do_formula_enrichment
or self.pipeline_options.do_code_enrichment or self.pipeline_options.do_code_enrichment
or self.pipeline_options.do_picture_description or self.pipeline_options.do_picture_description
or self.pipeline_options.do_table_description
): ):
self.keep_backend = True self.keep_backend = True
@ -165,11 +182,28 @@ class StandardPdfPipeline(PaginatedPipeline):
return factory.create_instance( return factory.create_instance(
options=self.pipeline_options.picture_description_options, options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description, enabled=self.pipeline_options.do_picture_description,
description_type = 'picture',
enable_remote_services=self.pipeline_options.enable_remote_services, enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options, accelerator_options=self.pipeline_options.accelerator_options,
) )
def get_table_description_model(
self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_table_description,
description_type = 'table',
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"): with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore