mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
table enrichments - Description and Indexing
Signed-off-by: Nikhil Khandelwal <nikhil.khandelwal3@ibm.com>
This commit is contained in:
parent
0d0fa6cbe3
commit
f2c019cad7
@ -226,7 +226,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
||||
params: Dict[str, Any] = {}
|
||||
timeout: float = 20
|
||||
|
||||
prompt: str = "Describe this image in a few sentences."
|
||||
prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
|
||||
provenance: str = ""
|
||||
|
||||
|
||||
@ -234,7 +234,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
||||
kind: ClassVar[Literal["vlm"]] = "vlm"
|
||||
|
||||
repo_id: str
|
||||
prompt: str = "Describe this image in a few sentences."
|
||||
prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
|
||||
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
|
||||
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
|
||||
|
||||
@ -249,7 +249,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
|
||||
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
||||
granite_picture_description = PictureDescriptionVlmOptions(
|
||||
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
||||
prompt="What is shown in this image?",
|
||||
prompt="What is shown in this image? Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image.",
|
||||
)
|
||||
|
||||
|
||||
@ -399,6 +399,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
||||
do_picture_classification: bool = False # True: classify pictures in documents
|
||||
do_picture_description: bool = False # True: run describe pictures in documents
|
||||
do_table_description: bool = False # True: run describe tables in documents
|
||||
force_backend_text: bool = (
|
||||
False # (To be used with vlms, or other generative models)
|
||||
)
|
||||
|
@ -24,6 +24,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
description_type: str,
|
||||
enable_remote_services: bool,
|
||||
artifacts_path: Optional[Union[Path, str]],
|
||||
options: PictureDescriptionApiOptions,
|
||||
@ -31,6 +32,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||
):
|
||||
super().__init__(
|
||||
enabled=enabled,
|
||||
description_type=description_type,
|
||||
enable_remote_services=enable_remote_services,
|
||||
artifacts_path=artifacts_path,
|
||||
options=options,
|
||||
|
@ -7,6 +7,7 @@ from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
NodeItem,
|
||||
PictureItem,
|
||||
TableItem
|
||||
)
|
||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||
PictureDescriptionData,
|
||||
@ -34,16 +35,21 @@ class PictureDescriptionBaseModel(
|
||||
*,
|
||||
enabled: bool,
|
||||
enable_remote_services: bool,
|
||||
description_type: str,
|
||||
artifacts_path: Optional[Union[Path, str]],
|
||||
options: PictureDescriptionBaseOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
self.enabled = enabled
|
||||
self.options = options
|
||||
self.description_type = description_type
|
||||
self.provenance = "not-implemented"
|
||||
|
||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||
return self.enabled and isinstance(element, PictureItem)
|
||||
if self.description_type == 'table':
|
||||
return self.enabled and isinstance(element, TableItem)
|
||||
elif self.description_type == 'picture':
|
||||
return self.enabled and isinstance(element, PictureItem)
|
||||
|
||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||
raise NotImplementedError
|
||||
@ -59,9 +65,9 @@ class PictureDescriptionBaseModel(
|
||||
return
|
||||
|
||||
images: List[Image.Image] = []
|
||||
elements: List[PictureItem] = []
|
||||
elements: List[PictureItem | TableItem] = []
|
||||
for el in element_batch:
|
||||
assert isinstance(el.item, PictureItem)
|
||||
assert isinstance(el.item, PictureItem) or isinstance(el.item, TableItem)
|
||||
describe_image = True
|
||||
# Don't describe the image if it's smaller than the threshold
|
||||
if len(el.item.prov) > 0:
|
||||
@ -76,7 +82,6 @@ class PictureDescriptionBaseModel(
|
||||
if describe_image:
|
||||
elements.append(el.item)
|
||||
images.append(el.image)
|
||||
|
||||
outputs = self._annotate_images(images)
|
||||
|
||||
for item, output in zip(elements, outputs):
|
||||
|
@ -21,6 +21,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
description_type: str,
|
||||
enable_remote_services: bool,
|
||||
artifacts_path: Optional[Union[Path, str]],
|
||||
options: PictureDescriptionVlmOptions,
|
||||
@ -28,6 +29,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||
):
|
||||
super().__init__(
|
||||
enabled=enabled,
|
||||
description_type=description_type,
|
||||
enable_remote_services=enable_remote_services,
|
||||
artifacts_path=artifacts_path,
|
||||
options=options,
|
||||
@ -57,9 +59,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||
artifacts_path,
|
||||
torch_dtype=torch.bfloat16,
|
||||
_attn_implementation=(
|
||||
"flash_attention_2"
|
||||
if self.device.startswith("cuda")
|
||||
and accelerator_options.cuda_use_flash_attention2
|
||||
"flash_attention_2" if self.device.startswith("cuda") and accelerator_options.cuda_use_flash_attention2
|
||||
else "eager"
|
||||
),
|
||||
).to(self.device)
|
||||
|
@ -100,6 +100,18 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
||||
)
|
||||
|
||||
# Table description model
|
||||
if (
|
||||
table_description_model := self.get_table_description_model(
|
||||
artifacts_path=artifacts_path
|
||||
)
|
||||
) is None:
|
||||
raise RuntimeError(
|
||||
f"The specified table description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
||||
)
|
||||
|
||||
|
||||
|
||||
self.enrichment_pipe = [
|
||||
# Code Formula Enrichment Model
|
||||
CodeFormulaModel(
|
||||
@ -121,12 +133,17 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
),
|
||||
# Document Picture description
|
||||
picture_description_model,
|
||||
# Document Table description
|
||||
table_description_model,
|
||||
|
||||
|
||||
]
|
||||
|
||||
if (
|
||||
self.pipeline_options.do_formula_enrichment
|
||||
or self.pipeline_options.do_code_enrichment
|
||||
or self.pipeline_options.do_picture_description
|
||||
or self.pipeline_options.do_table_description
|
||||
):
|
||||
self.keep_backend = True
|
||||
|
||||
@ -165,11 +182,28 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
return factory.create_instance(
|
||||
options=self.pipeline_options.picture_description_options,
|
||||
enabled=self.pipeline_options.do_picture_description,
|
||||
description_type = 'picture',
|
||||
enable_remote_services=self.pipeline_options.enable_remote_services,
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=self.pipeline_options.accelerator_options,
|
||||
)
|
||||
|
||||
def get_table_description_model(
|
||||
self, artifacts_path: Optional[Path] = None
|
||||
) -> Optional[PictureDescriptionBaseModel]:
|
||||
factory = get_picture_description_factory(
|
||||
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||
)
|
||||
return factory.create_instance(
|
||||
options=self.pipeline_options.picture_description_options,
|
||||
enabled=self.pipeline_options.do_table_description,
|
||||
description_type = 'table',
|
||||
enable_remote_services=self.pipeline_options.enable_remote_services,
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=self.pipeline_options.accelerator_options,
|
||||
)
|
||||
|
||||
|
||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
with TimeRecorder(conv_res, "page_init"):
|
||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||
|
Loading…
Reference in New Issue
Block a user