mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
table enrichments - Description and Indexing
Signed-off-by: Nikhil Khandelwal <nikhil.khandelwal3@ibm.com>
This commit is contained in:
parent
0d0fa6cbe3
commit
f2c019cad7
@ -226,7 +226,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|||||||
params: Dict[str, Any] = {}
|
params: Dict[str, Any] = {}
|
||||||
timeout: float = 20
|
timeout: float = 20
|
||||||
|
|
||||||
prompt: str = "Describe this image in a few sentences."
|
prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
|
||||||
provenance: str = ""
|
provenance: str = ""
|
||||||
|
|
||||||
|
|
||||||
@ -234,7 +234,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
|||||||
kind: ClassVar[Literal["vlm"]] = "vlm"
|
kind: ClassVar[Literal["vlm"]] = "vlm"
|
||||||
|
|
||||||
repo_id: str
|
repo_id: str
|
||||||
prompt: str = "Describe this image in a few sentences."
|
prompt: str = "Describe this image in a few sentences. Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image."
|
||||||
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
|
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
|
||||||
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
|
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
|
||||||
|
|
||||||
@ -249,7 +249,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|||||||
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
||||||
granite_picture_description = PictureDescriptionVlmOptions(
|
granite_picture_description = PictureDescriptionVlmOptions(
|
||||||
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
||||||
prompt="What is shown in this image?",
|
prompt="What is shown in this image? Extract semantic components from the image (objects, actions, colors, etc.) and generate a star-structure graph with the image as the central node. Link surrounding nodes with labeled relationships to form a unique ontology-style concept map and add this to the description of the image.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -399,6 +399,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
||||||
do_picture_classification: bool = False # True: classify pictures in documents
|
do_picture_classification: bool = False # True: classify pictures in documents
|
||||||
do_picture_description: bool = False # True: run describe pictures in documents
|
do_picture_description: bool = False # True: run describe pictures in documents
|
||||||
|
do_table_description: bool = False # True: run describe tables in documents
|
||||||
force_backend_text: bool = (
|
force_backend_text: bool = (
|
||||||
False # (To be used with vlms, or other generative models)
|
False # (To be used with vlms, or other generative models)
|
||||||
)
|
)
|
||||||
|
@ -24,6 +24,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
|
description_type: str,
|
||||||
enable_remote_services: bool,
|
enable_remote_services: bool,
|
||||||
artifacts_path: Optional[Union[Path, str]],
|
artifacts_path: Optional[Union[Path, str]],
|
||||||
options: PictureDescriptionApiOptions,
|
options: PictureDescriptionApiOptions,
|
||||||
@ -31,6 +32,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
enabled=enabled,
|
enabled=enabled,
|
||||||
|
description_type=description_type,
|
||||||
enable_remote_services=enable_remote_services,
|
enable_remote_services=enable_remote_services,
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
options=options,
|
options=options,
|
||||||
|
@ -7,6 +7,7 @@ from docling_core.types.doc import (
|
|||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
NodeItem,
|
NodeItem,
|
||||||
PictureItem,
|
PictureItem,
|
||||||
|
TableItem
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||||
PictureDescriptionData,
|
PictureDescriptionData,
|
||||||
@ -34,15 +35,20 @@ class PictureDescriptionBaseModel(
|
|||||||
*,
|
*,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
enable_remote_services: bool,
|
enable_remote_services: bool,
|
||||||
|
description_type: str,
|
||||||
artifacts_path: Optional[Union[Path, str]],
|
artifacts_path: Optional[Union[Path, str]],
|
||||||
options: PictureDescriptionBaseOptions,
|
options: PictureDescriptionBaseOptions,
|
||||||
accelerator_options: AcceleratorOptions,
|
accelerator_options: AcceleratorOptions,
|
||||||
):
|
):
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.options = options
|
self.options = options
|
||||||
|
self.description_type = description_type
|
||||||
self.provenance = "not-implemented"
|
self.provenance = "not-implemented"
|
||||||
|
|
||||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
|
if self.description_type == 'table':
|
||||||
|
return self.enabled and isinstance(element, TableItem)
|
||||||
|
elif self.description_type == 'picture':
|
||||||
return self.enabled and isinstance(element, PictureItem)
|
return self.enabled and isinstance(element, PictureItem)
|
||||||
|
|
||||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||||
@ -59,9 +65,9 @@ class PictureDescriptionBaseModel(
|
|||||||
return
|
return
|
||||||
|
|
||||||
images: List[Image.Image] = []
|
images: List[Image.Image] = []
|
||||||
elements: List[PictureItem] = []
|
elements: List[PictureItem | TableItem] = []
|
||||||
for el in element_batch:
|
for el in element_batch:
|
||||||
assert isinstance(el.item, PictureItem)
|
assert isinstance(el.item, PictureItem) or isinstance(el.item, TableItem)
|
||||||
describe_image = True
|
describe_image = True
|
||||||
# Don't describe the image if it's smaller than the threshold
|
# Don't describe the image if it's smaller than the threshold
|
||||||
if len(el.item.prov) > 0:
|
if len(el.item.prov) > 0:
|
||||||
@ -76,7 +82,6 @@ class PictureDescriptionBaseModel(
|
|||||||
if describe_image:
|
if describe_image:
|
||||||
elements.append(el.item)
|
elements.append(el.item)
|
||||||
images.append(el.image)
|
images.append(el.image)
|
||||||
|
|
||||||
outputs = self._annotate_images(images)
|
outputs = self._annotate_images(images)
|
||||||
|
|
||||||
for item, output in zip(elements, outputs):
|
for item, output in zip(elements, outputs):
|
||||||
|
@ -21,6 +21,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
|
description_type: str,
|
||||||
enable_remote_services: bool,
|
enable_remote_services: bool,
|
||||||
artifacts_path: Optional[Union[Path, str]],
|
artifacts_path: Optional[Union[Path, str]],
|
||||||
options: PictureDescriptionVlmOptions,
|
options: PictureDescriptionVlmOptions,
|
||||||
@ -28,6 +29,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
enabled=enabled,
|
enabled=enabled,
|
||||||
|
description_type=description_type,
|
||||||
enable_remote_services=enable_remote_services,
|
enable_remote_services=enable_remote_services,
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
options=options,
|
options=options,
|
||||||
@ -57,9 +59,7 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|||||||
artifacts_path,
|
artifacts_path,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
_attn_implementation=(
|
_attn_implementation=(
|
||||||
"flash_attention_2"
|
"flash_attention_2" if self.device.startswith("cuda") and accelerator_options.cuda_use_flash_attention2
|
||||||
if self.device.startswith("cuda")
|
|
||||||
and accelerator_options.cuda_use_flash_attention2
|
|
||||||
else "eager"
|
else "eager"
|
||||||
),
|
),
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
|
@ -100,6 +100,18 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Table description model
|
||||||
|
if (
|
||||||
|
table_description_model := self.get_table_description_model(
|
||||||
|
artifacts_path=artifacts_path
|
||||||
|
)
|
||||||
|
) is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The specified table description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
# Code Formula Enrichment Model
|
# Code Formula Enrichment Model
|
||||||
CodeFormulaModel(
|
CodeFormulaModel(
|
||||||
@ -121,12 +133,17 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
),
|
),
|
||||||
# Document Picture description
|
# Document Picture description
|
||||||
picture_description_model,
|
picture_description_model,
|
||||||
|
# Document Table description
|
||||||
|
table_description_model,
|
||||||
|
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.pipeline_options.do_formula_enrichment
|
self.pipeline_options.do_formula_enrichment
|
||||||
or self.pipeline_options.do_code_enrichment
|
or self.pipeline_options.do_code_enrichment
|
||||||
or self.pipeline_options.do_picture_description
|
or self.pipeline_options.do_picture_description
|
||||||
|
or self.pipeline_options.do_table_description
|
||||||
):
|
):
|
||||||
self.keep_backend = True
|
self.keep_backend = True
|
||||||
|
|
||||||
@ -165,11 +182,28 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
return factory.create_instance(
|
return factory.create_instance(
|
||||||
options=self.pipeline_options.picture_description_options,
|
options=self.pipeline_options.picture_description_options,
|
||||||
enabled=self.pipeline_options.do_picture_description,
|
enabled=self.pipeline_options.do_picture_description,
|
||||||
|
description_type = 'picture',
|
||||||
enable_remote_services=self.pipeline_options.enable_remote_services,
|
enable_remote_services=self.pipeline_options.enable_remote_services,
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_table_description_model(
|
||||||
|
self, artifacts_path: Optional[Path] = None
|
||||||
|
) -> Optional[PictureDescriptionBaseModel]:
|
||||||
|
factory = get_picture_description_factory(
|
||||||
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||||
|
)
|
||||||
|
return factory.create_instance(
|
||||||
|
options=self.pipeline_options.picture_description_options,
|
||||||
|
enabled=self.pipeline_options.do_table_description,
|
||||||
|
description_type = 'table',
|
||||||
|
enable_remote_services=self.pipeline_options.enable_remote_services,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
with TimeRecorder(conv_res, "page_init"):
|
with TimeRecorder(conv_res, "page_init"):
|
||||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||||
|
Loading…
Reference in New Issue
Block a user