feat: Add content_layer property to items to address body, furniture and other roles (#735)

* feat: Pass predicted page-headers and page-footers through to DoclingDocument furniture

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* chore: Update all test GT

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update all test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update all test cases again

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lock

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lock to final docling-core

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-02-10 12:07:49 +01:00
committed by GitHub
parent 3e26597995
commit cf78d5b7b9
43 changed files with 2082 additions and 198 deletions

View File

@@ -4,7 +4,12 @@ from pathlib import Path
from typing import List, Union
from deepsearch_glm.andromeda_nlp import nlp_model
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
)
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import (
Figure,
@@ -71,12 +76,15 @@ class GlmModel:
)
main_text: List[Union[Ref, BaseText]] = []
page_headers: List[Union[Ref, BaseText]] = []
page_footers: List[Union[Ref, BaseText]] = []
tables: List[DsSchemaTable] = []
figures: List[Figure] = []
page_no_to_page = {p.page_no: p for p in conv_res.pages}
for element in conv_res.assembled.elements:
for element in conv_res.assembled.body:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
@@ -238,6 +246,53 @@ class GlmModel:
)
)
# We can throw in headers and footers at the end of the legacy doc
# since the reading-order will re-sort it later.
for element in conv_res.assembled.headers:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)
if isinstance(element, TextElement):
tel = BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, len(element.text)],
)
],
)
if element.label == DocItemLabel.PAGE_HEADER:
index = len(page_headers)
ref_str = f"#/page-headers/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
page_headers.append(tel)
elif element.label == DocItemLabel.PAGE_FOOTER:
index = len(page_footers)
ref_str = f"#/page-footers/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
page_footers.append(tel)
page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in conv_res.pages
@@ -252,6 +307,8 @@ class GlmModel:
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
page_headers=page_headers,
page_footers=page_footers,
)
return ds_doc
@@ -264,6 +321,7 @@ class GlmModel:
glm_doc = self.model.apply_on_doc(ds_doc_dict)
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
1 == 1
# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):

View File

@@ -15,6 +15,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
def resolve_item(paths, obj):
@@ -311,6 +312,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
current_list = None
doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
current_list = None
doc.add_text(
label=DocItemLabel(name_label),
text=text,
prov=prov,
content_layer=ContentLayer.FURNITURE,
)
else:
current_list = None