mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
feat: Pass predicted page-headers and page-footers through to DoclingDocument furniture
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
1976584be1
commit
5c681ba352
@ -4,7 +4,12 @@ from pathlib import Path
|
|||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
from deepsearch_glm.andromeda_nlp import nlp_model
|
from deepsearch_glm.andromeda_nlp import nlp_model
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
from docling_core.types.doc import (
|
||||||
|
BoundingBox,
|
||||||
|
CoordOrigin,
|
||||||
|
DocItemLabel,
|
||||||
|
DoclingDocument,
|
||||||
|
)
|
||||||
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
||||||
from docling_core.types.legacy_doc.base import (
|
from docling_core.types.legacy_doc.base import (
|
||||||
Figure,
|
Figure,
|
||||||
@ -71,12 +76,15 @@ class GlmModel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
main_text: List[Union[Ref, BaseText]] = []
|
main_text: List[Union[Ref, BaseText]] = []
|
||||||
|
page_headers: List[Union[Ref, BaseText]] = []
|
||||||
|
page_footers: List[Union[Ref, BaseText]] = []
|
||||||
|
|
||||||
tables: List[DsSchemaTable] = []
|
tables: List[DsSchemaTable] = []
|
||||||
figures: List[Figure] = []
|
figures: List[Figure] = []
|
||||||
|
|
||||||
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
||||||
|
|
||||||
for element in conv_res.assembled.elements:
|
for element in conv_res.assembled.body:
|
||||||
# Convert bboxes to lower-left origin.
|
# Convert bboxes to lower-left origin.
|
||||||
target_bbox = DsBoundingBox(
|
target_bbox = DsBoundingBox(
|
||||||
element.cluster.bbox.to_bottom_left_origin(
|
element.cluster.bbox.to_bottom_left_origin(
|
||||||
@ -238,6 +246,53 @@ class GlmModel:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# We can throw in headers and footers at the end of the legacy doc
|
||||||
|
# since the reading-order will re-sort it later.
|
||||||
|
for element in conv_res.assembled.headers:
|
||||||
|
# Convert bboxes to lower-left origin.
|
||||||
|
target_bbox = DsBoundingBox(
|
||||||
|
element.cluster.bbox.to_bottom_left_origin(
|
||||||
|
page_no_to_page[element.page_no].size.height
|
||||||
|
).as_tuple()
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(element, TextElement):
|
||||||
|
|
||||||
|
tel = BaseText(
|
||||||
|
text=element.text,
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
name=element.label,
|
||||||
|
prov=[
|
||||||
|
Prov(
|
||||||
|
bbox=target_bbox,
|
||||||
|
page=element.page_no + 1,
|
||||||
|
span=[0, len(element.text)],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
if element.label == DocItemLabel.PAGE_HEADER:
|
||||||
|
index = len(page_headers)
|
||||||
|
ref_str = f"#/page-headers/{index}"
|
||||||
|
main_text.append(
|
||||||
|
Ref(
|
||||||
|
name=element.label,
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
ref=ref_str,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
page_headers.append(tel)
|
||||||
|
elif element.label == DocItemLabel.PAGE_FOOTER:
|
||||||
|
index = len(page_footers)
|
||||||
|
ref_str = f"#/page-footers/{index}"
|
||||||
|
main_text.append(
|
||||||
|
Ref(
|
||||||
|
name=element.label,
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
ref=ref_str,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
page_footers.append(tel)
|
||||||
|
|
||||||
page_dimensions = [
|
page_dimensions = [
|
||||||
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
||||||
for p in conv_res.pages
|
for p in conv_res.pages
|
||||||
@ -252,6 +307,8 @@ class GlmModel:
|
|||||||
tables=tables,
|
tables=tables,
|
||||||
figures=figures,
|
figures=figures,
|
||||||
page_dimensions=page_dimensions,
|
page_dimensions=page_dimensions,
|
||||||
|
page_headers=page_headers,
|
||||||
|
page_footers=page_footers,
|
||||||
)
|
)
|
||||||
|
|
||||||
return ds_doc
|
return ds_doc
|
||||||
@ -264,6 +321,7 @@ class GlmModel:
|
|||||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||||
|
|
||||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||||
|
1 == 1
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
||||||
|
@ -304,6 +304,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|||||||
current_list = None
|
current_list = None
|
||||||
|
|
||||||
doc.add_heading(text=text, prov=prov)
|
doc.add_heading(text=text, prov=prov)
|
||||||
|
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
||||||
|
current_list = None
|
||||||
|
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel(name_label),
|
||||||
|
text=text,
|
||||||
|
prov=prov,
|
||||||
|
parent=doc.furniture,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
current_list = None
|
current_list = None
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user