mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: Implement new reading-order model (#916)
* Implement new reading-order model, replacing DS GLM model (WIP) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update reading-order model branch Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add captions, footnotes and merges [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests and lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes, update tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add normalization, update tests again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests with code Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Push final lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * sanitize text Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Inlcude furniture, Update tests with furniture Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix content_layer assignment Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: Delete empty file docling/models/ds_glm_model.py Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
@@ -1,386 +0,0 @@
|
||||
import copy
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
from deepsearch_glm.andromeda_nlp import nlp_model
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
CoordOrigin,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
)
|
||||
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.legacy_doc.base import (
|
||||
Figure,
|
||||
PageDimensions,
|
||||
PageReference,
|
||||
Prov,
|
||||
Ref,
|
||||
)
|
||||
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
||||
from docling_core.types.legacy_doc.base import TableCell
|
||||
from docling_core.types.legacy_doc.document import BaseText
|
||||
from docling_core.types.legacy_doc.document import (
|
||||
CCSDocumentDescription as DsDocumentDescription,
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
Cluster,
|
||||
ContainerElement,
|
||||
FigureElement,
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.glm_utils import to_docling_document
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docling.utils.utils import create_hash
|
||||
|
||||
|
||||
class GlmOptions(BaseModel):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
model_names: str = "" # e.g. "language;term;reference"
|
||||
|
||||
|
||||
class GlmModel:
|
||||
def __init__(self, options: GlmOptions):
|
||||
self.options = options
|
||||
|
||||
self.model = nlp_model(loglevel="error", text_ordering=True)
|
||||
|
||||
def _to_legacy_document(self, conv_res) -> DsDocument:
|
||||
title = ""
|
||||
desc: DsDocumentDescription = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(
|
||||
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
|
||||
page=p.page_no + 1,
|
||||
model="default",
|
||||
)
|
||||
for p in conv_res.pages
|
||||
]
|
||||
|
||||
file_info = DsFileInfoObject(
|
||||
filename=conv_res.input.file.name,
|
||||
document_hash=conv_res.input.document_hash,
|
||||
num_pages=conv_res.input.page_count,
|
||||
page_hashes=page_hashes,
|
||||
)
|
||||
|
||||
main_text: List[Union[Ref, BaseText]] = []
|
||||
page_headers: List[Union[Ref, BaseText]] = []
|
||||
page_footers: List[Union[Ref, BaseText]] = []
|
||||
|
||||
tables: List[DsSchemaTable] = []
|
||||
figures: List[Figure] = []
|
||||
|
||||
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for element in conv_res.assembled.body:
|
||||
# Convert bboxes to lower-left origin.
|
||||
target_bbox = DsBoundingBox(
|
||||
element.cluster.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
)
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=element.text,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, len(element.text)],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
elif isinstance(element, Table):
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
)
|
||||
for j in range(element.num_cols)
|
||||
]
|
||||
for i in range(element.num_rows)
|
||||
]
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in element.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, element.num_cols),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for cspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx, element.num_cols
|
||||
),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
|
||||
spans = list(make_spans(cell))
|
||||
if cell.bbox is not None:
|
||||
bbox = cell.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
else:
|
||||
bbox = None
|
||||
|
||||
table_data[i][j] = TableCell(
|
||||
text=cell.text,
|
||||
bbox=bbox,
|
||||
# col=j,
|
||||
# row=i,
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
||||
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
||||
)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
num_cols=element.num_cols,
|
||||
num_rows=element.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
payload={
|
||||
"children": TypeAdapter(List[Cluster]).dump_python(
|
||||
element.cluster.children
|
||||
)
|
||||
}, # hack to channel child clusters through GLM
|
||||
)
|
||||
)
|
||||
elif isinstance(element, ContainerElement):
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text="",
|
||||
payload={
|
||||
"children": TypeAdapter(List[Cluster]).dump_python(
|
||||
element.cluster.children
|
||||
)
|
||||
}, # hack to channel child clusters through GLM
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
# We can throw in headers and footers at the end of the legacy doc
|
||||
# since the reading-order will re-sort it later.
|
||||
for element in conv_res.assembled.headers:
|
||||
# Convert bboxes to lower-left origin.
|
||||
target_bbox = DsBoundingBox(
|
||||
element.cluster.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
)
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
|
||||
tel = BaseText(
|
||||
text=element.text,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, len(element.text)],
|
||||
)
|
||||
],
|
||||
)
|
||||
if element.label == DocItemLabel.PAGE_HEADER:
|
||||
index = len(page_headers)
|
||||
ref_str = f"#/page-headers/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
page_headers.append(tel)
|
||||
elif element.label == DocItemLabel.PAGE_FOOTER:
|
||||
index = len(page_footers)
|
||||
ref_str = f"#/page-footers/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
page_footers.append(tel)
|
||||
|
||||
page_dimensions = [
|
||||
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
||||
for p in conv_res.pages
|
||||
if p.size is not None
|
||||
]
|
||||
|
||||
ds_doc: DsDocument = DsDocument(
|
||||
name=title,
|
||||
description=desc,
|
||||
file_info=file_info,
|
||||
main_text=main_text,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
page_dimensions=page_dimensions,
|
||||
page_headers=page_headers,
|
||||
page_footers=page_footers,
|
||||
)
|
||||
|
||||
return ds_doc
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
||||
ds_doc = self._to_legacy_document(conv_res)
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
|
||||
|
||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
1 == 1
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
||||
clusters_to_draw = []
|
||||
image = copy.deepcopy(conv_res.pages[page_no].image)
|
||||
for ix, elem in enumerate(ds_document.main_text):
|
||||
if isinstance(elem, BaseText):
|
||||
prov = elem.prov[0] # type: ignore
|
||||
elif isinstance(elem, Ref):
|
||||
_, arr, index = elem.ref.split("/")
|
||||
index = int(index) # type: ignore
|
||||
if arr == "tables":
|
||||
prov = ds_document.tables[index].prov[0]
|
||||
elif arr == "figures":
|
||||
prov = ds_document.pictures[index].prov[0]
|
||||
else:
|
||||
prov = None
|
||||
|
||||
if prov and prov.page == page_no:
|
||||
clusters_to_draw.append(
|
||||
Cluster(
|
||||
id=ix,
|
||||
label=elem.name,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=prov.bbox, # type: ignore
|
||||
origin=CoordOrigin.BOTTOMLEFT,
|
||||
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
||||
)
|
||||
)
|
||||
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in clusters_to_draw:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
|
||||
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
for tc in c.cells: # [:1]:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
|
||||
if show:
|
||||
image.show()
|
||||
else:
|
||||
out_path: Path = (
|
||||
Path(settings.debug.debug_output_path)
|
||||
/ f"debug_{conv_res.input.file.stem}"
|
||||
)
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out_file = out_path / f"doc_page_{page_no:05}.png"
|
||||
image.save(str(out_file), format="png")
|
||||
|
||||
# for item in ds_doc.page_dimensions:
|
||||
# page_no = item.page
|
||||
# draw_clusters_and_cells(ds_doc, page_no)
|
||||
|
||||
return docling_doc
|
||||
@@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
|
||||
|
||||
sanitized_text = "".join(lines)
|
||||
|
||||
# Text normalization
|
||||
sanitized_text = sanitized_text.replace("⁄", "/")
|
||||
sanitized_text = sanitized_text.replace("’", "'")
|
||||
sanitized_text = sanitized_text.replace("‘", "'")
|
||||
sanitized_text = sanitized_text.replace("“", '"')
|
||||
sanitized_text = sanitized_text.replace("”", '"')
|
||||
sanitized_text = sanitized_text.replace("•", "·")
|
||||
|
||||
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
||||
|
||||
def __call__(
|
||||
|
||||
389
docling/models/readingorder_model.py
Normal file
389
docling/models/readingorder_model.py
Normal file
@@ -0,0 +1,389 @@
|
||||
import copy
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
CoordOrigin,
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
ProvenanceItem,
|
||||
RefItem,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from docling_core.types.legacy_doc.base import Ref
|
||||
from docling_core.types.legacy_doc.document import BaseText
|
||||
from docling_ibm_models.reading_order.reading_order_rb import (
|
||||
PageElement as ReadingOrderPageElement,
|
||||
)
|
||||
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
BasePageElement,
|
||||
Cluster,
|
||||
ContainerElement,
|
||||
FigureElement,
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
|
||||
class ReadingOrderOptions(BaseModel):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
model_names: str = "" # e.g. "language;term;reference"
|
||||
|
||||
|
||||
class ReadingOrderModel:
|
||||
def __init__(self, options: ReadingOrderOptions):
|
||||
self.options = options
|
||||
self.ro_model = ReadingOrderPredictor()
|
||||
|
||||
def _assembled_to_readingorder_elements(
|
||||
self, conv_res: ConversionResult
|
||||
) -> List[ReadingOrderPageElement]:
|
||||
|
||||
elements: List[ReadingOrderPageElement] = []
|
||||
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for element in conv_res.assembled.elements:
|
||||
|
||||
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
||||
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
||||
text = element.text or ""
|
||||
|
||||
elements.append(
|
||||
ReadingOrderPageElement(
|
||||
cid=len(elements),
|
||||
ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
|
||||
text=text,
|
||||
page_no=element.page_no,
|
||||
page_size=page_no_to_pages[element.page_no].size,
|
||||
label=element.label,
|
||||
l=bbox.l,
|
||||
r=bbox.r,
|
||||
b=bbox.b,
|
||||
t=bbox.t,
|
||||
coord_origin=bbox.coord_origin,
|
||||
)
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
def _add_child_elements(
|
||||
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
||||
):
|
||||
|
||||
child: Cluster
|
||||
for child in element.cluster.children:
|
||||
c_label = child.label
|
||||
c_bbox = child.bbox.to_bottom_left_origin(
|
||||
doc.pages[element.page_no + 1].size.height
|
||||
)
|
||||
c_text = " ".join(
|
||||
[
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in child.cells
|
||||
if len(cell.text.strip()) > 0
|
||||
]
|
||||
)
|
||||
|
||||
c_prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
|
||||
)
|
||||
if c_label == DocItemLabel.LIST_ITEM:
|
||||
# TODO: Infer if this is a numbered or a bullet list item
|
||||
doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
|
||||
elif c_label == DocItemLabel.SECTION_HEADER:
|
||||
doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
|
||||
else:
|
||||
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
||||
|
||||
def _readingorder_elements_to_docling_doc(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
ro_elements: List[ReadingOrderPageElement],
|
||||
el_to_captions_mapping: Dict[int, List[int]],
|
||||
el_to_footnotes_mapping: Dict[int, List[int]],
|
||||
el_merges_mapping: Dict[int, List[int]],
|
||||
) -> DoclingDocument:
|
||||
|
||||
id_to_elem = {
|
||||
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
||||
for elem in conv_res.assembled.elements
|
||||
}
|
||||
cid_to_rels = {rel.cid: rel for rel in ro_elements}
|
||||
|
||||
origin = DocumentOrigin(
|
||||
mimetype="application/pdf",
|
||||
filename=conv_res.input.file.name,
|
||||
binary_hash=conv_res.input.document_hash,
|
||||
)
|
||||
doc_name = Path(origin.filename).stem
|
||||
out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
||||
|
||||
for page in conv_res.pages:
|
||||
page_no = page.page_no + 1
|
||||
size = page.size
|
||||
|
||||
assert size is not None
|
||||
|
||||
out_doc.add_page(page_no=page_no, size=size)
|
||||
|
||||
current_list = None
|
||||
skippable_cids = {
|
||||
cid
|
||||
for mapping in (
|
||||
el_to_captions_mapping,
|
||||
el_to_footnotes_mapping,
|
||||
el_merges_mapping,
|
||||
)
|
||||
for lst in mapping.values()
|
||||
for cid in lst
|
||||
}
|
||||
|
||||
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for rel in ro_elements:
|
||||
if rel.cid in skippable_cids:
|
||||
continue
|
||||
element = id_to_elem[rel.ref.cref]
|
||||
|
||||
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
if element.label == DocItemLabel.CODE:
|
||||
cap_text = element.text
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
charspan=(0, len(cap_text)),
|
||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||
)
|
||||
code_item = out_doc.add_code(text=cap_text, prov=prov)
|
||||
|
||||
if rel.cid in el_to_captions_mapping.keys():
|
||||
for caption_cid in el_to_captions_mapping[rel.cid]:
|
||||
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
||||
new_cap_item = self._add_caption_or_footnote(
|
||||
caption_elem, out_doc, code_item, page_height
|
||||
)
|
||||
|
||||
code_item.captions.append(new_cap_item.get_ref())
|
||||
|
||||
if rel.cid in el_to_footnotes_mapping.keys():
|
||||
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
||||
footnote_elem = id_to_elem[
|
||||
cid_to_rels[footnote_cid].ref.cref
|
||||
]
|
||||
new_footnote_item = self._add_caption_or_footnote(
|
||||
footnote_elem, out_doc, code_item, page_height
|
||||
)
|
||||
|
||||
code_item.footnotes.append(new_footnote_item.get_ref())
|
||||
else:
|
||||
|
||||
new_item, current_list = self._handle_text_element(
|
||||
element, out_doc, current_list, page_height
|
||||
)
|
||||
|
||||
if rel.cid in el_merges_mapping.keys():
|
||||
for merged_cid in el_merges_mapping[rel.cid]:
|
||||
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
|
||||
|
||||
self._merge_elements(
|
||||
element, merged_elem, new_item, page_height
|
||||
)
|
||||
|
||||
elif isinstance(element, Table):
|
||||
|
||||
tbl_data = TableData(
|
||||
num_rows=element.num_rows,
|
||||
num_cols=element.num_cols,
|
||||
table_cells=element.table_cells,
|
||||
)
|
||||
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
charspan=(0, 0),
|
||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||
)
|
||||
|
||||
tbl = out_doc.add_table(
|
||||
data=tbl_data, prov=prov, label=element.cluster.label
|
||||
)
|
||||
|
||||
if rel.cid in el_to_captions_mapping.keys():
|
||||
for caption_cid in el_to_captions_mapping[rel.cid]:
|
||||
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
||||
new_cap_item = self._add_caption_or_footnote(
|
||||
caption_elem, out_doc, tbl, page_height
|
||||
)
|
||||
|
||||
tbl.captions.append(new_cap_item.get_ref())
|
||||
|
||||
if rel.cid in el_to_footnotes_mapping.keys():
|
||||
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
||||
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
||||
new_footnote_item = self._add_caption_or_footnote(
|
||||
footnote_elem, out_doc, tbl, page_height
|
||||
)
|
||||
|
||||
tbl.footnotes.append(new_footnote_item.get_ref())
|
||||
|
||||
# TODO: Consider adding children of Table.
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
cap_text = ""
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
charspan=(0, len(cap_text)),
|
||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||
)
|
||||
pic = out_doc.add_picture(prov=prov)
|
||||
|
||||
if rel.cid in el_to_captions_mapping.keys():
|
||||
for caption_cid in el_to_captions_mapping[rel.cid]:
|
||||
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
||||
new_cap_item = self._add_caption_or_footnote(
|
||||
caption_elem, out_doc, pic, page_height
|
||||
)
|
||||
|
||||
pic.captions.append(new_cap_item.get_ref())
|
||||
|
||||
if rel.cid in el_to_footnotes_mapping.keys():
|
||||
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
||||
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
||||
new_footnote_item = self._add_caption_or_footnote(
|
||||
footnote_elem, out_doc, pic, page_height
|
||||
)
|
||||
|
||||
pic.footnotes.append(new_footnote_item.get_ref())
|
||||
|
||||
self._add_child_elements(element, pic, out_doc)
|
||||
|
||||
elif isinstance(element, ContainerElement): # Form, KV region
|
||||
label = element.label
|
||||
group_label = GroupLabel.UNSPECIFIED
|
||||
if label == DocItemLabel.FORM:
|
||||
group_label = GroupLabel.FORM_AREA
|
||||
elif label == DocItemLabel.KEY_VALUE_REGION:
|
||||
group_label = GroupLabel.KEY_VALUE_AREA
|
||||
|
||||
container_el = out_doc.add_group(label=group_label)
|
||||
|
||||
self._add_child_elements(element, container_el, out_doc)
|
||||
|
||||
return out_doc
|
||||
|
||||
def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
|
||||
assert isinstance(elem, TextElement)
|
||||
text = elem.text
|
||||
prov = ProvenanceItem(
|
||||
page_no=elem.page_no + 1,
|
||||
charspan=(0, len(text)),
|
||||
bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
|
||||
)
|
||||
new_item = out_doc.add_text(
|
||||
label=elem.label, text=text, prov=prov, parent=parent
|
||||
)
|
||||
return new_item
|
||||
|
||||
def _handle_text_element(self, element, out_doc, current_list, page_height):
|
||||
cap_text = element.text
|
||||
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
charspan=(0, len(cap_text)),
|
||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||
)
|
||||
label = element.label
|
||||
if label == DocItemLabel.LIST_ITEM:
|
||||
if current_list is None:
|
||||
current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
|
||||
|
||||
# TODO: Infer if this is a numbered or a bullet list item
|
||||
new_item = out_doc.add_list_item(
|
||||
text=cap_text, enumerated=False, prov=prov, parent=current_list
|
||||
)
|
||||
elif label == DocItemLabel.SECTION_HEADER:
|
||||
current_list = None
|
||||
|
||||
new_item = out_doc.add_heading(text=cap_text, prov=prov)
|
||||
elif label == DocItemLabel.FORMULA:
|
||||
current_list = None
|
||||
|
||||
new_item = out_doc.add_text(
|
||||
label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
|
||||
)
|
||||
else:
|
||||
current_list = None
|
||||
|
||||
content_layer = ContentLayer.BODY
|
||||
if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
||||
content_layer = ContentLayer.FURNITURE
|
||||
|
||||
new_item = out_doc.add_text(
|
||||
label=element.label,
|
||||
text=cap_text,
|
||||
prov=prov,
|
||||
content_layer=content_layer,
|
||||
)
|
||||
return new_item, current_list
|
||||
|
||||
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
||||
assert isinstance(
|
||||
merged_elem, type(element)
|
||||
), "Merged element must be of same type as element."
|
||||
assert (
|
||||
merged_elem.label == new_item.label
|
||||
), "Labels of merged elements must match."
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
charspan=(
|
||||
len(new_item.text) + 1,
|
||||
len(new_item.text) + 1 + len(merged_elem.text),
|
||||
),
|
||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||
)
|
||||
new_item.text += f" {merged_elem.text}"
|
||||
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
|
||||
new_item.prov.append(prov)
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
||||
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
||||
|
||||
# Apply reading order
|
||||
sorted_elements = self.ro_model.predict_reading_order(
|
||||
page_elements=page_elements
|
||||
)
|
||||
el_to_captions_mapping = self.ro_model.predict_to_captions(
|
||||
sorted_elements=sorted_elements
|
||||
)
|
||||
el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
|
||||
sorted_elements=sorted_elements
|
||||
)
|
||||
el_merges_mapping = self.ro_model.predict_merges(
|
||||
sorted_elements=sorted_elements
|
||||
)
|
||||
|
||||
docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
|
||||
conv_res,
|
||||
sorted_elements,
|
||||
el_to_captions_mapping,
|
||||
el_to_footnotes_mapping,
|
||||
el_merges_mapping,
|
||||
)
|
||||
|
||||
return docling_doc
|
||||
@@ -27,7 +27,6 @@ from docling.models.document_picture_classifier import (
|
||||
DocumentPictureClassifier,
|
||||
DocumentPictureClassifierOptions,
|
||||
)
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.ocr_mac_model import OcrMacModel
|
||||
@@ -40,6 +39,7 @@ from docling.models.picture_description_api_model import PictureDescriptionApiMo
|
||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
@@ -76,7 +76,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
or self.pipeline_options.generate_table_images
|
||||
)
|
||||
|
||||
self.glm_model = GlmModel(options=GlmOptions())
|
||||
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||
|
||||
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
||||
raise RuntimeError(
|
||||
|
||||
Reference in New Issue
Block a user