diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 3ba54b86..5a42feac 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import List, Union from deepsearch_glm.nlp_utils import init_nlp_model -from deepsearch_glm.utils.doc_utils import to_docling_document from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox @@ -35,6 +34,7 @@ from docling.datamodel.base_models import ( ) from docling.datamodel.document import ConversionResult, layout_label_to_ds_type from docling.datamodel.settings import settings +from docling.utils.glm_utils import to_docling_document from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.utils import create_hash @@ -225,6 +225,7 @@ class GlmModel: elif isinstance(element, ContainerElement): main_text.append( BaseText( + text="", payload={ "children": TypeAdapter(List[Cluster]).dump_python( element.cluster.children @@ -263,7 +264,7 @@ class GlmModel: def __call__(self, conv_res: ConversionResult) -> DoclingDocument: with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT): ds_doc = self._to_legacy_document(conv_res) - ds_doc_dict = ds_doc.model_dump(by_alias=True) + ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True) glm_doc = self.model.apply_on_doc(ds_doc_dict) diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py new file mode 100644 index 00000000..13681017 --- /dev/null +++ b/docling/utils/glm_utils.py @@ -0,0 +1,336 @@ +import re +from pathlib import Path +from typing import List + +import pandas as pd +from docling_core.types.doc import ( + BoundingBox, + CoordOrigin, + DocItemLabel, + DoclingDocument, + DocumentOrigin, + GroupLabel, + ProvenanceItem, + Size, + TableCell, + TableData, +) + + +def resolve_item(paths, obj): + """Find item in document from a reference path""" + + if len(paths) == 0: + return obj + + if paths[0] == "#": + return resolve_item(paths[1:], obj) + + try: + key = int(paths[0]) + except: + key = paths[0] + + if len(paths) == 1: + if isinstance(key, str) and key in obj: + return obj[key] + elif isinstance(key, int) and key < len(obj): + return obj[key] + else: + return None + + elif len(paths) > 1: + if isinstance(key, str) and key in obj: + return resolve_item(paths[1:], obj[key]) + elif isinstance(key, int) and key < len(obj): + return resolve_item(paths[1:], obj[key]) + else: + return None + + else: + return None + + +def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]: + unique_objects = [] + seen_spans = set() + + for sublist in grid: + for obj in sublist: + # Convert the spans list to a tuple of tuples for hashing + spans_tuple = tuple(tuple(span) for span in obj["spans"]) + if spans_tuple not in seen_spans: + seen_spans.add(spans_tuple) + unique_objects.append(obj) + + return unique_objects + + +def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: + origin = DocumentOrigin( + mimetype="application/pdf", + filename=doc_glm["file-info"]["filename"], + binary_hash=doc_glm["file-info"]["document-hash"], + ) + doc_name = Path(origin.filename).stem + + doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin) + + for page_dim in doc_glm["page-dimensions"]: + page_no = int(page_dim["page"]) + size = Size(width=page_dim["width"], height=page_dim["height"]) + + doc.add_page(page_no=page_no, size=size) + + if "properties" in doc_glm: + props = pd.DataFrame( + doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"] + ) + else: + props = pd.DataFrame() + + current_list = None + + for ix, pelem in enumerate(doc_glm["page-elements"]): + ptype = pelem["type"] + span_i = pelem["span"][0] + span_j = pelem["span"][1] + + if "iref" not in pelem: + # print(json.dumps(pelem, indent=2)) + continue + + iref = pelem["iref"] + + if re.match("#/figures/(\\d+)/captions/(.+)", iref): + # print(f"skip {iref}") + continue + + if re.match("#/tables/(\\d+)/captions/(.+)", iref): + # print(f"skip {iref}") + continue + + path = iref.split("/") + obj = resolve_item(path, doc_glm) + + if obj is None: + current_list = None + print(f"warning: undefined {path}") + continue + + if ptype == "figure": + current_list = None + text = "" + caption_refs = [] + for caption in obj["captions"]: + text += caption["text"] + + for nprov in caption["prov"]: + npaths = nprov["$ref"].split("/") + nelem = resolve_item(npaths, doc_glm) + + if nelem is None: + # print(f"warning: undefined caption {npaths}") + continue + + span_i = nelem["span"][0] + span_j = nelem["span"][1] + + cap_text = caption["text"][span_i:span_j] + + # doc_glm["page-elements"].remove(nelem) + + prov = ProvenanceItem( + page_no=nelem["page"], + charspan=tuple(nelem["span"]), + bbox=BoundingBox.from_tuple( + nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + caption_obj = doc.add_text( + label=DocItemLabel.CAPTION, text=cap_text, prov=prov + ) + caption_refs.append(caption_obj.get_ref()) + + prov = ProvenanceItem( + page_no=pelem["page"], + charspan=(0, len(text)), + bbox=BoundingBox.from_tuple( + pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + pic = doc.add_picture(prov=prov) + pic.captions.extend(caption_refs) + _add_child_elements(pic, doc, obj, pelem) + + elif ptype == "table": + current_list = None + text = "" + caption_refs = [] + for caption in obj["captions"]: + text += caption["text"] + + for nprov in caption["prov"]: + npaths = nprov["$ref"].split("/") + nelem = resolve_item(npaths, doc_glm) + + if nelem is None: + # print(f"warning: undefined caption {npaths}") + continue + + span_i = nelem["span"][0] + span_j = nelem["span"][1] + + cap_text = caption["text"][span_i:span_j] + + # doc_glm["page-elements"].remove(nelem) + + prov = ProvenanceItem( + page_no=nelem["page"], + charspan=tuple(nelem["span"]), + bbox=BoundingBox.from_tuple( + nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + caption_obj = doc.add_text( + label=DocItemLabel.CAPTION, text=cap_text, prov=prov + ) + caption_refs.append(caption_obj.get_ref()) + + table_cells_glm = _flatten_table_grid(obj["data"]) + + table_cells = [] + for tbl_cell_glm in table_cells_glm: + if tbl_cell_glm["bbox"] is not None: + bbox = BoundingBox.from_tuple( + tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT + ) + else: + bbox = None + + is_col_header = False + is_row_header = False + is_row_section = False + + if tbl_cell_glm["type"] == "col_header": + is_col_header = True + elif tbl_cell_glm["type"] == "row_header": + is_row_header = True + elif tbl_cell_glm["type"] == "row_section": + is_row_section = True + + table_cells.append( + TableCell( + row_span=tbl_cell_glm["row-span"][1] + - tbl_cell_glm["row-span"][0], + col_span=tbl_cell_glm["col-span"][1] + - tbl_cell_glm["col-span"][0], + start_row_offset_idx=tbl_cell_glm["row-span"][0], + end_row_offset_idx=tbl_cell_glm["row-span"][1], + start_col_offset_idx=tbl_cell_glm["col-span"][0], + end_col_offset_idx=tbl_cell_glm["col-span"][1], + text=tbl_cell_glm["text"], + bbox=bbox, + column_header=is_col_header, + row_header=is_row_header, + row_section=is_row_section, + ) + ) + + tbl_data = TableData( + num_rows=obj.get("#-rows", 0), + num_cols=obj.get("#-cols", 0), + table_cells=table_cells, + ) + + prov = ProvenanceItem( + page_no=pelem["page"], + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + tbl = doc.add_table(data=tbl_data, prov=prov) + tbl.captions.extend(caption_refs) + + elif ptype in ["form", "key_value_region"]: + label = DocItemLabel(ptype) + container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label) + + _add_child_elements(container_el, doc, obj, pelem) + + elif "text" in obj: + text = obj["text"][span_i:span_j] + + type_label = pelem["type"] + name_label = pelem["name"] + if update_name_label and len(props) > 0 and type_label == "paragraph": + prop = props[ + (props["type"] == "semantic") & (props["subj_path"] == iref) + ] + if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85: + name_label = prop.iloc[0]["label"] + + prov = ProvenanceItem( + page_no=pelem["page"], + charspan=(0, len(text)), + bbox=BoundingBox.from_tuple( + pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + label = DocItemLabel(name_label) + + if label == DocItemLabel.LIST_ITEM: + if current_list is None: + current_list = doc.add_group(label=GroupLabel.LIST, name="list") + + # TODO: Infer if this is a numbered or a bullet list item + doc.add_list_item( + text=text, enumerated=False, prov=prov, parent=current_list + ) + elif label == DocItemLabel.SECTION_HEADER: + current_list = None + + doc.add_heading(text=text, prov=prov) + else: + current_list = None + + doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov) + + return doc + + +def _add_child_elements(container_el, doc, obj, pelem): + payload = obj.get("payload") + if payload is not None: + children = payload.get("children", []) + + for child in children: + c_label = DocItemLabel(child["label"]) + c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin( + doc.pages[pelem["page"]].size.height + ) + c_text = " ".join( + [ + cell["text"].replace("\x02", "-").strip() + for cell in child["cells"] + if len(cell["text"].strip()) > 0 + ] + ) + + c_prov = ProvenanceItem( + page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox + ) + if c_label == DocItemLabel.LIST_ITEM: + # TODO: Infer if this is a numbered or a bullet list item + doc.add_list_item(parent=container_el, text=c_text, prov=c_prov) + elif c_label == DocItemLabel.SECTION_HEADER: + doc.add_heading(parent=container_el, text=c_text, prov=c_prov) + else: + doc.add_text( + parent=container_el, label=c_label, text=c_text, prov=c_prov + )