fix: WIP to fix the glm_utils.to_docling_document() and add a unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-07-26 03:55:00 +00:00 · 2025-01-09 14:59:39 +01:00 · 2025-01-09 14:59:39 +01:00 · 0ee690b5af
commit 0ee690b5af
parent 9a94b54f6c
4 changed files with 88 additions and 1 deletions
--- a/docling/utils/glm_utils.py
+++ b/docling/utils/glm_utils.py
@ -165,7 +165,7 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
            pic.captions.extend(caption_refs)
            _add_child_elements(pic, doc, obj, pelem)

-        elif ptype == "table":
+        elif ptype in ["table", "table-of-contents"]:
            current_list = None
            text = ""
            caption_refs = []
--- a/tests/data/utils/01030000000016.json
+++ b/tests/data/utils/01030000000016.json
@ -0,0 +1 @@
+{"_s3_data": {}, "applied_models": [], "body": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}], "conversion_settings": {}, "description": {"logs": []}, "dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#", "figures": [], "file-info": {"#-pages": 1, "document-hash": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63", "filename": "01030000000016.pdf", "page-hashes": [{"hash": "bd1d3243c5eb9572db6f790807ca26d917289b839e3b0f12bc4a8a7e99bbc895", "model": "default", "page": 1}]}, "footnotes": [], "meta": [{"$ref": "#/other/0"}, {"$ref": "#/page-footers/0"}], "model-application": {"message": "success", "success": true}, "other": [{"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/meta/2", "orig": "Table of contents", "prov": [{"$ref": "#/page-elements/2"}], "sref": "#/other/0", "subj_hash": 3945645157480977532, "text": "Table of contents", "text_hash": 10753538639672363281, "type": "table-of-contents"}], "page-dimensions": [{"height": 663.3070068359375, "page": 1, "width": 442.20501708984375}], "page-elements": [{"bbox": [152.0500030517578, 547.81103515625, 290.02099609375, 567.8159790039062], "iref": "#/texts/0", "name": "paragraph", "orig-order": 1, "page": 1, "span": [0, 17], "sref": "#/page-elements/0", "text-order": 0, "type": "paragraph"}, {"bbox": [56.69300079345703, 511.9410095214844, 117.2550048828125, 523.9440307617188], "iref": "#/texts/1", "name": "Text", "orig-order": 0, "page": 1, "span": [0, 12], "sref": "#/page-elements/1", "text-order": 1, "type": "paragraph"}, {"bbox": [56.37486267089844, 82.59698486328125, 386.371337890625, 521.0339965820312], "iref": "#/other/0", "name": "Document Index", "orig-order": 2, "page": 1, "span": [0, 0], "sref": "#/page-elements/2", "text-order": 2, "type": "table-of-contents"}, {"bbox": [381.4300231933594, 28.700000762939453, 385.4750061035156, 33.72200012207031], "iref": "#/page-footers/0", "name": "Page-footer", "orig-order": 3, "page": 1, "span": [0, 1], "sref": "#/page-elements/3", "text-order": 3, "type": "page-footer"}], "page-footers": [{"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/page-footers/0", "orig": "5", "prov": [{"$ref": "#/page-elements/3"}], "sref": "#/page-footers/0", "subj_hash": 10924513919643460592, "text": "5", "text_hash": 17767354399704235157, "type": "page-footer"}], "page-headers": [], "payload": null, "sref": "#", "subj_hash": 18446744073709551615, "tables": [], "texts": [{"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/texts/0", "orig": "Table of contents", "prov": [{"$ref": "#/page-elements/0"}], "sref": "#/texts/0", "subj_hash": 3945645157480977532, "text": "Table of contents", "text_hash": 10753538639672363281, "type": "paragraph"}, {"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/texts/1", "orig": "Introduction", "prov": [{"$ref": "#/page-elements/1"}], "sref": "#/texts/1", "subj_hash": 16581708288081805343, "text": "Introduction", "text_hash": 14273225408978377972, "type": "paragraph"}]}
--- a/tests/data/utils/01030000000016.pdf
+++ b/tests/data/utils/01030000000016.pdf
--- a/tests/test_glm_utils.py
+++ b/tests/test_glm_utils.py
@ -0,0 +1,86 @@
+import json
+from pathlib import Path
+from typing import List
+
+import pytest
+from deepsearch_glm.andromeda_nlp import nlp_model  # type: ignore
+from docling_core.types.doc import DocItemLabel
+from docling_core.utils.legacy import (
+    doc_item_label_to_legacy_name,
+    docling_document_to_legacy,
+)
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.utils.glm_utils import to_docling_document
+
+
+@pytest.fixture
+def test_glm_paths():
+    return [
+        Path("tests/data/utils/01030000000016.json"),
+    ]
+
+
+def generate_glm_docs(test_glm_paths: List[Path]):
+    r"""
+    Call this method only to generate the test dataset.
+    No need to call this method during the regular testing.
+
+    Run NLP model and convert PDF into GLM documents
+    """
+    # Initialize the NLP model
+    model = nlp_model(loglevel="error", text_ordering=True)
+
+    # Create the document converter
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = False
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+        }
+    )
+    pdf_paths = [p.with_suffix(".pdf") for p in test_glm_paths]
+    res = converter.convert_all(pdf_paths, raises_on_error=True)
+
+    # convert pdf -> DoclingDocument -> legacy -> glm_doc
+    for glm_path, conv_res in zip(test_glm_paths, res):
+        doc = conv_res.document
+        legacy_doc = docling_document_to_legacy(doc)
+        legacy_doc_dict = legacy_doc.model_dump(by_alias=True, exclude_none=True)
+        glm_doc = model.apply_on_doc(legacy_doc_dict)
+
+        # Save the glm doc
+        with open(glm_path, "w") as fd:
+            json.dump(glm_doc, fd)
+
+
+def test_convert_glm_to_docling(test_glm_paths):
+    name_mapping = {doc_item_label_to_legacy_name(v): v.value for v in DocItemLabel}
+
+    for glm_path in test_glm_paths:
+        with open(glm_path, "r") as fd:
+            glm_doc = json.load(fd)
+
+        # Map the page_element.name of GLM into the label of docling
+        for page_element in glm_doc["page-elements"]:
+            pname = page_element["name"]
+            if pname in name_mapping:
+                page_element["name"] = name_mapping[pname]
+
+        doc = to_docling_document(glm_doc)
+        print(doc)
+
+
+if __name__ == "__main__":
+    # generate_glm_docs([
+    #     Path("tests/data/utils/01030000000016.json"),
+    # ])
+
+    test_convert_glm_to_docling(
+        [
+            Path("tests/data/utils/01030000000016.json"),
+        ]
+    )
				`@ -0,0 +1 @@`
				{"_s3_data": {}, "applied_models": [], "body": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}], "conversion_settings": {}, "description": {"logs": []}, "dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#", "figures": [], "file-info": {"#-pages": 1, "document-hash": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63", "filename": "01030000000016.pdf", "page-hashes": [{"hash": "bd1d3243c5eb9572db6f790807ca26d917289b839e3b0f12bc4a8a7e99bbc895", "model": "default", "page": 1}]}, "footnotes": [], "meta": [{"$ref": "#/other/0"}, {"$ref": "#/page-footers/0"}], "model-application": {"message": "success", "success": true}, "other": [{"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/meta/2", "orig": "Table of contents", "prov": [{"$ref": "#/page-elements/2"}], "sref": "#/other/0", "subj_hash": 3945645157480977532, "text": "Table of contents", "text_hash": 10753538639672363281, "type": "table-of-contents"}], "page-dimensions": [{"height": 663.3070068359375, "page": 1, "width": 442.20501708984375}], "page-elements": [{"bbox": [152.0500030517578, 547.81103515625, 290.02099609375, 567.8159790039062], "iref": "#/texts/0", "name": "paragraph", "orig-order": 1, "page": 1, "span": [0, 17], "sref": "#/page-elements/0", "text-order": 0, "type": "paragraph"}, {"bbox": [56.69300079345703, 511.9410095214844, 117.2550048828125, 523.9440307617188], "iref": "#/texts/1", "name": "Text", "orig-order": 0, "page": 1, "span": [0, 12], "sref": "#/page-elements/1", "text-order": 1, "type": "paragraph"}, {"bbox": [56.37486267089844, 82.59698486328125, 386.371337890625, 521.0339965820312], "iref": "#/other/0", "name": "Document Index", "orig-order": 2, "page": 1, "span": [0, 0], "sref": "#/page-elements/2", "text-order": 2, "type": "table-of-contents"}, {"bbox": [381.4300231933594, 28.700000762939453, 385.4750061035156, 33.72200012207031], "iref": "#/page-footers/0", "name": "Page-footer", "orig-order": 3, "page": 1, "span": [0, 1], "sref": "#/page-elements/3", "text-order": 3, "type": "page-footer"}], "page-footers": [{"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/page-footers/0", "orig": "5", "prov": [{"$ref": "#/page-elements/3"}], "sref": "#/page-footers/0", "subj_hash": 10924513919643460592, "text": "5", "text_hash": 17767354399704235157, "type": "page-footer"}], "page-headers": [], "payload": null, "sref": "#", "subj_hash": 18446744073709551615, "tables": [], "texts": [{"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/texts/0", "orig": "Table of contents", "prov": [{"$ref": "#/page-elements/0"}], "sref": "#/texts/0", "subj_hash": 3945645157480977532, "text": "Table of contents", "text_hash": 10753538639672363281, "type": "paragraph"}, {"dloc": "34c43ca2f99dc95e789311626e8e2b3a5afea659112a742b0ca9e0b80a824c63#/texts/1", "orig": "Introduction", "prov": [{"$ref": "#/page-elements/1"}], "sref": "#/texts/1", "subj_hash": 16581708288081805343, "text": "Introduction", "text_hash": 14273225408978377972, "type": "paragraph"}]}