docling/tests/test_glm_utils.py

import json
from pathlib import Path
from typing import List

import pytest
from deepsearch_glm.andromeda_nlp import nlp_model  # type: ignore
from docling_core.types.doc import DocItemLabel
from docling_core.utils.legacy import (
    doc_item_label_to_legacy_name,
    docling_document_to_legacy,
)

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.glm_utils import to_docling_document


@pytest.fixture
def test_glm_paths():
    return [
        Path("tests/data/utils/01030000000016.json"),
    ]


def generate_glm_docs(test_glm_paths: List[Path]):
    r"""
    Call this method only to generate the test dataset.
    No need to call this method during the regular testing.

    Run NLP model and convert PDF into GLM documents
    """
    # Initialize the NLP model
    model = nlp_model(loglevel="error", text_ordering=True)

    # Create the document converter
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    pdf_paths = [p.with_suffix(".pdf") for p in test_glm_paths]
    res = converter.convert_all(pdf_paths, raises_on_error=True)

    # convert pdf -> DoclingDocument -> legacy -> glm_doc
    for glm_path, conv_res in zip(test_glm_paths, res):
        doc = conv_res.document
        legacy_doc = docling_document_to_legacy(doc)
        legacy_doc_dict = legacy_doc.model_dump(by_alias=True, exclude_none=True)
        glm_doc = model.apply_on_doc(legacy_doc_dict)

        # Save the glm doc
        with open(glm_path, "w") as fd:
            json.dump(glm_doc, fd)


def test_convert_glm_to_docling(test_glm_paths):
    name_mapping = {doc_item_label_to_legacy_name(v): v.value for v in DocItemLabel}

    for glm_path in test_glm_paths:
        with open(glm_path, "r") as fd:
            glm_doc = json.load(fd)

        # Map the page_element.name of GLM into the label of docling
        for page_element in glm_doc["page-elements"]:
            pname = page_element["name"]
            if pname in name_mapping:
                page_element["name"] = name_mapping[pname]

        doc = to_docling_document(glm_doc)
        print(doc)


if __name__ == "__main__":
    # generate_glm_docs([
    #     Path("tests/data/utils/01030000000016.json"),
    # ])

    test_convert_glm_to_docling(
        [
            Path("tests/data/utils/01030000000016.json"),
        ]
    )