mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
87 lines
2.6 KiB
Python
87 lines
2.6 KiB
Python
import json
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
import pytest
|
|
from deepsearch_glm.andromeda_nlp import nlp_model # type: ignore
|
|
from docling_core.types.doc import DocItemLabel
|
|
from docling_core.utils.legacy import (
|
|
doc_item_label_to_legacy_name,
|
|
docling_document_to_legacy,
|
|
)
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
from docling.utils.glm_utils import to_docling_document
|
|
|
|
|
|
@pytest.fixture
|
|
def test_glm_paths():
|
|
return [
|
|
Path("tests/data/utils/01030000000016.json"),
|
|
]
|
|
|
|
|
|
def generate_glm_docs(test_glm_paths: List[Path]):
|
|
r"""
|
|
Call this method only to generate the test dataset.
|
|
No need to call this method during the regular testing.
|
|
|
|
Run NLP model and convert PDF into GLM documents
|
|
"""
|
|
# Initialize the NLP model
|
|
model = nlp_model(loglevel="error", text_ordering=True)
|
|
|
|
# Create the document converter
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = False
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
}
|
|
)
|
|
pdf_paths = [p.with_suffix(".pdf") for p in test_glm_paths]
|
|
res = converter.convert_all(pdf_paths, raises_on_error=True)
|
|
|
|
# convert pdf -> DoclingDocument -> legacy -> glm_doc
|
|
for glm_path, conv_res in zip(test_glm_paths, res):
|
|
doc = conv_res.document
|
|
legacy_doc = docling_document_to_legacy(doc)
|
|
legacy_doc_dict = legacy_doc.model_dump(by_alias=True, exclude_none=True)
|
|
glm_doc = model.apply_on_doc(legacy_doc_dict)
|
|
|
|
# Save the glm doc
|
|
with open(glm_path, "w") as fd:
|
|
json.dump(glm_doc, fd)
|
|
|
|
|
|
def test_convert_glm_to_docling(test_glm_paths):
|
|
name_mapping = {doc_item_label_to_legacy_name(v): v.value for v in DocItemLabel}
|
|
|
|
for glm_path in test_glm_paths:
|
|
with open(glm_path, "r") as fd:
|
|
glm_doc = json.load(fd)
|
|
|
|
# Map the page_element.name of GLM into the label of docling
|
|
for page_element in glm_doc["page-elements"]:
|
|
pname = page_element["name"]
|
|
if pname in name_mapping:
|
|
page_element["name"] = name_mapping[pname]
|
|
|
|
doc = to_docling_document(glm_doc)
|
|
print(doc)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# generate_glm_docs([
|
|
# Path("tests/data/utils/01030000000016.json"),
|
|
# ])
|
|
|
|
test_convert_glm_to_docling(
|
|
[
|
|
Path("tests/data/utils/01030000000016.json"),
|
|
]
|
|
)
|