mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: add Docling JSON ingestion (#783)
* feat: add Docling JSON ingestion Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * update conversion as per review comments, add tests, revert Docling JSON disambiguation, document intricacies Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * Update docling/backend/json/docling_json_backend.py Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
58
tests/test_backend_docling_json.py
Normal file
58
tests/test_backend_docling_json.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Test methods in module docling.backend.json.docling_json_backend.py."""
|
||||
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DoclingDocument, InputDocument
|
||||
|
||||
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json")
|
||||
|
||||
|
||||
def test_convert_valid_docling_json():
|
||||
"""Test ingestion of valid Docling JSON."""
|
||||
cls = DoclingJSONBackend
|
||||
path_or_stream = GT_PATH
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path_or_stream,
|
||||
format=InputFormat.JSON_DOCLING,
|
||||
backend=cls,
|
||||
)
|
||||
backend = cls(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=path_or_stream,
|
||||
)
|
||||
assert backend.is_valid()
|
||||
|
||||
act_doc = backend.convert()
|
||||
act_data = act_doc.export_to_dict()
|
||||
|
||||
exp_doc = DoclingDocument.load_from_json(GT_PATH)
|
||||
exp_data = exp_doc.export_to_dict()
|
||||
|
||||
assert act_data == exp_data
|
||||
|
||||
|
||||
def test_invalid_docling_json():
|
||||
"""Test ingestion of invalid Docling JSON."""
|
||||
cls = DoclingJSONBackend
|
||||
path_or_stream = BytesIO(b"{}")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path_or_stream,
|
||||
format=InputFormat.JSON_DOCLING,
|
||||
backend=cls,
|
||||
filename="foo",
|
||||
)
|
||||
backend = cls(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=path_or_stream,
|
||||
)
|
||||
|
||||
assert not backend.is_valid()
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
backend.convert()
|
||||
Reference in New Issue
Block a user