mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
update conversion as per review comments, add tests, revert Docling JSON disambiguation, document intricacies
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
e972c9c60c
commit
9cdb176a8e
@ -16,11 +16,15 @@ class DoclingJSONBackend(DeclarativeDocumentBackend):
|
|||||||
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self._my_in_doc = in_doc
|
|
||||||
|
# given we need to store any actual conversion exception for raising it from
|
||||||
|
# convert(), this captures the successful result or the actual error in a
|
||||||
|
# mutually exclusive way:
|
||||||
|
self._doc_or_err = self._get_doc_or_err()
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return isinstance(self._doc_or_err, DoclingDocument)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@override
|
@override
|
||||||
@ -32,15 +36,23 @@ class DoclingJSONBackend(DeclarativeDocumentBackend):
|
|||||||
def supported_formats(cls) -> set[InputFormat]:
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
return {InputFormat.JSON_DOCLING}
|
return {InputFormat.JSON_DOCLING}
|
||||||
|
|
||||||
|
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
|
||||||
|
try:
|
||||||
|
json_data: Union[str, bytes]
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||||
|
json_data = f.read()
|
||||||
|
elif isinstance(self.path_or_stream, BytesIO):
|
||||||
|
json_data = self.path_or_stream.getvalue()
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
||||||
|
return DoclingDocument.model_validate_json(json_data=json_data)
|
||||||
|
except Exception as e:
|
||||||
|
return e
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self):
|
||||||
json_data: Union[str, bytes]
|
if isinstance(self._doc_or_err, DoclingDocument):
|
||||||
if isinstance(self.path_or_stream, Path):
|
return self._doc_or_err
|
||||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
|
||||||
json_data = f.read()
|
|
||||||
elif isinstance(self.path_or_stream, BytesIO):
|
|
||||||
json_data = self.path_or_stream.getvalue()
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
raise self._doc_or_err
|
||||||
doc = DoclingDocument.model_validate_json(json_data=json_data)
|
|
||||||
return doc
|
|
||||||
|
@ -297,7 +297,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
mime = mime or "text/plain"
|
mime = mime or "text/plain"
|
||||||
formats = MimeTypeToFormat.get(mime, [])
|
formats = MimeTypeToFormat.get(mime, [])
|
||||||
if formats:
|
if formats:
|
||||||
if len(formats) == 1 and mime not in ("text/plain", "application/json"):
|
if len(formats) == 1 and mime not in ("text/plain"):
|
||||||
return formats[0]
|
return formats[0]
|
||||||
else: # ambiguity in formats
|
else: # ambiguity in formats
|
||||||
return _DocumentConversionInput._guess_from_content(
|
return _DocumentConversionInput._guess_from_content(
|
||||||
@ -339,13 +339,6 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
||||||
input_format = InputFormat.XML_USPTO
|
input_format = InputFormat.XML_USPTO
|
||||||
|
|
||||||
elif mime == "application/json":
|
|
||||||
if (
|
|
||||||
InputFormat.JSON_DOCLING in formats
|
|
||||||
and '"schema_name": "DoclingDocument"' in content_str
|
|
||||||
):
|
|
||||||
input_format = InputFormat.JSON_DOCLING
|
|
||||||
|
|
||||||
return input_format
|
return input_format
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
58
tests/test_backend_docling_json.py
Normal file
58
tests/test_backend_docling_json.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
"""Test methods in module docling.backend.json.docling_json_backend.py."""
|
||||||
|
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import DoclingDocument, InputDocument
|
||||||
|
|
||||||
|
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json")
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_valid_docling_json():
|
||||||
|
"""Test ingestion of valid Docling JSON."""
|
||||||
|
cls = DoclingJSONBackend
|
||||||
|
path_or_stream = GT_PATH
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=path_or_stream,
|
||||||
|
format=InputFormat.JSON_DOCLING,
|
||||||
|
backend=cls,
|
||||||
|
)
|
||||||
|
backend = cls(
|
||||||
|
in_doc=in_doc,
|
||||||
|
path_or_stream=path_or_stream,
|
||||||
|
)
|
||||||
|
assert backend.is_valid()
|
||||||
|
|
||||||
|
act_doc = backend.convert()
|
||||||
|
act_data = act_doc.export_to_dict()
|
||||||
|
|
||||||
|
exp_doc = DoclingDocument.load_from_json(GT_PATH)
|
||||||
|
exp_data = exp_doc.export_to_dict()
|
||||||
|
|
||||||
|
assert act_data == exp_data
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_docling_json():
|
||||||
|
"""Test ingestion of invalid Docling JSON."""
|
||||||
|
cls = DoclingJSONBackend
|
||||||
|
path_or_stream = BytesIO(b"{}")
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=path_or_stream,
|
||||||
|
format=InputFormat.JSON_DOCLING,
|
||||||
|
backend=cls,
|
||||||
|
filename="foo",
|
||||||
|
)
|
||||||
|
backend = cls(
|
||||||
|
in_doc=in_doc,
|
||||||
|
path_or_stream=path_or_stream,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not backend.is_valid()
|
||||||
|
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
backend.convert()
|
@ -124,6 +124,25 @@ def test_guess_format(tmp_path):
|
|||||||
doc_path.write_text("xyz", encoding="utf-8")
|
doc_path.write_text("xyz", encoding="utf-8")
|
||||||
assert dci._guess_format(doc_path) == None
|
assert dci._guess_format(doc_path) == None
|
||||||
|
|
||||||
|
# Valid Docling JSON
|
||||||
|
test_str = '{"name": ""}'
|
||||||
|
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
||||||
|
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
|
||||||
|
doc_path = temp_dir / "test.json"
|
||||||
|
doc_path.write_text(test_str, encoding="utf-8")
|
||||||
|
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
|
||||||
|
|
||||||
|
# Non-Docling JSON
|
||||||
|
# TODO: Docling JSON is currently the single supported JSON flavor and the pipeline
|
||||||
|
# will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper
|
||||||
|
# disambiguation seen as part of https://github.com/DS4SD/docling/issues/802
|
||||||
|
test_str = "{}"
|
||||||
|
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
||||||
|
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
|
||||||
|
doc_path = temp_dir / "test.json"
|
||||||
|
doc_path.write_text(test_str, encoding="utf-8")
|
||||||
|
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
|
||||||
|
|
||||||
|
|
||||||
def _make_input_doc(path):
|
def _make_input_doc(path):
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
|
Loading…
Reference in New Issue
Block a user