From b559813b9becf7950bc539c1334e55ef17bed2ad Mon Sep 17 00:00:00 2001 From: "Peter W. J. Staar" <91719829+PeterStaar-IBM@users.noreply.github.com> Date: Thu, 20 Nov 2025 12:45:26 +0100 Subject: [PATCH] feat: add save and load for conversion result (#2648) * feat: added save_as_json and load_from_json to ConversionResult Signed-off-by: Peter Staar * added a test Signed-off-by: Peter Staar * fixed the save and load for ConversionResult Signed-off-by: Peter Staar * reformatted the code Signed-off-by: Peter Staar * fixed the signature Signed-off-by: Peter Staar * refactored load/save into ConversionAssets Signed-off-by: Peter Staar * added the DoclingVersion class Signed-off-by: Peter Staar * renamed time_stamp to timestamp Signed-off-by: Peter Staar --------- Signed-off-by: Peter Staar --- docling/cli/main.py | 22 ++-- docling/datamodel/base_models.py | 13 ++ docling/datamodel/document.py | 183 ++++++++++++++++++++++++++- tests/test_conversion_result_json.py | 44 +++++++ 4 files changed, 245 insertions(+), 17 deletions(-) create mode 100644 tests/test_conversion_result_json.py diff --git a/docling/cli/main.py b/docling/cli/main.py index 9607d2f4..f7444f3d 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -59,7 +59,7 @@ from docling.datamodel.base_models import ( InputFormat, OutputFormat, ) -from docling.datamodel.document import ConversionResult +from docling.datamodel.document import ConversionResult, DoclingVersion from docling.datamodel.pipeline_options import ( AsrPipelineOptions, ConvertPipelineOptions, @@ -168,19 +168,13 @@ def logo_callback(value: bool): def version_callback(value: bool): if value: - docling_version = importlib.metadata.version("docling") - docling_core_version = importlib.metadata.version("docling-core") - docling_ibm_models_version = importlib.metadata.version("docling-ibm-models") - docling_parse_version = importlib.metadata.version("docling-parse") - platform_str = platform.platform() - py_impl_version = sys.implementation.cache_tag - py_lang_version = platform.python_version() - print(f"Docling version: {docling_version}") - print(f"Docling Core version: {docling_core_version}") - print(f"Docling IBM Models version: {docling_ibm_models_version}") - print(f"Docling Parse version: {docling_parse_version}") - print(f"Python: {py_impl_version} ({py_lang_version})") - print(f"Platform: {platform_str}") + v = DoclingVersion() + print(f"Docling version: {v.docling_version}") + print(f"Docling Core version: {v.docling_core_version}") + print(f"Docling IBM Models version: {v.docling_ibm_models_version}") + print(f"Docling Parse version: {v.docling_parse_version}") + print(f"Python: {v.py_impl_version} ({v.py_lang_version})") + print(f"Platform: {v.platform_str}") raise typer.Exit() diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 3b50589b..61156f61 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -24,6 +24,7 @@ from pydantic import ( FieldSerializationInfo, computed_field, field_serializer, + field_validator, ) if TYPE_CHECKING: @@ -403,6 +404,18 @@ class PageConfidenceScores(BaseModel): table_score: ScoreValue = np.nan ocr_score: ScoreValue = np.nan + # Accept null/None or string "NaN" values on input and coerce to np.nan + @field_validator( + "parse_score", "layout_score", "table_score", "ocr_score", mode="before" + ) + @classmethod + def _coerce_none_or_nan_str(cls, v): + if v is None: + return np.nan + if isinstance(v, str) and v.strip().lower() in {"nan", "null", "none", ""}: + return np.nan + return v + def _score_to_grade(self, score: ScoreValue) -> QualityGrade: if score < 0.5: return QualityGrade.POOR diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index ec717941..c4784b68 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,8 +1,14 @@ import csv +import importlib +import json import logging +import platform import re +import sys import tarfile +import zipfile from collections.abc import Iterable, Mapping +from datetime import datetime from enum import Enum from io import BytesIO from pathlib import Path, PurePath @@ -223,14 +229,25 @@ class DocumentFormat(str, Enum): V1 = "v1" -class ConversionResult(BaseModel): - input: InputDocument +class DoclingVersion(BaseModel): + docling_version: str = importlib.metadata.version("docling") + docling_core_version: str = importlib.metadata.version("docling-core") + docling_ibm_models_version: str = importlib.metadata.version("docling-ibm-models") + docling_parse_version: str = importlib.metadata.version("docling-parse") + platform_str: str = platform.platform() + py_impl_version: str = sys.implementation.cache_tag + py_lang_version: str = platform.python_version() + + +class ConversionAssets(BaseModel): + version: DoclingVersion = DoclingVersion() + # When the assets were saved (ISO string from datetime.now()) + timestamp: Optional[str] = None status: ConversionStatus = ConversionStatus.PENDING # failure, success errors: list[ErrorItem] = [] # structure to keep errors pages: list[Page] = [] - assembled: AssembledUnit = AssembledUnit() timings: dict[str, ProfilingItem] = {} confidence: ConfidenceReport = Field(default_factory=ConfidenceReport) @@ -241,6 +258,166 @@ class ConversionResult(BaseModel): def legacy_document(self): return docling_document_to_legacy(self.document) + def save( + self, + *, + filename: Union[str, Path], + indent: Optional[int] = 2, + ): + """Serialize the full ConversionAssets to JSON.""" + if isinstance(filename, str): + filename = Path(filename) + # Build an in-memory ZIP archive containing JSON for each asset + buf = BytesIO() + + def to_jsonable(obj): + try: + # pydantic v2 models + if hasattr(obj, "model_dump"): + return obj.model_dump(mode="json") # type: ignore[attr-defined] + except TypeError: + # some models may not accept mode argument + return obj.model_dump() # type: ignore[attr-defined] + + # enums + try: + from enum import Enum + + if isinstance(obj, Enum): + return obj.value + except Exception: + pass + + # containers + if isinstance(obj, list): + return [to_jsonable(x) for x in obj] + if isinstance(obj, dict): + return {k: to_jsonable(v) for k, v in obj.items()} + + # passthrough primitives + return obj + + with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: + + def write_json(name: str, payload) -> None: + data = json.dumps( + to_jsonable(payload), ensure_ascii=False, indent=indent + ) + zf.writestr(name, data.encode("utf-8")) + + # Update and persist a save timestamp + self.timestamp = datetime.now().isoformat() + write_json("timestamp.json", self.timestamp) + + # Store each component in its own JSON file + write_json("version.json", self.version) + write_json("status.json", self.status) + write_json("errors.json", self.errors) + write_json("pages.json", self.pages) + write_json("timings.json", self.timings) + write_json("confidence.json", self.confidence) + # For the document, ensure stable schema via export_to_dict + doc_dict = self.document.export_to_dict() + zf.writestr( + "document.json", + json.dumps(doc_dict, ensure_ascii=False, indent=indent).encode("utf-8"), + ) + + # Persist the ZIP to disk + buf.seek(0) + if filename.parent and not filename.parent.exists(): + filename.parent.mkdir(parents=True, exist_ok=True) + with filename.open("wb") as f: + f.write(buf.getvalue()) + + @classmethod + def load(cls, filename: Union[str, Path]) -> "ConversionAssets": + """Load a ConversionAssets.""" + if isinstance(filename, str): + filename = Path(filename) + + # Read the ZIP and deserialize all items + version_info: DoclingVersion = DoclingVersion() + timestamp: Optional[str] = None + status = ConversionStatus.PENDING + errors: list[ErrorItem] = [] + pages: list[Page] = [] + timings: dict[str, ProfilingItem] = {} + confidence = ConfidenceReport() + document: DoclingDocument = _EMPTY_DOCLING_DOC + + with zipfile.ZipFile(filename, mode="r") as zf: + + def read_json(name: str): + try: + with zf.open(name, "r") as fp: + return json.loads(fp.read().decode("utf-8")) + except KeyError: + return None + + # version + if (data := read_json("version.json")) is not None: + try: + version_info = DoclingVersion.model_validate(data) + except Exception as exc: + _log.error(f"Could not read version: {exc}") + + # timestamp + if (data := read_json("timestamp.json")) is not None: + if isinstance(data, str): + timestamp = data + + # status + if (data := read_json("status.json")) is not None: + try: + status = ConversionStatus(data) + except Exception: + status = ConversionStatus.PENDING + + # errors + if (data := read_json("errors.json")) is not None and isinstance( + data, list + ): + errors = [ErrorItem.model_validate(item) for item in data] + + # pages + if (data := read_json("pages.json")) is not None and isinstance(data, list): + pages = [Page.model_validate(item) for item in data] + + # timings + if (data := read_json("timings.json")) is not None and isinstance( + data, dict + ): + timings = {k: ProfilingItem.model_validate(v) for k, v in data.items()} + + # confidence + if (data := read_json("confidence.json")) is not None and isinstance( + data, dict + ): + confidence = ConfidenceReport.model_validate(data) + + # document + if (data := read_json("document.json")) is not None and isinstance( + data, dict + ): + document = DoclingDocument.model_validate(data) + + return cls( + version=version_info, + timestamp=timestamp, + status=status, + errors=errors, + pages=pages, + timings=timings, + confidence=confidence, + document=document, + ) + + +class ConversionResult(ConversionAssets): + input: InputDocument + assembled: AssembledUnit = AssembledUnit() + class _DummyBackend(AbstractDocumentBackend): def __init__(self, *args, **kwargs): diff --git a/tests/test_conversion_result_json.py b/tests/test_conversion_result_json.py new file mode 100644 index 00000000..1f93870c --- /dev/null +++ b/tests/test_conversion_result_json.py @@ -0,0 +1,44 @@ +from io import BytesIO +from pathlib import Path + +import pytest + +from docling.backend.pypdfium2_backend import ( + PyPdfiumDocumentBackend, + PyPdfiumPageBackend, +) +from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.document import ConversionAssets +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def test_conversion_result_json_roundtrip_string(): + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = False + pipeline_options.images_scale = 1.0 + pipeline_options.generate_page_images = False + pipeline_options.do_table_structure = False + pipeline_options.table_structure_options.do_cell_matching = True + pipeline_options.generate_parsed_pages = True + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend + ) + } + ) + conv_res = doc_converter.convert(pdf_doc) + + fpath: Path = Path("./test-conversion.zip") + + conv_res.save(filename=fpath) # returns string when no filename is given + # assert isinstance(json_str, str) and len(json_str) > 0 + + loaded = ConversionAssets.load(filename=fpath) + + assert loaded.status == conv_res.status + assert loaded.document.name == conv_res.document.name