feat: add save and load for conversion result (#2648)

* feat: added save_as_json and load_from_json to ConversionResult

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added a test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the save and load for ConversionResult

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the signature

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* refactored load/save into ConversionAssets

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added the DoclingVersion class

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* renamed time_stamp to timestamp

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-11-20 12:45:26 +01:00
committed by GitHub
parent 6fb9a5f98a
commit b559813b9b
4 changed files with 245 additions and 17 deletions

View File

@@ -59,7 +59,7 @@ from docling.datamodel.base_models import (
InputFormat, InputFormat,
OutputFormat, OutputFormat,
) )
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult, DoclingVersion
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AsrPipelineOptions, AsrPipelineOptions,
ConvertPipelineOptions, ConvertPipelineOptions,
@@ -168,19 +168,13 @@ def logo_callback(value: bool):
def version_callback(value: bool): def version_callback(value: bool):
if value: if value:
docling_version = importlib.metadata.version("docling") v = DoclingVersion()
docling_core_version = importlib.metadata.version("docling-core") print(f"Docling version: {v.docling_version}")
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models") print(f"Docling Core version: {v.docling_core_version}")
docling_parse_version = importlib.metadata.version("docling-parse") print(f"Docling IBM Models version: {v.docling_ibm_models_version}")
platform_str = platform.platform() print(f"Docling Parse version: {v.docling_parse_version}")
py_impl_version = sys.implementation.cache_tag print(f"Python: {v.py_impl_version} ({v.py_lang_version})")
py_lang_version = platform.python_version() print(f"Platform: {v.platform_str}")
print(f"Docling version: {docling_version}")
print(f"Docling Core version: {docling_core_version}")
print(f"Docling IBM Models version: {docling_ibm_models_version}")
print(f"Docling Parse version: {docling_parse_version}")
print(f"Python: {py_impl_version} ({py_lang_version})")
print(f"Platform: {platform_str}")
raise typer.Exit() raise typer.Exit()

View File

@@ -24,6 +24,7 @@ from pydantic import (
FieldSerializationInfo, FieldSerializationInfo,
computed_field, computed_field,
field_serializer, field_serializer,
field_validator,
) )
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -403,6 +404,18 @@ class PageConfidenceScores(BaseModel):
table_score: ScoreValue = np.nan table_score: ScoreValue = np.nan
ocr_score: ScoreValue = np.nan ocr_score: ScoreValue = np.nan
# Accept null/None or string "NaN" values on input and coerce to np.nan
@field_validator(
"parse_score", "layout_score", "table_score", "ocr_score", mode="before"
)
@classmethod
def _coerce_none_or_nan_str(cls, v):
if v is None:
return np.nan
if isinstance(v, str) and v.strip().lower() in {"nan", "null", "none", ""}:
return np.nan
return v
def _score_to_grade(self, score: ScoreValue) -> QualityGrade: def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
if score < 0.5: if score < 0.5:
return QualityGrade.POOR return QualityGrade.POOR

View File

@@ -1,8 +1,14 @@
import csv import csv
import importlib
import json
import logging import logging
import platform
import re import re
import sys
import tarfile import tarfile
import zipfile
from collections.abc import Iterable, Mapping from collections.abc import Iterable, Mapping
from datetime import datetime
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
@@ -223,14 +229,25 @@ class DocumentFormat(str, Enum):
V1 = "v1" V1 = "v1"
class ConversionResult(BaseModel): class DoclingVersion(BaseModel):
input: InputDocument docling_version: str = importlib.metadata.version("docling")
docling_core_version: str = importlib.metadata.version("docling-core")
docling_ibm_models_version: str = importlib.metadata.version("docling-ibm-models")
docling_parse_version: str = importlib.metadata.version("docling-parse")
platform_str: str = platform.platform()
py_impl_version: str = sys.implementation.cache_tag
py_lang_version: str = platform.python_version()
class ConversionAssets(BaseModel):
version: DoclingVersion = DoclingVersion()
# When the assets were saved (ISO string from datetime.now())
timestamp: Optional[str] = None
status: ConversionStatus = ConversionStatus.PENDING # failure, success status: ConversionStatus = ConversionStatus.PENDING # failure, success
errors: list[ErrorItem] = [] # structure to keep errors errors: list[ErrorItem] = [] # structure to keep errors
pages: list[Page] = [] pages: list[Page] = []
assembled: AssembledUnit = AssembledUnit()
timings: dict[str, ProfilingItem] = {} timings: dict[str, ProfilingItem] = {}
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport) confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
@@ -241,6 +258,166 @@ class ConversionResult(BaseModel):
def legacy_document(self): def legacy_document(self):
return docling_document_to_legacy(self.document) return docling_document_to_legacy(self.document)
def save(
self,
*,
filename: Union[str, Path],
indent: Optional[int] = 2,
):
"""Serialize the full ConversionAssets to JSON."""
if isinstance(filename, str):
filename = Path(filename)
# Build an in-memory ZIP archive containing JSON for each asset
buf = BytesIO()
def to_jsonable(obj):
try:
# pydantic v2 models
if hasattr(obj, "model_dump"):
return obj.model_dump(mode="json") # type: ignore[attr-defined]
except TypeError:
# some models may not accept mode argument
return obj.model_dump() # type: ignore[attr-defined]
# enums
try:
from enum import Enum
if isinstance(obj, Enum):
return obj.value
except Exception:
pass
# containers
if isinstance(obj, list):
return [to_jsonable(x) for x in obj]
if isinstance(obj, dict):
return {k: to_jsonable(v) for k, v in obj.items()}
# passthrough primitives
return obj
with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
def write_json(name: str, payload) -> None:
data = json.dumps(
to_jsonable(payload), ensure_ascii=False, indent=indent
)
zf.writestr(name, data.encode("utf-8"))
# Update and persist a save timestamp
self.timestamp = datetime.now().isoformat()
write_json("timestamp.json", self.timestamp)
# Store each component in its own JSON file
write_json("version.json", self.version)
write_json("status.json", self.status)
write_json("errors.json", self.errors)
write_json("pages.json", self.pages)
write_json("timings.json", self.timings)
write_json("confidence.json", self.confidence)
# For the document, ensure stable schema via export_to_dict
doc_dict = self.document.export_to_dict()
zf.writestr(
"document.json",
json.dumps(doc_dict, ensure_ascii=False, indent=indent).encode("utf-8"),
)
# Persist the ZIP to disk
buf.seek(0)
if filename.parent and not filename.parent.exists():
filename.parent.mkdir(parents=True, exist_ok=True)
with filename.open("wb") as f:
f.write(buf.getvalue())
@classmethod
def load(cls, filename: Union[str, Path]) -> "ConversionAssets":
"""Load a ConversionAssets."""
if isinstance(filename, str):
filename = Path(filename)
# Read the ZIP and deserialize all items
version_info: DoclingVersion = DoclingVersion()
timestamp: Optional[str] = None
status = ConversionStatus.PENDING
errors: list[ErrorItem] = []
pages: list[Page] = []
timings: dict[str, ProfilingItem] = {}
confidence = ConfidenceReport()
document: DoclingDocument = _EMPTY_DOCLING_DOC
with zipfile.ZipFile(filename, mode="r") as zf:
def read_json(name: str):
try:
with zf.open(name, "r") as fp:
return json.loads(fp.read().decode("utf-8"))
except KeyError:
return None
# version
if (data := read_json("version.json")) is not None:
try:
version_info = DoclingVersion.model_validate(data)
except Exception as exc:
_log.error(f"Could not read version: {exc}")
# timestamp
if (data := read_json("timestamp.json")) is not None:
if isinstance(data, str):
timestamp = data
# status
if (data := read_json("status.json")) is not None:
try:
status = ConversionStatus(data)
except Exception:
status = ConversionStatus.PENDING
# errors
if (data := read_json("errors.json")) is not None and isinstance(
data, list
):
errors = [ErrorItem.model_validate(item) for item in data]
# pages
if (data := read_json("pages.json")) is not None and isinstance(data, list):
pages = [Page.model_validate(item) for item in data]
# timings
if (data := read_json("timings.json")) is not None and isinstance(
data, dict
):
timings = {k: ProfilingItem.model_validate(v) for k, v in data.items()}
# confidence
if (data := read_json("confidence.json")) is not None and isinstance(
data, dict
):
confidence = ConfidenceReport.model_validate(data)
# document
if (data := read_json("document.json")) is not None and isinstance(
data, dict
):
document = DoclingDocument.model_validate(data)
return cls(
version=version_info,
timestamp=timestamp,
status=status,
errors=errors,
pages=pages,
timings=timings,
confidence=confidence,
document=document,
)
class ConversionResult(ConversionAssets):
input: InputDocument
assembled: AssembledUnit = AssembledUnit()
class _DummyBackend(AbstractDocumentBackend): class _DummyBackend(AbstractDocumentBackend):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):

View File

@@ -0,0 +1,44 @@
from io import BytesIO
from pathlib import Path
import pytest
from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,
PyPdfiumPageBackend,
)
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionAssets
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
def test_conversion_result_json_roundtrip_string():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.images_scale = 1.0
pipeline_options.generate_page_images = False
pipeline_options.do_table_structure = False
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.generate_parsed_pages = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
)
}
)
conv_res = doc_converter.convert(pdf_doc)
fpath: Path = Path("./test-conversion.zip")
conv_res.save(filename=fpath) # returns string when no filename is given
# assert isinstance(json_str, str) and len(json_str) > 0
loaded = ConversionAssets.load(filename=fpath)
assert loaded.status == conv_res.status
assert loaded.document.name == conv_res.document.name