mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: add save and load for conversion result (#2648)
* feat: added save_as_json and load_from_json to ConversionResult Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added a test Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the save and load for ConversionResult Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the signature Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactored load/save into ConversionAssets Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the DoclingVersion class Signed-off-by: Peter Staar <taa@zurich.ibm.com> * renamed time_stamp to timestamp Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
6fb9a5f98a
commit
b559813b9b
@@ -59,7 +59,7 @@ from docling.datamodel.base_models import (
|
||||
InputFormat,
|
||||
OutputFormat,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.document import ConversionResult, DoclingVersion
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AsrPipelineOptions,
|
||||
ConvertPipelineOptions,
|
||||
@@ -168,19 +168,13 @@ def logo_callback(value: bool):
|
||||
|
||||
def version_callback(value: bool):
|
||||
if value:
|
||||
docling_version = importlib.metadata.version("docling")
|
||||
docling_core_version = importlib.metadata.version("docling-core")
|
||||
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
||||
docling_parse_version = importlib.metadata.version("docling-parse")
|
||||
platform_str = platform.platform()
|
||||
py_impl_version = sys.implementation.cache_tag
|
||||
py_lang_version = platform.python_version()
|
||||
print(f"Docling version: {docling_version}")
|
||||
print(f"Docling Core version: {docling_core_version}")
|
||||
print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
||||
print(f"Docling Parse version: {docling_parse_version}")
|
||||
print(f"Python: {py_impl_version} ({py_lang_version})")
|
||||
print(f"Platform: {platform_str}")
|
||||
v = DoclingVersion()
|
||||
print(f"Docling version: {v.docling_version}")
|
||||
print(f"Docling Core version: {v.docling_core_version}")
|
||||
print(f"Docling IBM Models version: {v.docling_ibm_models_version}")
|
||||
print(f"Docling Parse version: {v.docling_parse_version}")
|
||||
print(f"Python: {v.py_impl_version} ({v.py_lang_version})")
|
||||
print(f"Platform: {v.platform_str}")
|
||||
raise typer.Exit()
|
||||
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ from pydantic import (
|
||||
FieldSerializationInfo,
|
||||
computed_field,
|
||||
field_serializer,
|
||||
field_validator,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -403,6 +404,18 @@ class PageConfidenceScores(BaseModel):
|
||||
table_score: ScoreValue = np.nan
|
||||
ocr_score: ScoreValue = np.nan
|
||||
|
||||
# Accept null/None or string "NaN" values on input and coerce to np.nan
|
||||
@field_validator(
|
||||
"parse_score", "layout_score", "table_score", "ocr_score", mode="before"
|
||||
)
|
||||
@classmethod
|
||||
def _coerce_none_or_nan_str(cls, v):
|
||||
if v is None:
|
||||
return np.nan
|
||||
if isinstance(v, str) and v.strip().lower() in {"nan", "null", "none", ""}:
|
||||
return np.nan
|
||||
return v
|
||||
|
||||
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
||||
if score < 0.5:
|
||||
return QualityGrade.POOR
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
import csv
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import platform
|
||||
import re
|
||||
import sys
|
||||
import tarfile
|
||||
import zipfile
|
||||
from collections.abc import Iterable, Mapping
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
@@ -223,14 +229,25 @@ class DocumentFormat(str, Enum):
|
||||
V1 = "v1"
|
||||
|
||||
|
||||
class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
class DoclingVersion(BaseModel):
|
||||
docling_version: str = importlib.metadata.version("docling")
|
||||
docling_core_version: str = importlib.metadata.version("docling-core")
|
||||
docling_ibm_models_version: str = importlib.metadata.version("docling-ibm-models")
|
||||
docling_parse_version: str = importlib.metadata.version("docling-parse")
|
||||
platform_str: str = platform.platform()
|
||||
py_impl_version: str = sys.implementation.cache_tag
|
||||
py_lang_version: str = platform.python_version()
|
||||
|
||||
|
||||
class ConversionAssets(BaseModel):
|
||||
version: DoclingVersion = DoclingVersion()
|
||||
# When the assets were saved (ISO string from datetime.now())
|
||||
timestamp: Optional[str] = None
|
||||
|
||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||
errors: list[ErrorItem] = [] # structure to keep errors
|
||||
|
||||
pages: list[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
timings: dict[str, ProfilingItem] = {}
|
||||
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
||||
|
||||
@@ -241,6 +258,166 @@ class ConversionResult(BaseModel):
|
||||
def legacy_document(self):
|
||||
return docling_document_to_legacy(self.document)
|
||||
|
||||
def save(
|
||||
self,
|
||||
*,
|
||||
filename: Union[str, Path],
|
||||
indent: Optional[int] = 2,
|
||||
):
|
||||
"""Serialize the full ConversionAssets to JSON."""
|
||||
if isinstance(filename, str):
|
||||
filename = Path(filename)
|
||||
# Build an in-memory ZIP archive containing JSON for each asset
|
||||
buf = BytesIO()
|
||||
|
||||
def to_jsonable(obj):
|
||||
try:
|
||||
# pydantic v2 models
|
||||
if hasattr(obj, "model_dump"):
|
||||
return obj.model_dump(mode="json") # type: ignore[attr-defined]
|
||||
except TypeError:
|
||||
# some models may not accept mode argument
|
||||
return obj.model_dump() # type: ignore[attr-defined]
|
||||
|
||||
# enums
|
||||
try:
|
||||
from enum import Enum
|
||||
|
||||
if isinstance(obj, Enum):
|
||||
return obj.value
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# containers
|
||||
if isinstance(obj, list):
|
||||
return [to_jsonable(x) for x in obj]
|
||||
if isinstance(obj, dict):
|
||||
return {k: to_jsonable(v) for k, v in obj.items()}
|
||||
|
||||
# passthrough primitives
|
||||
return obj
|
||||
|
||||
with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
|
||||
def write_json(name: str, payload) -> None:
|
||||
data = json.dumps(
|
||||
to_jsonable(payload), ensure_ascii=False, indent=indent
|
||||
)
|
||||
zf.writestr(name, data.encode("utf-8"))
|
||||
|
||||
# Update and persist a save timestamp
|
||||
self.timestamp = datetime.now().isoformat()
|
||||
write_json("timestamp.json", self.timestamp)
|
||||
|
||||
# Store each component in its own JSON file
|
||||
write_json("version.json", self.version)
|
||||
write_json("status.json", self.status)
|
||||
write_json("errors.json", self.errors)
|
||||
write_json("pages.json", self.pages)
|
||||
write_json("timings.json", self.timings)
|
||||
write_json("confidence.json", self.confidence)
|
||||
# For the document, ensure stable schema via export_to_dict
|
||||
doc_dict = self.document.export_to_dict()
|
||||
zf.writestr(
|
||||
"document.json",
|
||||
json.dumps(doc_dict, ensure_ascii=False, indent=indent).encode("utf-8"),
|
||||
)
|
||||
|
||||
# Persist the ZIP to disk
|
||||
buf.seek(0)
|
||||
if filename.parent and not filename.parent.exists():
|
||||
filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
with filename.open("wb") as f:
|
||||
f.write(buf.getvalue())
|
||||
|
||||
@classmethod
|
||||
def load(cls, filename: Union[str, Path]) -> "ConversionAssets":
|
||||
"""Load a ConversionAssets."""
|
||||
if isinstance(filename, str):
|
||||
filename = Path(filename)
|
||||
|
||||
# Read the ZIP and deserialize all items
|
||||
version_info: DoclingVersion = DoclingVersion()
|
||||
timestamp: Optional[str] = None
|
||||
status = ConversionStatus.PENDING
|
||||
errors: list[ErrorItem] = []
|
||||
pages: list[Page] = []
|
||||
timings: dict[str, ProfilingItem] = {}
|
||||
confidence = ConfidenceReport()
|
||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
with zipfile.ZipFile(filename, mode="r") as zf:
|
||||
|
||||
def read_json(name: str):
|
||||
try:
|
||||
with zf.open(name, "r") as fp:
|
||||
return json.loads(fp.read().decode("utf-8"))
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
# version
|
||||
if (data := read_json("version.json")) is not None:
|
||||
try:
|
||||
version_info = DoclingVersion.model_validate(data)
|
||||
except Exception as exc:
|
||||
_log.error(f"Could not read version: {exc}")
|
||||
|
||||
# timestamp
|
||||
if (data := read_json("timestamp.json")) is not None:
|
||||
if isinstance(data, str):
|
||||
timestamp = data
|
||||
|
||||
# status
|
||||
if (data := read_json("status.json")) is not None:
|
||||
try:
|
||||
status = ConversionStatus(data)
|
||||
except Exception:
|
||||
status = ConversionStatus.PENDING
|
||||
|
||||
# errors
|
||||
if (data := read_json("errors.json")) is not None and isinstance(
|
||||
data, list
|
||||
):
|
||||
errors = [ErrorItem.model_validate(item) for item in data]
|
||||
|
||||
# pages
|
||||
if (data := read_json("pages.json")) is not None and isinstance(data, list):
|
||||
pages = [Page.model_validate(item) for item in data]
|
||||
|
||||
# timings
|
||||
if (data := read_json("timings.json")) is not None and isinstance(
|
||||
data, dict
|
||||
):
|
||||
timings = {k: ProfilingItem.model_validate(v) for k, v in data.items()}
|
||||
|
||||
# confidence
|
||||
if (data := read_json("confidence.json")) is not None and isinstance(
|
||||
data, dict
|
||||
):
|
||||
confidence = ConfidenceReport.model_validate(data)
|
||||
|
||||
# document
|
||||
if (data := read_json("document.json")) is not None and isinstance(
|
||||
data, dict
|
||||
):
|
||||
document = DoclingDocument.model_validate(data)
|
||||
|
||||
return cls(
|
||||
version=version_info,
|
||||
timestamp=timestamp,
|
||||
status=status,
|
||||
errors=errors,
|
||||
pages=pages,
|
||||
timings=timings,
|
||||
confidence=confidence,
|
||||
document=document,
|
||||
)
|
||||
|
||||
|
||||
class ConversionResult(ConversionAssets):
|
||||
input: InputDocument
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
|
||||
|
||||
class _DummyBackend(AbstractDocumentBackend):
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
44
tests/test_conversion_result_json.py
Normal file
44
tests/test_conversion_result_json.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.pypdfium2_backend import (
|
||||
PyPdfiumDocumentBackend,
|
||||
PyPdfiumPageBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionAssets
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def test_conversion_result_json_roundtrip_string():
|
||||
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.images_scale = 1.0
|
||||
pipeline_options.generate_page_images = False
|
||||
pipeline_options.do_table_structure = False
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
pipeline_options.generate_parsed_pages = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
||||
)
|
||||
}
|
||||
)
|
||||
conv_res = doc_converter.convert(pdf_doc)
|
||||
|
||||
fpath: Path = Path("./test-conversion.zip")
|
||||
|
||||
conv_res.save(filename=fpath) # returns string when no filename is given
|
||||
# assert isinstance(json_str, str) and len(json_str) > 0
|
||||
|
||||
loaded = ConversionAssets.load(filename=fpath)
|
||||
|
||||
assert loaded.status == conv_res.status
|
||||
assert loaded.document.name == conv_res.document.name
|
||||
Reference in New Issue
Block a user