mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: add save and load for conversion result (#2648)
* feat: added save_as_json and load_from_json to ConversionResult Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added a test Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the save and load for ConversionResult Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the signature Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactored load/save into ConversionAssets Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the DoclingVersion class Signed-off-by: Peter Staar <taa@zurich.ibm.com> * renamed time_stamp to timestamp Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
6fb9a5f98a
commit
b559813b9b
@@ -59,7 +59,7 @@ from docling.datamodel.base_models import (
|
|||||||
InputFormat,
|
InputFormat,
|
||||||
OutputFormat,
|
OutputFormat,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult, DoclingVersion
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AsrPipelineOptions,
|
AsrPipelineOptions,
|
||||||
ConvertPipelineOptions,
|
ConvertPipelineOptions,
|
||||||
@@ -168,19 +168,13 @@ def logo_callback(value: bool):
|
|||||||
|
|
||||||
def version_callback(value: bool):
|
def version_callback(value: bool):
|
||||||
if value:
|
if value:
|
||||||
docling_version = importlib.metadata.version("docling")
|
v = DoclingVersion()
|
||||||
docling_core_version = importlib.metadata.version("docling-core")
|
print(f"Docling version: {v.docling_version}")
|
||||||
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
print(f"Docling Core version: {v.docling_core_version}")
|
||||||
docling_parse_version = importlib.metadata.version("docling-parse")
|
print(f"Docling IBM Models version: {v.docling_ibm_models_version}")
|
||||||
platform_str = platform.platform()
|
print(f"Docling Parse version: {v.docling_parse_version}")
|
||||||
py_impl_version = sys.implementation.cache_tag
|
print(f"Python: {v.py_impl_version} ({v.py_lang_version})")
|
||||||
py_lang_version = platform.python_version()
|
print(f"Platform: {v.platform_str}")
|
||||||
print(f"Docling version: {docling_version}")
|
|
||||||
print(f"Docling Core version: {docling_core_version}")
|
|
||||||
print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
|
||||||
print(f"Docling Parse version: {docling_parse_version}")
|
|
||||||
print(f"Python: {py_impl_version} ({py_lang_version})")
|
|
||||||
print(f"Platform: {platform_str}")
|
|
||||||
raise typer.Exit()
|
raise typer.Exit()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from pydantic import (
|
|||||||
FieldSerializationInfo,
|
FieldSerializationInfo,
|
||||||
computed_field,
|
computed_field,
|
||||||
field_serializer,
|
field_serializer,
|
||||||
|
field_validator,
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -403,6 +404,18 @@ class PageConfidenceScores(BaseModel):
|
|||||||
table_score: ScoreValue = np.nan
|
table_score: ScoreValue = np.nan
|
||||||
ocr_score: ScoreValue = np.nan
|
ocr_score: ScoreValue = np.nan
|
||||||
|
|
||||||
|
# Accept null/None or string "NaN" values on input and coerce to np.nan
|
||||||
|
@field_validator(
|
||||||
|
"parse_score", "layout_score", "table_score", "ocr_score", mode="before"
|
||||||
|
)
|
||||||
|
@classmethod
|
||||||
|
def _coerce_none_or_nan_str(cls, v):
|
||||||
|
if v is None:
|
||||||
|
return np.nan
|
||||||
|
if isinstance(v, str) and v.strip().lower() in {"nan", "null", "none", ""}:
|
||||||
|
return np.nan
|
||||||
|
return v
|
||||||
|
|
||||||
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
||||||
if score < 0.5:
|
if score < 0.5:
|
||||||
return QualityGrade.POOR
|
return QualityGrade.POOR
|
||||||
|
|||||||
@@ -1,8 +1,14 @@
|
|||||||
import csv
|
import csv
|
||||||
|
import importlib
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import platform
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import tarfile
|
import tarfile
|
||||||
|
import zipfile
|
||||||
from collections.abc import Iterable, Mapping
|
from collections.abc import Iterable, Mapping
|
||||||
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
@@ -223,14 +229,25 @@ class DocumentFormat(str, Enum):
|
|||||||
V1 = "v1"
|
V1 = "v1"
|
||||||
|
|
||||||
|
|
||||||
class ConversionResult(BaseModel):
|
class DoclingVersion(BaseModel):
|
||||||
input: InputDocument
|
docling_version: str = importlib.metadata.version("docling")
|
||||||
|
docling_core_version: str = importlib.metadata.version("docling-core")
|
||||||
|
docling_ibm_models_version: str = importlib.metadata.version("docling-ibm-models")
|
||||||
|
docling_parse_version: str = importlib.metadata.version("docling-parse")
|
||||||
|
platform_str: str = platform.platform()
|
||||||
|
py_impl_version: str = sys.implementation.cache_tag
|
||||||
|
py_lang_version: str = platform.python_version()
|
||||||
|
|
||||||
|
|
||||||
|
class ConversionAssets(BaseModel):
|
||||||
|
version: DoclingVersion = DoclingVersion()
|
||||||
|
# When the assets were saved (ISO string from datetime.now())
|
||||||
|
timestamp: Optional[str] = None
|
||||||
|
|
||||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||||
errors: list[ErrorItem] = [] # structure to keep errors
|
errors: list[ErrorItem] = [] # structure to keep errors
|
||||||
|
|
||||||
pages: list[Page] = []
|
pages: list[Page] = []
|
||||||
assembled: AssembledUnit = AssembledUnit()
|
|
||||||
timings: dict[str, ProfilingItem] = {}
|
timings: dict[str, ProfilingItem] = {}
|
||||||
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
||||||
|
|
||||||
@@ -241,6 +258,166 @@ class ConversionResult(BaseModel):
|
|||||||
def legacy_document(self):
|
def legacy_document(self):
|
||||||
return docling_document_to_legacy(self.document)
|
return docling_document_to_legacy(self.document)
|
||||||
|
|
||||||
|
def save(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
filename: Union[str, Path],
|
||||||
|
indent: Optional[int] = 2,
|
||||||
|
):
|
||||||
|
"""Serialize the full ConversionAssets to JSON."""
|
||||||
|
if isinstance(filename, str):
|
||||||
|
filename = Path(filename)
|
||||||
|
# Build an in-memory ZIP archive containing JSON for each asset
|
||||||
|
buf = BytesIO()
|
||||||
|
|
||||||
|
def to_jsonable(obj):
|
||||||
|
try:
|
||||||
|
# pydantic v2 models
|
||||||
|
if hasattr(obj, "model_dump"):
|
||||||
|
return obj.model_dump(mode="json") # type: ignore[attr-defined]
|
||||||
|
except TypeError:
|
||||||
|
# some models may not accept mode argument
|
||||||
|
return obj.model_dump() # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
# enums
|
||||||
|
try:
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
if isinstance(obj, Enum):
|
||||||
|
return obj.value
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# containers
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return [to_jsonable(x) for x in obj]
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {k: to_jsonable(v) for k, v in obj.items()}
|
||||||
|
|
||||||
|
# passthrough primitives
|
||||||
|
return obj
|
||||||
|
|
||||||
|
with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
|
||||||
|
def write_json(name: str, payload) -> None:
|
||||||
|
data = json.dumps(
|
||||||
|
to_jsonable(payload), ensure_ascii=False, indent=indent
|
||||||
|
)
|
||||||
|
zf.writestr(name, data.encode("utf-8"))
|
||||||
|
|
||||||
|
# Update and persist a save timestamp
|
||||||
|
self.timestamp = datetime.now().isoformat()
|
||||||
|
write_json("timestamp.json", self.timestamp)
|
||||||
|
|
||||||
|
# Store each component in its own JSON file
|
||||||
|
write_json("version.json", self.version)
|
||||||
|
write_json("status.json", self.status)
|
||||||
|
write_json("errors.json", self.errors)
|
||||||
|
write_json("pages.json", self.pages)
|
||||||
|
write_json("timings.json", self.timings)
|
||||||
|
write_json("confidence.json", self.confidence)
|
||||||
|
# For the document, ensure stable schema via export_to_dict
|
||||||
|
doc_dict = self.document.export_to_dict()
|
||||||
|
zf.writestr(
|
||||||
|
"document.json",
|
||||||
|
json.dumps(doc_dict, ensure_ascii=False, indent=indent).encode("utf-8"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Persist the ZIP to disk
|
||||||
|
buf.seek(0)
|
||||||
|
if filename.parent and not filename.parent.exists():
|
||||||
|
filename.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with filename.open("wb") as f:
|
||||||
|
f.write(buf.getvalue())
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, filename: Union[str, Path]) -> "ConversionAssets":
|
||||||
|
"""Load a ConversionAssets."""
|
||||||
|
if isinstance(filename, str):
|
||||||
|
filename = Path(filename)
|
||||||
|
|
||||||
|
# Read the ZIP and deserialize all items
|
||||||
|
version_info: DoclingVersion = DoclingVersion()
|
||||||
|
timestamp: Optional[str] = None
|
||||||
|
status = ConversionStatus.PENDING
|
||||||
|
errors: list[ErrorItem] = []
|
||||||
|
pages: list[Page] = []
|
||||||
|
timings: dict[str, ProfilingItem] = {}
|
||||||
|
confidence = ConfidenceReport()
|
||||||
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||||
|
|
||||||
|
with zipfile.ZipFile(filename, mode="r") as zf:
|
||||||
|
|
||||||
|
def read_json(name: str):
|
||||||
|
try:
|
||||||
|
with zf.open(name, "r") as fp:
|
||||||
|
return json.loads(fp.read().decode("utf-8"))
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# version
|
||||||
|
if (data := read_json("version.json")) is not None:
|
||||||
|
try:
|
||||||
|
version_info = DoclingVersion.model_validate(data)
|
||||||
|
except Exception as exc:
|
||||||
|
_log.error(f"Could not read version: {exc}")
|
||||||
|
|
||||||
|
# timestamp
|
||||||
|
if (data := read_json("timestamp.json")) is not None:
|
||||||
|
if isinstance(data, str):
|
||||||
|
timestamp = data
|
||||||
|
|
||||||
|
# status
|
||||||
|
if (data := read_json("status.json")) is not None:
|
||||||
|
try:
|
||||||
|
status = ConversionStatus(data)
|
||||||
|
except Exception:
|
||||||
|
status = ConversionStatus.PENDING
|
||||||
|
|
||||||
|
# errors
|
||||||
|
if (data := read_json("errors.json")) is not None and isinstance(
|
||||||
|
data, list
|
||||||
|
):
|
||||||
|
errors = [ErrorItem.model_validate(item) for item in data]
|
||||||
|
|
||||||
|
# pages
|
||||||
|
if (data := read_json("pages.json")) is not None and isinstance(data, list):
|
||||||
|
pages = [Page.model_validate(item) for item in data]
|
||||||
|
|
||||||
|
# timings
|
||||||
|
if (data := read_json("timings.json")) is not None and isinstance(
|
||||||
|
data, dict
|
||||||
|
):
|
||||||
|
timings = {k: ProfilingItem.model_validate(v) for k, v in data.items()}
|
||||||
|
|
||||||
|
# confidence
|
||||||
|
if (data := read_json("confidence.json")) is not None and isinstance(
|
||||||
|
data, dict
|
||||||
|
):
|
||||||
|
confidence = ConfidenceReport.model_validate(data)
|
||||||
|
|
||||||
|
# document
|
||||||
|
if (data := read_json("document.json")) is not None and isinstance(
|
||||||
|
data, dict
|
||||||
|
):
|
||||||
|
document = DoclingDocument.model_validate(data)
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
version=version_info,
|
||||||
|
timestamp=timestamp,
|
||||||
|
status=status,
|
||||||
|
errors=errors,
|
||||||
|
pages=pages,
|
||||||
|
timings=timings,
|
||||||
|
confidence=confidence,
|
||||||
|
document=document,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ConversionResult(ConversionAssets):
|
||||||
|
input: InputDocument
|
||||||
|
assembled: AssembledUnit = AssembledUnit()
|
||||||
|
|
||||||
|
|
||||||
class _DummyBackend(AbstractDocumentBackend):
|
class _DummyBackend(AbstractDocumentBackend):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
|||||||
44
tests/test_conversion_result_json.py
Normal file
44
tests/test_conversion_result_json.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from docling.backend.pypdfium2_backend import (
|
||||||
|
PyPdfiumDocumentBackend,
|
||||||
|
PyPdfiumPageBackend,
|
||||||
|
)
|
||||||
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
|
from docling.datamodel.document import ConversionAssets
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
|
def test_conversion_result_json_roundtrip_string():
|
||||||
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_ocr = False
|
||||||
|
pipeline_options.images_scale = 1.0
|
||||||
|
pipeline_options.generate_page_images = False
|
||||||
|
pipeline_options.do_table_structure = False
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
pipeline_options.generate_parsed_pages = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
conv_res = doc_converter.convert(pdf_doc)
|
||||||
|
|
||||||
|
fpath: Path = Path("./test-conversion.zip")
|
||||||
|
|
||||||
|
conv_res.save(filename=fpath) # returns string when no filename is given
|
||||||
|
# assert isinstance(json_str, str) and len(json_str) > 0
|
||||||
|
|
||||||
|
loaded = ConversionAssets.load(filename=fpath)
|
||||||
|
|
||||||
|
assert loaded.status == conv_res.status
|
||||||
|
assert loaded.document.name == conv_res.document.name
|
||||||
Reference in New Issue
Block a user