From e972c9c60c1816eb3ca69dc8bcf1e71636a5d678 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 21 Jan 2025 14:59:13 +0100 Subject: [PATCH] feat: add Docling JSON ingestion Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/abstract_backend.py | 1 - docling/backend/json/__init__.py | 0 docling/backend/json/docling_json_backend.py | 46 ++++++++++++++++++++ docling/datamodel/base_models.py | 3 ++ docling/datamodel/document.py | 11 ++++- docling/document_converter.py | 4 ++ 6 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 docling/backend/json/__init__.py create mode 100644 docling/backend/json/docling_json_backend.py diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index b47b11cd..491330b3 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC): def supports_pagination(cls) -> bool: pass - @abstractmethod def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() diff --git a/docling/backend/json/__init__.py b/docling/backend/json/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/backend/json/docling_json_backend.py b/docling/backend/json/docling_json_backend.py new file mode 100644 index 00000000..b23f074a --- /dev/null +++ b/docling/backend/json/docling_json_backend.py @@ -0,0 +1,46 @@ +from io import BytesIO +from pathlib import Path +from typing import Union + +from docling_core.types.doc import DoclingDocument +from typing_extensions import override + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + + +class DoclingJSONBackend(DeclarativeDocumentBackend): + @override + def __init__( + self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path] + ) -> None: + super().__init__(in_doc, path_or_stream) + self._my_in_doc = in_doc + + @override + def is_valid(self) -> bool: + return True + + @classmethod + @override + def supports_pagination(cls) -> bool: + return False + + @classmethod + @override + def supported_formats(cls) -> set[InputFormat]: + return {InputFormat.JSON_DOCLING} + + @override + def convert(self) -> DoclingDocument: + json_data: Union[str, bytes] + if isinstance(self.path_or_stream, Path): + with open(self.path_or_stream, encoding="utf-8") as f: + json_data = f.read() + elif isinstance(self.path_or_stream, BytesIO): + json_data = self.path_or_stream.getvalue() + else: + raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}") + doc = DoclingDocument.model_validate_json(json_data=json_data) + return doc diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 99d30108..d1e7ce3a 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -41,6 +41,7 @@ class InputFormat(str, Enum): MD = "md" XLSX = "xlsx" XML_USPTO = "xml_uspto" + JSON_DOCLING = "json_docling" class OutputFormat(str, Enum): @@ -62,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.XLSX: ["xlsx"], InputFormat.XML_USPTO: ["xml", "txt"], + InputFormat.JSON_DOCLING: ["json"], } FormatToMimeType: Dict[InputFormat, List[str]] = { @@ -90,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], + InputFormat.JSON_DOCLING: ["application/json"], } MimeTypeToFormat: dict[str, list[InputFormat]] = { diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 136428e8..bdb7d831 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -297,7 +297,7 @@ class _DocumentConversionInput(BaseModel): mime = mime or "text/plain" formats = MimeTypeToFormat.get(mime, []) if formats: - if len(formats) == 1 and mime not in ("text/plain"): + if len(formats) == 1 and mime not in ("text/plain", "application/json"): return formats[0] else: # ambiguity in formats return _DocumentConversionInput._guess_from_content( @@ -339,6 +339,13 @@ class _DocumentConversionInput(BaseModel): if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"): input_format = InputFormat.XML_USPTO + elif mime == "application/json": + if ( + InputFormat.JSON_DOCLING in formats + and '"schema_name": "DoclingDocument"' in content_str + ): + input_format = InputFormat.JSON_DOCLING + return input_format @staticmethod @@ -350,6 +357,8 @@ class _DocumentConversionInput(BaseModel): mime = FormatToMimeType[InputFormat.HTML][0] elif ext in FormatToExtensions[InputFormat.MD]: mime = FormatToMimeType[InputFormat.MD][0] + elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]: + mime = FormatToMimeType[InputFormat.JSON_DOCLING][0] return mime @staticmethod diff --git a/docling/document_converter.py b/docling/document_converter.py index cb073949..13203ea7 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -11,6 +11,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.html_backend import HTMLDocumentBackend +from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend @@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.PDF: FormatOption( pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend ), + InputFormat.JSON_DOCLING: FormatOption( + pipeline_cls=SimplePipeline, backend=DoclingJSONBackend + ), } if (options := format_to_default_options.get(format)) is not None: return options