mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 23:42:22 +00:00
feat: add Docling JSON ingestion
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
c49b3526fb
commit
e972c9c60c
@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
|
|||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
self.path_or_stream.close()
|
self.path_or_stream.close()
|
||||||
|
0
docling/backend/json/__init__.py
Normal file
0
docling/backend/json/__init__.py
Normal file
46
docling/backend/json/docling_json_backend.py
Normal file
46
docling/backend/json/docling_json_backend.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from docling_core.types.doc import DoclingDocument
|
||||||
|
from typing_extensions import override
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
|
class DoclingJSONBackend(DeclarativeDocumentBackend):
|
||||||
|
@override
|
||||||
|
def __init__(
|
||||||
|
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
||||||
|
) -> None:
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
self._my_in_doc = in_doc
|
||||||
|
|
||||||
|
@override
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@override
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@override
|
||||||
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
|
return {InputFormat.JSON_DOCLING}
|
||||||
|
|
||||||
|
@override
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
json_data: Union[str, bytes]
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||||
|
json_data = f.read()
|
||||||
|
elif isinstance(self.path_or_stream, BytesIO):
|
||||||
|
json_data = self.path_or_stream.getvalue()
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
||||||
|
doc = DoclingDocument.model_validate_json(json_data=json_data)
|
||||||
|
return doc
|
@ -41,6 +41,7 @@ class InputFormat(str, Enum):
|
|||||||
MD = "md"
|
MD = "md"
|
||||||
XLSX = "xlsx"
|
XLSX = "xlsx"
|
||||||
XML_USPTO = "xml_uspto"
|
XML_USPTO = "xml_uspto"
|
||||||
|
JSON_DOCLING = "json_docling"
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
@ -62,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||||
InputFormat.XLSX: ["xlsx"],
|
InputFormat.XLSX: ["xlsx"],
|
||||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||||
|
InputFormat.JSON_DOCLING: ["json"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||||
@ -90,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
],
|
],
|
||||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||||
|
InputFormat.JSON_DOCLING: ["application/json"],
|
||||||
}
|
}
|
||||||
|
|
||||||
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
||||||
|
@ -297,7 +297,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
mime = mime or "text/plain"
|
mime = mime or "text/plain"
|
||||||
formats = MimeTypeToFormat.get(mime, [])
|
formats = MimeTypeToFormat.get(mime, [])
|
||||||
if formats:
|
if formats:
|
||||||
if len(formats) == 1 and mime not in ("text/plain"):
|
if len(formats) == 1 and mime not in ("text/plain", "application/json"):
|
||||||
return formats[0]
|
return formats[0]
|
||||||
else: # ambiguity in formats
|
else: # ambiguity in formats
|
||||||
return _DocumentConversionInput._guess_from_content(
|
return _DocumentConversionInput._guess_from_content(
|
||||||
@ -339,6 +339,13 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
||||||
input_format = InputFormat.XML_USPTO
|
input_format = InputFormat.XML_USPTO
|
||||||
|
|
||||||
|
elif mime == "application/json":
|
||||||
|
if (
|
||||||
|
InputFormat.JSON_DOCLING in formats
|
||||||
|
and '"schema_name": "DoclingDocument"' in content_str
|
||||||
|
):
|
||||||
|
input_format = InputFormat.JSON_DOCLING
|
||||||
|
|
||||||
return input_format
|
return input_format
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -350,6 +357,8 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
mime = FormatToMimeType[InputFormat.HTML][0]
|
mime = FormatToMimeType[InputFormat.HTML][0]
|
||||||
elif ext in FormatToExtensions[InputFormat.MD]:
|
elif ext in FormatToExtensions[InputFormat.MD]:
|
||||||
mime = FormatToMimeType[InputFormat.MD][0]
|
mime = FormatToMimeType[InputFormat.MD][0]
|
||||||
|
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
||||||
|
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
||||||
return mime
|
return mime
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -11,6 +11,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
|
|||||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|||||||
InputFormat.PDF: FormatOption(
|
InputFormat.PDF: FormatOption(
|
||||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||||
),
|
),
|
||||||
|
InputFormat.JSON_DOCLING: FormatOption(
|
||||||
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||||
|
),
|
||||||
}
|
}
|
||||||
if (options := format_to_default_options.get(format)) is not None:
|
if (options := format_to_default_options.get(format)) is not None:
|
||||||
return options
|
return options
|
||||||
|
Loading…
Reference in New Issue
Block a user