mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
updated the base-model and added the asciidoc_backend
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
034a411057
commit
12033537e3
72
docling/backend/asciidoc_backend.py
Normal file
72
docling/backend/asciidoc_backend.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Set, Union
|
||||||
|
|
||||||
|
from docling_core.types.doc import (
|
||||||
|
DocItemLabel,
|
||||||
|
DoclingDocument,
|
||||||
|
DocumentOrigin,
|
||||||
|
GroupLabel,
|
||||||
|
TableCell,
|
||||||
|
TableData,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ASCIIDocDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
|
||||||
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
|
self.path_or_stream = path_or_stream
|
||||||
|
|
||||||
|
self.valid = True
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.valid
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.ASCIIDOC}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
"""
|
||||||
|
Parses the ASCII into a structured document model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
fname = ""
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
fname = self.path_or_stream.name
|
||||||
|
|
||||||
|
origin = DocumentOrigin(
|
||||||
|
filename=fname,
|
||||||
|
mimetype="asciidoc",
|
||||||
|
binary_hash=self.document_hash,
|
||||||
|
)
|
||||||
|
if len(fname) > 0:
|
||||||
|
docname = Path(fname).stem
|
||||||
|
else:
|
||||||
|
docname = "stream"
|
||||||
|
|
||||||
|
doc = DoclingDocument(name=docname, origin=origin)
|
||||||
|
|
||||||
|
doc = self.parse_stream(doc)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def parse(self, doc: DoclingDocument):
|
||||||
|
|
||||||
|
return doc
|
@ -30,6 +30,7 @@ class InputFormat(str, Enum):
|
|||||||
HTML = "html"
|
HTML = "html"
|
||||||
IMAGE = "image"
|
IMAGE = "image"
|
||||||
PDF = "pdf"
|
PDF = "pdf"
|
||||||
|
ASCIIDOC = "asciidoc"
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
@ -45,6 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.PDF: ["pdf"],
|
InputFormat.PDF: ["pdf"],
|
||||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||||
|
InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||||
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
|||||||
"image/bmp",
|
"image/bmp",
|
||||||
},
|
},
|
||||||
InputFormat.PDF: {"application/pdf"},
|
InputFormat.PDF: {"application/pdf"},
|
||||||
|
InputFormat.ASCIIDOC: {"application/asciidoc"},
|
||||||
}
|
}
|
||||||
MimeTypeToFormat = {
|
MimeTypeToFormat = {
|
||||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||||
|
Loading…
Reference in New Issue
Block a user