updated the base-model and added the asciidoc_backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-17 19:58:07 +02:00
parent 034a411057
commit 12033537e3
2 changed files with 75 additions and 0 deletions

View File

@ -0,0 +1,72 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class ASCIIDocDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
self.valid = True
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
return
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.ASCIIDOC}
def convert(self) -> DoclingDocument:
"""
Parses the ASCII into a structured document model.
"""
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="asciidoc",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
doc = self.parse_stream(doc)
return doc
def parse(self, doc: DoclingDocument):
return doc

View File

@ -30,6 +30,7 @@ class InputFormat(str, Enum):
HTML = "html" HTML = "html"
IMAGE = "image" IMAGE = "image"
PDF = "pdf" PDF = "pdf"
ASCIIDOC = "asciidoc"
class OutputFormat(str, Enum): class OutputFormat(str, Enum):
@ -45,6 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["pdf"], InputFormat.PDF: ["pdf"],
InputFormat.HTML: ["html", "htm", "xhtml"], InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
} }
FormatToMimeType: Dict[InputFormat, Set[str]] = { FormatToMimeType: Dict[InputFormat, Set[str]] = {
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
"image/bmp", "image/bmp",
}, },
InputFormat.PDF: {"application/pdf"}, InputFormat.PDF: {"application/pdf"},
InputFormat.ASCIIDOC: {"application/asciidoc"},
} }
MimeTypeToFormat = { MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes