Update all backends with proper filename in DocumentOrigin

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-22 14:04:50 +02:00
parent 789b29bb24
commit b1a2af6d39
9 changed files with 79 additions and 69 deletions

View File

@ -13,6 +13,7 @@ if TYPE_CHECKING:
class AbstractDocumentBackend(ABC):
@abstractmethod
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
self.file = in_doc.file
self.path_or_stream = path_or_stream
self.document_hash = in_doc.document_hash
self.input_format = in_doc.format

View File

@ -1,4 +1,5 @@
import logging
import os
import re
from io import BytesIO
from pathlib import Path
@ -67,21 +68,13 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
Parses the ASCII into a structured document model.
"""
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
filename=self.file.name or "file",
mimetype="text/asciidoc",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
doc = self._parse(doc)
@ -138,9 +131,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Lists
elif self._is_list_item(line):
print("line: ", line)
_log.debug(f"line: {line}")
item = self._parse_list_item(line)
print("parsed list-item: ", item)
_log.debug(f"parsed list-item: {item}")
level = self._get_current_level(parents)
@ -160,9 +153,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
elif in_list and item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level])
# print(item["indent"], " => ", indents[level])
while item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level])
# print(item["indent"], " => ", indents[level])
parents[level] = None
indents[level] = None
level -= 1
@ -217,7 +210,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
caption_data = []
item = self._parse_picture(line)
print(item)
size = None
if "width" in item and "height" in item:
@ -355,7 +347,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Fallback if no match
return {
"type": "list_item",
"marker": item_marker,
"marker": "-",
"text": line,
"numbered": False,
"indent": 0,

View File

@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(name="dummy")
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="text/html",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
_log.debug("Trying to convert HTML...")
if self.is_valid():

View File

@ -237,21 +237,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
_log.debug("converting Markdown...")
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
filename=self.file.name or "file",
mimetype="text/markdown",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid():
# Parse the markdown into an abstract syntax tree (AST)

View File

@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
# Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
filename=self.file.name or "file",
mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(
name=docname, origin=origin
name=self.file.stem or "file", origin=origin
) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc)

View File

@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model.
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
filename=self.file.name or "file",
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)

View File

@ -51,27 +51,27 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
}
FormatToMimeType: Dict[InputFormat, Set[str]] = {
InputFormat.DOCX: {
FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.DOCX: [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
},
InputFormat.PPTX: {
],
InputFormat.PPTX: [
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
},
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
InputFormat.IMAGE: {
],
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.IMAGE: [
"image/png",
"image/jpeg",
"image/tiff",
"image/gif",
"image/bmp",
},
InputFormat.PDF: {"application/pdf"},
InputFormat.ASCIIDOC: {"text/asciidoc"},
InputFormat.MD: {"text/markdown", "text/x-markdown"},
],
InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: ["text/markdown", "text/x-markdown"],
}
MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes

View File

@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
ErrorItem,
FormatToExtensions,
FormatToMimeType,
InputFormat,
MimeTypeToFormat,
Page,
@ -480,28 +482,48 @@ class _DocumentConversionInput(BaseModel):
else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
def _guess_format(self, obj):
content = None
def _guess_format(self, obj: Union[Path, DocumentStream]):
content = b"" # empty binary blob
format = None
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
if mime is None:
ext = obj.suffix[1:]
mime = self._mime_from_extension(ext)
if mime is None: # must guess from
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
elif isinstance(obj, DocumentStream):
obj.stream.seek(0)
content = obj.stream.read(8192)
obj.stream.seek(0)
mime = filetype.guess_mime(content)
if mime is None:
ext = (
obj.name.rsplit(".", 1)[-1]
if ("." in obj.name and not obj.name.startswith("."))
else ""
)
mime = self._mime_from_extension(ext)
if mime is None:
mime = self._detect_html_xhtml(content)
if mime is None:
mime = "text/markdown"
mime = mime or self._detect_html_xhtml(content)
mime = mime or "text/plain"
format = MimeTypeToFormat.get(mime)
return format
def _mime_from_extension(self, ext):
mime = None
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
elif ext in FormatToExtensions[InputFormat.HTML]:
mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]
return mime
def _detect_html_xhtml(self, content):
content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments

View File

@ -1,11 +1,13 @@
import json
import logging
from io import BytesIO
from pathlib import Path
import yaml
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
@ -19,18 +21,24 @@ _log = logging.getLogger(__name__)
def main():
input_paths = [
Path("README.md"),
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/word_nested.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/test_02.asciidoc"),
Path("README.md"),
]
# To read from bytes instead:
# docs = [
# DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
# for f in input_paths
# ]
## for defaults use:
# doc_converter = DocumentConverter()
@ -49,7 +57,8 @@ def main():
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
pipeline_cls=StandardPdfPipeline,
backend=DoclingParseDocumentBackend,
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
@ -59,6 +68,7 @@ def main():
)
conv_results = doc_converter.convert_all(input_paths)
# conv_results = doc_converter.convert_all(docs)
for res in conv_results:
out_path = Path("scratch")