Update all backends with proper filename in DocumentOrigin

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-22 14:04:50 +02:00
parent 789b29bb24
commit b1a2af6d39
9 changed files with 79 additions and 69 deletions

View File

@ -13,6 +13,7 @@ if TYPE_CHECKING:
class AbstractDocumentBackend(ABC): class AbstractDocumentBackend(ABC):
@abstractmethod @abstractmethod
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
self.file = in_doc.file
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.document_hash = in_doc.document_hash self.document_hash = in_doc.document_hash
self.input_format = in_doc.format self.input_format = in_doc.format

View File

@ -1,4 +1,5 @@
import logging import logging
import os
import re import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -67,21 +68,13 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
Parses the ASCII into a structured document model. Parses the ASCII into a structured document model.
""" """
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin( origin = DocumentOrigin(
filename=fname, filename=self.file.name or "file",
mimetype="text/asciidoc", mimetype="text/asciidoc",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin) doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
doc = self._parse(doc) doc = self._parse(doc)
@ -138,9 +131,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Lists # Lists
elif self._is_list_item(line): elif self._is_list_item(line):
print("line: ", line) _log.debug(f"line: {line}")
item = self._parse_list_item(line) item = self._parse_list_item(line)
print("parsed list-item: ", item) _log.debug(f"parsed list-item: {item}")
level = self._get_current_level(parents) level = self._get_current_level(parents)
@ -160,9 +153,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
elif in_list and item["indent"] < indents[level]: elif in_list and item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level]) # print(item["indent"], " => ", indents[level])
while item["indent"] < indents[level]: while item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level]) # print(item["indent"], " => ", indents[level])
parents[level] = None parents[level] = None
indents[level] = None indents[level] = None
level -= 1 level -= 1
@ -217,7 +210,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
caption_data = [] caption_data = []
item = self._parse_picture(line) item = self._parse_picture(line)
print(item)
size = None size = None
if "width" in item and "height" in item: if "width" in item and "height" in item:
@ -355,7 +347,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Fallback if no match # Fallback if no match
return { return {
"type": "list_item", "type": "list_item",
"marker": item_marker, "marker": "-",
"text": line, "text": line,
"numbered": False, "numbered": False,
"indent": 0, "indent": 0,

View File

@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin,
GroupLabel, GroupLabel,
TableCell, TableCell,
TableData, TableData,
@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(name="dummy") origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="text/html",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
_log.debug("Trying to convert HTML...") _log.debug("Trying to convert HTML...")
if self.is_valid(): if self.is_valid():

View File

@ -237,21 +237,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
_log.debug("converting Markdown...") _log.debug("converting Markdown...")
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin( origin = DocumentOrigin(
filename=fname, filename=self.file.name or "file",
mimetype="text/markdown", mimetype="text/markdown",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin) doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid(): if self.is_valid():
# Parse the markdown into an abstract syntax tree (AST) # Parse the markdown into an abstract syntax tree (AST)

View File

@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
# Parses the PPTX into a structured document model. # Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash) # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin( origin = DocumentOrigin(
filename=fname, filename=self.file.name or "file",
mimetype="application/vnd.ms-powerpoint", mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument( doc = DoclingDocument(
name=docname, origin=origin name=self.file.stem or "file", origin=origin
) # must add origin information ) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc) doc = self.walk_linear(self.pptx_obj, doc)

View File

@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model. # Parses the DOCX into a structured document model.
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin( origin = DocumentOrigin(
filename=fname, filename=self.file.name or "file",
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )
if len(fname) > 0:
docname = Path(fname).stem doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
if self.is_valid(): if self.is_valid():
assert self.docx_obj is not None assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)

View File

@ -51,27 +51,27 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
} }
FormatToMimeType: Dict[InputFormat, Set[str]] = { FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.DOCX: { InputFormat.DOCX: [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template", "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
}, ],
InputFormat.PPTX: { InputFormat.PPTX: [
"application/vnd.openxmlformats-officedocument.presentationml.template", "application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
}, ],
InputFormat.HTML: {"text/html", "application/xhtml+xml"}, InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.IMAGE: { InputFormat.IMAGE: [
"image/png", "image/png",
"image/jpeg", "image/jpeg",
"image/tiff", "image/tiff",
"image/gif", "image/gif",
"image/bmp", "image/bmp",
}, ],
InputFormat.PDF: {"application/pdf"}, InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: {"text/asciidoc"}, InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: {"text/markdown", "text/x-markdown"}, InputFormat.MD: ["text/markdown", "text/x-markdown"],
} }
MimeTypeToFormat = { MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes

View File

@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
DocumentStream, DocumentStream,
ErrorItem, ErrorItem,
FormatToExtensions,
FormatToMimeType,
InputFormat, InputFormat,
MimeTypeToFormat, MimeTypeToFormat,
Page, Page,
@ -480,28 +482,48 @@ class _DocumentConversionInput(BaseModel):
else: else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}") raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
def _guess_format(self, obj): def _guess_format(self, obj: Union[Path, DocumentStream]):
content = None content = b"" # empty binary blob
format = None
if isinstance(obj, Path): if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj)) mime = filetype.guess_mime(str(obj))
if mime is None: if mime is None:
ext = obj.suffix[1:]
mime = self._mime_from_extension(ext)
if mime is None: # must guess from
with obj.open("rb") as f: with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB content = f.read(1024) # Read first 1KB
elif isinstance(obj, DocumentStream): elif isinstance(obj, DocumentStream):
obj.stream.seek(0)
content = obj.stream.read(8192) content = obj.stream.read(8192)
obj.stream.seek(0) obj.stream.seek(0)
mime = filetype.guess_mime(content) mime = filetype.guess_mime(content)
if mime is None:
ext = (
obj.name.rsplit(".", 1)[-1]
if ("." in obj.name and not obj.name.startswith("."))
else ""
)
mime = self._mime_from_extension(ext)
if mime is None: mime = mime or self._detect_html_xhtml(content)
mime = self._detect_html_xhtml(content) mime = mime or "text/plain"
if mime is None:
mime = "text/markdown"
format = MimeTypeToFormat.get(mime) format = MimeTypeToFormat.get(mime)
return format return format
def _mime_from_extension(self, ext):
mime = None
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
elif ext in FormatToExtensions[InputFormat.HTML]:
mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]
return mime
def _detect_html_xhtml(self, content): def _detect_html_xhtml(self, content):
content_str = content.decode("ascii", errors="ignore").lower() content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments # Remove XML comments

View File

@ -1,11 +1,13 @@
import json import json
import logging import logging
from io import BytesIO
from pathlib import Path from pathlib import Path
import yaml import yaml
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.document_converter import ( from docling.document_converter import (
DocumentConverter, DocumentConverter,
PdfFormatOption, PdfFormatOption,
@ -19,18 +21,24 @@ _log = logging.getLogger(__name__)
def main(): def main():
input_paths = [ input_paths = [
Path("README.md"),
Path("tests/data/wiki_duck.html"), Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"), Path("tests/data/word_sample.docx"),
Path("tests/data/word_nested.docx"),
Path("tests/data/lorem_ipsum.docx"), Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"), Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"), Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"), Path("tests/data/test_01.asciidoc"),
Path("tests/data/test_01.asciidoc"), Path("tests/data/test_02.asciidoc"),
Path("README.md"), Path("README.md"),
] ]
# To read from bytes instead:
# docs = [
# DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
# for f in input_paths
# ]
## for defaults use: ## for defaults use:
# doc_converter = DocumentConverter() # doc_converter = DocumentConverter()
@ -49,7 +57,8 @@ def main():
], # whitelist formats, non-matching files are ignored. ], # whitelist formats, non-matching files are ignored.
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend pipeline_cls=StandardPdfPipeline,
backend=DoclingParseDocumentBackend,
), ),
InputFormat.DOCX: WordFormatOption( InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
@ -59,6 +68,7 @@ def main():
) )
conv_results = doc_converter.convert_all(input_paths) conv_results = doc_converter.convert_all(input_paths)
# conv_results = doc_converter.convert_all(docs)
for res in conv_results: for res in conv_results:
out_path = Path("scratch") out_path = Path("scratch")