Update all backends with proper filename in DocumentOrigin

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-31 14:34:40 +00:00 · 2024-10-22 14:04:50 +02:00 · 2024-10-22 14:04:50 +02:00 · b1a2af6d39
commit b1a2af6d39
parent 789b29bb24
9 changed files with 79 additions and 69 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@ -13,6 +13,7 @@ if TYPE_CHECKING:
 class AbstractDocumentBackend(ABC):
    @abstractmethod
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        self.file = in_doc.file
        self.path_or_stream = path_or_stream
        self.document_hash = in_doc.document_hash
        self.input_format = in_doc.format
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -1,4 +1,5 @@
 import logging
+import os
 import re
 from io import BytesIO
 from pathlib import Path
@ -67,21 +68,13 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        Parses the ASCII into a structured document model.
        """

-        fname = ""
-        if isinstance(self.path_or_stream, Path):
-            fname = self.path_or_stream.name
-
        origin = DocumentOrigin(
-            filename=fname,
+            filename=self.file.name or "file",
            mimetype="text/asciidoc",
            binary_hash=self.document_hash,
        )
-        if len(fname) > 0:
-            docname = Path(fname).stem
-        else:
-            docname = "stream"

-        doc = DoclingDocument(name=docname, origin=origin)
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)

        doc = self._parse(doc)

@ -138,9 +131,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            # Lists
            elif self._is_list_item(line):

-                print("line: ", line)
+                _log.debug(f"line: {line}")
                item = self._parse_list_item(line)
-                print("parsed list-item: ", item)
+                _log.debug(f"parsed list-item: {item}")

                level = self._get_current_level(parents)

@ -160,9 +153,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

                elif in_list and item["indent"] < indents[level]:

-                    print(item["indent"], " => ", indents[level])
+                    # print(item["indent"], " => ", indents[level])
                    while item["indent"] < indents[level]:
-                        print(item["indent"], " => ", indents[level])
+                        # print(item["indent"], " => ", indents[level])
                        parents[level] = None
                        indents[level] = None
                        level -= 1
@ -217,7 +210,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                caption_data = []

                item = self._parse_picture(line)
-                print(item)

                size = None
                if "width" in item and "height" in item:
@ -355,7 +347,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            # Fallback if no match
            return {
                "type": "list_item",
-                "marker": item_marker,
+                "marker": "-",
                "text": line,
                "numbered": False,
                "indent": 0,
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 from docling_core.types.doc import (
    DocItemLabel,
    DoclingDocument,
+    DocumentOrigin,
    GroupLabel,
    TableCell,
    TableData,
@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
-        doc = DoclingDocument(name="dummy")
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="text/html",
+            binary_hash=self.document_hash,
+        )
+
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        _log.debug("Trying to convert HTML...")

        if self.is_valid():
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -237,21 +237,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        _log.debug("converting Markdown...")

-        fname = ""
-        if isinstance(self.path_or_stream, Path):
-            fname = self.path_or_stream.name
-
        origin = DocumentOrigin(
-            filename=fname,
+            filename=self.file.name or "file",
            mimetype="text/markdown",
            binary_hash=self.document_hash,
        )
-        if len(fname) > 0:
-            docname = Path(fname).stem
-        else:
-            docname = "stream"

-        doc = DoclingDocument(name=docname, origin=origin)
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)

        if self.is_valid():
            # Parse the markdown into an abstract syntax tree (AST)
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        # Parses the PPTX into a structured document model.
        # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)

-        fname = ""
-        if isinstance(self.path_or_stream, Path):
-            fname = self.path_or_stream.name
-
        origin = DocumentOrigin(
-            filename=fname,
+            filename=self.file.name or "file",
            mimetype="application/vnd.ms-powerpoint",
            binary_hash=self.document_hash,
        )
-        if len(fname) > 0:
-            docname = Path(fname).stem
-        else:
-            docname = "stream"
+
        doc = DoclingDocument(
-            name=docname, origin=origin
+            name=self.file.stem or "file", origin=origin
        )  # must add origin information
        doc = self.walk_linear(self.pptx_obj, doc)

--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # Parses the DOCX into a structured document model.

-        fname = ""
-        if isinstance(self.path_or_stream, Path):
-            fname = self.path_or_stream.name
-
        origin = DocumentOrigin(
-            filename=fname,
+            filename=self.file.name or "file",
            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            binary_hash=self.document_hash,
        )
-        if len(fname) > 0:
-            docname = Path(fname).stem
-        else:
-            docname = "stream"
-        doc = DoclingDocument(name=docname, origin=origin)
+
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        if self.is_valid():
            assert self.docx_obj is not None
            doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -51,27 +51,27 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
 }

-FormatToMimeType: Dict[InputFormat, Set[str]] = {
-    InputFormat.DOCX: {
+FormatToMimeType: Dict[InputFormat, List[str]] = {
+    InputFormat.DOCX: [
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
-    },
-    InputFormat.PPTX: {
+    ],
+    InputFormat.PPTX: [
        "application/vnd.openxmlformats-officedocument.presentationml.template",
        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-    },
-    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
-    InputFormat.IMAGE: {
+    ],
+    InputFormat.HTML: ["text/html", "application/xhtml+xml"],
+    InputFormat.IMAGE: [
        "image/png",
        "image/jpeg",
        "image/tiff",
        "image/gif",
        "image/bmp",
-    },
-    InputFormat.PDF: {"application/pdf"},
-    InputFormat.ASCIIDOC: {"text/asciidoc"},
-    InputFormat.MD: {"text/markdown", "text/x-markdown"},
+    ],
+    InputFormat.PDF: ["application/pdf"],
+    InputFormat.ASCIIDOC: ["text/asciidoc"],
+    InputFormat.MD: ["text/markdown", "text/x-markdown"],
 }
 MimeTypeToFormat = {
    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
    ConversionStatus,
    DocumentStream,
    ErrorItem,
+    FormatToExtensions,
+    FormatToMimeType,
    InputFormat,
    MimeTypeToFormat,
    Page,
@ -480,28 +482,48 @@ class _DocumentConversionInput(BaseModel):
            else:
                raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")

-    def _guess_format(self, obj):
-        content = None
+    def _guess_format(self, obj: Union[Path, DocumentStream]):
+        content = b""  # empty binary blob
+        format = None
+
        if isinstance(obj, Path):
            mime = filetype.guess_mime(str(obj))
            if mime is None:
+                ext = obj.suffix[1:]
+                mime = self._mime_from_extension(ext)
+            if mime is None:  # must guess from
                with obj.open("rb") as f:
                    content = f.read(1024)  # Read first 1KB

        elif isinstance(obj, DocumentStream):
-            obj.stream.seek(0)
            content = obj.stream.read(8192)
            obj.stream.seek(0)
            mime = filetype.guess_mime(content)
+            if mime is None:
+                ext = (
+                    obj.name.rsplit(".", 1)[-1]
+                    if ("." in obj.name and not obj.name.startswith("."))
+                    else ""
+                )
+                mime = self._mime_from_extension(ext)

-        if mime is None:
-            mime = self._detect_html_xhtml(content)
-        if mime is None:
-            mime = "text/markdown"
+        mime = mime or self._detect_html_xhtml(content)
+        mime = mime or "text/plain"

        format = MimeTypeToFormat.get(mime)
        return format

+    def _mime_from_extension(self, ext):
+        mime = None
+        if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
+            mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
+        elif ext in FormatToExtensions[InputFormat.HTML]:
+            mime = FormatToMimeType[InputFormat.HTML][0]
+        elif ext in FormatToExtensions[InputFormat.MD]:
+            mime = FormatToMimeType[InputFormat.MD][0]
+
+        return mime
+
    def _detect_html_xhtml(self, content):
        content_str = content.decode("ascii", errors="ignore").lower()
        # Remove XML comments
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -1,11 +1,13 @@
 import json
 import logging
+from io import BytesIO
 from pathlib import Path

 import yaml

+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import InputFormat
+from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
@ -19,18 +21,24 @@ _log = logging.getLogger(__name__)

 def main():
    input_paths = [
-        Path("README.md"),
        Path("tests/data/wiki_duck.html"),
        Path("tests/data/word_sample.docx"),
+        Path("tests/data/word_nested.docx"),
        Path("tests/data/lorem_ipsum.docx"),
        Path("tests/data/powerpoint_sample.pptx"),
        Path("tests/data/2305.03393v1-pg9-img.png"),
        Path("tests/data/2206.01062.pdf"),
        Path("tests/data/test_01.asciidoc"),
-        Path("tests/data/test_01.asciidoc"),
+        Path("tests/data/test_02.asciidoc"),
        Path("README.md"),
    ]

+    # To read from bytes instead:
+    # docs = [
+    #    DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
+    #    for f in input_paths
+    # ]
+
    ## for defaults use:
    # doc_converter = DocumentConverter()

@ -49,7 +57,8 @@ def main():
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
                InputFormat.PDF: PdfFormatOption(
-                    pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
+                    pipeline_cls=StandardPdfPipeline,
+                    backend=DoclingParseDocumentBackend,
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
@ -59,6 +68,7 @@ def main():
    )

    conv_results = doc_converter.convert_all(input_paths)
+    # conv_results = doc_converter.convert_all(docs)

    for res in conv_results:
        out_path = Path("scratch")