Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

2025-07-27 04:24:45 +00:00 · 2024-10-11 12:59:11 +02:00 · 2024-10-11 12:59:11 +02:00 · 786b89efd9
commit 786b89efd9
parent c6e1471e02 3ee97c42b2
12 changed files with 80 additions and 37 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):

        if not success:
            raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse could not load document with hash {document_hash}."
            )

    def page_count(self) -> int:
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)

 class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        print("About to init HTML backend...")
+        _log.debug("About to init HTML backend...")
        super().__init__(path_or_stream, document_hash)
        self.soup = None
        # HTML file:
@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

        try:
            if isinstance(self.path_or_stream, BytesIO):
-                text_stream = byte_stream.getvalue().decode("utf-8")
-                print(text_stream)
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                self.soup = BeautifulSoup(text_stream, "html.parser")
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    html_content = f.read()
                    self.soup = BeautifulSoup(html_content, "html.parser")
        except Exception as e:
-            _log.error("could not parse html: {}".format(e))
-            return doc
+            raise RuntimeError(
+                f"Could not initialize HTML backend for file with hash {document_hash}."
+            ) from e

    def is_valid(self) -> bool:
        return True
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-        print("Trying to convert HTML...")
+        _log.debug("Trying to convert HTML...")
        # Replace <br> tags with newline characters
        for br in self.soup.body.find_all("br"):
            br.replace_with("\n")
@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def analyse_element(self, element, idx, doc):
        """
        if element.name!=None:
-            print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
        """

        if element.name in self.labels:
@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

        doc.add_table(data=data, parent=self.parents[self.level])

-    def get_list_text(list_element, level=0):
+    def get_list_text(self, list_element, level=0):
        """Recursively extract text from <ul> or <ol> with proper indentation."""
        result = []
        bullet_char = "*"  # Default bullet character for unordered lists
@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
                if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))
        elif list_element.name == "ul":  # For unordered lists, use bullet points
            for li in list_element.find_all("li", recursive=False):
                # Add bullet points for unordered lists
@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
                if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))

        return result

--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        self.path_or_stream = path_or_stream

        self.pptx_obj = None
-        self.valid = True
+        self.valid = False
        try:
            self.pptx_obj = Presentation(self.path_or_stream)
+            self.valid = True
        except Exception:
-            _log.error("could not parse pptx")
-            self.valid = False
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+            ) from e

        return

--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        # self.initialise(path_or_stream)
        # Word file:
        self.path_or_stream = path_or_stream
+        self.valid = False
        # Initialise the parents for the hierarchy
        self.max_levels = 10
        self.level_at_new_list = None
@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            "indents": [None],
        }

+        self.docx_obj = None
+        try:
+            self.docx_obj = docx.Document(self.path_or_stream)
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+            ) from e
+
    def is_valid(self) -> bool:
        return True

@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # Parses the DOCX into a structured document model.
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-        docx_obj = None
-        try:
-            docx_obj = docx.Document(self.path_or_stream)
-        except Exception:
-            _log.error("could not parse docx")
-            return doc

        # self.initialise()
-        doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
+        doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
        return doc

    def update_history(self, name, level, numid, ilevel):
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
            self._pdoc = pdfium.PdfDocument(path_or_stream)
        except PdfiumError as e:
            raise RuntimeError(
-                f"pypdfium could not load document {document_hash}"
+                f"pypdfium could not load document with hash {document_hash}"
            ) from e

    def page_count(self) -> int:
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,4 +1,5 @@
 import logging
+import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
@ -171,7 +172,7 @@ class ConvertedDocument(BaseModel):
    pages: List[Page] = []
    assembled: AssembledUnit = AssembledUnit()

-    legacy_output: DsDocument = _EMPTY_LEGACY_DOC
+    legacy_output: Optional[DsDocument] = None  # _EMPTY_LEGACY_DOC
    output: DoclingDocument = _EMPTY_DOCLING_DOC

    def _to_legacy_document(self) -> DsDocument:
@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
                )

    def _guess_format(self, obj):
+        content = None
        if isinstance(obj, Path):
            mime = filetype.guess_mime(str(obj))
-        elif isinstance(obj, DocumentStream):
-            mime = filetype.guess_mime(obj.stream.read(8192))
            if mime is None:
-            # TODO improve this.
+                with obj.open("rb") as f:
+                    content = f.read(1024)  # Read first 1KB

-            if obj.suffix == ".html":
-                mime = "text/html"
+        elif isinstance(obj, DocumentStream):
+            obj.stream.seek(0)
+            content = obj.stream.read(8192)
+            obj.stream.seek(0)
+            mime = filetype.guess_mime(content)
+
+        if mime is None:
+            mime = self._detect_html_xhtml(content)

        format = MimeTypeToFormat.get(mime)
        return format

+    def _detect_html_xhtml(self, content):
+        content_str = content.decode("ascii", errors="ignore").lower()
+        # Remove XML comments
+        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
+        content_str = content_str.lstrip()
+
+        if re.match(r"<\?xml", content_str):
+            if "xhtml" in content_str[:1000]:
+                return "application/xhtml+xml"
+
+        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+            return "text/html"
+
+        return None
+
    @classmethod
    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
        paths = [Path(p) for p in paths]
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
    )


-class PipelineOptions(BaseModel): ...
+class PipelineOptions(BaseModel):
+    create_legacy_output: bool = (
+        True  # This defautl will be set to False on a future version of docling
+    )


 class PdfPipelineOptions(PipelineOptions):
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
 class GlmModel:
    def __init__(self, config):
        self.config = config
+        self.create_legacy_output = config.get("create_legacy_output", True)
+
        self.model_names = self.config.get(
            "model_names", ""
        )  # "language;term;reference"
@ -42,6 +44,9 @@ class GlmModel:
        )

        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
+        legacy_doc: DsLegacyDocument = None
+
+        if self.create_legacy_output:
            legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)

        # DEBUG code:
@ -92,4 +97,4 @@ class GlmModel:
        # draw_clusters_and_cells(ds_doc, 0)
        # draw_clusters_and_cells(exported_doc, 0)

-        return (legacy_doc, docling_doc)
+        return (docling_doc, legacy_doc)
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@ -41,7 +41,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
            artifacts_path = self.download_models_hf()

        self.artifacts_path = Path(artifacts_path)
-        self.glm_model = GlmModel(config={})
+        self.glm_model = GlmModel(
+            config={"create_legacy_output": pipeline_options.create_legacy_output}
+        )

        if ocr_model := self.get_ocr_model() is None:
            raise RuntimeError(
@ -140,7 +142,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
            elements=all_elements, headers=all_headers, body=all_body
        )

-        conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
+        conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)

        return conv_res

--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -120,7 +120,7 @@ def main():
    ]

    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
-    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+    # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
    # input = DocumentConversionInput.from_streams(docs)

    doc_converter = DocumentConverter()
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@ -1,3 +1,4 @@
+import json
 import logging
 from pathlib import Path

@ -38,6 +39,7 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
        InputFormat.PDF,
        # InputFormat.IMAGE,
        InputFormat.DOCX,
+        InputFormat.HTML,
    ],  # whitelist formats, other files are ignored.
    format_options={
        InputFormat.PDF: PdfFormatOption(
@ -53,12 +55,15 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
 conv_results = doc_converter.convert_batch(input)

 for res in conv_results:
-    out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
+    out_path = Path("./scratch")
    print(
        f"Document {res.input.file.name} converted with status {res.status}."
        f"\nSaved markdown output to: {str(out_path)}"
    )
    # print(res.experimental.export_to_markdown())
    # Export Docling document format to markdown (experimental):
-    with out_path.open("w") as fp:
+    with (out_path / f"{res.input.file.name}.md").open("w") as fp:
        fp.write(res.output.export_to_markdown())
+
+    with (out_path / f"{res.input.file.name}.json").open("w") as fp:
+        fp.write(json.dumps(res.output.export_to_dict()))
--- a/tests/data/word_sample.docx
+++ b/tests/data/word_sample.docx