Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

2025-07-27 04:24:45 +00:00 · 2024-10-11 12:59:11 +02:00 · 2024-10-11 12:59:11 +02:00 · 786b89efd9
commit 786b89efd9
parent c6e1471e02 3ee97c42b2
12 changed files with 80 additions and 37 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
        if not success:
            raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse could not load document with hash {document_hash}."
            )
    def page_count(self) -> int:
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        print("About to init HTML backend...")
+        _log.debug("About to init HTML backend...")
        super().__init__(path_or_stream, document_hash)
        self.soup = None
        # HTML file:
@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        try:
            if isinstance(self.path_or_stream, BytesIO):
-                text_stream = byte_stream.getvalue().decode("utf-8")
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                print(text_stream)
                self.soup = BeautifulSoup(text_stream, "html.parser")
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    html_content = f.read()
                    self.soup = BeautifulSoup(html_content, "html.parser")
        except Exception as e:
-            _log.error("could not parse html: {}".format(e))
+            raise RuntimeError(
-            return doc
+                f"Could not initialize HTML backend for file with hash {document_hash}."
            ) from e
    def is_valid(self) -> bool:
        return True
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
-        print("Trying to convert HTML...")
+        _log.debug("Trying to convert HTML...")
        # Replace <br> tags with newline characters
        for br in self.soup.body.find_all("br"):
            br.replace_with("\n")
@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def analyse_element(self, element, idx, doc):
        """
        if element.name!=None:
-            print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
        """
        if element.name in self.labels:
@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        doc.add_table(data=data, parent=self.parents[self.level])
-    def get_list_text(list_element, level=0):
+    def get_list_text(self, list_element, level=0):
        """Recursively extract text from <ul> or <ol> with proper indentation."""
        result = []
        bullet_char = "*"  # Default bullet character for unordered lists
@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
                if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))
        elif list_element.name == "ul":  # For unordered lists, use bullet points
            for li in list_element.find_all("li", recursive=False):
                # Add bullet points for unordered lists
@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
                if nested_list:
-                    result.extend(get_list_text(nested_list, level + 1))
+                    result.extend(self.get_list_text(nested_list, level + 1))
        return result
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        self.path_or_stream = path_or_stream
        self.pptx_obj = None
-        self.valid = True
+        self.valid = False
        try:
            self.pptx_obj = Presentation(self.path_or_stream)
            self.valid = True
        except Exception:
-            _log.error("could not parse pptx")
+            raise RuntimeError(
-            self.valid = False
+                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
            ) from e
        return
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        # self.initialise(path_or_stream)
        # Word file:
        self.path_or_stream = path_or_stream
        self.valid = False
        # Initialise the parents for the hierarchy
        self.max_levels = 10
        self.level_at_new_list = None
@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            "indents": [None],
        }
        self.docx_obj = None
        try:
            self.docx_obj = docx.Document(self.path_or_stream)
            self.valid = True
        except Exception as e:
            raise RuntimeError(
                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
            ) from e
    def is_valid(self) -> bool:
        return True
@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # Parses the DOCX into a structured document model.
        doc = DoclingDocument(description=DescriptionItem(), name="dummy")
        docx_obj = None
        try:
            docx_obj = docx.Document(self.path_or_stream)
        except Exception:
            _log.error("could not parse docx")
            return doc
        # self.initialise()
-        doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
+        doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
        return doc
    def update_history(self, name, level, numid, ilevel):
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
            self._pdoc = pdfium.PdfDocument(path_or_stream)
        except PdfiumError as e:
            raise RuntimeError(
-                f"pypdfium could not load document {document_hash}"
+                f"pypdfium could not load document with hash {document_hash}"
            ) from e
    def page_count(self) -> int:
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,4 +1,5 @@
 import logging
 import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
@ -171,7 +172,7 @@ class ConvertedDocument(BaseModel):
    pages: List[Page] = []
    assembled: AssembledUnit = AssembledUnit()
-    legacy_output: DsDocument = _EMPTY_LEGACY_DOC
+    legacy_output: Optional[DsDocument] = None  # _EMPTY_LEGACY_DOC
    output: DoclingDocument = _EMPTY_DOCLING_DOC
    def _to_legacy_document(self) -> DsDocument:
@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
                )
    def _guess_format(self, obj):
        content = None
        if isinstance(obj, Path):
            mime = filetype.guess_mime(str(obj))
-        elif isinstance(obj, DocumentStream):
+            if mime is None:
-            mime = filetype.guess_mime(obj.stream.read(8192))
+                with obj.open("rb") as f:
-        if mime is None:
+                    content = f.read(1024)  # Read first 1KB
            # TODO improve this.
-            if obj.suffix == ".html":
+        elif isinstance(obj, DocumentStream):
-                mime = "text/html"
+            obj.stream.seek(0)
            content = obj.stream.read(8192)
            obj.stream.seek(0)
            mime = filetype.guess_mime(content)
        if mime is None:
            mime = self._detect_html_xhtml(content)
        format = MimeTypeToFormat.get(mime)
        return format
    def _detect_html_xhtml(self, content):
        content_str = content.decode("ascii", errors="ignore").lower()
        # Remove XML comments
        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
        content_str = content_str.lstrip()
        if re.match(r"<\?xml", content_str):
            if "xhtml" in content_str[:1000]:
                return "application/xhtml+xml"
        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
            return "text/html"
        return None
    @classmethod
    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
        paths = [Path(p) for p in paths]
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
    )
-class PipelineOptions(BaseModel): ...
+class PipelineOptions(BaseModel):
    create_legacy_output: bool = (
        True  # This defautl will be set to False on a future version of docling
    )
 class PdfPipelineOptions(PipelineOptions):
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
 class GlmModel:
    def __init__(self, config):
        self.config = config
        self.create_legacy_output = config.get("create_legacy_output", True)
        self.model_names = self.config.get(
            "model_names", ""
        )  # "language;term;reference"
@ -42,7 +44,10 @@ class GlmModel:
        )
        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
-        legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
+        legacy_doc: DsLegacyDocument = None
        if self.create_legacy_output:
            legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
        # DEBUG code:
        def draw_clusters_and_cells(ds_document, page_no):
@ -92,4 +97,4 @@ class GlmModel:
        # draw_clusters_and_cells(ds_doc, 0)
        # draw_clusters_and_cells(exported_doc, 0)
-        return (legacy_doc, docling_doc)
+        return (docling_doc, legacy_doc)
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@ -41,7 +41,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
            artifacts_path = self.download_models_hf()
        self.artifacts_path = Path(artifacts_path)
-        self.glm_model = GlmModel(config={})
+        self.glm_model = GlmModel(
            config={"create_legacy_output": pipeline_options.create_legacy_output}
        )
        if ocr_model := self.get_ocr_model() is None:
            raise RuntimeError(
@ -140,7 +142,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
            elements=all_elements, headers=all_headers, body=all_body
        )
-        conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
+        conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
        return conv_res
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -120,7 +120,7 @@ def main():
    ]
    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
-    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+    # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
    # input = DocumentConversionInput.from_streams(docs)
    doc_converter = DocumentConverter()
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@ -1,3 +1,4 @@
 import json
 import logging
 from pathlib import Path
@ -38,6 +39,7 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
        InputFormat.PDF,
        # InputFormat.IMAGE,
        InputFormat.DOCX,
        InputFormat.HTML,
    ],  # whitelist formats, other files are ignored.
    format_options={
        InputFormat.PDF: PdfFormatOption(
@ -53,12 +55,15 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
 conv_results = doc_converter.convert_batch(input)
 for res in conv_results:
-    out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
+    out_path = Path("./scratch")
    print(
        f"Document {res.input.file.name} converted with status {res.status}."
        f"\nSaved markdown output to: {str(out_path)}"
    )
    # print(res.experimental.export_to_markdown())
    # Export Docling document format to markdown (experimental):
-    with out_path.open("w") as fp:
+    with (out_path / f"{res.input.file.name}.md").open("w") as fp:
        fp.write(res.output.export_to_markdown())
    with (out_path / f"{res.input.file.name}.json").open("w") as fp:
        fp.write(json.dumps(res.output.export_to_dict()))
--- a/tests/data/word_sample.docx
+++ b/tests/data/word_sample.docx