Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

2025-07-27 04:24:45 +00:00 · 2024-10-11 16:31:28 +02:00 · 2024-10-11 16:31:28 +02:00 · 5b5c99e9da
commit 5b5c99e9da
parent ca2a96d982 d0fccb9342
9 changed files with 131 additions and 67 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -1,5 +1,5 @@
 import logging
-from io import BytesIO, TextIOWrapper
+from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                try:
                    self.analyse_element(element, idx, doc)
                except Exception as exc_child:
                    _log.error(" -> error treating child: ", exc_child)
                    _log.error(" => element: ", element, "\n")
-                    pass
+                    raise exc_child
        except Exception as exc:
            pass
@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def handle_list(self, element, idx, doc):
        """Handles list tags (ul, ol) and their list items."""
-        # create a list group
+        if element.name == "ul":
-        self.parents[self.level + 1] = doc.add_group(
+            # create a list group
-            parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+            self.parents[self.level + 1] = doc.add_group(
-        )
+                parent=self.parents[self.level], name="list", label=GroupLabel.LIST
            )
        elif element.name == "ol":
            # create a list group
            self.parents[self.level + 1] = doc.add_group(
                parent=self.parents[self.level],
                name="ordered list",
                label=GroupLabel.ORDERED_LIST,
            )
        self.level += 1
        self.walk(element, doc)
@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def handle_listitem(self, element, idx, doc):
        """Handles listitem tags (li)."""
        nested_lists = element.find(["ul", "ol"])
        parent_list_label = self.parents[self.level].label
        index_in_list = len(self.parents[self.level].children) + 1
        if nested_lists:
            name = element.name
            text = self.get_direct_text(element)
            marker = ""
            enumerated = False
            if parent_list_label == GroupLabel.ORDERED_LIST:
                marker = str(index_in_list)
                enumerated = True
            # create a list-item
-            self.parents[self.level + 1] = doc.add_text(
+            self.parents[self.level + 1] = doc.add_list_item(
-                label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
+                text=text,
                enumerated=enumerated,
                marker=marker,
                parent=self.parents[self.level],
            )
            self.level += 1
@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element.text, str):
            text = element.text.strip()
-            doc.add_text(
+            marker = ""
-                label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
+            enumerated = False
            if parent_list_label == GroupLabel.ORDERED_LIST:
                marker = f"{str(index_in_list)}."
                enumerated = True
            doc.add_list_item(
                text=text,
                enumerated=enumerated,
                marker=marker,
                parent=self.parents[self.level],
            )
        else:
            _log.warn("list-item has no text: ", element)
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        try:
            self.pptx_obj = Presentation(self.path_or_stream)
            self.valid = True
-        except Exception:
+        except Exception as e:
            raise RuntimeError(
                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
            ) from e
@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            prov = self.generate_prov(shape, slide_ind, shape.text.strip())
            if is_a_list:
                # TODO: determine if this is an unordered list or an ordered list.
                #  Set GroupLabel.ORDERED_LIST when it fits.
                new_list = doc.add_group(
                    label=GroupLabel.LIST, name=f"list", parent=parent_slide
                )
@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                        e_is_a_list_item = True
                    else:
                        e_is_a_list_item = False
                    if e_is_a_list_item:
-                        doc.add_text(
+                        # TODO: Set marker and enumerated arguments if this is an enumeration element.
-                            label=DocItemLabel.LIST_ITEM,
+                        doc.add_list_item(
                            parent=new_list,
                            text=e.text.strip(),
                            prov=prov,
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
            )
-            doc.add_text(
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-                label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text
+            doc.add_list_item(parent=self.parents[level], text=text)
            )
        elif (
            self.prev_numid() == numid and self.prev_indent() < ilevel
@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self.level_at_new_list + self.prev_indent() + 1,
                self.level_at_new_list + ilevel + 1,
            ):
                # TODO: determine if this is an unordered list or an ordered list.
                #  Set GroupLabel.ORDERED_LIST when it fits.
                self.parents[i] = doc.add_group(
                    label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
                )
-            doc.add_text(
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-                label=DocItemLabel.LIST_ITEM,
+            doc.add_list_item(
                parent=self.parents[self.level_at_new_list + ilevel],
                text=text,
            )
@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                if k > self.level_at_new_list + ilevel:
                    self.parents[k] = None
-            doc.add_text(
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-                label=DocItemLabel.LIST_ITEM,
+            doc.add_list_item(
                parent=self.parents[self.level_at_new_list + ilevel],
                text=text,
            )
        elif self.prev_numid() == numid or self.prev_indent() == ilevel:
-            doc.add_text(
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-                label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
+            doc.add_list_item(parent=self.parents[level - 1], text=text)
            )
        return
    def handle_tables(self, element, docx_obj, doc):
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -3,7 +3,7 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 import filetype
 from docling_core.types import BaseText
@ -24,10 +24,6 @@ from pydantic import BaseModel
 from typing_extensions import deprecated
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConversionStatus,
@ -372,14 +368,20 @@ class ConversionResult(BaseModel):
        strict_text: bool = False,
        image_placeholder: str = "<!-- image -->",
    ) -> str:
-        return self.legacy_output.export_to_markdown(
+        if self.legacy_output is None:
-            delim=delim,
+            raise RuntimeError(
-            main_text_start=main_text_start,
+                "No legacy output was produced, can not export as markdown. "
-            main_text_stop=main_text_stop,
+                "Please use output.export_to_markdown() instead."
-            main_text_labels=main_text_labels,
+            )
-            strict_text=strict_text,
+        else:
-            image_placeholder=image_placeholder,
+            return self.legacy_output.export_to_markdown(
-        )
+                delim=delim,
                main_text_start=main_text_start,
                main_text_stop=main_text_stop,
                main_text_labels=main_text_labels,
                strict_text=strict_text,
                image_placeholder=image_placeholder,
            )
    @deprecated("Use output.export_to_text() instead.")
    def render_as_text(
@ -394,13 +396,19 @@ class ConversionResult(BaseModel):
            "caption",
        ],
    ) -> str:
-        return self.legacy_output.export_to_markdown(
+        if self.legacy_output is None:
-            delim=delim,
+            raise RuntimeError(
-            main_text_start=main_text_start,
+                "No legacy output was produced, can not export as text. "
-            main_text_stop=main_text_stop,
+                "Please use output.export_to_markdown() instead."
-            main_text_labels=main_text_labels,
+            )
-            strict_text=True,
+        else:
-        )
+            return self.legacy_output.export_to_markdown(
                delim=delim,
                main_text_start=main_text_start,
                main_text_stop=main_text_stop,
                main_text_labels=main_text_labels,
                strict_text=True,
            )
    @deprecated("Use output.export_to_document_tokens() instead.")
    def render_as_doctags(
@ -426,21 +434,27 @@ class ConversionResult(BaseModel):
        add_table_cell_label: bool = True,
        add_table_cell_text: bool = True,
    ) -> str:
-        return self.legacy_output.export_to_document_tokens(
+        if self.legacy_output is None:
-            delim=delim,
+            raise RuntimeError(
-            main_text_start=main_text_start,
+                "No legacy output was produced, can not export as doctags. "
-            main_text_stop=main_text_stop,
+                "Please use output.export_to_markdown() instead."
-            main_text_labels=main_text_labels,
+            )
-            xsize=xsize,
+        else:
-            ysize=ysize,
+            return self.legacy_output.export_to_document_tokens(
-            add_location=add_location,
+                delim=delim,
-            add_content=add_content,
+                main_text_start=main_text_start,
-            add_page_index=add_page_index,
+                main_text_stop=main_text_stop,
-            # table specific flags
+                main_text_labels=main_text_labels,
-            add_table_cell_location=add_table_cell_location,
+                xsize=xsize,
-            add_table_cell_label=add_table_cell_label,
+                ysize=ysize,
-            add_table_cell_text=add_table_cell_text,
+                add_location=add_location,
-        )
+                add_content=add_content,
                add_page_index=add_page_index,
                # table specific flags
                add_table_cell_location=add_table_cell_location,
                add_table_cell_label=add_table_cell_label,
                add_table_cell_text=add_table_cell_text,
            )
    def render_element_images(
        self, element_types: Tuple[PageElement] = (FigureElement,)
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -1,7 +1,6 @@
 import logging
 from typing import Iterable
 import numpy
 from docling_core.types.experimental import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@ -1,7 +1,4 @@
 import logging
 from typing import Iterable
 from docling_core.types.experimental import NodeItem
 from docling.backend.abstract_backend import (
    AbstractDocumentBackend,
@ -9,7 +6,7 @@ from docling.backend.abstract_backend import (
 )
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.pipeline.base_model_pipeline import AbstractModelPipeline
 _log = logging.getLogger(__name__)
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@ -2,13 +2,10 @@ import json
 import logging
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import (
    DocumentConverter,
    FormatOption,
    PdfFormatOption,
    WordFormatOption,
 )
@ -38,6 +35,7 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
        # InputFormat.IMAGE,
        InputFormat.DOCX,
        InputFormat.HTML,
        InputFormat.PPTX,
    ],  # whitelist formats, other files are ignored.
    format_options={
        InputFormat.PDF: PdfFormatOption(
@ -50,6 +48,30 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
    },
 )
 doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
    pdf=None,
    docx=WordFormatOption(
        pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
    ),
    formats=[
        InputFormat.PDF,
        # InputFormat.IMAGE,
        InputFormat.DOCX,
        InputFormat.HTML,
        InputFormat.PPTX,
    ],  # whitelist formats, other files are ignored.
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
        ),  # PdfFormatOption(backend=PyPdfiumDocumentBackend),
        InputFormat.DOCX: WordFormatOption(
            pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
        ),
        # InputFormat.IMAGE: PdfFormatOption(),
    },
 )
 conv_results = doc_converter.convert_all(input_paths)
 for res in conv_results:
--- a/tests/data/powerpoint_sample.pptx
+++ b/tests/data/powerpoint_sample.pptx
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -3,7 +3,6 @@ import warnings
 from pathlib import Path
 from typing import List
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types.experimental import DoclingDocument
 from pydantic import TypeAdapter