feat(backend): add generic options support and HTML image handling modes (#2011)

* feat: add backend options support to document backends Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * feat: enhance document backends with generic backend options and improve HTML image handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * Refactor tests for declarativebackend Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): improve image caption handling and ensure backend options are set correctly Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix: enhance HTML backend image handling and add support for local file paths Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: Add ground truth data for test data Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): skip loading SVG files in image data handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): simplify backend options and address gaps Backend options for DeclarativeDocumentBackend classes and only when necessary. Refactor caption parsing in 'img' elements and remove dummy text. Replace deprecated annotations from Typing library with native types. Replace typing annotations according to pydantic guidelines. Some documentation with pydantic annotations. Fix diff issue with test files. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(html): add tests and fix bugs Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): refactor backend options Move backend option classes to its own module within datamodel package. Rename 'source_location' with 'source_uri' in HTMLBackendOptions. Rename 'image_fetch' with 'fetch_images' in HTMLBackendOptions. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(markdown): create a class for the markdown backend options Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-21 12:52:17 +02:00
parent b66624bfff
commit a30e6a7614
31 changed files with 8088 additions and 7588 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Set, Union
+from typing import TYPE_CHECKING, Union

 from docling_core.types.doc import DoclingDocument

+from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
+
 if TYPE_CHECKING:
    from docling.datamodel.base_models import InputFormat
    from docling.datamodel.document import InputDocument
@@ -35,7 +37,7 @@ class AbstractDocumentBackend(ABC):

    @classmethod
    @abstractmethod
-    def supported_formats(cls) -> Set["InputFormat"]:
+    def supported_formats(cls) -> set["InputFormat"]:
        pass


@@ -58,6 +60,20 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
    straight without a recognition pipeline.
    """

+    @abstractmethod
+    def __init__(
+        self,
+        in_doc: "InputDocument",
+        path_or_stream: Union[BytesIO, Path],
+        options: BackendOptions = DeclarativeBackendOptions(),
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+        self.options: BackendOptions = options
+
    @abstractmethod
    def convert(self) -> DoclingDocument:
        pass
+
+    @classmethod
+    def get_default_options(cls) -> BackendOptions:
+        return DeclarativeBackendOptions()
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@@ -2,7 +2,7 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Final, Set, Union
+from typing import Final, Union

 from docling_core.types.doc import (
    DocItemLabel,
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128


 class AsciiDocBackend(DeclarativeDocumentBackend):
-    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

        self.path_or_stream = path_or_stream
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return

    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.ASCIIDOC}

    def convert(self) -> DoclingDocument:
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,13 +1,16 @@
+import base64
 import logging
+import os
 import re
-import traceback
+import warnings
 from contextlib import contextmanager
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse

+import requests
 from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from bs4.element import PreformattedString
 from docling_core.types.doc import (
@@ -17,6 +20,7 @@ from docling_core.types.doc import (
    DocumentOrigin,
    GroupItem,
    GroupLabel,
+    PictureItem,
    RefItem,
    RichTableCell,
    TableCell,
@@ -24,13 +28,18 @@ from docling_core.types.doc import (
    TableItem,
    TextItem,
 )
-from docling_core.types.doc.document import ContentLayer, Formatting, Script
+from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
+from PIL import Image, UnidentifiedImageError
 from pydantic import AnyUrl, BaseModel, ValidationError
 from typing_extensions import override

-from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.abstract_backend import (
+    DeclarativeDocumentBackend,
+)
+from docling.datamodel.backend_options import HTMLBackendOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
+from docling.exceptions import OperationNotAllowed

 _log = logging.getLogger(__name__)

@@ -43,6 +52,7 @@ _BLOCK_TAGS: Final = {
    "details",
    "figure",
    "footer",
+    "img",
    "h1",
    "h2",
    "h3",
@@ -186,11 +196,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self,
        in_doc: InputDocument,
        path_or_stream: Union[BytesIO, Path],
-        original_url: Optional[AnyUrl] = None,
+        options: HTMLBackendOptions = HTMLBackendOptions(),
    ):
-        super().__init__(in_doc, path_or_stream)
+        super().__init__(in_doc, path_or_stream, options)
        self.soup: Optional[Tag] = None
-        self.path_or_stream = path_or_stream
+        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
+        self.base_path: Optional[str] = str(options.source_uri)

        # Initialize the parents for the hierarchy
        self.max_levels = 10
@@ -200,7 +211,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        for i in range(self.max_levels):
            self.parents[i] = None
        self.hyperlink: Union[AnyUrl, Path, None] = None
-        self.original_url = original_url
        self.format_tags: list[str] = []

        try:
@@ -236,6 +246,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.HTML}

+    @classmethod
+    @override
+    def get_default_options(cls) -> HTMLBackendOptions:
+        return HTMLBackendOptions()
+
    @override
    def convert(self) -> DoclingDocument:
        _log.debug("Starting HTML conversion...")
@@ -261,7 +276,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                content_layer=ContentLayer.FURNITURE,
            )
        # remove script and style tags
-        for tag in self.soup(["script", "style"]):
+        for tag in self.soup(["script", "noscript", "style"]):
            tag.decompose()
        # remove any hidden tag
        for tag in self.soup(hidden=True):
@@ -291,6 +306,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self._walk(content, doc)
        return doc

+    @staticmethod
+    def _is_remote_url(value: str) -> bool:
+        parsed = urlparse(value)
+        return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
+
+    def _resolve_relative_path(self, loc: str) -> str:
+        abs_loc = loc
+
+        if self.base_path:
+            if loc.startswith("//"):
+                # Protocol-relative URL - default to https
+                abs_loc = "https:" + loc
+            elif not loc.startswith(("http://", "https://", "data:", "file://")):
+                if HTMLDocumentBackend._is_remote_url(self.base_path):  # remote fetch
+                    abs_loc = urljoin(self.base_path, loc)
+                elif self.base_path:  # local fetch
+                    # For local files, resolve relative to the HTML file location
+                    abs_loc = str(Path(self.base_path).parent / loc)
+
+        _log.debug(f"Resolved location {loc} to {abs_loc}")
+        return abs_loc
+
    @staticmethod
    def group_cell_elements(
        group_name: str,
@@ -520,7 +557,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                if name == "img":
                    flush_buffer()
                    im_ref3 = self._emit_image(node, doc)
-                    added_refs.append(im_ref3)
+                    if im_ref3:
+                        added_refs.append(im_ref3)
                elif name in _FORMAT_TAG_MAP:
                    with self._use_format([name]):
                        wk = self._walk(node, doc)
@@ -669,8 +707,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        else:
            if isinstance(this_href, str) and this_href:
                old_hyperlink = self.hyperlink
-                if self.original_url is not None:
-                    this_href = urljoin(str(self.original_url), str(this_href))
+                this_href = self._resolve_relative_path(this_href)
                # ugly fix for relative links since pydantic does not support them.
                try:
                    new_hyperlink = AnyUrl(this_href)
@@ -837,7 +874,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        for img_tag in tag("img"):
            if isinstance(img_tag, Tag):
                im_ref = self._emit_image(img_tag, doc)
-                added_ref.append(im_ref)
+                if im_ref:
+                    added_ref.append(im_ref)
        return added_ref

    def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
@@ -1003,7 +1041,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            img_tag = tag.find("img")
            if isinstance(img_tag, Tag):
                im_ref = self._emit_image(img_tag, doc)
-                added_refs.append(im_ref)
+                if im_ref is not None:
+                    added_refs.append(im_ref)

        elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
            heading_refs = self._handle_heading(tag, doc)
@@ -1061,7 +1100,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            for img_tag in tag("img"):
                if isinstance(img_tag, Tag):
                    im_ref2 = self._emit_image(tag, doc)
-                    added_refs.append(im_ref2)
+                    if im_ref2 is not None:
+                        added_refs.append(im_ref2)

        elif tag_name in {"pre"}:
            # handle monospace code snippets (pre).
@@ -1092,10 +1132,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                self._walk(tag, doc)
        return added_refs

-    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
+    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
        figure = img_tag.find_parent("figure")
        caption: AnnotatedTextList = AnnotatedTextList()

+        parent = self.parents[self.level]
+
        # check if the figure has a link - this is HACK:
        def get_img_hyperlink(img_tag):
            this_parent = img_tag.parent
@@ -1106,9 +1148,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            return None

        if img_hyperlink := get_img_hyperlink(img_tag):
-            caption.append(
-                AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
-            )
+            img_text = img_tag.get("alt") or ""
+            caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))

        if isinstance(figure, Tag):
            caption_tag = figure.find("figcaption", recursive=False)
@@ -1135,13 +1176,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                hyperlink=caption_anno_text.hyperlink,
            )

+        src_loc: str = self._get_attr_as_string(img_tag, "src")
+        if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
+            # Do not fetch the image, just add a placeholder
+            placeholder: PictureItem = doc.add_picture(
+                caption=caption_item,
+                parent=parent,
+                content_layer=self.content_layer,
+            )
+            return placeholder.get_ref()
+
+        src_loc = self._resolve_relative_path(src_loc)
+        img_ref = self._create_image_ref(src_loc)
+
        docling_pic = doc.add_picture(
+            image=img_ref,
            caption=caption_item,
-            parent=self.parents[self.level],
+            parent=parent,
            content_layer=self.content_layer,
        )
        return docling_pic.get_ref()

+    def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
+        try:
+            img_data = self._load_image_data(src_url)
+            if img_data:
+                img = Image.open(BytesIO(img_data))
+                return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
+        except (
+            requests.HTTPError,
+            ValidationError,
+            UnidentifiedImageError,
+            OperationNotAllowed,
+            TypeError,
+            ValueError,
+        ) as e:
+            warnings.warn(f"Could not process an image from {src_url}: {e}")
+
+        return None
+
+    def _load_image_data(self, src_loc: str) -> Optional[bytes]:
+        if src_loc.lower().endswith(".svg"):
+            _log.debug(f"Skipping SVG file: {src_loc}")
+            return None
+
+        if HTMLDocumentBackend._is_remote_url(src_loc):
+            if not self.options.enable_remote_fetch:
+                raise OperationNotAllowed(
+                    "Fetching remote resources is only allowed when set explicitly. "
+                    "Set options.enable_remote_fetch=True."
+                )
+            response = requests.get(src_loc, stream=True)
+            response.raise_for_status()
+            return response.content
+        elif src_loc.startswith("data:"):
+            data = re.sub(r"^data:image/.+;base64,", "", src_loc)
+            return base64.b64decode(data)
+
+        if src_loc.startswith("file://"):
+            src_loc = src_loc[7:]
+
+        if not self.options.enable_local_fetch:
+            raise OperationNotAllowed(
+                "Fetching local resources is only allowed when set explicitly. "
+                "Set options.enable_local_fetch=True."
+            )
+        # add check that file exists and can read
+        if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
+            with open(src_loc, "rb") as f:
+                return f.read()
+        else:
+            raise ValueError("File does not exist or it is not readable.")
+
    @staticmethod
    def get_text(item: PageElement) -> str:
        """Concatenate all child strings of a PageElement.
@@ -1238,3 +1344,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        )

        return int_spans
+
+    @staticmethod
+    def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
+        """Get attribute value as string, handling list values."""
+        value = tag.get(attr)
+        if not value:
+            return default
+
+        return value[0] if isinstance(value, list) else value
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -24,10 +24,16 @@ from docling_core.types.doc import (
 from docling_core.types.doc.document import Formatting
 from marko import Markdown
 from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
-from typing_extensions import Annotated
+from typing_extensions import Annotated, override

-from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.abstract_backend import (
+    DeclarativeDocumentBackend,
+)
 from docling.backend.html_backend import HTMLDocumentBackend
+from docling.datamodel.backend_options import (
+    HTMLBackendOptions,
+    MarkdownBackendOptions,
+)
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

@@ -88,8 +94,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

        return shortened_text

-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
+    @override
+    def __init__(
+        self,
+        in_doc: InputDocument,
+        path_or_stream: Union[BytesIO, Path],
+        options: MarkdownBackendOptions = MarkdownBackendOptions(),
+    ):
+        super().__init__(in_doc, path_or_stream, options)

        _log.debug("Starting MarkdownDocumentBackend...")

@@ -580,9 +592,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    format=InputFormat.HTML,
                    backend=html_backend_cls,
                    filename=self.file.name,
+                    backend_options=self.options,
                )
                html_backend_obj = html_backend_cls(
-                    in_doc=in_doc, path_or_stream=stream
+                    in_doc=in_doc,
+                    path_or_stream=stream,
+                    options=cast(HTMLBackendOptions, self.options),
                )
                doc = html_backend_obj.convert()
        else:
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Union

 from docling_core.types.doc import (
    BoundingBox,
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        self.path_or_stream = None

    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.PPTX}

    def convert(self) -> DoclingDocument:
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -3,7 +3,7 @@ import re
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, Optional, Union

 from docling_core.types.doc import (
    DocItemLabel,
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.numbered_headers: dict[int, int] = {}
        self.equation_bookends: str = "<eq>{EQ}</eq>"
        # Track processed textbox elements to avoid duplication
-        self.processed_textbox_elements: List[int] = []
+        self.processed_textbox_elements: list[int] = []
        self.docx_to_pdf_converter: Optional[Callable] = None
        self.docx_to_pdf_converter_init = False
        self.display_drawingml_warning = True
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        textbox_elements: list,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
        """Process textbox content and add it to the document structure."""
        level = self._get_level()
        # Create a textbox group to contain all text from the textbox
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
        paragraph = Paragraph(element, docx_obj)
        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        curr_level: Optional[int],
        text: str,
        is_numbered_style: bool = False,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
        level = self._get_level()
        if isinstance(curr_level, int):
            if curr_level > level:
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        marker: str,
        enumerated: bool,
        level: int,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
        # This should not happen by construction
        if not isinstance(self.parents[level], ListGroup):
            return elem_ref
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        ilevel: int,
        elements: list,
        is_numbered: bool = False,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
        # this method is always called with is_numbered. Numbered lists should be properly addressed.
        if not elements:
            return elem_ref
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
        table: Table = Table(element, docx_obj)
        num_rows = len(table.rows)
        num_cols = len(table.columns)
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                else:
                    text = text.replace("<eq>", "$").replace("</eq>", "$")

-                provs_in_cell: List[RefItem] = []
+                provs_in_cell: list[RefItem] = []
                _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
                ref_for_rich_cell = provs_in_cell[0]
                rich_table_cell = False

                def group_cell_elements(
-                    group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
+                    group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
                ) -> RefItem:
                    group_element = doc.add_group(
                        label=GroupLabel.UNSPECIFIED,
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):

    def _handle_pictures(
        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
-    ) -> List[RefItem]:
+    ) -> list[RefItem]:
        def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
            image_data: Optional[bytes] = None
            rId = drawing_blip[0].get(
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                image_data = image_part.blob  # Get the binary image data
            return image_data

-        elem_ref: List[RefItem] = []
+        elem_ref: list[RefItem] = []
        level = self._get_level()
        # Open the BytesIO object with PIL to create an Image
        image_data: Optional[bytes] = get_docx_image(drawing_blip)
--- a/docling/datamodel/backend_options.py
+++ b/docling/datamodel/backend_options.py
@@ -0,0 +1,53 @@
+from pathlib import PurePath
+from typing import Annotated, Literal, Optional, Union
+
+from pydantic import AnyUrl, BaseModel, Field
+
+
+class BaseBackendOptions(BaseModel):
+    """Common options for all declarative document backends."""
+
+    enable_remote_fetch: bool = Field(
+        False, description="Enable remote resource fetching."
+    )
+    enable_local_fetch: bool = Field(
+        False, description="Enable local resource fetching."
+    )
+
+
+class DeclarativeBackendOptions(BaseBackendOptions):
+    """Default backend options for a declarative document backend."""
+
+    kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
+
+
+class HTMLBackendOptions(BaseBackendOptions):
+    """Options specific to the HTML backend.
+
+    This class can be extended to include options specific to HTML processing.
+    """
+
+    kind: Literal["html"] = Field("html", exclude=True, repr=False)
+    fetch_images: bool = Field(
+        False,
+        description=(
+            "Whether the backend should access remote or local resources to parse "
+            "images in an HTML document."
+        ),
+    )
+    source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
+        None,
+        description=(
+            "The URI that originates the HTML document. If provided, the backend "
+            "will use it to resolve relative paths in the HTML document."
+        ),
+    )
+
+
+class MarkdownBackendOptions(HTMLBackendOptions):
+    """Options specific to the Markdown backend."""
+
+
+BackendOptions = Annotated[
+    Union[DeclarativeBackendOptions, HTMLBackendOptions], Field(discriminator="kind")
+]
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -8,14 +8,12 @@ from io import BytesIO
 from pathlib import Path, PurePath
 from typing import (
    TYPE_CHECKING,
-    Any,
-    Dict,
-    List,
+    Annotated,
    Literal,
    Optional,
-    Set,
    Type,
    Union,
+    cast,
 )

 import filetype
@@ -54,8 +52,10 @@ from typing_extensions import deprecated

 from docling.backend.abstract_backend import (
    AbstractDocumentBackend,
+    DeclarativeDocumentBackend,
    PaginatedDocumentBackend,
 )
+from docling.datamodel.backend_options import BackendOptions
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConfidenceReport,
@@ -74,6 +74,7 @@ from docling.utils.utils import create_file_hash

 if TYPE_CHECKING:
    from docling.datamodel.base_models import BaseFormatOption
+    from docling.document_converter import FormatOption

 _log = logging.getLogger(__name__)

@@ -102,32 +103,58 @@ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")


 class InputDocument(BaseModel):
-    file: PurePath
-    document_hash: str  # = None
-    valid: bool = True
-    limits: DocumentLimits = DocumentLimits()
-    format: InputFormat  # = None
+    """A document as an input of a Docling conversion."""

-    filesize: Optional[int] = None
-    page_count: int = 0
+    file: Annotated[
+        PurePath, Field(description="A path representation the input document.")
+    ]
+    document_hash: Annotated[
+        str,
+        Field(description="A stable hash of the path or stream of the input document."),
+    ]
+    valid: bool = Field(True, description="Whether this is is a valid input document.")
+    backend_options: Optional[BackendOptions] = Field(
+        None, description="Custom options for declarative backends."
+    )
+    limits: DocumentLimits = Field(
+        DocumentLimits(), description="Limits in the input document for the conversion."
+    )
+    format: Annotated[InputFormat, Field(description="The document format.")]

-    _backend: AbstractDocumentBackend  # Internal PDF backend used
+    filesize: Optional[int] = Field(
+        None, description="Size of the input file, in bytes."
+    )
+    page_count: int = Field(0, description="Number of pages in the input document.")
+
+    _backend: AbstractDocumentBackend

    def __init__(
        self,
        path_or_stream: Union[BytesIO, Path],
        format: InputFormat,
        backend: Type[AbstractDocumentBackend],
+        backend_options: Optional[BackendOptions] = None,
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
-    ):
+    ) -> None:
        super().__init__(
-            file="", document_hash="", format=InputFormat.PDF
+            file="",
+            document_hash="",
+            format=InputFormat.PDF,
+            backend_options=backend_options,
        )  # initialize with dummy values
-
        self.limits = limits or DocumentLimits()
        self.format = format

+        # check for backend incompatibilities
+        if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
+            if not issubclass(
+                type(backend_options), type(backend.get_default_options())
+            ):
+                raise ValueError(
+                    "Incompatible types between backend and backend_options arguments."
+                )
+
        try:
            if isinstance(path_or_stream, Path):
                self.file = path_or_stream
@@ -140,7 +167,8 @@ class InputDocument(BaseModel):

            elif isinstance(path_or_stream, BytesIO):
                assert filename is not None, (
-                    "Can't construct InputDocument from stream without providing filename arg."
+                    "Can't construct InputDocument from stream without providing "
+                    "filename arg."
                )
                self.file = PurePath(filename)
                self.filesize = path_or_stream.getbuffer().nbytes
@@ -175,7 +203,8 @@ class InputDocument(BaseModel):
        except RuntimeError as e:
            self.valid = False
            _log.exception(
-                f"An unexpected error occurred while opening the document {self.file.name}",
+                "An unexpected error occurred while opening the document "
+                "f{self.file.name}",
                exc_info=e,
            )
            # raise
@@ -185,7 +214,15 @@ class InputDocument(BaseModel):
        backend: Type[AbstractDocumentBackend],
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
-        self._backend = backend(self, path_or_stream=path_or_stream)
+        if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
+            self._backend = backend(
+                self,
+                path_or_stream=path_or_stream,
+                options=self.backend_options,
+            )
+        else:
+            self._backend = backend(self, path_or_stream=path_or_stream)
+
        if not self._backend.is_valid():
            self.valid = False

@@ -199,11 +236,11 @@ class ConversionResult(BaseModel):
    input: InputDocument

    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
-    errors: List[ErrorItem] = []  # structure to keep errors
+    errors: list[ErrorItem] = []  # structure to keep errors

-    pages: List[Page] = []
+    pages: list[Page] = []
    assembled: AssembledUnit = AssembledUnit()
-    timings: Dict[str, ProfilingItem] = {}
+    timings: dict[str, ProfilingItem] = {}
    confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)

    document: DoclingDocument = _EMPTY_DOCLING_DOC
@@ -222,7 +259,7 @@ class _DummyBackend(AbstractDocumentBackend):
        return False

    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
        return set()

    @classmethod
@@ -235,7 +272,7 @@ class _DummyBackend(AbstractDocumentBackend):

 class _DocumentConversionInput(BaseModel):
    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
-    headers: Optional[Dict[str, str]] = None
+    headers: Optional[dict[str, str]] = None
    limits: Optional[DocumentLimits] = DocumentLimits()

    def docs(
@@ -250,33 +287,36 @@ class _DocumentConversionInput(BaseModel):
            )
            format = self._guess_format(obj)
            backend: Type[AbstractDocumentBackend]
-            if format not in format_options.keys():
+            backend_options: Optional[BackendOptions] = None
+            if not format or format not in format_options:
                _log.error(
-                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
+                    f"Input document {obj.name} with format {format} does not match "
+                    f"any allowed format: ({format_options.keys()})"
                )
                backend = _DummyBackend
            else:
-                backend = format_options[format].backend
+                options = format_options[format]
+                backend = options.backend
+                if "backend_options" in options.model_fields_set:
+                    backend_options = cast("FormatOption", options).backend_options

+            path_or_stream: Union[BytesIO, Path]
            if isinstance(obj, Path):
-                yield InputDocument(
-                    path_or_stream=obj,
-                    format=format,  # type: ignore[arg-type]
-                    filename=obj.name,
-                    limits=self.limits,
-                    backend=backend,
-                )
+                path_or_stream = obj
            elif isinstance(obj, DocumentStream):
-                yield InputDocument(
-                    path_or_stream=obj.stream,
-                    format=format,  # type: ignore[arg-type]
-                    filename=obj.name,
-                    limits=self.limits,
-                    backend=backend,
-                )
+                path_or_stream = obj.stream
            else:
                raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")

+            yield InputDocument(
+                path_or_stream=path_or_stream,
+                format=format,  # type: ignore[arg-type]
+                filename=obj.name,
+                limits=self.limits,
+                backend=backend,
+                backend_options=backend_options,
+            )
+
    def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
        content = b""  # empty binary blob
        formats: list[InputFormat] = []
@@ -290,12 +330,13 @@ class _DocumentConversionInput(BaseModel):
                with obj.open("rb") as f:
                    content = f.read(1024)  # Read first 1KB
            if mime is not None and mime.lower() == "application/zip":
+                mime_root = "application/vnd.openxmlformats-officedocument"
                if obj.suffixes[-1].lower() == ".xlsx":
-                    mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                    mime = mime_root + ".spreadsheetml.sheet"
                elif obj.suffixes[-1].lower() == ".docx":
-                    mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                    mime = mime_root + ".wordprocessingml.document"
                elif obj.suffixes[-1].lower() == ".pptx":
-                    mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+                    mime = mime_root + ".presentationml.presentation"

        elif isinstance(obj, DocumentStream):
            content = obj.stream.read(8192)
@@ -310,12 +351,13 @@ class _DocumentConversionInput(BaseModel):
                mime = _DocumentConversionInput._mime_from_extension(ext.lower())
            if mime is not None and mime.lower() == "application/zip":
                objname = obj.name.lower()
+                mime_root = "application/vnd.openxmlformats-officedocument"
                if objname.endswith(".xlsx"):
-                    mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                    mime = mime_root + ".spreadsheetml.sheet"
                elif objname.endswith(".docx"):
-                    mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                    mime = mime_root + ".wordprocessingml.document"
                elif objname.endswith(".pptx"):
-                    mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+                    mime = mime_root + ".presentationml.presentation"

        if mime is not None and mime.lower() == "application/gzip":
            if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -9,11 +9,14 @@ from datetime import datetime
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Optional, Type, Union

-from pydantic import BaseModel, ConfigDict, model_validator, validate_call
+from pydantic import ConfigDict, model_validator, validate_call
+from typing_extensions import Self

-from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+)
 from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@@ -28,6 +31,7 @@ from docling.backend.noop_backend import NoOpBackend
 from docling.backend.webvtt_backend import WebVTTDocumentBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
+from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
 from docling.datamodel.base_models import (
    BaseFormatOption,
    ConversionStatus,
@@ -61,11 +65,13 @@ _PIPELINE_CACHE_LOCK = threading.Lock()

 class FormatOption(BaseFormatOption):
    pipeline_cls: Type[BasePipeline]
+    backend_options: Optional[BackendOptions] = None

    @model_validator(mode="after")
-    def set_optional_field_default(self) -> "FormatOption":
+    def set_optional_field_default(self) -> Self:
        if self.pipeline_options is None:
            self.pipeline_options = self.pipeline_cls.get_default_options()
+
        return self


@@ -92,6 +98,7 @@ class PowerpointFormatOption(FormatOption):
 class MarkdownFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
+    backend_options: HTMLBackendOptions = HTMLBackendOptions()


 class AsciiDocFormatOption(FormatOption):
@@ -102,6 +109,7 @@ class AsciiDocFormatOption(FormatOption):
 class HTMLFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
+    backend_options: HTMLBackendOptions = HTMLBackendOptions()


 class PatentUsptoFormatOption(FormatOption):
@@ -150,7 +158,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
            pipeline_cls=SimplePipeline, backend=AsciiDocBackend
        ),
        InputFormat.HTML: FormatOption(
-            pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+            pipeline_cls=SimplePipeline,
+            backend=HTMLDocumentBackend,
+            backend_options=HTMLBackendOptions(),
        ),
        InputFormat.XML_USPTO: FormatOption(
            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
@@ -186,13 +196,13 @@ class DocumentConverter:

    def __init__(
        self,
-        allowed_formats: Optional[List[InputFormat]] = None,
-        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
+        allowed_formats: Optional[list[InputFormat]] = None,
+        format_options: Optional[dict[InputFormat, FormatOption]] = None,
    ):
        self.allowed_formats = (
            allowed_formats if allowed_formats is not None else list(InputFormat)
        )
-        self.format_to_options: Dict[InputFormat, FormatOption] = {
+        self.format_to_options: dict[InputFormat, FormatOption] = {
            format: (
                _get_default_option(format=format)
                if (custom_option := (format_options or {}).get(format)) is None
@@ -200,8 +210,8 @@ class DocumentConverter:
            )
            for format in self.allowed_formats
        }
-        self.initialized_pipelines: Dict[
-            Tuple[Type[BasePipeline], str], BasePipeline
+        self.initialized_pipelines: dict[
+            tuple[Type[BasePipeline], str], BasePipeline
        ] = {}

    def _get_initialized_pipelines(
@@ -228,7 +238,7 @@ class DocumentConverter:
    def convert(
        self,
        source: Union[Path, str, DocumentStream],  # TODO review naming
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
@@ -248,7 +258,7 @@ class DocumentConverter:
    def convert_all(
        self,
        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
--- a/docling/document_extractor.py
+++ b/docling/document_extractor.py
@@ -8,9 +8,10 @@ from collections.abc import Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Optional, Type, Union

 from pydantic import ConfigDict, model_validator, validate_call
+from typing_extensions import Self

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -56,7 +57,7 @@ class ExtractionFormatOption(BaseFormatOption):
    pipeline_cls: Type[BaseExtractionPipeline]

    @model_validator(mode="after")
-    def set_optional_field_default(self) -> "ExtractionFormatOption":
+    def set_optional_field_default(self) -> Self:
        if self.pipeline_options is None:
            # `get_default_options` comes from BaseExtractionPipeline
            self.pipeline_options = self.pipeline_cls.get_default_options()  # type: ignore[assignment]
@@ -70,7 +71,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
    the VLM extractor. This duplication will be removed when we deduplicate
    the format registry between convert/extract.
    """
-    format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
+    format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
        InputFormat.IMAGE: PyPdfiumDocumentBackend,
        InputFormat.PDF: PyPdfiumDocumentBackend,
    }
@@ -98,24 +99,24 @@ class DocumentExtractor:

    def __init__(
        self,
-        allowed_formats: Optional[List[InputFormat]] = None,
+        allowed_formats: Optional[list[InputFormat]] = None,
        extraction_format_options: Optional[
-            Dict[InputFormat, ExtractionFormatOption]
+            dict[InputFormat, ExtractionFormatOption]
        ] = None,
    ) -> None:
-        self.allowed_formats: List[InputFormat] = (
+        self.allowed_formats: list[InputFormat] = (
            allowed_formats if allowed_formats is not None else list(InputFormat)
        )
        # Build per-format options with defaults, then apply any user overrides
        overrides = extraction_format_options or {}
-        self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
+        self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
            fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
            for fmt in self.allowed_formats
        }

        # Cache pipelines by (class, options-hash)
-        self._initialized_pipelines: Dict[
-            Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
+        self._initialized_pipelines: dict[
+            tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
        ] = {}

    # ---------------------------- Public API ---------------------------------
@@ -125,7 +126,7 @@ class DocumentExtractor:
        self,
        source: Union[Path, str, DocumentStream],
        template: ExtractionTemplateType,
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
@@ -147,7 +148,7 @@ class DocumentExtractor:
        self,
        source: Iterable[Union[Path, str, DocumentStream]],
        template: ExtractionTemplateType,
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import Dict, List

 from docling_core.types.doc import (
    DocItemLabel,
@@ -48,8 +47,8 @@ class ReadingOrderModel:

    def _assembled_to_readingorder_elements(
        self, conv_res: ConversionResult
-    ) -> List[ReadingOrderPageElement]:
-        elements: List[ReadingOrderPageElement] = []
+    ) -> list[ReadingOrderPageElement]:
+        elements: list[ReadingOrderPageElement] = []
        page_no_to_pages = {p.page_no: p for p in conv_res.pages}

        for element in conv_res.assembled.elements:
@@ -123,10 +122,10 @@ class ReadingOrderModel:
    def _readingorder_elements_to_docling_doc(
        self,
        conv_res: ConversionResult,
-        ro_elements: List[ReadingOrderPageElement],
-        el_to_captions_mapping: Dict[int, List[int]],
-        el_to_footnotes_mapping: Dict[int, List[int]],
-        el_merges_mapping: Dict[int, List[int]],
+        ro_elements: list[ReadingOrderPageElement],
+        el_to_captions_mapping: dict[int, list[int]],
+        el_to_footnotes_mapping: dict[int, list[int]],
+        el_merges_mapping: dict[int, list[int]],
    ) -> DoclingDocument:
        id_to_elem = {
            RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
--- a/docling/utils/api_image_request.py
+++ b/docling/utils/api_image_request.py
@@ -2,7 +2,7 @@ import base64
 import json
 import logging
 from io import BytesIO
-from typing import Dict, List, Optional
+from typing import Optional

 import requests
 from PIL import Image
@@ -19,7 +19,7 @@ def api_image_request(
    prompt: str,
    url: AnyUrl,
    timeout: float = 20,
-    headers: Optional[Dict[str, str]] = None,
+    headers: Optional[dict[str, str]] = None,
    **params,
 ) -> str:
    img_io = BytesIO()
@@ -69,8 +69,8 @@ def api_image_request_streaming(
    url: AnyUrl,
    *,
    timeout: float = 20,
-    headers: Optional[Dict[str, str]] = None,
-    generation_stoppers: List[GenerationStopper] = [],
+    headers: Optional[dict[str, str]] = None,
+    generation_stoppers: list[GenerationStopper] = [],
    **params,
 ) -> str:
    """
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@@ -2,7 +2,6 @@ import bisect
 import logging
 import sys
 from collections import defaultdict
-from typing import Dict, List, Set, Tuple

 from docling_core.types.doc import DocItemLabel, Size
 from docling_core.types.doc.page import TextCell
@@ -39,7 +38,7 @@ class UnionFind:
            self.parent[root_y] = root_x
            self.rank[root_x] += 1

-    def get_groups(self) -> Dict[int, List[int]]:
+    def get_groups(self) -> dict[int, list[int]]:
        """Returns groups as {root: [elements]}."""
        groups = defaultdict(list)
        for elem in self.parent:
@@ -50,13 +49,13 @@ class UnionFind:
 class SpatialClusterIndex:
    """Efficient spatial indexing for clusters using R-tree and interval trees."""

-    def __init__(self, clusters: List[Cluster]):
+    def __init__(self, clusters: list[Cluster]):
        p = index.Property()
        p.dimension = 2
        self.spatial_index = index.Index(properties=p)
        self.x_intervals = IntervalTree()
        self.y_intervals = IntervalTree()
-        self.clusters_by_id: Dict[int, Cluster] = {}
+        self.clusters_by_id: dict[int, Cluster] = {}

        for cluster in clusters:
            self.add_cluster(cluster)
@@ -72,7 +71,7 @@ class SpatialClusterIndex:
        self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
        del self.clusters_by_id[cluster.id]

-    def find_candidates(self, bbox: BoundingBox) -> Set[int]:
+    def find_candidates(self, bbox: BoundingBox) -> set[int]:
        """Find potential overlapping cluster IDs using all indexes."""
        spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
        x_candidates = self.x_intervals.find_containing(
@@ -123,13 +122,13 @@ class IntervalTree:
    """Memory-efficient interval tree for 1D overlap queries."""

    def __init__(self):
-        self.intervals: List[Interval] = []  # Sorted by min_val
+        self.intervals: list[Interval] = []  # Sorted by min_val

    def insert(self, min_val: float, max_val: float, id: int):
        interval = Interval(min_val, max_val, id)
        bisect.insort(self.intervals, interval)

-    def find_containing(self, point: float) -> Set[int]:
+    def find_containing(self, point: float) -> set[int]:
        """Find all intervals containing the point."""
        pos = bisect.bisect_left(self.intervals, point)
        result = set()
@@ -196,7 +195,7 @@ class LayoutPostprocessor:
    }

    def __init__(
-        self, page: Page, clusters: List[Cluster], options: LayoutOptions
+        self, page: Page, clusters: list[Cluster], options: LayoutOptions
    ) -> None:
        """Initialize processor with page and clusters."""

@@ -219,7 +218,7 @@ class LayoutPostprocessor:
            [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
        )

-    def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
+    def postprocess(self) -> tuple[list[Cluster], list[TextCell]]:
        """Main processing pipeline."""
        self.regular_clusters = self._process_regular_clusters()
        self.special_clusters = self._process_special_clusters()
@@ -254,7 +253,7 @@ class LayoutPostprocessor:

        return final_clusters, self.cells

-    def _process_regular_clusters(self) -> List[Cluster]:
+    def _process_regular_clusters(self) -> list[Cluster]:
        """Process regular clusters with iterative refinement."""
        clusters = [
            c
@@ -311,7 +310,7 @@ class LayoutPostprocessor:

        return clusters

-    def _process_special_clusters(self) -> List[Cluster]:
+    def _process_special_clusters(self) -> list[Cluster]:
        special_clusters = [
            c
            for c in self.special_clusters
@@ -381,7 +380,7 @@ class LayoutPostprocessor:

        return picture_clusters + wrapper_clusters

-    def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
+    def _handle_cross_type_overlaps(self, special_clusters) -> list[Cluster]:
        """Handle overlaps between regular and wrapper clusters before child assignment.

        In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
@@ -454,7 +453,7 @@ class LayoutPostprocessor:

    def _select_best_cluster_from_group(
        self,
-        group_clusters: List[Cluster],
+        group_clusters: list[Cluster],
        params: dict,
    ) -> Cluster:
        """Select best cluster from a group of overlapping clusters based on all rules."""
@@ -487,11 +486,11 @@ class LayoutPostprocessor:

    def _remove_overlapping_clusters(
        self,
-        clusters: List[Cluster],
+        clusters: list[Cluster],
        cluster_type: str,
        overlap_threshold: float = 0.8,
        containment_threshold: float = 0.8,
-    ) -> List[Cluster]:
+    ) -> list[Cluster]:
        if not clusters:
            return []

@@ -544,7 +543,7 @@ class LayoutPostprocessor:

    def _select_best_cluster(
        self,
-        clusters: List[Cluster],
+        clusters: list[Cluster],
        area_threshold: float,
        conf_threshold: float,
    ) -> Cluster:
@@ -572,7 +571,7 @@ class LayoutPostprocessor:

        return current_best if current_best else clusters[0]

-    def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
+    def _deduplicate_cells(self, cells: list[TextCell]) -> list[TextCell]:
        """Ensure each cell appears only once, maintaining order of first appearance."""
        seen_ids = set()
        unique_cells = []
@@ -583,8 +582,8 @@ class LayoutPostprocessor:
        return unique_cells

    def _assign_cells_to_clusters(
-        self, clusters: List[Cluster], min_overlap: float = 0.2
-    ) -> List[Cluster]:
+        self, clusters: list[Cluster], min_overlap: float = 0.2
+    ) -> list[Cluster]:
        """Assign cells to best overlapping cluster."""
        for cluster in clusters:
            cluster.cells = []
@@ -616,7 +615,7 @@ class LayoutPostprocessor:

        return clusters

-    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
+    def _find_unassigned_cells(self, clusters: list[Cluster]) -> list[TextCell]:
        """Find cells not assigned to any cluster."""
        assigned = {cell.index for cluster in clusters for cell in cluster.cells}
        return [
@@ -625,7 +624,7 @@ class LayoutPostprocessor:
            if cell.index not in assigned and cell.text.strip()
        ]

-    def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
+    def _adjust_cluster_bboxes(self, clusters: list[Cluster]) -> list[Cluster]:
        """Adjust cluster bounding boxes to contain their cells."""
        for cluster in clusters:
            if not cluster.cells:
@@ -651,13 +650,13 @@ class LayoutPostprocessor:

        return clusters

-    def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
+    def _sort_cells(self, cells: list[TextCell]) -> list[TextCell]:
        """Sort cells in native reading order."""
        return sorted(cells, key=lambda c: (c.index))

    def _sort_clusters(
-        self, clusters: List[Cluster], mode: str = "id"
-    ) -> List[Cluster]:
+        self, clusters: list[Cluster], mode: str = "id"
+    ) -> list[Cluster]:
        """Sort clusters in reading order (top-to-bottom, left-to-right)."""
        if mode == "id":  # sort in the order the cells are printed in the PDF.
            return sorted(
--- a/tests/data/groundtruth/docling_v2/example_01_images.html.json
+++ b/tests/data/groundtruth/docling_v2/example_01_images.html.json
--- a/tests/data/groundtruth/docling_v2/example_01_images.html.md
+++ b/tests/data/groundtruth/docling_v2/example_01_images.html.md
@@ -0,0 +1,20 @@
+# Introduction
+
+This is the first paragraph of the introduction.
+
+## Background
+
+Some background information here.
+
+Example image
+
+<!-- image -->
+
+- First item in unordered list
+- Second item in unordered list
+
+1. First item in ordered list
+2. Second item in ordered list
+
+42. First item in ordered list with start
+43. Second item in ordered list with start
--- a/tests/data/groundtruth/docling_v2/example_09.html.itxt
+++ b/tests/data/groundtruth/docling_v2/example_09.html.itxt
@@ -1,36 +0,0 @@
-item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: title: Introduction to parsing HTML files with Docling
-    item-2 at level 2: picture
-      item-2 at level 3: caption: Docling
-    item-3 at level 2: text: Docling simplifies document proc ... ntegrations with the gen AI ecosystem.
-    item-4 at level 2: section_header: Supported file formats
-      item-5 at level 3: text: Docling supports multiple file formats..
-      item-6 at level 3: list: group list
-        item-7 at level 4: list_item: Advanced PDF understanding
-        item-8 at level 4: picture
-          item-8 at level 5: caption: PDF
-        item-9 at level 4: list_item: Microsoft Office DOCX
-        item-10 at level 4: picture
-          item-10 at level 5: caption: DOCX
-        item-11 at level 4: list_item: HTML files (with optional support for images)
-        item-12 at level 4: picture
-          item-12 at level 5: caption: HTML
-      item-13 at level 3: section_header: Three backends for handling HTML files
-        item-14 at level 4: text: Docling has three backends for parsing HTML files:
-        item-15 at level 4: list: group ordered list
-          item-16 at level 5: list_item: 
-            item-17 at level 6: inline: group group
-              item-18 at level 7: text: HTMLDocumentBackend
-              item-19 at level 7: text: Ignores images
-          item-20 at level 5: list_item: 
-            item-21 at level 6: inline: group group
-              item-22 at level 7: text: HTMLDocumentBackendImagesInline
-              item-23 at level 7: text: Extracts images inline
-          item-24 at level 5: list_item: 
-            item-25 at level 6: inline: group group
-              item-26 at level 7: text: HTMLDocumentBackendImagesReferenced
-              item-27 at level 7: text: Extracts images as references
-  item-28 at level 1: caption: Docling
-  item-29 at level 1: caption: PDF
-  item-30 at level 1: caption: DOCX
-  item-31 at level 1: caption: HTML
--- a/tests/data/groundtruth/docling_v2/example_09.html.json
+++ b/tests/data/groundtruth/docling_v2/example_09.html.json
@@ -1,560 +0,0 @@
-{
-  "schema_name": "DoclingDocument",
-  "version": "1.7.0",
-  "name": "example_09",
-  "origin": {
-    "mimetype": "text/html",
-    "binary_hash": 6785336133244366107,
-    "filename": "example_09.html"
-  },
-  "furniture": {
-    "self_ref": "#/furniture",
-    "children": [],
-    "content_layer": "furniture",
-    "name": "_root_",
-    "label": "unspecified"
-  },
-  "body": {
-    "self_ref": "#/body",
-    "children": [
-      {
-        "$ref": "#/texts/0"
-      },
-      {
-        "$ref": "#/texts/1"
-      },
-      {
-        "$ref": "#/texts/6"
-      },
-      {
-        "$ref": "#/texts/8"
-      },
-      {
-        "$ref": "#/texts/10"
-      }
-    ],
-    "content_layer": "body",
-    "name": "_root_",
-    "label": "unspecified"
-  },
-  "groups": [
-    {
-      "self_ref": "#/groups/0",
-      "parent": {
-        "$ref": "#/texts/3"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/5"
-        },
-        {
-          "$ref": "#/pictures/1"
-        },
-        {
-          "$ref": "#/texts/7"
-        },
-        {
-          "$ref": "#/pictures/2"
-        },
-        {
-          "$ref": "#/texts/9"
-        },
-        {
-          "$ref": "#/pictures/3"
-        }
-      ],
-      "content_layer": "body",
-      "name": "list",
-      "label": "list"
-    },
-    {
-      "self_ref": "#/groups/1",
-      "parent": {
-        "$ref": "#/texts/11"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/13"
-        },
-        {
-          "$ref": "#/texts/16"
-        },
-        {
-          "$ref": "#/texts/19"
-        }
-      ],
-      "content_layer": "body",
-      "name": "ordered list",
-      "label": "list"
-    },
-    {
-      "self_ref": "#/groups/2",
-      "parent": {
-        "$ref": "#/texts/13"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/14"
-        },
-        {
-          "$ref": "#/texts/15"
-        }
-      ],
-      "content_layer": "body",
-      "name": "group",
-      "label": "inline"
-    },
-    {
-      "self_ref": "#/groups/3",
-      "parent": {
-        "$ref": "#/texts/16"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/17"
-        },
-        {
-          "$ref": "#/texts/18"
-        }
-      ],
-      "content_layer": "body",
-      "name": "group",
-      "label": "inline"
-    },
-    {
-      "self_ref": "#/groups/4",
-      "parent": {
-        "$ref": "#/texts/19"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/20"
-        },
-        {
-          "$ref": "#/texts/21"
-        }
-      ],
-      "content_layer": "body",
-      "name": "group",
-      "label": "inline"
-    }
-  ],
-  "texts": [
-    {
-      "self_ref": "#/texts/0",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
-        {
-          "$ref": "#/pictures/0"
-        },
-        {
-          "$ref": "#/texts/2"
-        },
-        {
-          "$ref": "#/texts/3"
-        }
-      ],
-      "content_layer": "body",
-      "label": "title",
-      "prov": [],
-      "orig": "Introduction to parsing HTML files with Docling",
-      "text": "Introduction to parsing HTML files with Docling"
-    },
-    {
-      "self_ref": "#/texts/1",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "caption",
-      "prov": [],
-      "orig": "Docling",
-      "text": "Docling"
-    },
-    {
-      "self_ref": "#/texts/2",
-      "parent": {
-        "$ref": "#/texts/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem.",
-      "text": "Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem."
-    },
-    {
-      "self_ref": "#/texts/3",
-      "parent": {
-        "$ref": "#/texts/0"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/4"
-        },
-        {
-          "$ref": "#/groups/0"
-        },
-        {
-          "$ref": "#/texts/11"
-        }
-      ],
-      "content_layer": "body",
-      "label": "section_header",
-      "prov": [],
-      "orig": "Supported file formats",
-      "text": "Supported file formats",
-      "level": 1
-    },
-    {
-      "self_ref": "#/texts/4",
-      "parent": {
-        "$ref": "#/texts/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Docling supports multiple file formats..",
-      "text": "Docling supports multiple file formats.."
-    },
-    {
-      "self_ref": "#/texts/5",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "Advanced PDF understanding",
-      "text": "Advanced PDF understanding",
-      "enumerated": false,
-      "marker": ""
-    },
-    {
-      "self_ref": "#/texts/6",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "caption",
-      "prov": [],
-      "orig": "PDF",
-      "text": "PDF"
-    },
-    {
-      "self_ref": "#/texts/7",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "Microsoft Office DOCX",
-      "text": "Microsoft Office DOCX",
-      "enumerated": false,
-      "marker": ""
-    },
-    {
-      "self_ref": "#/texts/8",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "caption",
-      "prov": [],
-      "orig": "DOCX",
-      "text": "DOCX"
-    },
-    {
-      "self_ref": "#/texts/9",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "HTML files (with optional support for images)",
-      "text": "HTML files (with optional support for images)",
-      "enumerated": false,
-      "marker": ""
-    },
-    {
-      "self_ref": "#/texts/10",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "caption",
-      "prov": [],
-      "orig": "HTML",
-      "text": "HTML"
-    },
-    {
-      "self_ref": "#/texts/11",
-      "parent": {
-        "$ref": "#/texts/3"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/12"
-        },
-        {
-          "$ref": "#/groups/1"
-        }
-      ],
-      "content_layer": "body",
-      "label": "section_header",
-      "prov": [],
-      "orig": "Three backends for handling HTML files",
-      "text": "Three backends for handling HTML files",
-      "level": 2
-    },
-    {
-      "self_ref": "#/texts/12",
-      "parent": {
-        "$ref": "#/texts/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Docling has three backends for parsing HTML files:",
-      "text": "Docling has three backends for parsing HTML files:"
-    },
-    {
-      "self_ref": "#/texts/13",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [
-        {
-          "$ref": "#/groups/2"
-        }
-      ],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": true,
-      "marker": ""
-    },
-    {
-      "self_ref": "#/texts/14",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "HTMLDocumentBackend",
-      "text": "HTMLDocumentBackend",
-      "formatting": {
-        "bold": true,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/15",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Ignores images",
-      "text": "Ignores images"
-    },
-    {
-      "self_ref": "#/texts/16",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [
-        {
-          "$ref": "#/groups/3"
-        }
-      ],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": true,
-      "marker": ""
-    },
-    {
-      "self_ref": "#/texts/17",
-      "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "HTMLDocumentBackendImagesInline",
-      "text": "HTMLDocumentBackendImagesInline",
-      "formatting": {
-        "bold": true,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/18",
-      "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Extracts images inline",
-      "text": "Extracts images inline"
-    },
-    {
-      "self_ref": "#/texts/19",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [
-        {
-          "$ref": "#/groups/4"
-        }
-      ],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": true,
-      "marker": ""
-    },
-    {
-      "self_ref": "#/texts/20",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "HTMLDocumentBackendImagesReferenced",
-      "text": "HTMLDocumentBackendImagesReferenced",
-      "formatting": {
-        "bold": true,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/21",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Extracts images as references",
-      "text": "Extracts images as references"
-    }
-  ],
-  "pictures": [
-    {
-      "self_ref": "#/pictures/0",
-      "parent": {
-        "$ref": "#/texts/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "picture",
-      "prov": [],
-      "captions": [
-        {
-          "$ref": "#/texts/1"
-        }
-      ],
-      "references": [],
-      "footnotes": [],
-      "annotations": []
-    },
-    {
-      "self_ref": "#/pictures/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "picture",
-      "prov": [],
-      "captions": [
-        {
-          "$ref": "#/texts/6"
-        }
-      ],
-      "references": [],
-      "footnotes": [],
-      "annotations": []
-    },
-    {
-      "self_ref": "#/pictures/2",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "picture",
-      "prov": [],
-      "captions": [
-        {
-          "$ref": "#/texts/8"
-        }
-      ],
-      "references": [],
-      "footnotes": [],
-      "annotations": []
-    },
-    {
-      "self_ref": "#/pictures/3",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "picture",
-      "prov": [],
-      "captions": [
-        {
-          "$ref": "#/texts/10"
-        }
-      ],
-      "references": [],
-      "footnotes": [],
-      "annotations": []
-    }
-  ],
-  "tables": [],
-  "key_value_items": [],
-  "form_items": [],
-  "pages": {}
-}
--- a/tests/data/groundtruth/docling_v2/example_09.html.md
+++ b/tests/data/groundtruth/docling_v2/example_09.html.md
@@ -1,32 +0,0 @@
-# Introduction to parsing HTML files with Docling
-
-Docling
-
-<!-- image -->
-
-Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem.
-
-## Supported file formats
-
-Docling supports multiple file formats..
-
- Advanced PDF understanding
-PDF
-
-<!-- image -->
- Microsoft Office DOCX
-DOCX
-
-<!-- image -->
- HTML files (with optional support for images)
-HTML
-
-<!-- image -->
-
-### Three backends for handling HTML files
-
-Docling has three backends for parsing HTML files:
-
-1. **HTMLDocumentBackend** Ignores images
-2. **HTMLDocumentBackendImagesInline** Extracts images inline
-3. **HTMLDocumentBackendImagesReferenced** Extracts images as references
--- a/tests/data/groundtruth/docling_v2/hyperlink_02.html.json
+++ b/tests/data/groundtruth/docling_v2/hyperlink_02.html.json
@@ -17,6 +17,12 @@
  "body": {
    "self_ref": "#/body",
    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/pictures/0"
+      },
      {
        "$ref": "#/groups/0"
      }
@@ -33,7 +39,7 @@
      },
      "children": [
        {
-          "$ref": "#/texts/0"
+          "$ref": "#/texts/1"
        }
      ],
      "content_layer": "body",
@@ -44,6 +50,18 @@
  "texts": [
    {
      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "furniture",
+      "label": "caption",
+      "prov": [],
+      "orig": "Image alt text",
+      "text": "Image alt text"
+    },
+    {
+      "self_ref": "#/texts/1",
      "parent": {
        "$ref": "#/groups/0"
      },
@@ -57,7 +75,26 @@
      "level": 1
    }
  ],
-  "pictures": [],
+  "pictures": [
+    {
+      "self_ref": "#/pictures/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "furniture",
+      "label": "picture",
+      "prov": [],
+      "captions": [
+        {
+          "$ref": "#/texts/0"
+        }
+      ],
+      "references": [],
+      "footnotes": [],
+      "annotations": []
+    }
+  ],
  "tables": [],
  "key_value_items": [],
  "form_items": [],
--- a/tests/data/groundtruth/docling_v2/hyperlink_05.html.itxt
+++ b/tests/data/groundtruth/docling_v2/hyperlink_05.html.itxt
@@ -1,7 +1,7 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: caption: Image Hyperlink.
+  item-1 at level 1: caption: Clickable Example
  item-2 at level 1: picture
-    item-2 at level 2: caption: Image Hyperlink.
+    item-2 at level 2: caption: Clickable Example
  item-3 at level 1: caption: This is an example caption for the image.
  item-4 at level 1: picture
    item-4 at level 2: caption: This is an example caption for the image.
--- a/tests/data/groundtruth/docling_v2/hyperlink_05.html.json
+++ b/tests/data/groundtruth/docling_v2/hyperlink_05.html.json
@@ -66,8 +66,8 @@
      "content_layer": "body",
      "label": "caption",
      "prov": [],
-      "orig": "Image Hyperlink.",
-      "text": "Image Hyperlink.",
+      "orig": "Clickable Example",
+      "text": "Clickable Example",
      "hyperlink": "https://www.example.com/"
    },
    {
--- a/tests/data/groundtruth/docling_v2/hyperlink_05.html.md
+++ b/tests/data/groundtruth/docling_v2/hyperlink_05.html.md
@@ -1,4 +1,4 @@
-Image Hyperlink.
+Clickable Example

 <!-- image -->

--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
--- a/tests/data/html/example_01.html
+++ b/tests/data/html/example_01.html
@@ -4,7 +4,7 @@
        <p>This is the first paragraph of the introduction.</p>
        <h2>Background</h2>
        <p>Some background information here.</p>
-        <img src="image1.png" alt="Example image"/>
+        <img src="example_image_01.png" alt="Example image"/>
        <ul>
            <li>First item in unordered list</li>
            <li>Second item in unordered list</li>
--- a/tests/data/html/example_09.html
+++ b/tests/data/html/example_09.html
@@ -1,21 +0,0 @@
-<html>
-    <body>
-        <h1>Introduction to parsing HTML files with <img src="https://docling-project.github.io/docling/assets/logo.png" alt="Docling" height="64"> Docling</h1>
-        <p>Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.</p>
-        <h2>Supported file formats</h2>
-        <p>Docling supports multiple file formats..</p>
-        <ul>
-            <li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/pdf.png" height="32" alt="PDF">Advanced PDF understanding</li>
-            <li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/docx.png" height="32" alt="DOCX">Microsoft Office DOCX</li>
-            <li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/html.png" height="32" alt="HTML">HTML files (with optional support for images)</li>
-        </ul>
-        <h3>Three backends for handling HTML files</h3>
-        <p>Docling has three backends for parsing HTML files:</p>
-        <ol>
-            <li><b>HTMLDocumentBackend</b> Ignores images</li>
-            <li><b>HTMLDocumentBackendImagesInline</b> Extracts images inline</li>
-            <li><b>HTMLDocumentBackendImagesReferenced</b> Extracts images as references</li>
-        </ol>
-    </body>
-</html>
-
--- a/tests/data/html/example_image_01.png
+++ b/tests/data/html/example_image_01.png
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@@ -1,9 +1,14 @@
 from io import BytesIO
-from pathlib import Path
+from pathlib import Path, PurePath
+from unittest.mock import Mock, mock_open, patch

+import pytest
+from docling_core.types.doc import PictureItem
 from docling_core.types.doc.document import ContentLayer
+from pydantic import AnyUrl, ValidationError

 from docling.backend.html_backend import HTMLDocumentBackend
+from docling.datamodel.backend_options import HTMLBackendOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import (
    ConversionResult,
@@ -11,7 +16,7 @@ from docling.datamodel.document import (
    InputDocument,
    SectionHeaderItem,
 )
-from docling.document_converter import DocumentConverter
+from docling.document_converter import DocumentConverter, HTMLFormatOption

 from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document, verify_export
@@ -19,6 +24,68 @@ from .verify_utils import verify_document, verify_export
 GENERATE = GEN_TEST_DATA


+def test_html_backend_options():
+    options = HTMLBackendOptions()
+    assert options.kind == "html"
+    assert not options.fetch_images
+    assert options.source_uri is None
+
+    url = "http://example.com"
+    source_location = AnyUrl(url=url)
+    options = HTMLBackendOptions(source_uri=source_location)
+    assert options.source_uri == source_location
+
+    source_location = PurePath("/local/path/to/file.html")
+    options = HTMLBackendOptions(source_uri=source_location)
+    assert options.source_uri == source_location
+
+    with pytest.raises(ValidationError, match="Input is not a valid path"):
+        HTMLBackendOptions(source_uri=12345)
+
+
+def test_resolve_relative_path():
+    html_path = Path("./tests/data/html/example_01.html")
+    in_doc = InputDocument(
+        path_or_stream=html_path,
+        format=InputFormat.HTML,
+        backend=HTMLDocumentBackend,
+        filename="test",
+    )
+    html_doc = HTMLDocumentBackend(path_or_stream=html_path, in_doc=in_doc)
+    html_doc.base_path = "/local/path/to/file.html"
+
+    relative_path = "subdir/another.html"
+    expected_abs_loc = "/local/path/to/subdir/another.html"
+    assert html_doc._resolve_relative_path(relative_path) == expected_abs_loc
+
+    absolute_path = "/absolute/path/to/file.html"
+    assert html_doc._resolve_relative_path(absolute_path) == absolute_path
+
+    html_doc.base_path = "http://my_host.com"
+    protocol_relative_url = "//example.com/file.html"
+    expected_abs_loc = "https://example.com/file.html"
+    assert html_doc._resolve_relative_path(protocol_relative_url) == expected_abs_loc
+
+    html_doc.base_path = "http://example.com"
+    remote_relative_path = "subdir/file.html"
+    expected_abs_loc = "http://example.com/subdir/file.html"
+    assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
+
+    html_doc.base_path = "http://example.com"
+    remote_relative_path = "https://my_host.com/my_page.html"
+    expected_abs_loc = "https://my_host.com/my_page.html"
+    assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
+
+    html_doc.base_path = "http://example.com"
+    remote_relative_path = "/static/images/my_image.png"
+    expected_abs_loc = "http://example.com/static/images/my_image.png"
+    assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
+
+    html_doc.base_path = None
+    relative_path = "subdir/file.html"
+    assert html_doc._resolve_relative_path(relative_path) == relative_path
+
+
 def test_heading_levels():
    in_path = Path("tests/data/html/wiki_duck.html")
    in_doc = InputDocument(
@@ -158,8 +225,6 @@ def test_e2e_html_conversions():
    converter = get_converter()

    for html_path in html_paths:
-        # print(f"converting {html_path}")
-
        gt_path = (
            html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name
        )
@@ -183,6 +248,76 @@ def test_e2e_html_conversions():
        assert verify_document(doc, str(gt_path) + ".json", GENERATE)


+@patch("docling.backend.html_backend.requests.get")
+@patch("docling.backend.html_backend.open", new_callable=mock_open)
+def test_e2e_html_conversion_with_images(mock_local, mock_remote):
+    source = "tests/data/html/example_01.html"
+    image_path = "tests/data/html/example_image_01.png"
+    with open(image_path, "rb") as f:
+        img_bytes = f.read()
+
+    # fetching image locally
+    mock_local.return_value.__enter__.return_value = BytesIO(img_bytes)
+    backend_options = HTMLBackendOptions(
+        enable_local_fetch=True, fetch_images=True, source_uri=source
+    )
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.HTML],
+        format_options={
+            InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
+        },
+    )
+    res_local = converter.convert(source)
+    mock_local.assert_called_once()
+    assert res_local.document
+    num_pic: int = 0
+    for element, _ in res_local.document.iterate_items():
+        if isinstance(element, PictureItem):
+            assert element.image
+            num_pic += 1
+    assert num_pic == 1, "No embedded picture was found in the converted file"
+
+    # fetching image remotely
+    mock_resp = Mock()
+    mock_resp.status_code = 200
+    mock_resp.content = img_bytes
+    mock_remote.return_value = mock_resp
+    source_location = "https://example.com/example_01.html"
+
+    backend_options = HTMLBackendOptions(
+        enable_remote_fetch=True, fetch_images=True, source_uri=source_location
+    )
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.HTML],
+        format_options={
+            InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
+        },
+    )
+    res_remote = converter.convert(source)
+    mock_remote.assert_called_once_with(
+        "https://example.com/example_image_01.png", stream=True
+    )
+    assert res_remote.document
+    num_pic = 0
+    for element, _ in res_remote.document.iterate_items():
+        if isinstance(element, PictureItem):
+            assert element.image
+            assert element.image.mimetype == "image/png"
+            num_pic += 1
+    assert num_pic == 1, "No embedded picture was found in the converted file"
+
+    # both methods should generate the same DoclingDocument
+    assert res_remote.document == res_local.document
+
+    # checking exported formats
+    gt_path = (
+        "tests/data/groundtruth/docling_v2/" + str(Path(source).stem) + "_images.html"
+    )
+    pred_md: str = res_local.document.export_to_markdown()
+    assert verify_export(pred_md, gt_path + ".md", generate=GENERATE)
+    assert verify_document(res_local.document, gt_path + ".json", GENERATE)
+
+
 def test_html_furniture():
    raw_html = (
        b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
@@ -211,3 +346,98 @@ def test_html_furniture():
        "Initial content with some **bold text**\n\n# Main Heading\n\nSome Content\n\n"
        "Some Footer Content"
    )
+
+
+def test_fetch_remote_images(monkeypatch):
+    source = "./tests/data/html/example_01.html"
+
+    # no image fetching: the image_fetch flag is False
+    backend_options = HTMLBackendOptions(
+        fetch_images=False, source_uri="http://example.com"
+    )
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.HTML],
+        format_options={
+            InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
+        },
+    )
+    with patch("docling.backend.html_backend.requests.get") as mocked_get:
+        res = converter.convert(source)
+        mocked_get.assert_not_called()
+    assert res.document
+
+    # no image fetching: the source location is False and enable_local_fetch is False
+    backend_options = HTMLBackendOptions(fetch_images=True)
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.HTML],
+        format_options={
+            InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
+        },
+    )
+    with (
+        patch("docling.backend.html_backend.requests.get") as mocked_get,
+        pytest.warns(
+            match="Fetching local resources is only allowed when set explicitly"
+        ),
+    ):
+        res = converter.convert(source)
+        mocked_get.assert_not_called()
+    assert res.document
+
+    # no image fetching: the enable_remote_fetch is False
+    backend_options = HTMLBackendOptions(
+        fetch_images=True, source_uri="http://example.com"
+    )
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.HTML],
+        format_options={
+            InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
+        },
+    )
+    with (
+        patch("docling.backend.html_backend.requests.get") as mocked_get,
+        pytest.warns(
+            match="Fetching remote resources is only allowed when set explicitly"
+        ),
+    ):
+        res = converter.convert(source)
+        mocked_get.assert_not_called()
+    assert res.document
+
+    # image fetching: all conditions apply, source location is remote
+    backend_options = HTMLBackendOptions(
+        enable_remote_fetch=True, fetch_images=True, source_uri="http://example.com"
+    )
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.HTML],
+        format_options={
+            InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
+        },
+    )
+    with (
+        patch("docling.backend.html_backend.requests.get") as mocked_get,
+        pytest.warns(match="a bytes-like object is required"),
+    ):
+        res = converter.convert(source)
+        mocked_get.assert_called_once()
+    assert res.document
+
+    # image fetching: all conditions apply, local fetching allowed
+    backend_options = HTMLBackendOptions(
+        enable_local_fetch=True, fetch_images=True, source_uri=source
+    )
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.HTML],
+        format_options={
+            InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
+        },
+    )
+    with (
+        patch("docling.backend.html_backend.open") as mocked_open,
+        pytest.warns(match="a bytes-like object is required"),
+    ):
+        res = converter.convert(source)
+        mocked_open.assert_called_once_with(
+            "tests/data/html/example_image_01.png", "rb"
+        )
+        assert res.document
--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@@ -6,13 +6,12 @@ from docling.datamodel.document import (
    ConversionResult,
    DoclingDocument,
    InputDocument,
-    SectionHeaderItem,
 )
 from docling.document_converter import DocumentConverter
 from tests.verify_utils import CONFID_PREC, COORD_PREC

 from .test_data_gen_flag import GEN_TEST_DATA
-from .verify_utils import verify_document, verify_export
+from .verify_utils import verify_document

 GENERATE = GEN_TEST_DATA

--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -1,10 +1,19 @@
 from io import BytesIO
 from pathlib import Path

+import pytest
+from pydantic import ValidationError
+
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.backend_options import (
+    BaseBackendOptions,
+    DeclarativeBackendOptions,
+    HTMLBackendOptions,
+)
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.document import InputDocument, _DocumentConversionInput
 from docling.datamodel.settings import DocumentLimits
@@ -15,6 +24,7 @@ def test_in_doc_from_valid_path():
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    doc = _make_input_doc(test_doc_path)
    assert doc.valid is True
+    assert doc.backend_options is None


 def test_in_doc_from_invalid_path():
@@ -105,6 +115,38 @@ def test_in_doc_with_page_range():
    assert doc.valid is False


+def test_in_doc_with_backend_options():
+    test_doc_path = Path("./tests/data/html/example_01.html")
+    doc = InputDocument(
+        path_or_stream=test_doc_path,
+        format=InputFormat.HTML,
+        backend=HTMLDocumentBackend,
+        backend_options=HTMLBackendOptions(),
+    )
+    assert doc.valid
+    assert doc.backend_options
+    assert isinstance(doc.backend_options, HTMLBackendOptions)
+    assert not doc.backend_options.fetch_images
+    assert not doc.backend_options.enable_local_fetch
+    assert not doc.backend_options.enable_remote_fetch
+
+    with pytest.raises(ValueError, match="Incompatible types"):
+        doc = InputDocument(
+            path_or_stream=test_doc_path,
+            format=InputFormat.HTML,
+            backend=HTMLDocumentBackend,
+            backend_options=DeclarativeBackendOptions(),
+        )
+
+    with pytest.raises(ValidationError):
+        doc = InputDocument(
+            path_or_stream=test_doc_path,
+            format=InputFormat.HTML,
+            backend=HTMLDocumentBackend,
+            backend_options=BaseBackendOptions(),
+        )
+
+
 def test_guess_format(tmp_path):
    """Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
    dci = _DocumentConversionInput(path_or_stream_iterator=[])