Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-31 14:34:40 +00:00 · 2025-02-19 10:29:39 +01:00 · 2025-02-19 10:29:39 +01:00 · d788bf2a6e
commit d788bf2a6e
parent 8606b598dc 7450050ace
8 changed files with 327 additions and 424 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -1,9 +1,9 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Optional, Set, Union
+from typing import Optional, Union, cast

-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from docling_core.types.doc import (
    DocItemLabel,
    DoclingDocument,
@ -12,6 +12,7 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
+from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -21,6 +22,7 @@ _log = logging.getLogger(__name__)


 class HTMLDocumentBackend(DeclarativeDocumentBackend):
+    @override
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
        _log.debug("About to init HTML backend...")
@ -48,13 +50,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                f"Could not initialize HTML backend for file with hash {self.document_hash}."
            ) from e

+    @override
    def is_valid(self) -> bool:
        return self.soup is not None

    @classmethod
+    @override
    def supports_pagination(cls) -> bool:
        return False

+    @override
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
@ -62,9 +67,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.path_or_stream = None

    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.HTML}

+    @override
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
        origin = DocumentOrigin(
@ -80,98 +87,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            assert self.soup is not None
            content = self.soup.body or self.soup
            # Replace <br> tags with newline characters
-            for br in content.find_all("br"):
-                br.replace_with("\n")
-            doc = self.walk(content, doc)
+            for br in content("br"):
+                br.replace_with(NavigableString("\n"))
+            self.walk(content, doc)
        else:
            raise RuntimeError(
                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
            )
        return doc

-    def walk(self, element: Tag, doc: DoclingDocument):
-        try:
-            # Iterate over elements in the body of the document
-            for idx, element in enumerate(element.children):
+    def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+        # Iterate over elements in the body of the document
+        for element in tag.children:
+            if isinstance(element, Tag):
                try:
-                    self.analyse_element(element, idx, doc)
+                    self.analyze_tag(cast(Tag, element), doc)
                except Exception as exc_child:
-
-                    _log.error(" -> error treating child: ", exc_child)
-                    _log.error(" => element: ", element, "\n")
+                    _log.error(
+                        f"Error processing child from tag{tag.name}: {exc_child}"
+                    )
                    raise exc_child

-        except Exception as exc:
-            pass
+        return

-        return doc
-
-    def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
-        """
-        if element.name!=None:
-            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
-        """
-
-        if element.name in self.labels:
-            self.labels[element.name] += 1
+    def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
+        if tag.name in self.labels:
+            self.labels[tag.name] += 1
        else:
-            self.labels[element.name] = 1
+            self.labels[tag.name] = 1

-        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            self.handle_header(element, idx, doc)
-        elif element.name in ["p"]:
-            self.handle_paragraph(element, idx, doc)
-        elif element.name in ["pre"]:
-            self.handle_code(element, idx, doc)
-        elif element.name in ["ul", "ol"]:
-            self.handle_list(element, idx, doc)
-        elif element.name in ["li"]:
-            self.handle_listitem(element, idx, doc)
-        elif element.name == "table":
-            self.handle_table(element, idx, doc)
-        elif element.name == "figure":
-            self.handle_figure(element, idx, doc)
-        elif element.name == "img":
-            self.handle_image(element, idx, doc)
+        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            self.handle_header(tag, doc)
+        elif tag.name in ["p"]:
+            self.handle_paragraph(tag, doc)
+        elif tag.name in ["pre"]:
+            self.handle_code(tag, doc)
+        elif tag.name in ["ul", "ol"]:
+            self.handle_list(tag, doc)
+        elif tag.name in ["li"]:
+            self.handle_list_item(tag, doc)
+        elif tag.name == "table":
+            self.handle_table(tag, doc)
+        elif tag.name == "figure":
+            self.handle_figure(tag, doc)
+        elif tag.name == "img":
+            self.handle_image(doc)
        else:
-            self.walk(element, doc)
+            self.walk(tag, doc)

-    def get_direct_text(self, item: Tag):
-        """Get the direct text of the <li> element (ignoring nested lists)."""
-        text = item.find(string=True, recursive=False)
-        if isinstance(text, str):
-            return text.strip()
+    def get_text(self, item: PageElement) -> str:
+        """Get the text content of a tag."""
+        parts: list[str] = self.extract_text_recursively(item)

-        return ""
+        return "".join(parts) + " "

    # Function to recursively extract text from all child nodes
-    def extract_text_recursively(self, item: Tag):
-        result = []
+    def extract_text_recursively(self, item: PageElement) -> list[str]:
+        result: list[str] = []

-        if isinstance(item, str):
+        if isinstance(item, NavigableString):
            return [item]

-        if item.name not in ["ul", "ol"]:
-            try:
-                # Iterate over the children (and their text and tails)
-                for child in item:
-                    try:
-                        # Recursively get the child's text content
-                        result.extend(self.extract_text_recursively(child))
-                    except:
-                        pass
-            except:
-                _log.warn("item has no children")
-                pass
+        tag = cast(Tag, item)
+        if tag.name not in ["ul", "ol"]:
+            for child in tag:
+                # Recursively get the child's text content
+                result.extend(self.extract_text_recursively(child))

-        return "".join(result) + " "
+        return ["".join(result) + " "]

-    def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles header tags (h1, h2, etc.)."""
        hlevel = int(element.name.replace("h", ""))
-        slevel = hlevel - 1
-
-        label = DocItemLabel.SECTION_HEADER
        text = element.text.strip()

        if hlevel == 1:
@ -197,7 +184,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            elif hlevel < self.level:

                # remove the tail
-                for key, val in self.parents.items():
+                for key in self.parents.keys():
                    if key > hlevel:
                        self.parents[key] = None
                self.level = hlevel
@ -208,27 +195,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                level=hlevel,
            )

-    def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles monospace code snippets (pre)."""
        if element.text is None:
            return
        text = element.text.strip()
-        label = DocItemLabel.CODE
-        if len(text) == 0:
-            return
-        doc.add_code(parent=self.parents[self.level], text=text)
+        if text:
+            doc.add_code(parent=self.parents[self.level], text=text)

-    def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles paragraph tags (p)."""
        if element.text is None:
            return
        text = element.text.strip()
        label = DocItemLabel.PARAGRAPH
-        if len(text) == 0:
-            return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        if text:
+            doc.add_text(parent=self.parents[self.level], label=label, text=text)

-    def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles list tags (ul, ol) and their list items."""

        if element.name == "ul":
@ -250,18 +234,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.parents[self.level + 1] = None
        self.level -= 1

-    def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles listitem tags (li)."""
-        nested_lists = element.find(["ul", "ol"])
+        nested_list = element.find(["ul", "ol"])

        parent_list_label = self.parents[self.level].label
        index_in_list = len(self.parents[self.level].children) + 1

-        if nested_lists:
-            name = element.name
+        if nested_list:
            # Text in list item can be hidden within hierarchy, hence
            # we need to extract it recursively
-            text = self.extract_text_recursively(element)
+            text: str = self.get_text(element)
            # Flatten text, remove break lines:
            text = text.replace("\n", "").replace("\r", "")
            text = " ".join(text.split()).strip()
@ -287,7 +270,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self.parents[self.level + 1] = None
            self.level -= 1

-        elif isinstance(element.text, str):
+        elif element.text.strip():
            text = element.text.strip()

            marker = ""
@ -302,59 +285,79 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                parent=self.parents[self.level],
            )
        else:
-            _log.warn("list-item has no text: ", element)
-
-    def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles table tags."""
+            _log.warning(f"list-item has no text: {element}")

+    @staticmethod
+    def parse_table_data(element: Tag) -> Optional[TableData]:
        nested_tables = element.find("table")
        if nested_tables is not None:
-            _log.warn("detected nested tables: skipping for now")
-            return
+            _log.warning("Skipping nested table.")
+            return None

        # Count the number of rows (number of <tr> elements)
-        num_rows = len(element.find_all("tr"))
+        num_rows = len(element("tr"))

        # Find the number of columns (taking into account colspan)
        num_cols = 0
-        for row in element.find_all("tr"):
+        for row in element("tr"):
            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
+            if not isinstance(row, Tag):
+                continue
+            for cell in row(["td", "th"]):
+                if not isinstance(row, Tag):
+                    continue
+                val = cast(Tag, cell).get("colspan", "1")
+                colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
                col_count += colspan
            num_cols = max(num_cols, col_count)

-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+        grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]

        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])

        # Iterate over the rows in the table
-        for row_idx, row in enumerate(element.find_all("tr")):
+        for row_idx, row in enumerate(element("tr")):
+            if not isinstance(row, Tag):
+                continue

            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
+            cells = row(["td", "th"])

            # Check if each cell in the row is a header -> means it is a column header
            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
+            for html_cell in cells:
+                if isinstance(html_cell, Tag) and html_cell.name == "td":
                    col_header = False

+            # Extract the text content of each cell
            col_idx = 0
-            # Extract and print the text content of each cell
-            for _, html_cell in enumerate(cells):
+            for html_cell in cells:
+                if not isinstance(html_cell, Tag):
+                    continue

+                # extract inline formulas
+                for formula in html_cell("inline-formula"):
+                    math_parts = formula.text.split("$$")
+                    if len(math_parts) == 3:
+                        math_formula = f"$${math_parts[1]}$$"
+                        formula.replace_with(NavigableString(math_formula))
+
+                # TODO: extract content correctly from table-cells with lists
                text = html_cell.text
-                try:
-                    text = self.extract_table_cell_text(html_cell)
-                except Exception as exc:
-                    _log.warn("exception: ", exc)
-                    exit(-1)

                # label = html_cell.name
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
+                col_val = html_cell.get("colspan", "1")
+                col_span = (
+                    int(col_val)
+                    if isinstance(col_val, str) and col_val.isnumeric()
+                    else 1
+                )
+                row_val = html_cell.get("rowspan", "1")
+                row_span = (
+                    int(row_val)
+                    if isinstance(row_val, str) and row_val.isnumeric()
+                    else 1
+                )

                while grid[row_idx][col_idx] is not None:
                    col_idx += 1
@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    for c in range(col_span):
                        grid[row_idx + r][col_idx + c] = text

-                cell = TableCell(
+                table_cell = TableCell(
                    text=text,
                    row_span=row_span,
                    col_span=col_span,
@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    col_header=col_header,
                    row_header=((not col_header) and html_cell.name == "th"),
                )
-                data.table_cells.append(cell)
+                data.table_cells.append(table_cell)

-        doc.add_table(data=data, parent=self.parents[self.level])
+        return data

-    def get_list_text(self, list_element: Tag, level=0):
+    def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
+        """Handles table tags."""
+
+        table_data = HTMLDocumentBackend.parse_table_data(element)
+
+        if table_data is not None:
+            doc.add_table(data=table_data, parent=self.parents[self.level])
+
+    def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
        """Recursively extract text from <ul> or <ol> with proper indentation."""
        result = []
        bullet_char = "*"  # Default bullet character for unordered lists

        if list_element.name == "ol":  # For ordered lists, use numbers
-            for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
+            for i, li in enumerate(list_element("li", recursive=False), 1):
+                if not isinstance(li, Tag):
+                    continue
                # Add numbering for ordered lists
                result.append(f"{'    ' * level}{i}. {li.get_text(strip=True)}")
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
-                if nested_list:
+                if isinstance(nested_list, Tag):
                    result.extend(self.get_list_text(nested_list, level + 1))
        elif list_element.name == "ul":  # For unordered lists, use bullet points
-            for li in list_element.find_all("li", recursive=False):
+            for li in list_element("li", recursive=False):
+                if not isinstance(li, Tag):
+                    continue
                # Add bullet points for unordered lists
                result.append(
                    f"{'    ' * level}{bullet_char} {li.get_text(strip=True)}"
                )
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
-                if nested_list:
+                if isinstance(nested_list, Tag):
                    result.extend(self.get_list_text(nested_list, level + 1))

        return result

-    def extract_table_cell_text(self, cell: Tag):
-        """Extract text from a table cell, including lists with indents."""
-        contains_lists = cell.find(["ul", "ol"])
-        if contains_lists is None:
-            return cell.text
-        else:
-            _log.debug(
-                "should extract the content correctly for table-cells with lists ..."
-            )
-            return cell.text
-
-    def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles image tags (img)."""

        # Extract the image URI from the <img> tag
        # image_uri = root.xpath('//figure//img/@src')[0]

        contains_captions = element.find(["figcaption"])
-        if contains_captions is None:
+        if not isinstance(contains_captions, Tag):
            doc.add_picture(parent=self.parents[self.level], caption=None)
-
        else:
            texts = []
            for item in contains_captions:
@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                caption=fig_caption,
            )

-    def handle_image(self, element: Tag, idx, doc: DoclingDocument):
+    def handle_image(self, doc: DoclingDocument) -> None:
        """Handles image tags (img)."""
        doc.add_picture(parent=self.parents[self.level], caption=None)
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -4,7 +4,7 @@ from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union

-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
    DocItemLabel,
    DoclingDocument,
@ -12,14 +12,13 @@ from docling_core.types.doc import (
    GroupItem,
    GroupLabel,
    NodeItem,
-    TableCell,
-    TableData,
    TextItem,
 )
 from lxml import etree
 from typing_extensions import TypedDict, override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
    ) -> None:
        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
        table_tag = soup.find("table")
-
-        nested_tables = table_tag.find("table")
-        if nested_tables:
-            _log.warning(f"Skipping nested table in {str(self.file)}")
+        if not isinstance(table_tag, Tag):
            return

-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(table_tag.find_all("tr"))
-
-        # Find the number of columns (taking into account colspan)
-        num_cols = 0
-        for row in table_tag.find_all("tr"):
-            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
-                col_count += colspan
-            num_cols = max(num_cols, col_count)
-
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
-        # Iterate over the rows in the table
-        for row_idx, row in enumerate(table_tag.find_all("tr")):
-            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
-
-            # Check if each cell in the row is a header -> means it is a column header
-            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
-                    col_header = False
-
-            # Extract and print the text content of each cell
-            col_idx = 0
-            for _, html_cell in enumerate(cells):
-                # extract inline formulas
-                for formula in html_cell.find_all("inline-formula"):
-                    math_parts = formula.text.split("$$")
-                    if len(math_parts) == 3:
-                        math_formula = f"$${math_parts[1]}$$"
-                        formula.replaceWith(math_formula)
-                text = html_cell.text
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
-
-                while grid[row_idx][col_idx] is not None:
-                    col_idx += 1
-                for r in range(row_span):
-                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
-
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=col_header,
-                    row_header=((not col_header) and html_cell.name == "th"),
-                )
-                data.table_cells.append(cell)
+        data = HTMLDocumentBackend.parse_table_data(table_tag)

        # TODO: format label vs caption once styling is supported
        label = table_xml_component["label"]
@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
            else None
        )

-        doc.add_table(data=data, parent=parent, caption=table_caption)
+        if data is not None:
+            doc.add_table(data=data, parent=parent, caption=table_caption)

        return

@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
    def _walk_linear(
        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
    ) -> str:
-        # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
        skip_tags = ["term"]
        flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
        new_parent: NodeItem = parent
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
 from enum import Enum, unique
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Final, Optional, Union
+from typing import Final, Optional, Union

 from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
@ -1406,6 +1406,10 @@ class XmlTable:
    http://oasis-open.org/specs/soextblx.dtd
    """

+    class ColInfo(TypedDict):
+        ncols: int
+        colinfo: list[dict]
+
    class MinColInfoType(TypedDict):
        offset: list[int]
        colwidth: list[int]
@ -1425,7 +1429,7 @@ class XmlTable:
        self.empty_text = ""
        self._soup = BeautifulSoup(input, features="xml")

-    def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
+    def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
        """Create a unified range along the table groups.

        Args:
@ -1532,19 +1536,26 @@ class XmlTable:
        Returns:
            A docling table object.
        """
-        tgs_align = []
-        tg_secs = table.find_all("tgroup")
+        tgs_align: list[XmlTable.ColInfo] = []
+        tg_secs = table("tgroup")
        if tg_secs:
            for tg_sec in tg_secs:
-                ncols = tg_sec.get("cols", None)
-                if ncols:
-                    ncols = int(ncols)
-                tg_align = {"ncols": ncols, "colinfo": []}
-                cs_secs = tg_sec.find_all("colspec")
+                if not isinstance(tg_sec, Tag):
+                    continue
+                col_val = tg_sec.get("cols")
+                ncols = (
+                    int(col_val)
+                    if isinstance(col_val, str) and col_val.isnumeric()
+                    else 1
+                )
+                tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
+                cs_secs = tg_sec("colspec")
                if cs_secs:
                    for cs_sec in cs_secs:
-                        colname = cs_sec.get("colname", None)
-                        colwidth = cs_sec.get("colwidth", None)
+                        if not isinstance(cs_sec, Tag):
+                            continue
+                        colname = cs_sec.get("colname")
+                        colwidth = cs_sec.get("colwidth")
                        tg_align["colinfo"].append(
                            {"colname": colname, "colwidth": colwidth}
                        )
@ -1565,16 +1576,23 @@ class XmlTable:
        table_data: list[TableCell] = []
        i_row_global = 0
        is_row_empty: bool = True
-        tg_secs = table.find_all("tgroup")
+        tg_secs = table("tgroup")
        if tg_secs:
            for itg, tg_sec in enumerate(tg_secs):
+                if not isinstance(tg_sec, Tag):
+                    continue
                tg_range = tgs_range[itg]
-                row_secs = tg_sec.find_all(["row", "tr"])
+                row_secs = tg_sec(["row", "tr"])

                if row_secs:
                    for row_sec in row_secs:
-                        entry_secs = row_sec.find_all(["entry", "td"])
-                        is_header: bool = row_sec.parent.name in ["thead"]
+                        if not isinstance(row_sec, Tag):
+                            continue
+                        entry_secs = row_sec(["entry", "td"])
+                        is_header: bool = (
+                            row_sec.parent is not None
+                            and row_sec.parent.name == "thead"
+                        )

                        ncols = 0
                        local_row: list[TableCell] = []
@ -1582,23 +1600,26 @@ class XmlTable:
                        if entry_secs:
                            wrong_nbr_cols = False
                            for ientry, entry_sec in enumerate(entry_secs):
+                                if not isinstance(entry_sec, Tag):
+                                    continue
                                text = entry_sec.get_text().strip()

                                # start-end
-                                namest = entry_sec.attrs.get("namest", None)
-                                nameend = entry_sec.attrs.get("nameend", None)
-                                if isinstance(namest, str) and namest.isnumeric():
-                                    namest = int(namest)
-                                else:
-                                    namest = ientry + 1
+                                namest = entry_sec.get("namest")
+                                nameend = entry_sec.get("nameend")
+                                start = (
+                                    int(namest)
+                                    if isinstance(namest, str) and namest.isnumeric()
+                                    else ientry + 1
+                                )
                                if isinstance(nameend, str) and nameend.isnumeric():
-                                    nameend = int(nameend)
+                                    end = int(nameend)
                                    shift = 0
                                else:
-                                    nameend = ientry + 2
+                                    end = ientry + 2
                                    shift = 1

-                                if nameend > len(tg_range["cell_offst"]):
+                                if end > len(tg_range["cell_offst"]):
                                    wrong_nbr_cols = True
                                    self.nbr_messages += 1
                                    if self.nbr_messages <= self.max_nbr_messages:
@ -1608,8 +1629,8 @@ class XmlTable:
                                    break

                                range_ = [
-                                    tg_range["cell_offst"][namest - 1],
-                                    tg_range["cell_offst"][nameend - 1] - shift,
+                                    tg_range["cell_offst"][start - 1],
+                                    tg_range["cell_offst"][end - 1] - shift,
                                ]

                                # add row and replicate cell if needed
@ -1668,7 +1689,7 @@ class XmlTable:
            A docling table data.
        """
        section = self._soup.find("table")
-        if section is not None:
+        if isinstance(section, Tag):
            table = self._parse_table(section)
            if table.num_rows == 0 or table.num_cols == 0:
                _log.warning("The parsed USPTO table is empty")
--- a/poetry.lock
+++ b/poetry.lock
@ -282,17 +282,18 @@ testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-ch

 [[package]]
 name = "beautifulsoup4"
-version = "4.12.3"
+version = "4.13.3"
 description = "Screen-scraping library"
 optional = false
-python-versions = ">=3.6.0"
+python-versions = ">=3.7.0"
 files = [
-    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
-    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+    {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
+    {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
 ]

 [package.dependencies]
 soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"

 [package.extras]
 cchardet = ["cchardet"]
@ -820,13 +821,13 @@ files = [

 [[package]]
 name = "docling-core"
-version = "2.19.0"
+version = "2.19.1"
 description = "A python library to define and validate data types in Docling."
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_core-2.19.0-py3-none-any.whl", hash = "sha256:caa1e13d98fa9a00608091c386609c75b3560c7291e842c252f0b6f8d5812dbd"},
-    {file = "docling_core-2.19.0.tar.gz", hash = "sha256:ebf3062e31155bb5f0e6132056a2d239a0e6e693a75c5758886909bb9fef461a"},
+    {file = "docling_core-2.19.1-py3-none-any.whl", hash = "sha256:ca7bd4dacd75611c5ea4f205192b71a8f22205e615eff1a16aac7082644d3b2e"},
+    {file = "docling_core-2.19.1.tar.gz", hash = "sha256:e2769b816c669cdf27024dd3b219d3ecaf2161691dd5e8e5e8ce439557ea0928"},
 ]

 [package.dependencies]
@ -1317,13 +1318,13 @@ colorama = ">=0.4"

 [[package]]
 name = "griffe-pydantic"
-version = "1.1.0"
+version = "1.1.2"
 description = "Griffe extension for Pydantic."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"},
-    {file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"},
+    {file = "griffe_pydantic-1.1.2-py3-none-any.whl", hash = "sha256:8ad53218ca6e9c24ccec83588eb435f562b30355f641fe336e81b1e00ea05f3c"},
+    {file = "griffe_pydantic-1.1.2.tar.gz", hash = "sha256:381eacd8854a85811522b4f6dc9a1ef0fb5931825081379d70ff3a425b0d4ea1"},
 ]

 [package.dependencies]
@ -7021,18 +7022,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]

 [[package]]
 name = "transformers"
-version = "4.48.3"
+version = "4.49.0"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.9.0"
 files = [
-    {file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"},
-    {file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"},
+    {file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"},
+    {file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"},
 ]

 [package.dependencies]
 filelock = "*"
-huggingface-hub = ">=0.24.0,<1.0"
+huggingface-hub = ">=0.26.0,<1.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 pyyaml = ">=5.1"
@ -7045,13 +7046,13 @@ tqdm = ">=4.27"
 [package.extras]
 accelerate = ["accelerate (>=0.26.0)"]
 agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 benchmark = ["optimum-benchmark (>=0.3.0)"]
 codecarbon = ["codecarbon (>=2.8.1)"]
 deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
 deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
 dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
@ -7084,8 +7085,8 @@ tokenizers = ["tokenizers (>=0.21,<0.22)"]
 torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)"]
+torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
+video = ["av"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]

 [[package]]
@ -7810,4 +7811,4 @@ vlm = ["transformers", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "b19c39233b5c7ca2a4feed4886542395492ed43f4957f9c6f097b03e8d5b6148"
+content-hash = "3f657e7af78058e75dfb9f32e373f7f70e5e68a42a5b3603189e2251be90f349"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -45,7 +45,7 @@ scipy = [
 typer = "^0.12.5"
 python-docx = "^1.1.2"
 python-pptx = "^1.0.2"
-beautifulsoup4 = ">=4.12.3,<4.13.0"
+beautifulsoup4 = "^4.12.3"
 pandas = "^2.1.4"
 marko = "^2.1.2"
 openpyxl = "^3.1.5"
@ -164,7 +164,6 @@ module = [
    "easyocr.*",
    "ocrmac.*",
    "lxml.*",
-    "bs4.*",
    "huggingface_hub.*",
    "transformers.*",
 ]
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
@ -410,68 +410,65 @@ item-0 at level 0: unspecified: group _root_
      item-396 at level 3: list: group list
        item-397 at level 4: list_item: list of books (useful looking abstracts)
        item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
-        item-399 at level 4: list_item: 
-        item-400 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
-      item-401 at level 3: table with [3x2]
-      item-402 at level 3: picture
-      item-403 at level 3: list: group list
-        item-404 at level 4: list_item: Ducks
-        item-405 at level 4: list_item: Game birds
-        item-406 at level 4: list_item: Bird common names
-      item-407 at level 3: list: group list
-        item-408 at level 4: list_item: All accuracy disputes
-        item-409 at level 4: list_item: Accuracy disputes from February 2020
-        item-410 at level 4: list_item: CS1 Finnish-language sources (fi)
-        item-411 at level 4: list_item: CS1 Latvian-language sources (lv)
-        item-412 at level 4: list_item: CS1 Swedish-language sources (sv)
-        item-413 at level 4: list_item: Articles with short description
-        item-414 at level 4: list_item: Short description is different from Wikidata
-        item-415 at level 4: list_item: Wikipedia indefinitely move-protected pages
-        item-416 at level 4: list_item: Wikipedia indefinitely semi-protected pages
-        item-417 at level 4: list_item: Articles with 'species' microformats
-        item-418 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
-        item-419 at level 4: list_item: Articles containing Dutch-language text
-        item-420 at level 4: list_item: Articles containing German-language text
-        item-421 at level 4: list_item: Articles containing Norwegian-language text
-        item-422 at level 4: list_item: Articles containing Lithuanian-language text
-        item-423 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
-        item-424 at level 4: list_item: All articles with self-published sources
-        item-425 at level 4: list_item: Articles with self-published sources from February 2020
-        item-426 at level 4: list_item: All articles with unsourced statements
-        item-427 at level 4: list_item: Articles with unsourced statements from January 2022
-        item-428 at level 4: list_item: CS1: long volume value
-        item-429 at level 4: list_item: Pages using Sister project links with wikidata mismatch
-        item-430 at level 4: list_item: Pages using Sister project links with hidden wikidata
-        item-431 at level 4: list_item: Webarchive template wayback links
-        item-432 at level 4: list_item: Articles with Project Gutenberg links
-        item-433 at level 4: list_item: Articles containing video clips
-      item-434 at level 3: list: group list
-        item-435 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
-        item-436 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
-      item-437 at level 3: list: group list
-        item-438 at level 4: list_item: Privacy policy
-        item-439 at level 4: list_item: About Wikipedia
-        item-440 at level 4: list_item: Disclaimers
-        item-441 at level 4: list_item: Contact Wikipedia
-        item-442 at level 4: list_item: Code of Conduct
-        item-443 at level 4: list_item: Developers
-        item-444 at level 4: list_item: Statistics
-        item-445 at level 4: list_item: Cookie statement
-        item-446 at level 4: list_item: Mobile view
+        item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
+      item-400 at level 3: table with [3x2]
+      item-401 at level 3: picture
+      item-402 at level 3: list: group list
+        item-403 at level 4: list_item: Ducks
+        item-404 at level 4: list_item: Game birds
+        item-405 at level 4: list_item: Bird common names
+      item-406 at level 3: list: group list
+        item-407 at level 4: list_item: All accuracy disputes
+        item-408 at level 4: list_item: Accuracy disputes from February 2020
+        item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
+        item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
+        item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
+        item-412 at level 4: list_item: Articles with short description
+        item-413 at level 4: list_item: Short description is different from Wikidata
+        item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
+        item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
+        item-416 at level 4: list_item: Articles with 'species' microformats
+        item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
+        item-418 at level 4: list_item: Articles containing Dutch-language text
+        item-419 at level 4: list_item: Articles containing German-language text
+        item-420 at level 4: list_item: Articles containing Norwegian-language text
+        item-421 at level 4: list_item: Articles containing Lithuanian-language text
+        item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
+        item-423 at level 4: list_item: All articles with self-published sources
+        item-424 at level 4: list_item: Articles with self-published sources from February 2020
+        item-425 at level 4: list_item: All articles with unsourced statements
+        item-426 at level 4: list_item: Articles with unsourced statements from January 2022
+        item-427 at level 4: list_item: CS1: long volume value
+        item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
+        item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
+        item-430 at level 4: list_item: Webarchive template wayback links
+        item-431 at level 4: list_item: Articles with Project Gutenberg links
+        item-432 at level 4: list_item: Articles containing video clips
+      item-433 at level 3: list: group list
+        item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
+        item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
+      item-436 at level 3: list: group list
+        item-437 at level 4: list_item: Privacy policy
+        item-438 at level 4: list_item: About Wikipedia
+        item-439 at level 4: list_item: Disclaimers
+        item-440 at level 4: list_item: Contact Wikipedia
+        item-441 at level 4: list_item: Code of Conduct
+        item-442 at level 4: list_item: Developers
+        item-443 at level 4: list_item: Statistics
+        item-444 at level 4: list_item: Cookie statement
+        item-445 at level 4: list_item: Mobile view
+      item-446 at level 3: list: group list
      item-447 at level 3: list: group list
-        item-448 at level 4: list_item: 
-        item-449 at level 4: list_item: 
-      item-450 at level 3: list: group list
-  item-451 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
-  item-452 at level 1: caption: Male mallard.
-  item-453 at level 1: caption: Wood ducks.
-  item-454 at level 1: caption: Mallard landing in approach
-  item-455 at level 1: caption: Male Mandarin duck
-  item-456 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
-  item-457 at level 1: caption: Female mallard in Cornwall, England
-  item-458 at level 1: caption: Pecten along the bill
-  item-459 at level 1: caption: Mallard duckling preening
-  item-460 at level 1: caption: A Muscovy duckling
-  item-461 at level 1: caption: Ringed teal
-  item-462 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
-  item-463 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
+  item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
+  item-449 at level 1: caption: Male mallard.
+  item-450 at level 1: caption: Wood ducks.
+  item-451 at level 1: caption: Mallard landing in approach
+  item-452 at level 1: caption: Male Mandarin duck
+  item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
+  item-454 at level 1: caption: Female mallard in Cornwall, England
+  item-455 at level 1: caption: Pecten along the bill
+  item-456 at level 1: caption: Mallard duckling preening
+  item-457 at level 1: caption: A Muscovy duckling
+  item-458 at level 1: caption: Ringed teal
+  item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
+  item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
@ -1413,9 +1413,6 @@
        },
        {
          "$ref": "#/texts/350"
-        },
-        {
-          "$ref": "#/texts/351"
        }
      ],
      "content_layer": "body",
@ -1428,14 +1425,14 @@
        "$ref": "#/texts/341"
      },
      "children": [
+        {
+          "$ref": "#/texts/351"
+        },
        {
          "$ref": "#/texts/352"
        },
        {
          "$ref": "#/texts/353"
-        },
-        {
-          "$ref": "#/texts/354"
        }
      ],
      "content_layer": "body",
@ -1448,6 +1445,9 @@
        "$ref": "#/texts/341"
      },
      "children": [
+        {
+          "$ref": "#/texts/354"
+        },
        {
          "$ref": "#/texts/355"
        },
@ -1522,9 +1522,6 @@
        },
        {
          "$ref": "#/texts/379"
-        },
-        {
-          "$ref": "#/texts/380"
        }
      ],
      "content_layer": "body",
@ -1538,10 +1535,10 @@
      },
      "children": [
        {
-          "$ref": "#/texts/381"
+          "$ref": "#/texts/380"
        },
        {
-          "$ref": "#/texts/382"
+          "$ref": "#/texts/381"
        }
      ],
      "content_layer": "body",
@ -1554,6 +1551,9 @@
        "$ref": "#/texts/341"
      },
      "children": [
+        {
+          "$ref": "#/texts/382"
+        },
        {
          "$ref": "#/texts/383"
        },
@ -1577,9 +1577,6 @@
        },
        {
          "$ref": "#/texts/390"
-        },
-        {
-          "$ref": "#/texts/391"
        }
      ],
      "content_layer": "body",
@ -1591,14 +1588,7 @@
      "parent": {
        "$ref": "#/texts/341"
      },
-      "children": [
-        {
-          "$ref": "#/texts/392"
-        },
-        {
-          "$ref": "#/texts/393"
-        }
-      ],
+      "children": [],
      "content_layer": "body",
      "name": "list",
      "label": "list"
@ -6774,27 +6764,13 @@
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/351",
-      "parent": {
-        "$ref": "#/groups/42"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
      "orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
      "text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
      "enumerated": false,
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/352",
+      "self_ref": "#/texts/351",
      "parent": {
        "$ref": "#/groups/43"
      },
@ -6808,7 +6784,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/353",
+      "self_ref": "#/texts/352",
      "parent": {
        "$ref": "#/groups/43"
      },
@ -6822,7 +6798,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/354",
+      "self_ref": "#/texts/353",
      "parent": {
        "$ref": "#/groups/43"
      },
@ -6836,7 +6812,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/355",
+      "self_ref": "#/texts/354",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6850,7 +6826,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/356",
+      "self_ref": "#/texts/355",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6864,7 +6840,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/357",
+      "self_ref": "#/texts/356",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6878,7 +6854,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/358",
+      "self_ref": "#/texts/357",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6892,7 +6868,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/359",
+      "self_ref": "#/texts/358",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6906,7 +6882,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/360",
+      "self_ref": "#/texts/359",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6920,7 +6896,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/361",
+      "self_ref": "#/texts/360",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6934,7 +6910,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/362",
+      "self_ref": "#/texts/361",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6948,7 +6924,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/363",
+      "self_ref": "#/texts/362",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6962,7 +6938,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/364",
+      "self_ref": "#/texts/363",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6976,7 +6952,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/365",
+      "self_ref": "#/texts/364",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -6990,7 +6966,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/366",
+      "self_ref": "#/texts/365",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7004,7 +6980,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/367",
+      "self_ref": "#/texts/366",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7018,7 +6994,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/368",
+      "self_ref": "#/texts/367",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7032,7 +7008,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/369",
+      "self_ref": "#/texts/368",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7046,7 +7022,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/370",
+      "self_ref": "#/texts/369",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7060,7 +7036,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/371",
+      "self_ref": "#/texts/370",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7074,7 +7050,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/372",
+      "self_ref": "#/texts/371",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7088,7 +7064,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/373",
+      "self_ref": "#/texts/372",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7102,7 +7078,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/374",
+      "self_ref": "#/texts/373",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7116,7 +7092,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/375",
+      "self_ref": "#/texts/374",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7130,7 +7106,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/376",
+      "self_ref": "#/texts/375",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7144,7 +7120,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/377",
+      "self_ref": "#/texts/376",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7158,7 +7134,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/378",
+      "self_ref": "#/texts/377",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7172,7 +7148,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/379",
+      "self_ref": "#/texts/378",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7186,7 +7162,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/380",
+      "self_ref": "#/texts/379",
      "parent": {
        "$ref": "#/groups/44"
      },
@ -7200,7 +7176,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/381",
+      "self_ref": "#/texts/380",
      "parent": {
        "$ref": "#/groups/45"
      },
@ -7214,7 +7190,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/382",
+      "self_ref": "#/texts/381",
      "parent": {
        "$ref": "#/groups/45"
      },
@ -7228,7 +7204,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/383",
+      "self_ref": "#/texts/382",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7242,7 +7218,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/384",
+      "self_ref": "#/texts/383",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7256,7 +7232,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/385",
+      "self_ref": "#/texts/384",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7270,7 +7246,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/386",
+      "self_ref": "#/texts/385",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7284,7 +7260,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/387",
+      "self_ref": "#/texts/386",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7298,7 +7274,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/388",
+      "self_ref": "#/texts/387",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7312,7 +7288,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/389",
+      "self_ref": "#/texts/388",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7326,7 +7302,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/390",
+      "self_ref": "#/texts/389",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7340,7 +7316,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/391",
+      "self_ref": "#/texts/390",
      "parent": {
        "$ref": "#/groups/46"
      },
@ -7352,34 +7328,6 @@
      "text": "Mobile view",
      "enumerated": false,
      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/392",
-      "parent": {
-        "$ref": "#/groups/47"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/393",
-      "parent": {
-        "$ref": "#/groups/47"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": false,
-      "marker": "-"
    }
  ],
  "pictures": [
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
@ -473,7 +473,6 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a

 - list of books (useful looking abstracts)
 - Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
- 
 - Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl

 | Authority control databases    | Authority control databases                  |
@ -526,7 +525,4 @@ additional terms may apply. By using this site, you agree to the Terms of Use an
 - Developers
 - Statistics
 - Cookie statement
- Mobile view
-
- 
-
+- Mobile view