From 413f6e8a88ee604666bbf2fde6f8b8c9fa1cbc70 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Wed, 26 Feb 2025 15:47:12 +0100 Subject: [PATCH] fix(html): use 'start' attribute in ordered lists When parsing ordered lists in HTML, take into account the 'start' attribute if it exists. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 18 ++++++++-- tests/test_backend_html.py | 58 ++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 00ef05b4..93e44db6 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -256,10 +256,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[self.level], name="list", label=GroupLabel.LIST ) elif element.name == "ol": + start_attr = element.get("start") + start: int = ( + int(start_attr) + if isinstance(start_attr, str) and start_attr.isnumeric() + else 1 + ) # create a list group self.parents[self.level + 1] = doc.add_group( parent=self.parents[self.level], - name="ordered list", + name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, ) self.level += 1 @@ -270,7 +276,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level -= 1 def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: - """Handles listitem tags (li).""" + """Handles list item tags (li).""" nested_list = element.find(["ul", "ol"]) parent = self.parents[self.level] @@ -279,6 +285,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return parent_label: str = parent.label index_in_list = len(parent.children) + 1 + if ( + parent_label == GroupLabel.ORDERED_LIST + and isinstance(parent, GroupItem) + and parent.name + ): + start_in_list: str = parent.name.split(" ")[-1] + start: int = int(start_in_list) if start_in_list.isnumeric() else 1 + index_in_list += start - 1 if nested_list: # Text in list item can be hidden within hierarchy, hence diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 6c1db062..a04ae219 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -1,4 +1,4 @@ -import os +from io import BytesIO from pathlib import Path from docling.backend.html_backend import HTMLDocumentBackend @@ -41,6 +41,62 @@ def test_heading_levels(): assert found_lvl_2 and found_lvl_3 +def test_ordered_lists(): + test_set: list[tuple[bytes, str]] = [] + + test_set.append( + ( + b"