From de7b963b09a34916f0a8d99649269aeb37db1408 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Thu, 27 Feb 2025 09:46:57 +0100 Subject: [PATCH] fix(html): use 'start' attribute when parsing ordered lists from HTML docs (#1062) * fix(html): use 'start' attribute in ordered lists When parsing ordered lists in HTML, take into account the 'start' attribute if it exists. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore(html): reduce verbosity in HTML backend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 24 +++++++++++--- tests/test_backend_html.py | 58 ++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 00ef05b4..f2320693 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -256,10 +256,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[self.level], name="list", label=GroupLabel.LIST ) elif element.name == "ol": + start_attr = element.get("start") + start: int = ( + int(start_attr) + if isinstance(start_attr, str) and start_attr.isnumeric() + else 1 + ) # create a list group self.parents[self.level + 1] = doc.add_group( parent=self.parents[self.level], - name="ordered list", + name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, ) self.level += 1 @@ -270,15 +276,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level -= 1 def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: - """Handles listitem tags (li).""" + """Handles list item tags (li).""" nested_list = element.find(["ul", "ol"]) parent = self.parents[self.level] if parent is None: - _log.warning(f"list-item has no parent in DoclingDocument: {element}") + _log.debug(f"list-item has no parent in DoclingDocument: {element}") return parent_label: str = parent.label index_in_list = len(parent.children) + 1 + if ( + parent_label == GroupLabel.ORDERED_LIST + and isinstance(parent, GroupItem) + and parent.name + ): + start_in_list: str = parent.name.split(" ")[-1] + start: int = int(start_in_list) if start_in_list.isnumeric() else 1 + index_in_list += start - 1 if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -324,13 +338,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=parent, ) else: - _log.warning(f"list-item has no text: {element}") + _log.debug(f"list-item has no text: {element}") @staticmethod def parse_table_data(element: Tag) -> Optional[TableData]: nested_tables = element.find("table") if nested_tables is not None: - _log.warning("Skipping nested table.") + _log.debug("Skipping nested table.") return None # Count the number of rows (number of elements) diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 6c1db062..a04ae219 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -1,4 +1,4 @@ -import os +from io import BytesIO from pathlib import Path from docling.backend.html_backend import HTMLDocumentBackend @@ -41,6 +41,62 @@ def test_heading_levels(): assert found_lvl_2 and found_lvl_3 +def test_ordered_lists(): + test_set: list[tuple[bytes, str]] = [] + + test_set.append( + ( + b"
  1. 1st item
  2. 2nd item
", + "1. 1st item\n2. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "1. 1st item\n2. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "2. 1st item\n3. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "0. 1st item\n1. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "1. 1st item\n2. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "1. 1st item\n2. 2nd item", + ) + ) + + for pair in test_set: + in_doc = InputDocument( + path_or_stream=BytesIO(pair[0]), + format=InputFormat.HTML, + backend=HTMLDocumentBackend, + filename="test", + ) + backend = HTMLDocumentBackend( + in_doc=in_doc, + path_or_stream=BytesIO(pair[0]), + ) + doc: DoclingDocument = backend.convert() + assert doc + assert doc.export_to_markdown() == pair[1] + + def get_html_paths(): # Define the directory you want to search