From de7b963b09a34916f0a8d99649269aeb37db1408 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Thu, 27 Feb 2025 09:46:57 +0100 Subject: [PATCH] fix(html): use 'start' attribute when parsing ordered lists from HTML docs (#1062) * fix(html): use 'start' attribute in ordered lists When parsing ordered lists in HTML, take into account the 'start' attribute if it exists. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore(html): reduce verbosity in HTML backend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 24 +++++++++++--- tests/test_backend_html.py | 58 ++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 00ef05b4..f2320693 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -256,10 +256,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[self.level], name="list", label=GroupLabel.LIST ) elif element.name == "ol": + start_attr = element.get("start") + start: int = ( + int(start_attr) + if isinstance(start_attr, str) and start_attr.isnumeric() + else 1 + ) # create a list group self.parents[self.level + 1] = doc.add_group( parent=self.parents[self.level], - name="ordered list", + name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, ) self.level += 1 @@ -270,15 +276,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level -= 1 def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: - """Handles listitem tags (li).""" + """Handles list item tags (li).""" nested_list = element.find(["ul", "ol"]) parent = self.parents[self.level] if parent is None: - _log.warning(f"list-item has no parent in DoclingDocument: {element}") + _log.debug(f"list-item has no parent in DoclingDocument: {element}") return parent_label: str = parent.label index_in_list = len(parent.children) + 1 + if ( + parent_label == GroupLabel.ORDERED_LIST + and isinstance(parent, GroupItem) + and parent.name + ): + start_in_list: str = parent.name.split(" ")[-1] + start: int = int(start_in_list) if start_in_list.isnumeric() else 1 + index_in_list += start - 1 if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -324,13 +338,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=parent, ) else: - _log.warning(f"list-item has no text: {element}") + _log.debug(f"list-item has no text: {element}") @staticmethod def parse_table_data(element: Tag) -> Optional[TableData]: nested_tables = element.find("table") if nested_tables is not None: - _log.warning("Skipping nested table.") + _log.debug("Skipping nested table.") return None # Count the number of rows (number of