From 0cba30e2544c68441551e78e77017379be74bdd3 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 24 Feb 2025 09:29:06 +0100 Subject: [PATCH] reformatted code of html backend Signed-off-by: Peter Staar --- docling/backend/html_backend.py | 68 +++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 99fec909..9913ecdf 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -5,8 +5,6 @@ from typing import Final, Optional, Union, cast from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from bs4.element import PreformattedString - -from docling_core.types.doc.document import ContentLayer from docling_core.types.doc import ( DocItem, DocItemLabel, @@ -17,6 +15,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -106,7 +105,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if self.is_valid(): self.content_layer = ContentLayer.FURNITURE - + assert self.soup is not None content = self.soup.body or self.soup # Replace
tags with newline characters @@ -204,13 +203,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if hlevel == 1: self.content_layer = ContentLayer.BODY - - for key, in self.parents.keys(): + + for key in self.parents.keys(): self.parents[key] = None self.level = 1 self.parents[self.level] = doc.add_text( - parent=self.parents[0], label=DocItemLabel.TITLE, text=text, content_layer=self.content_layer + parent=self.parents[0], + label=DocItemLabel.TITLE, + text=text, + content_layer=self.content_layer, ) else: if hlevel > self.level: @@ -221,7 +223,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): name=f"header-{i}", label=GroupLabel.SECTION, parent=self.parents[i - 1], - content_layer=self.content_layer + content_layer=self.content_layer, ) self.level = hlevel @@ -237,7 +239,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[hlevel - 1], text=text, level=hlevel, - content_layer=self.content_layer + content_layer=self.content_layer, ) def handle_code(self, element: Tag, doc: DoclingDocument) -> None: @@ -246,7 +248,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return text = element.text.strip() if text: - doc.add_code(parent=self.parents[self.level], text=text, content_layer=self.content_layer) + doc.add_code( + parent=self.parents[self.level], + text=text, + content_layer=self.content_layer, + ) def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: """Handles paragraph tags (p).""" @@ -255,7 +261,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text = element.text.strip() label = DocItemLabel.TEXT if text: - doc.add_text(parent=self.parents[self.level], label=label, text=text, content_layer=self.content_layer) + doc.add_text( + parent=self.parents[self.level], + label=label, + text=text, + content_layer=self.content_layer, + ) def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" @@ -263,7 +274,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if element.name == "ul": # create a list group self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], name="list", label=GroupLabel.LIST, content_layer=self.content_layer + parent=self.parents[self.level], + name="list", + label=GroupLabel.LIST, + content_layer=self.content_layer, ) elif element.name == "ol": start_attr = element.get("start") @@ -277,7 +291,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[self.level], name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, - content_layer=self.content_layer + content_layer=self.content_layer, ) self.level += 1 @@ -326,7 +340,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): enumerated=enumerated, marker=marker, parent=parent, - content_layer=self.content_layer + content_layer=self.content_layer, ) self.level += 1 @@ -348,7 +362,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): enumerated=enumerated, marker=marker, parent=parent, - content_layer=self.content_layer + content_layer=self.content_layer, ) else: _log.debug(f"list-item has no text: {element}") @@ -452,7 +466,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): table_data = HTMLDocumentBackend.parse_table_data(element) if table_data is not None: - doc.add_table(data=table_data, parent=self.parents[self.level], content_layer=self.content_layer) + doc.add_table( + data=table_data, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: """Recursively extract text from