reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-27 12:34:22 +00:00 · 2024-11-05 07:25:21 +01:00 · 2024-11-05 07:25:21 +01:00 · ddd1474c8d
commit ddd1474c8d
parent 3257034631
1 changed files with 23 additions and 20 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -1,12 +1,11 @@
-import re
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union

 from bs4 import BeautifulSoup
 from bs4.element import Tag
-
 from docling_core.types.doc import (
    DocItemLabel,
    DoclingDocument,
@ -171,14 +170,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            if self.is_body():
                self.handle_svg(element, idx, doc)

-        elif isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'):
+        elif (
+            isinstance(element, Tag)
+            and element.name in ["section"]
+            and element.has_attr("data-content")
+        ):
            try:
                # Decode the data-content attribute
                # data_content = html.unescape(element['data-content'])
-                data_content = element['data-content']
+                data_content = element["data-content"]

                # Parse the decoded HTML content
-                content_soup = BeautifulSoup(data_content, 'html.parser')
+                content_soup = BeautifulSoup(data_content, "html.parser")

                for jdx, _ in enumerate(content_soup):
                    self.analyse_element(_, jdx, doc)
@ -311,7 +314,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            # Flatten text, remove break lines:
            text = text.replace("\n", " ").replace("\r", "")
            text = " ".join(text.split()).strip()
-            text = re.sub(r'\s{2,}', ' ', text)
+            text = re.sub(r"\s{2,}", " ", text)

            marker = ""
            enumerated = False
@ -337,7 +340,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element.text, str):
            text = element.text.strip()
            text = text.replace("\n", " ").replace("\r", "")
-            text = re.sub(r'\s{2,}', ' ', text)
+            text = re.sub(r"\s{2,}", " ", text)

            marker = ""
            enumerated = False