updated the html backend to add svg, remove empty list-items and use data-content fields

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-12-12 14:48:21 +00:00 · 2024-11-05 06:37:43 +01:00
parent 5fc4d5bd3d
commit f276c0cc90
1 changed files with 38 additions and 49 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -47,16 +47,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        try:
            if isinstance(self.path_or_stream, BytesIO):
                text_stream = self.path_or_stream.getvalue().decode("utf-8")
-                print("BytesIO")
+                _log.debug("reading from BytesIO")
                self.soup = BeautifulSoup(text_stream, "html.parser")
            if isinstance(self.path_or_stream, Path):
-                print("file")
+                _log.debug("reading from file")
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    html_content = f.read()
                    with open("./scratch/file.html", "w") as fw:
                        fw.write(html_content)
                    self.soup = BeautifulSoup(html_content, "html.parser")
        except Exception as e:
            raise RuntimeError(
@@ -115,31 +111,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    try:
                        self.analyse_element(child, idx, doc)
                    except Exception as exc:
-                        _log.error(f" -> error treating child: {exc}")
+                        _log.info(f" -> error treating child: {exc}")
                        raise exc
            elif isinstance(element, Tag):
                try:
                    self.analyse_element(element, 0, doc)
                except Exception as exc:
-                    _log.error(f" -> error treating elem: {exc}")
+                    _log.info(f" -> error treating elem: {exc}")
                    raise exc
            else:
-                _log.warn(f"ignoring element of type {type(element)}")
+                _log.debug(f"ignoring element of type {type(element)}")
        except Exception as exc:
-            _log.warn(f"error walking element: {type(element)}")
+            _log.debug(f"error walking element: {type(element)}")
            pass
        return doc
    def is_body(self):
        return (not self.contains_h1) or (self.contains_h1 and self.detected_h1)
    def analyse_element(self, element, idx, doc):
        if element.name!=None:
-            #_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
            print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
        #print(element.name)
        if element.name in self.labels:
            self.labels[element.name] += 1
@@ -150,53 +146,43 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self.detected_h1 = True
        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+            if self.is_body():
                self.handle_header(element, idx, doc)
        elif element.name in ["p"]:
-            print(" --> detected ...")
+            if self.is_body():
            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
                self.handle_paragraph(element, idx, doc)
                print(" --> registered ...")
        elif element.name in ["ul", "ol"]:
-            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+            if self.is_body():
                self.handle_list(element, idx, doc)
        elif element.name in ["li"]:
-            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+            if self.is_body():
                self.handle_listitem(element, idx, doc)
        elif element.name == "table":
-            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+            if self.is_body():
                self.handle_table(element, idx, doc)
        elif element.name == "figure":
-            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+            if self.is_body():
                self.handle_figure(element, idx, doc)
        elif element.name == "img":
-            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+            if self.is_body():
                self.handle_image(element, idx, doc)
        elif element.name == "svg":
-            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+            if self.is_body():
-                #self.handle_image(element, idx, doc)
+                self.handle_svg(element, idx, doc)
                _log.warn("Add `svg` elements")
-        elif True and isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'):
+        elif isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'):
            try:
                #print("\n\n\nattempt decoding: ", element['data-content'])
                # Decode the data-content attribute
                #data_content = html.unescape(element['data-content'])
                #print(data_content)
                data_content = element['data-content']
                # Parse the decoded HTML content
                content_soup = BeautifulSoup(data_content, 'html.parser')
                print("\n\n\nsoup: ", content_soup)
                for jdx, _ in enumerate(content_soup):
                    print(_)
                    self.analyse_element(_, jdx, doc)
            except:
-                _log.warn("could not parse the `data-content` attribute")
+                _log.debug("could not parse the `data-content` attribute")
            self.walk(element, doc)
@@ -278,16 +264,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def handle_paragraph(self, element, idx, doc):
        """Handles paragraph tags (p)."""
        if element.text is None:            
            print(" -> text is None ...")
            return
        text = element.text.strip()
-        print("doc is adding paragraph: ", text)
+        if len(text) == 0:
            return
        label = DocItemLabel.PARAGRAPH
        if len(text) == 0:
            print(" -> text is zero length ...")
            return
        print("doc is adding paragraph: ", text)
        doc.add_text(parent=self.parents[self.level], label=label, text=text)
    def handle_list(self, element, idx, doc):
@@ -325,7 +308,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            # we need to extract it recursively
            text = self.extract_text_recursively(element)
            # Flatten text, remove break lines:
-            text = text.replace("\n", "").replace("\r", "")
+            text = text.replace("\n", " ").replace("\r", "")
            text = " ".join(text.split()).strip()
            marker = ""
@@ -357,6 +340,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            if parent_list_label == GroupLabel.ORDERED_LIST:
                marker = f"{str(index_in_list)}."
                enumerated = True
            if len(text) > 0:
                doc.add_list_item(
                    text=text,
                    enumerated=enumerated,
@@ -502,3 +487,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def handle_image(self, element, idx, doc):
        """Handles image tags (img)."""
        doc.add_picture(parent=self.parents[self.level], caption=None)
    def handle_svg(self, element, idx, doc):
        """Handles svg tags."""
        doc.add_picture(parent=self.parents[self.level], caption=None)