From c52e68c52b926022cec1911dd5a0e5ea5069a070 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 31 Oct 2024 15:50:26 +0100
Subject: [PATCH] feat: add ability to detect h1 and filter from there-on

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/backend/html_backend.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7d14c2eb..b34e6cc3 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -81,6 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             # Replace <br> tags with newline characters
             for br in self.soup.body.find_all("br"):
                 br.replace_with("\n")
+
+            self.contains_h1 = True
+            self.detected_h1 = False
+                
             doc = self.walk(self.soup.body, doc)
         else:
             raise RuntimeError(
@@ -116,20 +120,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             self.labels[element.name] = 1
 
+        if element.name in ["h1"]:
+            self.detected_h1 = True
+            
         if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            self.handle_header(element, idx, doc)
+            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+                self.handle_header(element, idx, doc)
         elif element.name in ["p"]:
-            self.handle_paragraph(element, idx, doc)
+            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+                self.handle_paragraph(element, idx, doc)
         elif element.name in ["ul", "ol"]:
-            self.handle_list(element, idx, doc)
+            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+                self.handle_list(element, idx, doc)
         elif element.name in ["li"]:
-            self.handle_listitem(element, idx, doc)
+            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+                self.handle_listitem(element, idx, doc)
         elif element.name == "table":
-            self.handle_table(element, idx, doc)
+            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+                self.handle_table(element, idx, doc)
         elif element.name == "figure":
-            self.handle_figure(element, idx, doc)
+            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+                self.handle_figure(element, idx, doc)
         elif element.name == "img":
-            self.handle_image(element, idx, doc)
+            if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
+                self.handle_image(element, idx, doc)
         else:
             self.walk(element, doc)