feat: add ability to detect h1 and filter from there-on

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-31 15:50:26 +01:00
parent 9d8865856d
commit c52e68c52b

View File

@ -81,6 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"): for br in self.soup.body.find_all("br"):
br.replace_with("\n") br.replace_with("\n")
self.contains_h1 = True
self.detected_h1 = False
doc = self.walk(self.soup.body, doc) doc = self.walk(self.soup.body, doc)
else: else:
raise RuntimeError( raise RuntimeError(
@ -116,19 +120,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
else: else:
self.labels[element.name] = 1 self.labels[element.name] = 1
if element.name in ["h1"]:
self.detected_h1 = True
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_header(element, idx, doc) self.handle_header(element, idx, doc)
elif element.name in ["p"]: elif element.name in ["p"]:
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_paragraph(element, idx, doc) self.handle_paragraph(element, idx, doc)
elif element.name in ["ul", "ol"]: elif element.name in ["ul", "ol"]:
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_list(element, idx, doc) self.handle_list(element, idx, doc)
elif element.name in ["li"]: elif element.name in ["li"]:
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_listitem(element, idx, doc) self.handle_listitem(element, idx, doc)
elif element.name == "table": elif element.name == "table":
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_table(element, idx, doc) self.handle_table(element, idx, doc)
elif element.name == "figure": elif element.name == "figure":
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_figure(element, idx, doc) self.handle_figure(element, idx, doc)
elif element.name == "img": elif element.name == "img":
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_image(element, idx, doc) self.handle_image(element, idx, doc)
else: else:
self.walk(element, doc) self.walk(element, doc)