mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
feat: add ability to detect h1 and filter from there-on
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
9d8865856d
commit
c52e68c52b
@ -81,6 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Replace <br> tags with newline characters
|
||||
for br in self.soup.body.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
|
||||
self.contains_h1 = True
|
||||
self.detected_h1 = False
|
||||
|
||||
doc = self.walk(self.soup.body, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
@ -116,19 +120,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
self.labels[element.name] = 1
|
||||
|
||||
if element.name in ["h1"]:
|
||||
self.detected_h1 = True
|
||||
|
||||
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_header(element, idx, doc)
|
||||
elif element.name in ["p"]:
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_paragraph(element, idx, doc)
|
||||
elif element.name in ["ul", "ol"]:
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_list(element, idx, doc)
|
||||
elif element.name in ["li"]:
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_listitem(element, idx, doc)
|
||||
elif element.name == "table":
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_table(element, idx, doc)
|
||||
elif element.name == "figure":
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_figure(element, idx, doc)
|
||||
elif element.name == "img":
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_image(element, idx, doc)
|
||||
else:
|
||||
self.walk(element, doc)
|
||||
|
Loading…
Reference in New Issue
Block a user