fix: HTML backend, fixes for Lists and nested texts (#180)

* Fixes for HTML backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* removed prints

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* cleaning up

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-25 20:14:04 +02:00 committed by GitHub
parent 88c1673057
commit 7d19418b77
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def get_direct_text(self, item): def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists).""" """Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False) text = item.find(string=True, recursive=False)
if isinstance(text, str): if isinstance(text, str):
return text.strip() return text.strip()
@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(item, str): if isinstance(item, str):
return [item] return [item]
result.append(self.get_direct_text(item)) if item.name not in ["ul", "ol"]:
try:
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
try: return "".join(result) + " "
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
return " ".join(result)
def handle_header(self, element, idx, doc): def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.).""" """Handles header tags (h1, h2, etc.)."""
@ -255,7 +253,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if nested_lists: if nested_lists:
name = element.name name = element.name
text = self.get_direct_text(element) # Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
text = self.extract_text_recursively(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
marker = "" marker = ""
enumerated = False enumerated = False
@ -263,14 +266,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = str(index_in_list) marker = str(index_in_list)
enumerated = True enumerated = True
# create a list-item if len(text) > 0:
self.parents[self.level + 1] = doc.add_list_item( # create a list-item
text=text, self.parents[self.level + 1] = doc.add_list_item(
enumerated=enumerated, text=text,
marker=marker, enumerated=enumerated,
parent=self.parents[self.level], marker=marker,
) parent=self.parents[self.level],
self.level += 1 )
self.level += 1
self.walk(element, doc) self.walk(element, doc)