Fixes for HTML backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-25 16:27:54 +02:00
parent 77a89c3334
commit 7332360e27

View File

@ -136,6 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def get_direct_text(self, item): def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists).""" """Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False) text = item.find(string=True, recursive=False)
print(text)
if isinstance(text, str): if isinstance(text, str):
return text.strip() return text.strip()
@ -149,21 +150,22 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(item, str): if isinstance(item, str):
return [item] return [item]
result.append(self.get_direct_text(item)) # result.append(self.get_direct_text(item))
try: if item.name not in ["ul", "ol"]:
# Iterate over the children (and their text and tails) try:
for child in item: # Iterate over the children (and their text and tails)
try: for child in item:
# Recursively get the child's text content try:
result.extend(self.extract_text_recursively(child)) # Recursively get the child's text content
except: result.extend(self.extract_text_recursively(child))
pass except:
except: pass
_log.warn("item has no children") except:
pass _log.warn("item has no children")
pass
return " ".join(result) return "".join(result) + " "
def handle_header(self, element, idx, doc): def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.).""" """Handles header tags (h1, h2, etc.)."""
@ -255,7 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if nested_lists: if nested_lists:
name = element.name name = element.name
text = self.get_direct_text(element) # text = self.get_direct_text(element)
# Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
text = self.extract_text_recursively(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
marker = "" marker = ""
enumerated = False enumerated = False
@ -263,14 +271,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = str(index_in_list) marker = str(index_in_list)
enumerated = True enumerated = True
# create a list-item if len(text) > 0:
self.parents[self.level + 1] = doc.add_list_item( # create a list-item
text=text, self.parents[self.level + 1] = doc.add_list_item(
enumerated=enumerated, text=text,
marker=marker, enumerated=enumerated,
parent=self.parents[self.level], marker=marker,
) parent=self.parents[self.level],
self.level += 1 )
self.level += 1
self.walk(element, doc) self.walk(element, doc)