fix: HTML backend, fixes for Lists and nested texts (#180)

* Fixes for HTML backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* removed prints

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* cleaning up

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-25 20:14:04 +02:00 committed by GitHub
parent 88c1673057
commit 7d19418b77
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def get_direct_text(self, item): def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists).""" """Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False) text = item.find(string=True, recursive=False)
if isinstance(text, str): if isinstance(text, str):
return text.strip() return text.strip()
@ -149,8 +148,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(item, str): if isinstance(item, str):
return [item] return [item]
result.append(self.get_direct_text(item)) if item.name not in ["ul", "ol"]:
try: try:
# Iterate over the children (and their text and tails) # Iterate over the children (and their text and tails)
for child in item: for child in item:
@ -163,7 +161,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.warn("item has no children") _log.warn("item has no children")
pass pass
return " ".join(result) return "".join(result) + " "
def handle_header(self, element, idx, doc): def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.).""" """Handles header tags (h1, h2, etc.)."""
@ -255,7 +253,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if nested_lists: if nested_lists:
name = element.name name = element.name
text = self.get_direct_text(element) # Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
text = self.extract_text_recursively(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
marker = "" marker = ""
enumerated = False enumerated = False
@ -263,6 +266,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = str(index_in_list) marker = str(index_in_list)
enumerated = True enumerated = True
if len(text) > 0:
# create a list-item # create a list-item
self.parents[self.level + 1] = doc.add_list_item( self.parents[self.level + 1] = doc.add_list_item(
text=text, text=text,