mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Fixes for HTML backend
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
77a89c3334
commit
7332360e27
@ -136,6 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def get_direct_text(self, item):
|
||||
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
||||
text = item.find(string=True, recursive=False)
|
||||
print(text)
|
||||
|
||||
if isinstance(text, str):
|
||||
return text.strip()
|
||||
@ -149,21 +150,22 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if isinstance(item, str):
|
||||
return [item]
|
||||
|
||||
result.append(self.get_direct_text(item))
|
||||
# result.append(self.get_direct_text(item))
|
||||
|
||||
try:
|
||||
# Iterate over the children (and their text and tails)
|
||||
for child in item:
|
||||
try:
|
||||
# Recursively get the child's text content
|
||||
result.extend(self.extract_text_recursively(child))
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
_log.warn("item has no children")
|
||||
pass
|
||||
if item.name not in ["ul", "ol"]:
|
||||
try:
|
||||
# Iterate over the children (and their text and tails)
|
||||
for child in item:
|
||||
try:
|
||||
# Recursively get the child's text content
|
||||
result.extend(self.extract_text_recursively(child))
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
_log.warn("item has no children")
|
||||
pass
|
||||
|
||||
return " ".join(result)
|
||||
return "".join(result) + " "
|
||||
|
||||
def handle_header(self, element, idx, doc):
|
||||
"""Handles header tags (h1, h2, etc.)."""
|
||||
@ -255,7 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
if nested_lists:
|
||||
name = element.name
|
||||
text = self.get_direct_text(element)
|
||||
# text = self.get_direct_text(element)
|
||||
# Text in list item can be hidden within hierarchy, hence
|
||||
# we need to extract it recursively
|
||||
text = self.extract_text_recursively(element)
|
||||
# Flatten text, remove break lines:
|
||||
text = text.replace("\n", "").replace("\r", "")
|
||||
text = " ".join(text.split()).strip()
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
@ -263,14 +271,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker = str(index_in_list)
|
||||
enumerated = True
|
||||
|
||||
# create a list-item
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
if len(text) > 0:
|
||||
# create a list-item
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
self.walk(element, doc)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user