mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 22:28:31 +00:00
fix: Proper heading support in rich tables for HTML backend (#2394)
* Fix for the proper headers support in rich tables in HTML Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Compatibility with older Python versions Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixing Furniture before the first heading rule Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added minimalistic test case Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * added html for the test Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for br in content("br"):
|
||||
br.replace_with(NavigableString("\n"))
|
||||
# set default content layer
|
||||
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
||||
|
||||
# Furniture before the first heading rule, except for headers in tables
|
||||
header = None
|
||||
# Find all headers first
|
||||
all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
||||
# Keep only those that do NOT have a <table> in a parent chain
|
||||
clean_headers = [h for h in all_headers if not h.find_parent("table")]
|
||||
# Pick the first header from the remaining
|
||||
if len(clean_headers):
|
||||
header = clean_headers[0]
|
||||
# Set starting content layer
|
||||
self.content_layer = (
|
||||
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
||||
ContentLayer.BODY if header is None else ContentLayer.FURNITURE
|
||||
)
|
||||
# reset context
|
||||
self.ctx = _Context()
|
||||
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
group_name: str,
|
||||
doc: DoclingDocument,
|
||||
docling_table: TableItem,
|
||||
) -> tuple[bool, RefItem]:
|
||||
) -> tuple[bool, Union[RefItem, None]]:
|
||||
rich_table_cell = False
|
||||
ref_for_rich_cell = provs_in_cell[0]
|
||||
ref_for_rich_cell = None
|
||||
if len(provs_in_cell) > 0:
|
||||
ref_for_rich_cell = provs_in_cell[0]
|
||||
if len(provs_in_cell) > 1:
|
||||
# Cell has multiple elements, we need to group them
|
||||
rich_table_cell = True
|
||||
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if isinstance(pr_item, TextItem):
|
||||
# Cell has only one element and it's just a text
|
||||
rich_table_cell = False
|
||||
doc.delete_items(node_items=[pr_item])
|
||||
try:
|
||||
doc.delete_items(node_items=[pr_item])
|
||||
except Exception as e:
|
||||
_log.error(f"Error while making rich table: {e}.")
|
||||
else:
|
||||
rich_table_cell = True
|
||||
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
||||
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
provs_in_cell: list[RefItem] = []
|
||||
# Parse table cell sub-tree for Rich Cells content:
|
||||
table_level = self.level
|
||||
provs_in_cell = self._walk(html_cell, doc)
|
||||
# After walking sub-tree in cell, restore previously set level
|
||||
self.level = table_level
|
||||
|
||||
rich_table_cell = False
|
||||
ref_for_rich_cell = None
|
||||
if len(provs_in_cell) > 0:
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
||||
rich_table_cell, ref_for_rich_cell = (
|
||||
HTMLDocumentBackend.process_rich_table_cells(
|
||||
provs_in_cell, group_name, doc, docling_table
|
||||
)
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
||||
rich_table_cell, ref_for_rich_cell = (
|
||||
HTMLDocumentBackend.process_rich_table_cells(
|
||||
provs_in_cell, group_name, doc, docling_table
|
||||
)
|
||||
)
|
||||
|
||||
# Extracting text
|
||||
text = self.get_text(html_cell).strip()
|
||||
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for key in self.parents.keys():
|
||||
self.parents[key] = None
|
||||
self.level = 0
|
||||
docling_title = self.parents[self.level + 1] = doc.add_title(
|
||||
self.parents[self.level + 1] = doc.add_title(
|
||||
text_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_ref = [docling_title.get_ref()]
|
||||
p1 = self.parents[self.level + 1]
|
||||
if p1 is not None:
|
||||
added_ref = [p1.get_ref()]
|
||||
# the other levels need to be lowered by 1 if a title was set
|
||||
else:
|
||||
level -= 1
|
||||
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
_log.debug(f"Remove the tail of level {key}")
|
||||
self.parents[key] = None
|
||||
self.level = level
|
||||
docling_heading = self.parents[self.level + 1] = doc.add_heading(
|
||||
self.parents[self.level + 1] = doc.add_heading(
|
||||
parent=self.parents[self.level],
|
||||
text=text_clean,
|
||||
orig=annotated_text.text,
|
||||
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_ref = [docling_heading.get_ref()]
|
||||
p2 = self.parents[self.level + 1]
|
||||
if p2 is not None:
|
||||
added_ref = [p2.get_ref()]
|
||||
self.level += 1
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
|
||||
Reference in New Issue
Block a user