Fix html backend accordion hidden (#1)

* fix(html-backend): improve accordion extraction and hidden content handling

   - Add specialized handlers for Bootstrap accordion components to properly extract
     questions from panel-title elements
   - Implement is_hidden_element() method to detect and skip content with hidden
     classes, styles, and attributes
   - Update walk(), analyze_tag(), and extract_text_recursively() to filter out
     hidden elements
   - Add comprehensive test suite with direct method tests and example HTML files

   This fixes two issues:
   1. Missing questions in accordion components
   2. Unwanted extraction of hidden metadata content

   Tests: tests/test_html_enhanced.py

Signed-off-by: Ulan.Yisaev <ulan.yisaev@nortal.com>

* + html-backend itelsd

Signed-off-by: Ulan.Yisaev <ulan.yisaev@nortal.com>

* run pre-commit run --all-files

---------

Signed-off-by: Ulan.Yisaev <ulan.yisaev@nortal.com>
Co-authored-by: Ulan.Yisaev <ulan.yisaev@nortal.com>
This commit is contained in:
Ulan Yisaev 2025-03-09 18:13:24 +02:00 committed by GitHub
parent 4d64c4c0b6
commit 4c88d4fe14
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 299 additions and 0 deletions

View File

@ -40,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
"table",
"figure",
"img",
"div", # Add div to ensure panel-titles are considered
]
@ -126,6 +127,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Skip if the current tag is hidden
if self.is_hidden_element(tag):
return
# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
@ -161,6 +166,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
# Skip hidden elements
if self.is_hidden_element(tag):
return
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc)
elif tag.name in ["p"]:
@ -177,6 +186,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.handle_figure(tag, doc)
elif tag.name == "img":
self.handle_image(tag, doc)
# Special handling for accordion panel titles (Bootstrap)
elif tag.name == "div" and "panel-title" in tag.get("class", []):
self.handle_panel_title(tag, doc)
# Special handling for entire accordion panels
elif tag.name == "div" and "panel" in tag.get("class", []):
self.handle_panel(tag, doc)
else:
self.walk(tag, doc)
@ -194,6 +209,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return [item]
tag = cast(Tag, item)
if self.is_hidden_element(tag):
return []
if tag.name not in ["ul", "ol"]:
for child in tag:
# Recursively get the child's text content
@ -201,6 +219,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return ["".join(result) + " "]
def is_hidden_element(self, tag: Tag) -> bool:
"""Check if an element is hidden based on its class attributes."""
if not isinstance(tag, Tag):
return False
# Check for classes that indicate hidden content
classes = tag.get("class", [])
if isinstance(classes, str):
classes = classes.split()
hidden_classes = ["hidden", "d-none", "hide", "invisible", "collapse"]
for cls in hidden_classes:
if cls in classes:
return True
# Check for style attribute with display:none or visibility:hidden
style = tag.get("style", "")
if "display:none" in style or "visibility:hidden" in style:
return True
# Check hidden attribute
if tag.has_attr("hidden"):
return True
return False
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
@ -544,3 +588,58 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
caption=None,
content_layer=self.content_layer,
)
def handle_panel_title(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles panel titles that contain questions in Bootstrap accordion."""
# Skip if the element is hidden
if self.is_hidden_element(element):
return
# Find the anchor tag that contains the question text
anchor = element.find("a")
if anchor and anchor.text and not self.is_hidden_element(anchor):
question_text = anchor.text.strip()
# Add the question as a proper text item
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=question_text,
content_layer=self.content_layer,
)
def handle_panel(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles entire Bootstrap accordion panels."""
# Skip if the element is hidden
if self.is_hidden_element(element):
return
# First, find and process the panel-title (question)
panel_title = element.find("div", class_="panel-title")
if panel_title and not self.is_hidden_element(panel_title):
self.handle_panel_title(panel_title, doc)
# Then, find and process the panel-body (answer)
panel_body = element.find("div", class_="panel-body")
if panel_body and not self.is_hidden_element(panel_body):
# Create a group for the answer
panel_group = doc.add_group(
parent=self.parents[self.level],
name="panel-answer",
label=GroupLabel.SECTION,
content_layer=self.content_layer,
)
# Save current level
current_level = self.level
# Set new parent for content in the panel
self.level += 1
self.parents[self.level] = panel_group
# Process panel body content
self.walk(panel_body, doc)
# Restore previous level
self.level = current_level
else:
# If no panel-body found, just process children normally
self.walk(element, doc)

View File

@ -0,0 +1,45 @@
<!DOCTYPE html>
<html>
<head>
<title>Accordion Test</title>
</head>
<body>
<div class="row">
<div class="col-xs-12">
<h3>Account Information FAQ</h3>
<div class="row">
<div class="col-xs-12">
<div class="panel-group" id="accordion-36" role="tablist" aria-multiselectable="true">
<div class="panel panel-default">
<div class="panel-heading" id="accordion-36h0kkk">
<div class="panel-title">
<a class="collapsed" role="button" data-toggle="collapse" data-path="faq/account-information" data-id="digitally-signed-statement" data-target="#accordion-36h0kkk.panel-collapse" aria-controls="accordion-36h0kkk">1. How can I get a digitally signed bank statement?</a>
</div>
</div>
<div class="panel-collapse collapse" id="accordion-36h0kkk" role="tabpanel" aria-labelledby="accordion-36h0kkk">
<div class="panel-body">
<p>You can download your statement from the online banking portal..</p>
<div class="keywords hidden">Account Information FAQ</div>
</div>
</div>
</div>
<div class="panel panel-default">
<div class="panel-heading" id="accordion-36h1kkk">
<div class="panel-title">
<a class="collapsed" role="button" data-toggle="collapse" data-path="faq/account-information" data-id="change-contact-details" data-target="#accordion-36h1kkk.panel-collapse" aria-controls="accordion-36h1kkk">2. How do I update my contact information?</a>
</div>
</div>
<div class="panel-collapse collapse" id="accordion-36h1kkk" role="tabpanel" aria-labelledby="accordion-36h1kkk">
<div class="panel-body">
<p>You can update your contact details through the online banking portal..</p>
<div class="keywords hidden">Account Information FAQ</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>

View File

@ -0,0 +1,50 @@
<!DOCTYPE html>
<html>
<head>
<title>Hidden Elements Test</title>
</head>
<body>
<div class="container">
<h3>Visible Elements Test</h3>
<!-- Visible content that should be extracted -->
<p>This is a regular paragraph that should be extracted.</p>
<!-- Content with class="hidden" that should be skipped -->
<div class="hidden">
<p>This text has class="hidden" and should NOT be extracted.</p>
</div>
<!-- Content with style="display:none" that should be skipped -->
<div style="display:none">
<p>This text has style="display:none" and should NOT be extracted.</p>
</div>
<!-- Content with hidden attribute that should be skipped -->
<div hidden>
<p>This text has the hidden attribute and should NOT be extracted.</p>
</div>
<!-- Content with class="d-none" (Bootstrap) that should be skipped -->
<div class="d-none">
<p>This text has class="d-none" and should NOT be extracted.</p>
</div>
<!-- Content with class="invisible" (Bootstrap) that should be skipped -->
<div class="invisible">
<p>This text has class="invisible" and should NOT be extracted.</p>
</div>
<!-- Content with class="collapse" (Bootstrap) that should be skipped -->
<div class="collapse">
<p>This text has class="collapse" and should NOT be extracted.</p>
</div>
<!-- Visible content that should be extracted -->
<div class="visible-content">
<p>This is another regular paragraph that should be extracted.</p>
<div class="keywords hidden">Keywords that should NOT be extracted.</div>
</div>
</div>
</body>
</html>

105
tests/test_html_enhanced.py Normal file
View File

@ -0,0 +1,105 @@
import sys
from pathlib import Path
import re
# Add the root directory to the system path
sys.path.insert(0, str(Path(__file__).parent.parent))
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc.document import ContentLayer
from docling.datamodel.document import InputDocument, DoclingDocument
def test_is_hidden_element():
"""Test the is_hidden_element method directly."""
# Create a minimal instance of HTMLDocumentBackend
in_path = Path("tests/data/html/wiki_duck.html")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
# Test with different types of hidden elements
from bs4 import BeautifulSoup, Tag
# Hidden by class
tag = BeautifulSoup('<div class="hidden">Test</div>', "html.parser").div
assert backend.is_hidden_element(tag) == True
# Hidden by d-none class (Bootstrap)
tag = BeautifulSoup('<div class="d-none">Test</div>', "html.parser").div
assert backend.is_hidden_element(tag) == True
# Hidden by style
tag = BeautifulSoup('<div style="display:none">Test</div>', "html.parser").div
assert backend.is_hidden_element(tag) == True
# Hidden by attribute
tag = BeautifulSoup("<div hidden>Test</div>", "html.parser").div
assert backend.is_hidden_element(tag) == True
# Not hidden
tag = BeautifulSoup("<div>Test</div>", "html.parser").div
assert backend.is_hidden_element(tag) == False
print("All is_hidden_element tests passed!")
def test_panel_title_extraction():
"""Test the handle_panel_title method directly."""
# Create a minimal instance of HTMLDocumentBackend
in_path = Path("tests/data/html/wiki_duck.html")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
# Initialize necessary attributes
backend.content_layer = ContentLayer.BODY
# Create a mock document
doc = DoclingDocument(name="test")
# Create a BeautifulSoup tag for a panel title
from bs4 import BeautifulSoup
html = """
<div class="panel-title">
<a class="collapsed" role="button">How can I get a digitally signed bank statement?</a>
</div>
"""
panel_title = BeautifulSoup(html, "html.parser").div
# Set the parent level
backend.level = 0
backend.parents = {0: None}
# Call the method
backend.handle_panel_title(panel_title, doc)
# Check if something was added to the document
assert len(doc.body.children) == 1
# Export to markdown to check the content
markdown_content = doc.export_to_markdown()
assert "How can I get a digitally signed bank statement?" in markdown_content
print("Panel title extraction test passed!")
if __name__ == "__main__":
test_is_hidden_element()
test_panel_title_extraction()
print("All tests passed successfully!")