mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Fix html backend accordion hidden (#1)
* fix(html-backend): improve accordion extraction and hidden content handling - Add specialized handlers for Bootstrap accordion components to properly extract questions from panel-title elements - Implement is_hidden_element() method to detect and skip content with hidden classes, styles, and attributes - Update walk(), analyze_tag(), and extract_text_recursively() to filter out hidden elements - Add comprehensive test suite with direct method tests and example HTML files This fixes two issues: 1. Missing questions in accordion components 2. Unwanted extraction of hidden metadata content Tests: tests/test_html_enhanced.py Signed-off-by: Ulan.Yisaev <ulan.yisaev@nortal.com> * + html-backend itelsd Signed-off-by: Ulan.Yisaev <ulan.yisaev@nortal.com> * run pre-commit run --all-files --------- Signed-off-by: Ulan.Yisaev <ulan.yisaev@nortal.com> Co-authored-by: Ulan.Yisaev <ulan.yisaev@nortal.com>
This commit is contained in:
parent
4d64c4c0b6
commit
4c88d4fe14
@ -40,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
||||
"table",
|
||||
"figure",
|
||||
"img",
|
||||
"div", # Add div to ensure panel-titles are considered
|
||||
]
|
||||
|
||||
|
||||
@ -126,6 +127,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
|
||||
# Skip if the current tag is hidden
|
||||
if self.is_hidden_element(tag):
|
||||
return
|
||||
|
||||
# Iterate over elements in the body of the document
|
||||
text: str = ""
|
||||
for element in tag.children:
|
||||
@ -161,6 +166,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
# Skip hidden elements
|
||||
if self.is_hidden_element(tag):
|
||||
return
|
||||
|
||||
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
self.handle_header(tag, doc)
|
||||
elif tag.name in ["p"]:
|
||||
@ -177,6 +186,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.handle_figure(tag, doc)
|
||||
elif tag.name == "img":
|
||||
self.handle_image(tag, doc)
|
||||
# Special handling for accordion panel titles (Bootstrap)
|
||||
elif tag.name == "div" and "panel-title" in tag.get("class", []):
|
||||
self.handle_panel_title(tag, doc)
|
||||
# Special handling for entire accordion panels
|
||||
elif tag.name == "div" and "panel" in tag.get("class", []):
|
||||
self.handle_panel(tag, doc)
|
||||
else:
|
||||
self.walk(tag, doc)
|
||||
|
||||
@ -194,6 +209,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return [item]
|
||||
|
||||
tag = cast(Tag, item)
|
||||
if self.is_hidden_element(tag):
|
||||
return []
|
||||
|
||||
if tag.name not in ["ul", "ol"]:
|
||||
for child in tag:
|
||||
# Recursively get the child's text content
|
||||
@ -201,6 +219,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return ["".join(result) + " "]
|
||||
|
||||
def is_hidden_element(self, tag: Tag) -> bool:
|
||||
"""Check if an element is hidden based on its class attributes."""
|
||||
if not isinstance(tag, Tag):
|
||||
return False
|
||||
|
||||
# Check for classes that indicate hidden content
|
||||
classes = tag.get("class", [])
|
||||
if isinstance(classes, str):
|
||||
classes = classes.split()
|
||||
|
||||
hidden_classes = ["hidden", "d-none", "hide", "invisible", "collapse"]
|
||||
for cls in hidden_classes:
|
||||
if cls in classes:
|
||||
return True
|
||||
|
||||
# Check for style attribute with display:none or visibility:hidden
|
||||
style = tag.get("style", "")
|
||||
if "display:none" in style or "visibility:hidden" in style:
|
||||
return True
|
||||
|
||||
# Check hidden attribute
|
||||
if tag.has_attr("hidden"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles header tags (h1, h2, etc.)."""
|
||||
hlevel = int(element.name.replace("h", ""))
|
||||
@ -544,3 +588,58 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
caption=None,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
def handle_panel_title(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles panel titles that contain questions in Bootstrap accordion."""
|
||||
# Skip if the element is hidden
|
||||
if self.is_hidden_element(element):
|
||||
return
|
||||
|
||||
# Find the anchor tag that contains the question text
|
||||
anchor = element.find("a")
|
||||
if anchor and anchor.text and not self.is_hidden_element(anchor):
|
||||
question_text = anchor.text.strip()
|
||||
# Add the question as a proper text item
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
text=question_text,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
def handle_panel(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles entire Bootstrap accordion panels."""
|
||||
# Skip if the element is hidden
|
||||
if self.is_hidden_element(element):
|
||||
return
|
||||
|
||||
# First, find and process the panel-title (question)
|
||||
panel_title = element.find("div", class_="panel-title")
|
||||
if panel_title and not self.is_hidden_element(panel_title):
|
||||
self.handle_panel_title(panel_title, doc)
|
||||
|
||||
# Then, find and process the panel-body (answer)
|
||||
panel_body = element.find("div", class_="panel-body")
|
||||
if panel_body and not self.is_hidden_element(panel_body):
|
||||
# Create a group for the answer
|
||||
panel_group = doc.add_group(
|
||||
parent=self.parents[self.level],
|
||||
name="panel-answer",
|
||||
label=GroupLabel.SECTION,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
# Save current level
|
||||
current_level = self.level
|
||||
# Set new parent for content in the panel
|
||||
self.level += 1
|
||||
self.parents[self.level] = panel_group
|
||||
|
||||
# Process panel body content
|
||||
self.walk(panel_body, doc)
|
||||
|
||||
# Restore previous level
|
||||
self.level = current_level
|
||||
else:
|
||||
# If no panel-body found, just process children normally
|
||||
self.walk(element, doc)
|
||||
|
45
tests/data/html/accordion_test.html
Normal file
45
tests/data/html/accordion_test.html
Normal file
@ -0,0 +1,45 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Accordion Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class="row">
|
||||
<div class="col-xs-12">
|
||||
<h3>Account Information FAQ</h3>
|
||||
<div class="row">
|
||||
<div class="col-xs-12">
|
||||
<div class="panel-group" id="accordion-36" role="tablist" aria-multiselectable="true">
|
||||
<div class="panel panel-default">
|
||||
<div class="panel-heading" id="accordion-36h0kkk">
|
||||
<div class="panel-title">
|
||||
<a class="collapsed" role="button" data-toggle="collapse" data-path="faq/account-information" data-id="digitally-signed-statement" data-target="#accordion-36h0kkk.panel-collapse" aria-controls="accordion-36h0kkk">1. How can I get a digitally signed bank statement?</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="panel-collapse collapse" id="accordion-36h0kkk" role="tabpanel" aria-labelledby="accordion-36h0kkk">
|
||||
<div class="panel-body">
|
||||
<p>You can download your statement from the online banking portal..</p>
|
||||
<div class="keywords hidden">Account Information FAQ</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="panel panel-default">
|
||||
<div class="panel-heading" id="accordion-36h1kkk">
|
||||
<div class="panel-title">
|
||||
<a class="collapsed" role="button" data-toggle="collapse" data-path="faq/account-information" data-id="change-contact-details" data-target="#accordion-36h1kkk.panel-collapse" aria-controls="accordion-36h1kkk">2. How do I update my contact information?</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="panel-collapse collapse" id="accordion-36h1kkk" role="tabpanel" aria-labelledby="accordion-36h1kkk">
|
||||
<div class="panel-body">
|
||||
<p>You can update your contact details through the online banking portal..</p>
|
||||
<div class="keywords hidden">Account Information FAQ</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
50
tests/data/html/hidden_elements_test.html
Normal file
50
tests/data/html/hidden_elements_test.html
Normal file
@ -0,0 +1,50 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Hidden Elements Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h3>Visible Elements Test</h3>
|
||||
|
||||
<!-- Visible content that should be extracted -->
|
||||
<p>This is a regular paragraph that should be extracted.</p>
|
||||
|
||||
<!-- Content with class="hidden" that should be skipped -->
|
||||
<div class="hidden">
|
||||
<p>This text has class="hidden" and should NOT be extracted.</p>
|
||||
</div>
|
||||
|
||||
<!-- Content with style="display:none" that should be skipped -->
|
||||
<div style="display:none">
|
||||
<p>This text has style="display:none" and should NOT be extracted.</p>
|
||||
</div>
|
||||
|
||||
<!-- Content with hidden attribute that should be skipped -->
|
||||
<div hidden>
|
||||
<p>This text has the hidden attribute and should NOT be extracted.</p>
|
||||
</div>
|
||||
|
||||
<!-- Content with class="d-none" (Bootstrap) that should be skipped -->
|
||||
<div class="d-none">
|
||||
<p>This text has class="d-none" and should NOT be extracted.</p>
|
||||
</div>
|
||||
|
||||
<!-- Content with class="invisible" (Bootstrap) that should be skipped -->
|
||||
<div class="invisible">
|
||||
<p>This text has class="invisible" and should NOT be extracted.</p>
|
||||
</div>
|
||||
|
||||
<!-- Content with class="collapse" (Bootstrap) that should be skipped -->
|
||||
<div class="collapse">
|
||||
<p>This text has class="collapse" and should NOT be extracted.</p>
|
||||
</div>
|
||||
|
||||
<!-- Visible content that should be extracted -->
|
||||
<div class="visible-content">
|
||||
<p>This is another regular paragraph that should be extracted.</p>
|
||||
<div class="keywords hidden">Keywords that should NOT be extracted.</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
105
tests/test_html_enhanced.py
Normal file
105
tests/test_html_enhanced.py
Normal file
@ -0,0 +1,105 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
# Add the root directory to the system path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from docling.datamodel.document import InputDocument, DoclingDocument
|
||||
|
||||
|
||||
def test_is_hidden_element():
|
||||
"""Test the is_hidden_element method directly."""
|
||||
# Create a minimal instance of HTMLDocumentBackend
|
||||
in_path = Path("tests/data/html/wiki_duck.html")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
)
|
||||
backend = HTMLDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=in_path,
|
||||
)
|
||||
|
||||
# Test with different types of hidden elements
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
# Hidden by class
|
||||
tag = BeautifulSoup('<div class="hidden">Test</div>', "html.parser").div
|
||||
assert backend.is_hidden_element(tag) == True
|
||||
|
||||
# Hidden by d-none class (Bootstrap)
|
||||
tag = BeautifulSoup('<div class="d-none">Test</div>', "html.parser").div
|
||||
assert backend.is_hidden_element(tag) == True
|
||||
|
||||
# Hidden by style
|
||||
tag = BeautifulSoup('<div style="display:none">Test</div>', "html.parser").div
|
||||
assert backend.is_hidden_element(tag) == True
|
||||
|
||||
# Hidden by attribute
|
||||
tag = BeautifulSoup("<div hidden>Test</div>", "html.parser").div
|
||||
assert backend.is_hidden_element(tag) == True
|
||||
|
||||
# Not hidden
|
||||
tag = BeautifulSoup("<div>Test</div>", "html.parser").div
|
||||
assert backend.is_hidden_element(tag) == False
|
||||
|
||||
print("All is_hidden_element tests passed!")
|
||||
|
||||
|
||||
def test_panel_title_extraction():
|
||||
"""Test the handle_panel_title method directly."""
|
||||
# Create a minimal instance of HTMLDocumentBackend
|
||||
in_path = Path("tests/data/html/wiki_duck.html")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
)
|
||||
backend = HTMLDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=in_path,
|
||||
)
|
||||
|
||||
# Initialize necessary attributes
|
||||
backend.content_layer = ContentLayer.BODY
|
||||
|
||||
# Create a mock document
|
||||
doc = DoclingDocument(name="test")
|
||||
|
||||
# Create a BeautifulSoup tag for a panel title
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html = """
|
||||
<div class="panel-title">
|
||||
<a class="collapsed" role="button">How can I get a digitally signed bank statement?</a>
|
||||
</div>
|
||||
"""
|
||||
|
||||
panel_title = BeautifulSoup(html, "html.parser").div
|
||||
|
||||
# Set the parent level
|
||||
backend.level = 0
|
||||
backend.parents = {0: None}
|
||||
|
||||
# Call the method
|
||||
backend.handle_panel_title(panel_title, doc)
|
||||
|
||||
# Check if something was added to the document
|
||||
assert len(doc.body.children) == 1
|
||||
|
||||
# Export to markdown to check the content
|
||||
markdown_content = doc.export_to_markdown()
|
||||
assert "How can I get a digitally signed bank statement?" in markdown_content
|
||||
|
||||
print("Panel title extraction test passed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_is_hidden_element()
|
||||
test_panel_title_extraction()
|
||||
print("All tests passed successfully!")
|
Loading…
Reference in New Issue
Block a user