diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index d14b422f..6088a49d 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -40,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [ "table", "figure", "img", + "div", # Add div to ensure panel-titles are considered ] @@ -126,6 +127,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def walk(self, tag: Tag, doc: DoclingDocument) -> None: + # Skip if the current tag is hidden + if self.is_hidden_element(tag): + return + # Iterate over elements in the body of the document text: str = "" for element in tag.children: @@ -161,6 +166,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: + # Skip hidden elements + if self.is_hidden_element(tag): + return + if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: self.handle_header(tag, doc) elif tag.name in ["p"]: @@ -177,6 +186,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.handle_figure(tag, doc) elif tag.name == "img": self.handle_image(tag, doc) + # Special handling for accordion panel titles (Bootstrap) + elif tag.name == "div" and "panel-title" in tag.get("class", []): + self.handle_panel_title(tag, doc) + # Special handling for entire accordion panels + elif tag.name == "div" and "panel" in tag.get("class", []): + self.handle_panel(tag, doc) else: self.walk(tag, doc) @@ -194,6 +209,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return [item] tag = cast(Tag, item) + if self.is_hidden_element(tag): + return [] + if tag.name not in ["ul", "ol"]: for child in tag: # Recursively get the child's text content @@ -201,6 +219,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return ["".join(result) + " "] + def is_hidden_element(self, tag: Tag) -> bool: + """Check if an element is hidden based on its class attributes.""" + if not isinstance(tag, Tag): + return False + + # Check for classes that indicate hidden content + classes = tag.get("class", []) + if isinstance(classes, str): + classes = classes.split() + + hidden_classes = ["hidden", "d-none", "hide", "invisible", "collapse"] + for cls in hidden_classes: + if cls in classes: + return True + + # Check for style attribute with display:none or visibility:hidden + style = tag.get("style", "") + if "display:none" in style or "visibility:hidden" in style: + return True + + # Check hidden attribute + if tag.has_attr("hidden"): + return True + + return False + def handle_header(self, element: Tag, doc: DoclingDocument) -> None: """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) @@ -544,3 +588,58 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): caption=None, content_layer=self.content_layer, ) + + def handle_panel_title(self, element: Tag, doc: DoclingDocument) -> None: + """Handles panel titles that contain questions in Bootstrap accordion.""" + # Skip if the element is hidden + if self.is_hidden_element(element): + return + + # Find the anchor tag that contains the question text + anchor = element.find("a") + if anchor and anchor.text and not self.is_hidden_element(anchor): + question_text = anchor.text.strip() + # Add the question as a proper text item + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=question_text, + content_layer=self.content_layer, + ) + + def handle_panel(self, element: Tag, doc: DoclingDocument) -> None: + """Handles entire Bootstrap accordion panels.""" + # Skip if the element is hidden + if self.is_hidden_element(element): + return + + # First, find and process the panel-title (question) + panel_title = element.find("div", class_="panel-title") + if panel_title and not self.is_hidden_element(panel_title): + self.handle_panel_title(panel_title, doc) + + # Then, find and process the panel-body (answer) + panel_body = element.find("div", class_="panel-body") + if panel_body and not self.is_hidden_element(panel_body): + # Create a group for the answer + panel_group = doc.add_group( + parent=self.parents[self.level], + name="panel-answer", + label=GroupLabel.SECTION, + content_layer=self.content_layer, + ) + + # Save current level + current_level = self.level + # Set new parent for content in the panel + self.level += 1 + self.parents[self.level] = panel_group + + # Process panel body content + self.walk(panel_body, doc) + + # Restore previous level + self.level = current_level + else: + # If no panel-body found, just process children normally + self.walk(element, doc) diff --git a/tests/data/html/accordion_test.html b/tests/data/html/accordion_test.html new file mode 100644 index 00000000..b8af93d8 --- /dev/null +++ b/tests/data/html/accordion_test.html @@ -0,0 +1,45 @@ + + + + Accordion Test + + +
+
+

Account Information FAQ

+
+
+
+
+ +
+
+

You can download your statement from the online banking portal..

+ +
+
+
+
+ +
+
+

You can update your contact details through the online banking portal..

+ +
+
+
+
+
+
+
+
+ + \ No newline at end of file diff --git a/tests/data/html/hidden_elements_test.html b/tests/data/html/hidden_elements_test.html new file mode 100644 index 00000000..c9528ab1 --- /dev/null +++ b/tests/data/html/hidden_elements_test.html @@ -0,0 +1,50 @@ + + + + Hidden Elements Test + + +
+

Visible Elements Test

+ + +

This is a regular paragraph that should be extracted.

+ + + + + +
+

This text has style="display:none" and should NOT be extracted.

+
+ + + + + +
+

This text has class="d-none" and should NOT be extracted.

+
+ + + + + +
+

This text has class="collapse" and should NOT be extracted.

+
+ + +
+

This is another regular paragraph that should be extracted.

+ +
+
+ + \ No newline at end of file diff --git a/tests/test_html_enhanced.py b/tests/test_html_enhanced.py new file mode 100644 index 00000000..c3603a62 --- /dev/null +++ b/tests/test_html_enhanced.py @@ -0,0 +1,105 @@ +import sys +from pathlib import Path +import re + +# Add the root directory to the system path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from docling.backend.html_backend import HTMLDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling_core.types.doc.document import ContentLayer +from docling.datamodel.document import InputDocument, DoclingDocument + + +def test_is_hidden_element(): + """Test the is_hidden_element method directly.""" + # Create a minimal instance of HTMLDocumentBackend + in_path = Path("tests/data/html/wiki_duck.html") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.HTML, + backend=HTMLDocumentBackend, + ) + backend = HTMLDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + + # Test with different types of hidden elements + from bs4 import BeautifulSoup, Tag + + # Hidden by class + tag = BeautifulSoup('', "html.parser").div + assert backend.is_hidden_element(tag) == True + + # Hidden by d-none class (Bootstrap) + tag = BeautifulSoup('
Test
', "html.parser").div + assert backend.is_hidden_element(tag) == True + + # Hidden by style + tag = BeautifulSoup('
Test
', "html.parser").div + assert backend.is_hidden_element(tag) == True + + # Hidden by attribute + tag = BeautifulSoup("", "html.parser").div + assert backend.is_hidden_element(tag) == True + + # Not hidden + tag = BeautifulSoup("
Test
", "html.parser").div + assert backend.is_hidden_element(tag) == False + + print("All is_hidden_element tests passed!") + + +def test_panel_title_extraction(): + """Test the handle_panel_title method directly.""" + # Create a minimal instance of HTMLDocumentBackend + in_path = Path("tests/data/html/wiki_duck.html") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.HTML, + backend=HTMLDocumentBackend, + ) + backend = HTMLDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + + # Initialize necessary attributes + backend.content_layer = ContentLayer.BODY + + # Create a mock document + doc = DoclingDocument(name="test") + + # Create a BeautifulSoup tag for a panel title + from bs4 import BeautifulSoup + + html = """ +
+ +
+ """ + + panel_title = BeautifulSoup(html, "html.parser").div + + # Set the parent level + backend.level = 0 + backend.parents = {0: None} + + # Call the method + backend.handle_panel_title(panel_title, doc) + + # Check if something was added to the document + assert len(doc.body.children) == 1 + + # Export to markdown to check the content + markdown_content = doc.export_to_markdown() + assert "How can I get a digitally signed bank statement?" in markdown_content + + print("Panel title extraction test passed!") + + +if __name__ == "__main__": + test_is_hidden_element() + test_panel_title_extraction() + print("All tests passed successfully!")