diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7c716908..d951680f 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -181,6 +181,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.handle_figure(tag, doc) elif tag.name == "img": self.handle_image(tag, doc) + elif tag.name == "a": + self.handle_anchor(tag, doc) elif tag.name == "details": self.handle_details(tag, doc) else: @@ -293,7 +295,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" - if element.name == "ul": # create a list group self.parents[self.level + 1] = doc.add_group( @@ -508,9 +509,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_table(self, element: Tag, doc: DoclingDocument) -> None: """Handles table tags.""" - table_data = HTMLDocumentBackend.parse_table_data(element) - if table_data is not None: doc.add_table( data=table_data, @@ -545,12 +544,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): nested_list = li.find(["ul", "ol"]) if isinstance(nested_list, Tag): result.extend(self.get_list_text(nested_list, level + 1)) - return result def handle_figure(self, element: Tag, doc: DoclingDocument) -> None: """Handles image tags (img).""" - # Extract the image URI from the tag # image_uri = root.xpath('//figure//img/@src')[0] @@ -580,9 +577,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_image(self, element: Tag, doc: DoclingDocument) -> None: """Handles image tags (img).""" _log.debug(f"ignoring tags at the moment: {element}") - doc.add_picture( parent=self.parents[self.level], caption=None, content_layer=self.content_layer, ) + + def handle_anchor(self, element: Tag, doc: DoclingDocument) -> None: + """Handles anchor tags () by extracting the visible text and setting the hyperlink property.""" + # Extract the visible text and href URL + text = element.get_text().strip() + href = element.get("href", "").strip() + + # If no text is present, use the hyperlink itself as the text. + display_text = text if text else href + + if display_text or href: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=display_text, + hyperlink=href if href else None, + content_layer=self.content_layer, + ) diff --git a/tests/data/html/example_08.html b/tests/data/html/example_08.html new file mode 100644 index 00000000..ce1f6048 --- /dev/null +++ b/tests/data/html/example_08.html @@ -0,0 +1,9 @@ + + + Example Website
+
+ Contact Us
+ Go to Top
+ Wikipedia + +