From baa6d87ed105039ac022e43e6bfdb41ea784626a Mon Sep 17 00:00:00 2001 From: ka-weihe Date: Tue, 15 Apr 2025 23:31:39 +0200 Subject: [PATCH 1/4] Update html_backend.py Signed-off-by: ka-weihe --- docling/backend/html_backend.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index aa2637f2..7c89538b 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -177,6 +177,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.handle_figure(tag, doc) elif tag.name == "img": self.handle_image(tag, doc) + elif tag.name == "a": + # New branch to handle anchor tags. + self.handle_anchor(tag, doc) else: self.walk(tag, doc) @@ -272,7 +275,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" - if element.name == "ul": # create a list group self.parents[self.level + 1] = doc.add_group( @@ -464,9 +466,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_table(self, element: Tag, doc: DoclingDocument) -> None: """Handles table tags.""" - table_data = HTMLDocumentBackend.parse_table_data(element) - if table_data is not None: doc.add_table( data=table_data, @@ -501,12 +501,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): nested_list = li.find(["ul", "ol"]) if isinstance(nested_list, Tag): result.extend(self.get_list_text(nested_list, level + 1)) - return result def handle_figure(self, element: Tag, doc: DoclingDocument) -> None: """Handles image tags (img).""" - # Extract the image URI from the tag # image_uri = root.xpath('//figure//img/@src')[0] @@ -536,9 +534,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_image(self, element: Tag, doc: DoclingDocument) -> None: """Handles image tags (img).""" _log.debug(f"ignoring tags at the moment: {element}") - doc.add_picture( parent=self.parents[self.level], caption=None, content_layer=self.content_layer, ) + + def handle_anchor(self, element: Tag, doc: DoclingDocument) -> None: + """Handles anchor tags () by extracting the visible text and href attribute.""" + # Extract the anchor text and the URL (href attribute) + text = element.get_text().strip() + href = element.get("href", "").strip() + + if text: + # Combine the text with the hyperlink if available + combined_text = f"{text} (Link: {href})" if href else text + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=combined_text, + content_layer=self.content_layer, + ) + elif href: + # If no visible text, add the link itself + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=f"Link: {href}", + content_layer=self.content_layer, + ) From 61a5d95b206562e1bc91d561f689ddbbcd3d3570 Mon Sep 17 00:00:00 2001 From: ka-weihe Date: Wed, 16 Apr 2025 12:59:26 +0200 Subject: [PATCH 2/4] Apply suggestion Signed-off-by: ka-weihe --- docling/backend/html_backend.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7c89538b..516c24e1 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -541,25 +541,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) def handle_anchor(self, element: Tag, doc: DoclingDocument) -> None: - """Handles anchor tags () by extracting the visible text and href attribute.""" - # Extract the anchor text and the URL (href attribute) + """Handles anchor tags () by extracting the visible text and setting the hyperlink property.""" + # Extract the visible text and href URL text = element.get_text().strip() href = element.get("href", "").strip() - - if text: - # Combine the text with the hyperlink if available - combined_text = f"{text} (Link: {href})" if href else text + + # If no text is present, use the hyperlink itself as the text. + display_text = text if text else href + + if display_text or href: doc.add_text( parent=self.parents[self.level], label=DocItemLabel.TEXT, - text=combined_text, - content_layer=self.content_layer, - ) - elif href: - # If no visible text, add the link itself - doc.add_text( - parent=self.parents[self.level], - label=DocItemLabel.TEXT, - text=f"Link: {href}", + text=display_text, + hyperlink=href if href else None, # Pass the hyperlink as a separate parameter content_layer=self.content_layer, ) From cb949f4ca27bcb22867d613b6ce2caa4913663d3 Mon Sep 17 00:00:00 2001 From: ka-weihe Date: Wed, 16 Apr 2025 13:00:52 +0200 Subject: [PATCH 3/4] Remove comments Signed-off-by: ka-weihe --- docling/backend/html_backend.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 516c24e1..8b595ae1 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -178,7 +178,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif tag.name == "img": self.handle_image(tag, doc) elif tag.name == "a": - # New branch to handle anchor tags. self.handle_anchor(tag, doc) else: self.walk(tag, doc) @@ -554,6 +553,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[self.level], label=DocItemLabel.TEXT, text=display_text, - hyperlink=href if href else None, # Pass the hyperlink as a separate parameter + hyperlink=href if href else None, content_layer=self.content_layer, ) From 0b0c6b985b0a208a2cef4cf570e82a3b6eef39de Mon Sep 17 00:00:00 2001 From: ka-weihe Date: Sun, 4 May 2025 22:24:51 +0200 Subject: [PATCH 4/4] Create example_08.html Signed-off-by: ka-weihe --- tests/data/html/example_08.html | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tests/data/html/example_08.html diff --git a/tests/data/html/example_08.html b/tests/data/html/example_08.html new file mode 100644 index 00000000..ce1f6048 --- /dev/null +++ b/tests/data/html/example_08.html @@ -0,0 +1,9 @@ + + + Example Website
+
+ Contact Us
+ Go to Top
+ Wikipedia + +