fix(HTML): parse footer tag as a group in furniture content layer (#2106)

* fix(HTML): parse footer tag as a section in furniture

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(HTML): add test for body vs furniture in HTML parser.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-08-20 08:42:25 +02:00
committed by GitHub
parent 8820b5558b
commit c5f2e2fdd6
5 changed files with 131 additions and 131 deletions

View File

@@ -1,6 +1,8 @@
from io import BytesIO
from pathlib import Path
from docling_core.types.doc.document import ContentLayer
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
@@ -179,3 +181,33 @@ def test_e2e_html_conversions():
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
def test_html_furniture():
raw_html = (
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
b"<h1>Main Heading</h1>"
b"<p>Some Content</p>"
b"<footer><p>Some Footer Content</p></footer></body></html"
)
in_doc = InputDocument(
path_or_stream=BytesIO(raw_html),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(raw_html),
)
doc: DoclingDocument = backend.convert()
md_body = doc.export_to_markdown()
assert md_body == "# Main Heading\n\nSome Content"
md_all = doc.export_to_markdown(
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
)
assert md_all == (
"Initial content with some bold text\n\n# Main Heading\n\nSome Content\n\n"
"Some Footer Content"
)