mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(HTML): parse footer tag as a group in furniture content layer (#2106)
* fix(HTML): parse footer tag as a section in furniture Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): add test for body vs furniture in HTML parser. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
8820b5558b
commit
c5f2e2fdd6
@@ -1,6 +1,8 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
@@ -179,3 +181,33 @@ def test_e2e_html_conversions():
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||
|
||||
|
||||
def test_html_furniture():
|
||||
raw_html = (
|
||||
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
|
||||
b"<h1>Main Heading</h1>"
|
||||
b"<p>Some Content</p>"
|
||||
b"<footer><p>Some Footer Content</p></footer></body></html"
|
||||
)
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=BytesIO(raw_html),
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
filename="test",
|
||||
)
|
||||
backend = HTMLDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=BytesIO(raw_html),
|
||||
)
|
||||
doc: DoclingDocument = backend.convert()
|
||||
md_body = doc.export_to_markdown()
|
||||
assert md_body == "# Main Heading\n\nSome Content"
|
||||
md_all = doc.export_to_markdown(
|
||||
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
|
||||
)
|
||||
assert md_all == (
|
||||
"Initial content with some bold text\n\n# Main Heading\n\nSome Content\n\n"
|
||||
"Some Footer Content"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user