fix(HTML): parse footer tag as a group in furniture content layer (#2106)

* fix(HTML): parse footer tag as a section in furniture

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(HTML): add test for body vs furniture in HTML parser.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-08-20 08:42:25 +02:00
committed by GitHub
parent 8820b5558b
commit c5f2e2fdd6
5 changed files with 131 additions and 131 deletions

View File

@@ -1378,45 +1378,15 @@ item-0 at level 0: unspecified: group _root_
item-1365 at level 4: list_item: Articles with Project Gutenberg links
item-1366 at level 4: list_item: Articles containing video clips
item-1367 at level 3: list: group list
item-1368 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC) .
item-1369 at level 4: list_item:
item-1370 at level 5: inline: group group
item-1371 at level 6: text: Text is available under the
item-1372 at level 6: text: Creative Commons Attribution-ShareAlike License 4.0
item-1373 at level 6: text: ; additional terms may apply. By using this site, you agree to the
item-1374 at level 6: text: Terms of Use
item-1375 at level 6: text: and
item-1376 at level 6: text: Privacy Policy
item-1377 at level 6: text: . Wikipedia® is a registered trademark of the
item-1378 at level 6: text: Wikimedia Foundation, Inc.
item-1379 at level 6: text: , a non-profit organization.
item-1380 at level 3: list: group list
item-1381 at level 4: list_item: Privacy policy
item-1382 at level 4: list_item: About Wikipedia
item-1383 at level 4: list_item: Disclaimers
item-1384 at level 4: list_item: Contact Wikipedia
item-1385 at level 4: list_item: Code of Conduct
item-1386 at level 4: list_item: Developers
item-1387 at level 4: list_item: Statistics
item-1388 at level 4: list_item: Cookie statement
item-1389 at level 4: list_item: Mobile view
item-1390 at level 3: list: group list
item-1391 at level 4: picture
item-1391 at level 5: caption: Image Hyperlink.
item-1392 at level 4: picture
item-1392 at level 5: caption: Image Hyperlink.
item-1393 at level 3: list: group list
item-1394 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
item-1395 at level 1: caption: Male mallard .
item-1396 at level 1: caption: Wood ducks .
item-1397 at level 1: caption: Mallard landing in approach
item-1398 at level 1: caption: Male Mandarin duck
item-1399 at level 1: caption: Flying steamer ducks in Ushuaia , Argentina
item-1400 at level 1: caption: Female mallard in Cornwall , England
item-1401 at level 1: caption: Pecten along the bill
item-1402 at level 1: caption: A Muscovy duckling
item-1403 at level 1: caption: Ringed teal
item-1404 at level 1: caption: Indian Runner ducks , a common breed of domestic ducks
item-1405 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka [ 49 ]
item-1406 at level 1: caption: Image Hyperlink.
item-1407 at level 1: caption: Image Hyperlink.
item-1368 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
item-1369 at level 1: caption: Male mallard .
item-1370 at level 1: caption: Wood ducks .
item-1371 at level 1: caption: Mallard landing in approach
item-1372 at level 1: caption: Male Mandarin duck
item-1373 at level 1: caption: Flying steamer ducks in Ushuaia , Argentina
item-1374 at level 1: caption: Female mallard in Cornwall , England
item-1375 at level 1: caption: Pecten along the bill
item-1376 at level 1: caption: A Muscovy duckling
item-1377 at level 1: caption: Ringed teal
item-1378 at level 1: caption: Indian Runner ducks , a common breed of domestic ducks
item-1379 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka [ 49 ]

View File

@@ -5488,6 +5488,26 @@
"parent": {
"$ref": "#/texts/1157"
},
"children": [
{
"$ref": "#/groups/171"
},
{
"$ref": "#/groups/173"
},
{
"$ref": "#/groups/174"
}
],
"content_layer": "furniture",
"name": "footer",
"label": "section"
},
{
"self_ref": "#/groups/171",
"parent": {
"$ref": "#/groups/170"
},
"children": [
{
"$ref": "#/texts/1226"
@@ -5496,12 +5516,12 @@
"$ref": "#/texts/1227"
}
],
"content_layer": "body",
"content_layer": "furniture",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/171",
"self_ref": "#/groups/172",
"parent": {
"$ref": "#/texts/1227"
},
@@ -5534,14 +5554,14 @@
"$ref": "#/texts/1236"
}
],
"content_layer": "body",
"content_layer": "furniture",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/172",
"self_ref": "#/groups/173",
"parent": {
"$ref": "#/texts/1157"
"$ref": "#/groups/170"
},
"children": [
{
@@ -5572,14 +5592,14 @@
"$ref": "#/texts/1245"
}
],
"content_layer": "body",
"content_layer": "furniture",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/173",
"self_ref": "#/groups/174",
"parent": {
"$ref": "#/texts/1157"
"$ref": "#/groups/170"
},
"children": [
{
@@ -5589,12 +5609,12 @@
"$ref": "#/pictures/24"
}
],
"content_layer": "body",
"content_layer": "furniture",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/174",
"self_ref": "#/groups/175",
"parent": {
"$ref": "#/texts/1157"
},
@@ -21472,13 +21492,7 @@
"$ref": "#/groups/170"
},
{
"$ref": "#/groups/172"
},
{
"$ref": "#/groups/173"
},
{
"$ref": "#/groups/174"
"$ref": "#/groups/175"
}
],
"content_layer": "body",
@@ -22463,10 +22477,10 @@
{
"self_ref": "#/texts/1226",
"parent": {
"$ref": "#/groups/170"
"$ref": "#/groups/171"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "This page was last edited on 21 September 2024, at 12:11 (UTC) .",
@@ -22477,14 +22491,14 @@
{
"self_ref": "#/texts/1227",
"parent": {
"$ref": "#/groups/170"
"$ref": "#/groups/171"
},
"children": [
{
"$ref": "#/groups/171"
"$ref": "#/groups/172"
}
],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "",
@@ -22495,10 +22509,10 @@
{
"self_ref": "#/texts/1228",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "Text is available under the",
@@ -22507,10 +22521,10 @@
{
"self_ref": "#/texts/1229",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "Creative Commons Attribution-ShareAlike License 4.0",
@@ -22520,10 +22534,10 @@
{
"self_ref": "#/texts/1230",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "; additional terms may apply. By using this site, you agree to the",
@@ -22532,10 +22546,10 @@
{
"self_ref": "#/texts/1231",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "Terms of Use",
@@ -22545,10 +22559,10 @@
{
"self_ref": "#/texts/1232",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "and",
@@ -22557,10 +22571,10 @@
{
"self_ref": "#/texts/1233",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "Privacy Policy",
@@ -22570,10 +22584,10 @@
{
"self_ref": "#/texts/1234",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": ". Wikipedia® is a registered trademark of the",
@@ -22582,10 +22596,10 @@
{
"self_ref": "#/texts/1235",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "Wikimedia Foundation, Inc.",
@@ -22595,10 +22609,10 @@
{
"self_ref": "#/texts/1236",
"parent": {
"$ref": "#/groups/171"
"$ref": "#/groups/172"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": ", a non-profit organization.",
@@ -22607,10 +22621,10 @@
{
"self_ref": "#/texts/1237",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Privacy policy",
@@ -22622,10 +22636,10 @@
{
"self_ref": "#/texts/1238",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "About Wikipedia",
@@ -22637,10 +22651,10 @@
{
"self_ref": "#/texts/1239",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Disclaimers",
@@ -22652,10 +22666,10 @@
{
"self_ref": "#/texts/1240",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Contact Wikipedia",
@@ -22667,10 +22681,10 @@
{
"self_ref": "#/texts/1241",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Code of Conduct",
@@ -22682,10 +22696,10 @@
{
"self_ref": "#/texts/1242",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Developers",
@@ -22697,10 +22711,10 @@
{
"self_ref": "#/texts/1243",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Statistics",
@@ -22712,10 +22726,10 @@
{
"self_ref": "#/texts/1244",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Cookie statement",
@@ -22727,10 +22741,10 @@
{
"self_ref": "#/texts/1245",
"parent": {
"$ref": "#/groups/172"
"$ref": "#/groups/173"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "list_item",
"prov": [],
"orig": "Mobile view",
@@ -22745,7 +22759,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "caption",
"prov": [],
"orig": "Image Hyperlink.",
@@ -22758,7 +22772,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "caption",
"prov": [],
"orig": "Image Hyperlink.",
@@ -23144,10 +23158,10 @@
{
"self_ref": "#/pictures/23",
"parent": {
"$ref": "#/groups/173"
"$ref": "#/groups/174"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "picture",
"prov": [],
"captions": [
@@ -23162,10 +23176,10 @@
{
"self_ref": "#/pictures/24",
"parent": {
"$ref": "#/groups/173"
"$ref": "#/groups/174"
},
"children": [],
"content_layer": "body",
"content_layer": "furniture",
"label": "picture",
"prov": [],
"captions": [

View File

@@ -555,24 +555,4 @@ Hidden categories:
- [Pages using Sister project links with hidden wikidata](/wiki/Category:Pages_using_Sister_project_links_with_hidden_wikidata)
- [Webarchive template wayback links](/wiki/Category:Webarchive_template_wayback_links)
- [Articles with Project Gutenberg links](/wiki/Category:Articles_with_Project_Gutenberg_links)
- [Articles containing video clips](/wiki/Category:Articles_containing_video_clips)
- This page was last edited on 21 September 2024, at 12:11 (UTC) .
- Text is available under the [Creative Commons Attribution-ShareAlike License 4.0](//en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License) ; additional terms may apply. By using this site, you agree to the [Terms of Use](//foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use) and [Privacy Policy](//foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy) . Wikipedia® is a registered trademark of the [Wikimedia Foundation, Inc.](//wikimediafoundation.org) , a non-profit organization.
- [Privacy policy](https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy)
- [About Wikipedia](/wiki/Wikipedia:About)
- [Disclaimers](/wiki/Wikipedia:General_disclaimer)
- [Contact Wikipedia](//en.wikipedia.org/wiki/Wikipedia:Contact_us)
- [Code of Conduct](https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct)
- [Developers](https://developer.wikimedia.org/)
- [Statistics](https://stats.wikimedia.org/#/en.wikipedia.org)
- [Cookie statement](https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement)
- [Mobile view](//en.m.wikipedia.org/w/index.php?title=Duck&mobileaction=toggle_view_mobile)
Image Hyperlink.
<!-- image -->
Image Hyperlink.
<!-- image -->
- [Articles containing video clips](/wiki/Category:Articles_containing_video_clips)

View File

@@ -1,6 +1,8 @@
from io import BytesIO
from pathlib import Path
from docling_core.types.doc.document import ContentLayer
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
@@ -179,3 +181,33 @@ def test_e2e_html_conversions():
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
def test_html_furniture():
raw_html = (
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
b"<h1>Main Heading</h1>"
b"<p>Some Content</p>"
b"<footer><p>Some Footer Content</p></footer></body></html"
)
in_doc = InputDocument(
path_or_stream=BytesIO(raw_html),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(raw_html),
)
doc: DoclingDocument = backend.convert()
md_body = doc.export_to_markdown()
assert md_body == "# Main Heading\n\nSome Content"
md_all = doc.export_to_markdown(
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
)
assert md_all == (
"Initial content with some bold text\n\n# Main Heading\n\nSome Content\n\n"
"Some Footer Content"
)