mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix(HTML): parse footer tag as a group in furniture content layer (#2106)
* fix(HTML): parse footer tag as a section in furniture Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): add test for body vs furniture in HTML parser. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
8820b5558b
commit
c5f2e2fdd6
@@ -1378,45 +1378,15 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-1365 at level 4: list_item: Articles with Project Gutenberg links
|
||||
item-1366 at level 4: list_item: Articles containing video clips
|
||||
item-1367 at level 3: list: group list
|
||||
item-1368 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC) .
|
||||
item-1369 at level 4: list_item:
|
||||
item-1370 at level 5: inline: group group
|
||||
item-1371 at level 6: text: Text is available under the
|
||||
item-1372 at level 6: text: Creative Commons Attribution-ShareAlike License 4.0
|
||||
item-1373 at level 6: text: ; additional terms may apply. By using this site, you agree to the
|
||||
item-1374 at level 6: text: Terms of Use
|
||||
item-1375 at level 6: text: and
|
||||
item-1376 at level 6: text: Privacy Policy
|
||||
item-1377 at level 6: text: . Wikipedia® is a registered trademark of the
|
||||
item-1378 at level 6: text: Wikimedia Foundation, Inc.
|
||||
item-1379 at level 6: text: , a non-profit organization.
|
||||
item-1380 at level 3: list: group list
|
||||
item-1381 at level 4: list_item: Privacy policy
|
||||
item-1382 at level 4: list_item: About Wikipedia
|
||||
item-1383 at level 4: list_item: Disclaimers
|
||||
item-1384 at level 4: list_item: Contact Wikipedia
|
||||
item-1385 at level 4: list_item: Code of Conduct
|
||||
item-1386 at level 4: list_item: Developers
|
||||
item-1387 at level 4: list_item: Statistics
|
||||
item-1388 at level 4: list_item: Cookie statement
|
||||
item-1389 at level 4: list_item: Mobile view
|
||||
item-1390 at level 3: list: group list
|
||||
item-1391 at level 4: picture
|
||||
item-1391 at level 5: caption: Image Hyperlink.
|
||||
item-1392 at level 4: picture
|
||||
item-1392 at level 5: caption: Image Hyperlink.
|
||||
item-1393 at level 3: list: group list
|
||||
item-1394 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
|
||||
item-1395 at level 1: caption: Male mallard .
|
||||
item-1396 at level 1: caption: Wood ducks .
|
||||
item-1397 at level 1: caption: Mallard landing in approach
|
||||
item-1398 at level 1: caption: Male Mandarin duck
|
||||
item-1399 at level 1: caption: Flying steamer ducks in Ushuaia , Argentina
|
||||
item-1400 at level 1: caption: Female mallard in Cornwall , England
|
||||
item-1401 at level 1: caption: Pecten along the bill
|
||||
item-1402 at level 1: caption: A Muscovy duckling
|
||||
item-1403 at level 1: caption: Ringed teal
|
||||
item-1404 at level 1: caption: Indian Runner ducks , a common breed of domestic ducks
|
||||
item-1405 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka [ 49 ]
|
||||
item-1406 at level 1: caption: Image Hyperlink.
|
||||
item-1407 at level 1: caption: Image Hyperlink.
|
||||
item-1368 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
|
||||
item-1369 at level 1: caption: Male mallard .
|
||||
item-1370 at level 1: caption: Wood ducks .
|
||||
item-1371 at level 1: caption: Mallard landing in approach
|
||||
item-1372 at level 1: caption: Male Mandarin duck
|
||||
item-1373 at level 1: caption: Flying steamer ducks in Ushuaia , Argentina
|
||||
item-1374 at level 1: caption: Female mallard in Cornwall , England
|
||||
item-1375 at level 1: caption: Pecten along the bill
|
||||
item-1376 at level 1: caption: A Muscovy duckling
|
||||
item-1377 at level 1: caption: Ringed teal
|
||||
item-1378 at level 1: caption: Indian Runner ducks , a common breed of domestic ducks
|
||||
item-1379 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka [ 49 ]
|
||||
@@ -5488,6 +5488,26 @@
|
||||
"parent": {
|
||||
"$ref": "#/texts/1157"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/171"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/174"
|
||||
}
|
||||
],
|
||||
"content_layer": "furniture",
|
||||
"name": "footer",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/171",
|
||||
"parent": {
|
||||
"$ref": "#/groups/170"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1226"
|
||||
@@ -5496,12 +5516,12 @@
|
||||
"$ref": "#/texts/1227"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/171",
|
||||
"self_ref": "#/groups/172",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1227"
|
||||
},
|
||||
@@ -5534,14 +5554,14 @@
|
||||
"$ref": "#/texts/1236"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/172",
|
||||
"self_ref": "#/groups/173",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1157"
|
||||
"$ref": "#/groups/170"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
@@ -5572,14 +5592,14 @@
|
||||
"$ref": "#/texts/1245"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/173",
|
||||
"self_ref": "#/groups/174",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1157"
|
||||
"$ref": "#/groups/170"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
@@ -5589,12 +5609,12 @@
|
||||
"$ref": "#/pictures/24"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/174",
|
||||
"self_ref": "#/groups/175",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1157"
|
||||
},
|
||||
@@ -21472,13 +21492,7 @@
|
||||
"$ref": "#/groups/170"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/174"
|
||||
"$ref": "#/groups/175"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@@ -22463,10 +22477,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1226",
|
||||
"parent": {
|
||||
"$ref": "#/groups/170"
|
||||
"$ref": "#/groups/171"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "This page was last edited on 21 September 2024, at 12:11 (UTC) .",
|
||||
@@ -22477,14 +22491,14 @@
|
||||
{
|
||||
"self_ref": "#/texts/1227",
|
||||
"parent": {
|
||||
"$ref": "#/groups/170"
|
||||
"$ref": "#/groups/171"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
@@ -22495,10 +22509,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1228",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Text is available under the",
|
||||
@@ -22507,10 +22521,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1229",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Creative Commons Attribution-ShareAlike License 4.0",
|
||||
@@ -22520,10 +22534,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1230",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "; additional terms may apply. By using this site, you agree to the",
|
||||
@@ -22532,10 +22546,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1231",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Terms of Use",
|
||||
@@ -22545,10 +22559,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1232",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "and",
|
||||
@@ -22557,10 +22571,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1233",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Privacy Policy",
|
||||
@@ -22570,10 +22584,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1234",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ". Wikipedia® is a registered trademark of the",
|
||||
@@ -22582,10 +22596,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1235",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Wikimedia Foundation, Inc.",
|
||||
@@ -22595,10 +22609,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1236",
|
||||
"parent": {
|
||||
"$ref": "#/groups/171"
|
||||
"$ref": "#/groups/172"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ", a non-profit organization.",
|
||||
@@ -22607,10 +22621,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1237",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Privacy policy",
|
||||
@@ -22622,10 +22636,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1238",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "About Wikipedia",
|
||||
@@ -22637,10 +22651,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1239",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Disclaimers",
|
||||
@@ -22652,10 +22666,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1240",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Contact Wikipedia",
|
||||
@@ -22667,10 +22681,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1241",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Code of Conduct",
|
||||
@@ -22682,10 +22696,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1242",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Developers",
|
||||
@@ -22697,10 +22711,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1243",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Statistics",
|
||||
@@ -22712,10 +22726,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1244",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Cookie statement",
|
||||
@@ -22727,10 +22741,10 @@
|
||||
{
|
||||
"self_ref": "#/texts/1245",
|
||||
"parent": {
|
||||
"$ref": "#/groups/172"
|
||||
"$ref": "#/groups/173"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Mobile view",
|
||||
@@ -22745,7 +22759,7 @@
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "Image Hyperlink.",
|
||||
@@ -22758,7 +22772,7 @@
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "Image Hyperlink.",
|
||||
@@ -23144,10 +23158,10 @@
|
||||
{
|
||||
"self_ref": "#/pictures/23",
|
||||
"parent": {
|
||||
"$ref": "#/groups/173"
|
||||
"$ref": "#/groups/174"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [
|
||||
@@ -23162,10 +23176,10 @@
|
||||
{
|
||||
"self_ref": "#/pictures/24",
|
||||
"parent": {
|
||||
"$ref": "#/groups/173"
|
||||
"$ref": "#/groups/174"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"content_layer": "furniture",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [
|
||||
|
||||
@@ -555,24 +555,4 @@ Hidden categories:
|
||||
- [Pages using Sister project links with hidden wikidata](/wiki/Category:Pages_using_Sister_project_links_with_hidden_wikidata)
|
||||
- [Webarchive template wayback links](/wiki/Category:Webarchive_template_wayback_links)
|
||||
- [Articles with Project Gutenberg links](/wiki/Category:Articles_with_Project_Gutenberg_links)
|
||||
- [Articles containing video clips](/wiki/Category:Articles_containing_video_clips)
|
||||
|
||||
- This page was last edited on 21 September 2024, at 12:11 (UTC) .
|
||||
- Text is available under the [Creative Commons Attribution-ShareAlike License 4.0](//en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License) ; additional terms may apply. By using this site, you agree to the [Terms of Use](//foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use) and [Privacy Policy](//foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy) . Wikipedia® is a registered trademark of the [Wikimedia Foundation, Inc.](//wikimediafoundation.org) , a non-profit organization.
|
||||
|
||||
- [Privacy policy](https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy)
|
||||
- [About Wikipedia](/wiki/Wikipedia:About)
|
||||
- [Disclaimers](/wiki/Wikipedia:General_disclaimer)
|
||||
- [Contact Wikipedia](//en.wikipedia.org/wiki/Wikipedia:Contact_us)
|
||||
- [Code of Conduct](https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct)
|
||||
- [Developers](https://developer.wikimedia.org/)
|
||||
- [Statistics](https://stats.wikimedia.org/#/en.wikipedia.org)
|
||||
- [Cookie statement](https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement)
|
||||
- [Mobile view](//en.m.wikipedia.org/w/index.php?title=Duck&mobileaction=toggle_view_mobile)
|
||||
|
||||
Image Hyperlink.
|
||||
|
||||
<!-- image -->
|
||||
Image Hyperlink.
|
||||
|
||||
<!-- image -->
|
||||
- [Articles containing video clips](/wiki/Category:Articles_containing_video_clips)
|
||||
@@ -1,6 +1,8 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
@@ -179,3 +181,33 @@ def test_e2e_html_conversions():
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||
|
||||
|
||||
def test_html_furniture():
|
||||
raw_html = (
|
||||
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
|
||||
b"<h1>Main Heading</h1>"
|
||||
b"<p>Some Content</p>"
|
||||
b"<footer><p>Some Footer Content</p></footer></body></html"
|
||||
)
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=BytesIO(raw_html),
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
filename="test",
|
||||
)
|
||||
backend = HTMLDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=BytesIO(raw_html),
|
||||
)
|
||||
doc: DoclingDocument = backend.convert()
|
||||
md_body = doc.export_to_markdown()
|
||||
assert md_body == "# Main Heading\n\nSome Content"
|
||||
md_all = doc.export_to_markdown(
|
||||
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
|
||||
)
|
||||
assert md_all == (
|
||||
"Initial content with some bold text\n\n# Main Heading\n\nSome Content\n\n"
|
||||
"Some Footer Content"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user