From 08beb406d9ab5a7ee2fabe8a6fa81605b95204d6 Mon Sep 17 00:00:00 2001 From: Roman Kayan BAZG Date: Sun, 25 May 2025 18:14:32 +0200 Subject: [PATCH] fix: when .simplify_text_elements() always put a space between chunks, checks for alphanumeric characters creates more problems than it does good. commit new that testfiles that got forgotten in the last commit. --- docling/backend/html_backend.py | 9 +- .../docling_v2/hyperlink_01.html.itxt | 6 + .../docling_v2/hyperlink_01.html.json | 110 ++++++ .../docling_v2/hyperlink_01.html.md | 3 + .../docling_v2/hyperlink_02.html.itxt | 3 + .../docling_v2/hyperlink_02.html.json | 83 +++++ .../docling_v2/hyperlink_02.html.md | 1 + .../docling_v2/hyperlink_03.html.itxt | 11 + .../docling_v2/hyperlink_03.html.json | 200 +++++++++++ .../docling_v2/hyperlink_03.html.md | 6 + .../docling_v2/wiki_duck.html.itxt | 168 ++++----- .../docling_v2/wiki_duck.html.json | 336 +++++++++--------- .../groundtruth/docling_v2/wiki_duck.html.md | 40 +-- tests/data/html/hyperlink_01.html | 17 + tests/data/html/hyperlink_02.html | 18 + tests/data/html/hyperlink_03.html | 31 ++ 16 files changed, 762 insertions(+), 280 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_01.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_01.html.json create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_01.html.md create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_02.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_02.html.json create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_02.html.md create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_03.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_03.html.json create mode 100644 tests/data/groundtruth/docling_v2/hyperlink_03.html.md create mode 100644 tests/data/html/hyperlink_01.html create mode 100644 tests/data/html/hyperlink_02.html create mode 100644 tests/data/html/hyperlink_03.html diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 75c961e4..0dda2f57 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -83,14 +83,7 @@ class AnnotatedTextList(list): hyperlink = self[0].hyperlink for i in range(1, len(self)): if hyperlink == self[i].hyperlink: - sep = "" - if ( - text - and re.match(r"\w", text[-1]) - and self[i].text - and re.match(r"\w", self[i].text[0]) - ): - sep = " " + sep = " " text += sep + self[i].text else: simplified.append(AnnotatedText(text=text, hyperlink=hyperlink)) diff --git a/tests/data/groundtruth/docling_v2/hyperlink_01.html.itxt b/tests/data/groundtruth/docling_v2/hyperlink_01.html.itxt new file mode 100644 index 00000000..13f67657 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_01.html.itxt @@ -0,0 +1,6 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Something + item-2 at level 2: inline: group group + item-3 at level 3: text: Please follow the link to: + item-4 at level 3: text: This page + item-5 at level 3: text: . \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_01.html.json b/tests/data/groundtruth/docling_v2/hyperlink_01.html.json new file mode 100644 index 00000000..2178d1c1 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_01.html.json @@ -0,0 +1,110 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "hyperlink_01", + "origin": { + "mimetype": "text/html", + "binary_hash": 17149231461445569313, + "filename": "hyperlink_01.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "label": "title", + "prov": [], + "orig": "Something", + "text": "Something" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Please follow the link to:", + "text": "Please follow the link to:" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "This page", + "text": "This page", + "hyperlink": "#" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ".", + "text": "." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_01.html.md b/tests/data/groundtruth/docling_v2/hyperlink_01.html.md new file mode 100644 index 00000000..14b49160 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_01.html.md @@ -0,0 +1,3 @@ +# Something + +Please follow the link to: [This page](#) . \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_02.html.itxt b/tests/data/groundtruth/docling_v2/hyperlink_02.html.itxt new file mode 100644 index 00000000..6738ae86 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_02.html.itxt @@ -0,0 +1,3 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-1 + item-2 at level 2: section_header: Home \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_02.html.json b/tests/data/groundtruth/docling_v2/hyperlink_02.html.json new file mode 100644 index 00000000..24ed4a54 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_02.html.json @@ -0,0 +1,83 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "hyperlink_02", + "origin": { + "mimetype": "text/html", + "binary_hash": 15683290523889238210, + "filename": "hyperlink_02.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + } + ], + "content_layer": "body", + "name": "header-1", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Home", + "text": "Home", + "hyperlink": "/home.html", + "level": 1 + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "furniture", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] + } + ], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_02.html.md b/tests/data/groundtruth/docling_v2/hyperlink_02.html.md new file mode 100644 index 00000000..1009cceb --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_02.html.md @@ -0,0 +1 @@ +[## Home](/home.html) \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_03.html.itxt b/tests/data/groundtruth/docling_v2/hyperlink_03.html.itxt new file mode 100644 index 00000000..89469454 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_03.html.itxt @@ -0,0 +1,11 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: list: group list + item-2 at level 2: list_item: My Section + item-3 at level 3: list: group list + item-4 at level 4: list_item: Some page + item-5 at level 5: list: group list + item-6 at level 6: list_item: A sub page + item-7 at level 5: list: group list + item-8 at level 6: list_item: This is my Homepage + item-9 at level 6: list_item: Main navigation + item-10 at level 2: list_item: My organisation \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_03.html.json b/tests/data/groundtruth/docling_v2/hyperlink_03.html.json new file mode 100644 index 00000000..de662d19 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_03.html.json @@ -0,0 +1,200 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "hyperlink_03", + "origin": { + "mimetype": "text/html", + "binary_hash": 14556394815653517177, + "filename": "hyperlink_03.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/1" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "My Section", + "text": "My Section", + "hyperlink": "#", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Some page", + "text": "Some page", + "hyperlink": "/start.html", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "A sub page", + "text": "A sub page", + "hyperlink": "/home2.html", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "This is my Homepage", + "text": "This is my Homepage", + "hyperlink": "/home.html", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Main navigation", + "text": "Main navigation", + "hyperlink": "#main-navigation", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "My organisation", + "text": "My organisation", + "hyperlink": "#", + "enumerated": false, + "marker": "-" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/hyperlink_03.html.md b/tests/data/groundtruth/docling_v2/hyperlink_03.html.md new file mode 100644 index 00000000..179f6631 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/hyperlink_03.html.md @@ -0,0 +1,6 @@ +- [My Section](#) + - [Some page](/start.html) + - [A sub page](/home2.html) + - [This is my Homepage](/home.html) + - [Main navigation](#main-navigation) +- [My organisation](#) \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt index 13c82e45..a7c55134 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt @@ -251,35 +251,35 @@ item-0 at level 0: unspecified: group _root_ item-250 at level 3: inline: group group item-251 at level 4: text: The word duck comes from item-252 at level 4: text: Old English - item-253 at level 4: text: dūce'diver', a derivative of the ... because of the way many species in the + item-253 at level 4: text: dūce 'diver', a derivative of th ... because of the way many species in the item-254 at level 4: text: dabbling duck item-255 at level 4: text: group feed by upending; compare with item-256 at level 4: text: Dutch item-257 at level 4: text: duiken and item-258 at level 4: text: German - item-259 at level 4: text: tauchen'to dive'. + item-259 at level 4: text: tauchen 'to dive'. item-260 at level 3: picture item-260 at level 4: caption: Pacific black duck displaying the characteristic upending "duck" item-261 at level 3: inline: group group - item-262 at level 4: text: This word replaced Old English e ... r example, Dutch eend, German Ente and + item-262 at level 4: text: This word replaced Old English e ... example, Dutch eend , German Ente and item-263 at level 4: text: Norwegian - item-264 at level 4: text: and. The word ened/ænid was inherited from + item-264 at level 4: text: and . The word ened / ænid was inherited from item-265 at level 4: text: Proto-Indo-European item-266 at level 4: text: ; item-267 at level 4: text: cf. item-268 at level 4: text: Latin - item-269 at level 4: text: anas"duck", + item-269 at level 4: text: anas "duck", item-270 at level 4: text: Lithuanian - item-271 at level 4: text: ántis'duck', + item-271 at level 4: text: ántis 'duck', item-272 at level 4: text: Ancient Greek - item-273 at level 4: text: νῆσσα/νῆττα(nēssa/nētta) 'duck', and + item-273 at level 4: text: νῆσσα / νῆττα ( nēssa / nētta ) 'duck', and item-274 at level 4: text: Sanskrit - item-275 at level 4: text: ātí'water bird', among others. + item-275 at level 4: text: ātí 'water bird', among others. item-276 at level 3: inline: group group item-277 at level 4: text: A duckling is a young duck in downy plumage - item-278 at level 4: text: [1] + item-278 at level 4: text: [ 1 ] item-279 at level 4: text: or baby duck, - item-280 at level 4: text: [2] + item-280 at level 4: text: [ 2 ] item-281 at level 4: text: but in the food trade a young do ... , is sometimes labelled as a duckling. item-282 at level 3: inline: group group item-283 at level 4: text: A male is called a @@ -287,8 +287,8 @@ item-0 at level 0: unspecified: group _root_ item-285 at level 4: text: and the female is called a duck, or in item-286 at level 4: text: ornithology item-287 at level 4: text: a hen. - item-288 at level 4: text: [3] - item-289 at level 4: text: [4] + item-288 at level 4: text: [ 3 ] + item-289 at level 4: text: [ 4 ] item-290 at level 3: picture item-290 at level 4: caption: Male mallard. item-291 at level 3: picture @@ -303,83 +303,83 @@ item-0 at level 0: unspecified: group _root_ item-299 at level 4: text: , and the item-300 at level 4: text: magpie goose item-301 at level 4: text: . - item-302 at level 4: text: [5] + item-302 at level 4: text: [ 5 ] item-303 at level 4: text: All except the screamers belong to the item-304 at level 4: text: biological family item-305 at level 4: text: Anatidae item-306 at level 4: text: . - item-307 at level 4: text: [5] + item-307 at level 4: text: [ 5 ] item-308 at level 4: text: Within the family, ducks are spl ... erable disagreement among taxonomists. - item-309 at level 4: text: [5] + item-309 at level 4: text: [ 5 ] item-310 at level 4: text: Some base their decisions on item-311 at level 4: text: morphological characteristics item-312 at level 4: text: , others on shared behaviours or genetic studies. - item-313 at level 4: text: [6] - item-314 at level 4: text: [7] + item-313 at level 4: text: [ 6 ] + item-314 at level 4: text: [ 7 ] item-315 at level 4: text: The number of suggested subfamil ... taining ducks ranges from two to five. - item-316 at level 4: text: [8] - item-317 at level 4: text: [9] + item-316 at level 4: text: [ 8 ] + item-317 at level 4: text: [ 9 ] item-318 at level 4: text: The significant level of item-319 at level 4: text: hybridisation item-320 at level 4: text: that occurs among wild ducks com ... relationships between various species. - item-321 at level 4: text: [9] + item-321 at level 4: text: [ 9 ] item-322 at level 3: picture item-322 at level 4: caption: Mallard landing in approach item-323 at level 3: inline: group group item-324 at level 4: text: In most modern classifications, ... split into a varying number of tribes. - item-325 at level 4: text: [10] + item-325 at level 4: text: [ 10 ] item-326 at level 4: text: The largest of these, the Anatin ... imarily at the surface of fresh water. - item-327 at level 4: text: [11] + item-327 at level 4: text: [ 11 ] item-328 at level 4: text: The 'diving ducks', also named f ... ng method, make up the tribe Aythyini. - item-329 at level 4: text: [12] + item-329 at level 4: text: [ 12 ] item-330 at level 4: text: The 'sea ducks' of the tribe Mer ... majority of their lives in saltwater. - item-331 at level 4: text: [13] + item-331 at level 4: text: [ 13 ] item-332 at level 4: text: The tribe Oxyurini contains the ... r small size and stiff, upright tails. - item-333 at level 4: text: [14] + item-333 at level 4: text: [ 14 ] item-334 at level 3: inline: group group item-335 at level 4: text: A number of other species called ... ed in other subfamilies or tribes. The item-336 at level 4: text: whistling ducks item-337 at level 4: text: are assigned either to a tribe ( ... y Anatinae or the subfamily Anserinae, - item-338 at level 4: text: [15] + item-338 at level 4: text: [ 15 ] item-339 at level 4: text: or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae). - item-340 at level 4: text: [9] - item-341 at level 4: text: [16] + item-340 at level 4: text: [ 9 ] + item-341 at level 4: text: [ 16 ] item-342 at level 4: text: The item-343 at level 4: text: freckled duck item-344 at level 4: text: of Australia is either the sole ... ctonettini in the subfamily Anserinae, - item-345 at level 4: text: [15] + item-345 at level 4: text: [ 15 ] item-346 at level 4: text: or in its own family, the Stictonettinae. - item-347 at level 4: text: [9] + item-347 at level 4: text: [ 9 ] item-348 at level 4: text: The item-349 at level 4: text: shelducks item-350 at level 4: text: make up the tribe Tadornini in t ... ily Anserinae in some classifications, - item-351 at level 4: text: [15] + item-351 at level 4: text: [ 15 ] item-352 at level 4: text: and their own subfamily, Tadorninae, in others, - item-353 at level 4: text: [17] + item-353 at level 4: text: [ 17 ] item-354 at level 4: text: while the item-355 at level 4: text: steamer ducks item-356 at level 4: text: are either placed in the family Anserinae in the tribe Tachyerini - item-357 at level 4: text: [15] + item-357 at level 4: text: [ 15 ] item-358 at level 4: text: or lumped with the shelducks in the tribe Tadorini. - item-359 at level 4: text: [9] + item-359 at level 4: text: [ 9 ] item-360 at level 4: text: The item-361 at level 4: text: perching ducks item-362 at level 4: text: make up in the tribe Cairinini i ... members assigned to the tribe Anatini. - item-363 at level 4: text: [9] + item-363 at level 4: text: [ 9 ] item-364 at level 4: text: The item-365 at level 4: text: torrent duck item-366 at level 4: text: is generally included in the sub ... e in the monotypic tribe Merganettini, - item-367 at level 4: text: [15] + item-367 at level 4: text: [ 15 ] item-368 at level 4: text: but is sometimes included in the tribe Tadornini. - item-369 at level 4: text: [18] + item-369 at level 4: text: [ 18 ] item-370 at level 4: text: The item-371 at level 4: text: pink-eared duck item-372 at level 4: text: is sometimes included as a true duck either in the tribe Anatini - item-373 at level 4: text: [15] + item-373 at level 4: text: [ 15 ] item-374 at level 4: text: or the tribe Malacorhynchini, - item-375 at level 4: text: [19] + item-375 at level 4: text: [ 19 ] item-376 at level 4: text: and other times is included with the shelducks in the tribe Tadornini. - item-377 at level 4: text: [15] + item-377 at level 4: text: [ 15 ] item-378 at level 2: section_header: Morphology item-379 at level 3: picture item-379 at level 4: caption: Male Mandarin duck @@ -418,13 +418,13 @@ item-0 at level 0: unspecified: group _root_ item-411 at level 4: text: Ducks have a item-412 at level 4: text: cosmopolitan distribution item-413 at level 4: text: , and are found on every continent except Antarctica. - item-414 at level 4: text: [5] + item-414 at level 4: text: [ 5 ] item-415 at level 4: text: Several species manage to live on subantarctic islands, including item-416 at level 4: text: South Georgia item-417 at level 4: text: and the item-418 at level 4: text: Auckland Islands item-419 at level 4: text: . - item-420 at level 4: text: [20] + item-420 at level 4: text: [ 20 ] item-421 at level 4: text: Ducks have reached a number of isolated oceanic islands, including the item-422 at level 4: text: Hawaiian Islands item-423 at level 4: text: , @@ -436,17 +436,17 @@ item-0 at level 0: unspecified: group _root_ item-429 at level 4: text: and less often item-430 at level 4: text: residents item-431 at level 4: text: . - item-432 at level 4: text: [21] - item-433 at level 4: text: [22] + item-432 at level 4: text: [ 21 ] + item-433 at level 4: text: [ 22 ] item-434 at level 4: text: A handful are item-435 at level 4: text: endemic item-436 at level 4: text: to such far-flung islands. - item-437 at level 4: text: [21] + item-437 at level 4: text: [ 21 ] item-438 at level 3: picture item-438 at level 4: caption: Female mallard in Cornwall, England item-439 at level 3: inline: group group item-440 at level 4: text: Some duck species, mainly those ... that form after localised heavy rain. - item-441 at level 4: text: [23] + item-441 at level 4: text: [ 23 ] item-442 at level 2: section_header: Behaviour item-443 at level 3: section_header: Feeding item-444 at level 4: picture @@ -462,7 +462,7 @@ item-0 at level 0: unspecified: group _root_ item-452 at level 4: inline: group group item-453 at level 5: text: Dabbling ducks item-454 at level 5: text: feed on the surface of water or ... -ending without completely submerging. - item-455 at level 5: text: [24] + item-455 at level 5: text: [ 24 ] item-456 at level 5: text: Along the edge of the bill, there is a comb-like structure called a item-457 at level 5: text: pecten item-458 at level 5: text: . This strains the water squirti ... thers and to hold slippery food items. @@ -486,7 +486,7 @@ item-0 at level 0: unspecified: group _root_ item-476 at level 5: text: published an article advising th ... hould not be fed with bread because it item-477 at level 5: text: damages the health of the ducks item-478 at level 5: text: and pollutes waterways. - item-479 at level 5: text: [25] + item-479 at level 5: text: [ 25 ] item-480 at level 3: section_header: Breeding item-481 at level 4: picture item-481 at level 5: caption: A Muscovy duckling @@ -494,9 +494,9 @@ item-0 at level 0: unspecified: group _root_ item-483 at level 5: text: Ducks generally item-484 at level 5: text: only have one partner at a time item-485 at level 5: text: , although the partnership usually only lasts one year. - item-486 at level 5: text: [26] + item-486 at level 5: text: [ 26 ] item-487 at level 5: text: Larger species and the more sede ... e pair-bonds that last numerous years. - item-488 at level 5: text: [27] + item-488 at level 5: text: [ 27 ] item-489 at level 5: text: Most duck species breed once a y ... ng to do so in favourable conditions ( item-490 at level 5: text: spring item-491 at level 5: text: /summer or wet seasons). Ducks also tend to make a @@ -504,12 +504,12 @@ item-0 at level 0: unspecified: group _root_ item-493 at level 5: text: before breeding, and, after hatc ... out of (such as nesting in an enclosed item-494 at level 5: text: courtyard item-495 at level 5: text: ) or are not prospering due to g ... e nest and led her ducklings to water. - item-496 at level 5: text: [28] + item-496 at level 5: text: [ 28 ] item-497 at level 3: section_header: Communication item-498 at level 4: inline: group group item-499 at level 5: text: Female item-500 at level 5: text: mallard - item-501 at level 5: text: ducks (as well as several other species in the genus Anas, such as the + item-501 at level 5: text: ducks (as well as several other species in the genus Anas , such as the item-502 at level 5: text: American item-503 at level 5: text: and item-504 at level 5: text: Pacific black ducks @@ -520,11 +520,11 @@ item-0 at level 0: unspecified: group _root_ item-509 at level 5: text: and item-510 at level 5: text: common teal item-511 at level 5: text: ) make the classic "quack" sound ... at is sometimes written as "breeeeze", - item-512 at level 5: text: [29] + item-512 at level 5: text: [ 29 ] item-513 at level 5: text: [ item-514 at level 5: text: self-published source? - item-515 at level 5: text: ]but, despite widespread misconc ... , most species of duck do not "quack". - item-516 at level 5: text: [30] + item-515 at level 5: text: ] but, despite widespread miscon ... , most species of duck do not "quack". + item-516 at level 5: text: [ 30 ] item-517 at level 5: text: In general, ducks make a range of item-518 at level 5: text: calls item-519 at level 5: text: , including whistles, cooing, yodels and grunts. For example, the @@ -540,13 +540,13 @@ item-0 at level 0: unspecified: group _root_ item-529 at level 5: text: in 2003 as part of the item-530 at level 5: text: British Association item-531 at level 5: text: 's Festival of Science. - item-532 at level 5: text: [31] + item-532 at level 5: text: [ 31 ] item-533 at level 5: text: It was also debunked in item-534 at level 5: text: one of the earlier episodes item-535 at level 5: text: of the popular Discovery Channel television show item-536 at level 5: text: MythBusters item-537 at level 5: text: . - item-538 at level 5: text: [32] + item-538 at level 5: text: [ 32 ] item-539 at level 3: section_header: Predators item-540 at level 4: picture item-540 at level 5: caption: Ringed teal @@ -584,39 +584,39 @@ item-0 at level 0: unspecified: group _root_ item-572 at level 5: text: in California dating to 7800 – 6400 item-573 at level 5: text: BP item-574 at level 5: text: have turned up bones of ducks, i ... st one now-extinct flightless species. - item-575 at level 5: text: [33] + item-575 at level 5: text: [ 33 ] item-576 at level 5: text: Ducks were captured in "significant numbers" by item-577 at level 5: text: Holocene item-578 at level 5: text: inhabitants of the lower item-579 at level 5: text: Ohio River item-580 at level 5: text: valley, suggesting they took adv ... ounty provided by migrating waterfowl. - item-581 at level 5: text: [34] + item-581 at level 5: text: [ 34 ] item-582 at level 5: text: Neolithic hunters in locations as far apart as the Caribbean, - item-583 at level 5: text: [35] + item-583 at level 5: text: [ 35 ] item-584 at level 5: text: Scandinavia, - item-585 at level 5: text: [36] + item-585 at level 5: text: [ 36 ] item-586 at level 5: text: Egypt, - item-587 at level 5: text: [37] + item-587 at level 5: text: [ 37 ] item-588 at level 5: text: Switzerland, - item-589 at level 5: text: [38] + item-589 at level 5: text: [ 38 ] item-590 at level 5: text: and China relied on ducks as a s ... f protein for some or all of the year. - item-591 at level 5: text: [39] + item-591 at level 5: text: [ 39 ] item-592 at level 5: text: Archeological evidence shows that item-593 at level 5: text: Māori people item-594 at level 5: text: in New Zealand hunted the flightless item-595 at level 5: text: Finsch's duck item-596 at level 5: text: , possibly to extinction, though ... may also have contributed to its fate. - item-597 at level 5: text: [40] + item-597 at level 5: text: [ 40 ] item-598 at level 5: text: A similar end awaited the item-599 at level 5: text: Chatham duck item-600 at level 5: text: , a species with reduced flying ... was colonised by Polynesian settlers. - item-601 at level 5: text: [41] + item-601 at level 5: text: [ 41 ] item-602 at level 5: text: It is probable that duck eggs we ... ugh hard evidence of this is uncommon. - item-603 at level 5: text: [35] - item-604 at level 5: text: [42] + item-603 at level 5: text: [ 35 ] + item-604 at level 5: text: [ 42 ] item-605 at level 4: inline: group group item-606 at level 5: text: In many areas, wild ducks (inclu ... he wild) are hunted for food or sport, - item-607 at level 5: text: [43] + item-607 at level 5: text: [ 43 ] item-608 at level 5: text: by shooting, or by being trapped using item-609 at level 5: text: duck decoys item-610 at level 5: text: . Because an idle floating duck ... n "an easy target". These ducks may be @@ -624,7 +624,7 @@ item-0 at level 0: unspecified: group _root_ item-612 at level 5: text: such as item-613 at level 5: text: PCBs item-614 at level 5: text: . - item-615 at level 5: text: [44] + item-615 at level 5: text: [ 44 ] item-616 at level 3: section_header: Domestication item-617 at level 4: picture item-617 at level 5: caption: Indian Runner ducks, a common breed of domestic ducks @@ -632,18 +632,18 @@ item-0 at level 0: unspecified: group _root_ item-619 at level 5: text: Ducks have many economic uses, b ... eggs, and feathers (particularly their item-620 at level 5: text: down item-621 at level 5: text: ). Approximately 3 billion ducks ... ughtered each year for meat worldwide. - item-622 at level 5: text: [45] + item-622 at level 5: text: [ 45 ] item-623 at level 5: text: They are also kept and bred by a ... domestic ducks are descended from the item-624 at level 5: text: mallard - item-625 at level 5: text: (Anas platyrhynchos), apart from the + item-625 at level 5: text: ( Anas platyrhynchos ), apart from the item-626 at level 5: text: Muscovy duck - item-627 at level 5: text: (Cairina moschata). - item-628 at level 5: text: [46] - item-629 at level 5: text: [47] + item-627 at level 5: text: ( Cairina moschata ). + item-628 at level 5: text: [ 46 ] + item-629 at level 5: text: [ 47 ] item-630 at level 5: text: The item-631 at level 5: text: Call duck item-632 at level 5: text: is another example of a domestic ... as it weighs less than 1 kg (2.2 lb). - item-633 at level 5: text: [48] + item-633 at level 5: text: [ 48 ] item-634 at level 3: section_header: Heraldry item-635 at level 4: picture item-635 at level 5: caption: Three black-colored ducks in the coat of arms of Maaninka[49] @@ -655,13 +655,13 @@ item-0 at level 0: unspecified: group _root_ item-641 at level 5: text: ( item-642 at level 5: text: Latvia item-643 at level 5: text: ) - item-644 at level 5: text: [50] + item-644 at level 5: text: [ 50 ] item-645 at level 5: text: and the coat of arms of item-646 at level 5: text: Föglö item-647 at level 5: text: ( item-648 at level 5: text: Åland item-649 at level 5: text: ). - item-650 at level 5: text: [51] + item-650 at level 5: text: [ 51 ] item-651 at level 3: section_header: Cultural references item-652 at level 4: inline: group group item-653 at level 5: text: In 2002, psychologist @@ -673,7 +673,7 @@ item-0 at level 0: unspecified: group _root_ item-659 at level 5: text: , finished a year-long item-660 at level 5: text: LaughLab item-661 at level 5: text: experiment, concluding that of a ... involving an animal, make it a duck." - item-662 at level 5: text: [52] + item-662 at level 5: text: [ 52 ] item-663 at level 5: text: The word "duck" may have become an item-664 at level 5: text: inherently funny word item-665 at level 5: text: in many languages, possibly beca ... n their looks or behavior. Of the many @@ -689,8 +689,8 @@ item-0 at level 0: unspecified: group _root_ item-675 at level 5: text: . item-676 at level 5: text: Howard the Duck item-677 at level 5: text: started as a comic book character in 1973 - item-678 at level 5: text: [53] - item-679 at level 5: text: [54] + item-678 at level 5: text: [ 53 ] + item-679 at level 5: text: [ 54 ] item-680 at level 5: text: and was made into a item-681 at level 5: text: movie item-682 at level 5: text: in 1986. @@ -703,16 +703,16 @@ item-0 at level 0: unspecified: group _root_ item-689 at level 5: text: National Hockey League item-690 at level 5: text: professional team of the item-691 at level 5: text: Anaheim Ducks - item-692 at level 5: text: , who were founded with the name the Mighty Ducks of Anaheim.[ + item-692 at level 5: text: , who were founded with the name the Mighty Ducks of Anaheim. [ item-693 at level 5: text: citation needed - item-694 at level 5: text: ]The duck is also the nickname of the + item-694 at level 5: text: ] The duck is also the nickname of the item-695 at level 5: text: University of Oregon item-696 at level 5: text: sports teams as well as the item-697 at level 5: text: Long Island Ducks item-698 at level 5: text: minor league item-699 at level 5: text: baseball item-700 at level 5: text: team. - item-701 at level 5: text: [55] + item-701 at level 5: text: [ 55 ] item-702 at level 2: section_header: See also item-703 at level 3: list: group list item-704 at level 4: list_item: Birds portal diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index ad03897d..47519c34 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -6904,8 +6904,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "dūce'diver', a derivative of the verb*dūcan'to duck, bend down low as if to get under something, or dive', because of the way many species in the", - "text": "dūce'diver', a derivative of the verb*dūcan'to duck, bend down low as if to get under something, or dive', because of the way many species in the" + "orig": "dūce 'diver', a derivative of the verb * dūcan 'to duck, bend down low as if to get under something, or dive', because of the way many species in the", + "text": "dūce 'diver', a derivative of the verb * dūcan 'to duck, bend down low as if to get under something, or dive', because of the way many species in the" }, { "self_ref": "#/texts/241", @@ -6979,8 +6979,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "tauchen'to dive'.", - "text": "tauchen'to dive'." + "orig": "tauchen 'to dive'.", + "text": "tauchen 'to dive'." }, { "self_ref": "#/texts/247", @@ -7003,8 +7003,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "This word replaced Old English ened/ænid'duck', possibly to avoid confusion with other words, such as ende'end' with similar forms. Other Germanic languages still have similar words for duck, for example, Dutch eend, German Ente and", - "text": "This word replaced Old English ened/ænid'duck', possibly to avoid confusion with other words, such as ende'end' with similar forms. Other Germanic languages still have similar words for duck, for example, Dutch eend, German Ente and" + "orig": "This word replaced Old English ened / ænid 'duck', possibly to avoid confusion with other words, such as ende 'end' with similar forms. Other Germanic languages still have similar words for duck , for example, Dutch eend , German Ente and", + "text": "This word replaced Old English ened / ænid 'duck', possibly to avoid confusion with other words, such as ende 'end' with similar forms. Other Germanic languages still have similar words for duck , for example, Dutch eend , German Ente and" }, { "self_ref": "#/texts/249", @@ -7028,8 +7028,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "and. The word ened/ænid was inherited from", - "text": "and. The word ened/ænid was inherited from" + "orig": "and . The word ened / ænid was inherited from", + "text": "and . The word ened / ænid was inherited from" }, { "self_ref": "#/texts/251", @@ -7091,8 +7091,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "anas\"duck\",", - "text": "anas\"duck\"," + "orig": "anas \"duck\",", + "text": "anas \"duck\"," }, { "self_ref": "#/texts/256", @@ -7116,8 +7116,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "ántis'duck',", - "text": "ántis'duck'," + "orig": "ántis 'duck',", + "text": "ántis 'duck'," }, { "self_ref": "#/texts/258", @@ -7141,8 +7141,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "νῆσσα/νῆττα(nēssa/nētta) 'duck', and", - "text": "νῆσσα/νῆττα(nēssa/nētta) 'duck', and" + "orig": "νῆσσα / νῆττα ( nēssa / nētta ) 'duck', and", + "text": "νῆσσα / νῆττα ( nēssa / nētta ) 'duck', and" }, { "self_ref": "#/texts/260", @@ -7166,8 +7166,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "ātí'water bird', among others.", - "text": "ātí'water bird', among others." + "orig": "ātí 'water bird', among others.", + "text": "ātí 'water bird', among others." }, { "self_ref": "#/texts/262", @@ -7190,8 +7190,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[1]", - "text": "[1]", + "orig": "[ 1 ]", + "text": "[ 1 ]", "hyperlink": "#cite_note-1" }, { @@ -7215,8 +7215,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[2]", - "text": "[2]", + "orig": "[ 2 ]", + "text": "[ 2 ]", "hyperlink": "#cite_note-2" }, { @@ -7302,8 +7302,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[3]", - "text": "[3]", + "orig": "[ 3 ]", + "text": "[ 3 ]", "hyperlink": "#cite_note-3" }, { @@ -7315,8 +7315,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[4]", - "text": "[4]", + "orig": "[ 4 ]", + "text": "[ 4 ]", "hyperlink": "#cite_note-4" }, { @@ -7478,8 +7478,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[5]", - "text": "[5]", + "orig": "[ 5 ]", + "text": "[ 5 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992536-5" }, { @@ -7541,8 +7541,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[5]", - "text": "[5]", + "orig": "[ 5 ]", + "text": "[ 5 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992536-5" }, { @@ -7566,8 +7566,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[5]", - "text": "[5]", + "orig": "[ 5 ]", + "text": "[ 5 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992536-5" }, { @@ -7616,8 +7616,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[6]", - "text": "[6]", + "orig": "[ 6 ]", + "text": "[ 6 ]", "hyperlink": "#cite_note-FOOTNOTELivezey1986737–738-6" }, { @@ -7629,8 +7629,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[7]", - "text": "[7]", + "orig": "[ 7 ]", + "text": "[ 7 ]", "hyperlink": "#cite_note-FOOTNOTEMadsenMcHughde_Kloet1988452-7" }, { @@ -7654,8 +7654,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[8]", - "text": "[8]", + "orig": "[ 8 ]", + "text": "[ 8 ]", "hyperlink": "#cite_note-FOOTNOTEDonne-GousséLaudetHänni2002353–354-8" }, { @@ -7667,8 +7667,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[9]", - "text": "[9]", + "orig": "[ 9 ]", + "text": "[ 9 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992540-9" }, { @@ -7717,8 +7717,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[9]", - "text": "[9]", + "orig": "[ 9 ]", + "text": "[ 9 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992540-9" }, { @@ -7754,8 +7754,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[10]", - "text": "[10]", + "orig": "[ 10 ]", + "text": "[ 10 ]", "hyperlink": "#cite_note-FOOTNOTEElphickDunningSibley2001191-10" }, { @@ -7779,8 +7779,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[11]", - "text": "[11]", + "orig": "[ 11 ]", + "text": "[ 11 ]", "hyperlink": "#cite_note-FOOTNOTEKear2005448-11" }, { @@ -7804,8 +7804,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[12]", - "text": "[12]", + "orig": "[ 12 ]", + "text": "[ 12 ]", "hyperlink": "#cite_note-FOOTNOTEKear2005622–623-12" }, { @@ -7829,8 +7829,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[13]", - "text": "[13]", + "orig": "[ 13 ]", + "text": "[ 13 ]", "hyperlink": "#cite_note-FOOTNOTEKear2005686-13" }, { @@ -7854,8 +7854,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[14]", - "text": "[14]", + "orig": "[ 14 ]", + "text": "[ 14 ]", "hyperlink": "#cite_note-FOOTNOTEElphickDunningSibley2001193-14" }, { @@ -7904,8 +7904,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[15]", - "text": "[15]", + "orig": "[ 15 ]", + "text": "[ 15 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992537-15" }, { @@ -7929,8 +7929,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[9]", - "text": "[9]", + "orig": "[ 9 ]", + "text": "[ 9 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992540-9" }, { @@ -7942,8 +7942,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[16]", - "text": "[16]", + "orig": "[ 16 ]", + "text": "[ 16 ]", "hyperlink": "#cite_note-FOOTNOTEAmerican_Ornithologists'_Union1998xix-16" }, { @@ -7992,8 +7992,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[15]", - "text": "[15]", + "orig": "[ 15 ]", + "text": "[ 15 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992537-15" }, { @@ -8017,8 +8017,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[9]", - "text": "[9]", + "orig": "[ 9 ]", + "text": "[ 9 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992540-9" }, { @@ -8067,8 +8067,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[15]", - "text": "[15]", + "orig": "[ 15 ]", + "text": "[ 15 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992537-15" }, { @@ -8092,8 +8092,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[17]", - "text": "[17]", + "orig": "[ 17 ]", + "text": "[ 17 ]", "hyperlink": "#cite_note-FOOTNOTEAmerican_Ornithologists'_Union1998-17" }, { @@ -8142,8 +8142,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[15]", - "text": "[15]", + "orig": "[ 15 ]", + "text": "[ 15 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992537-15" }, { @@ -8167,8 +8167,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[9]", - "text": "[9]", + "orig": "[ 9 ]", + "text": "[ 9 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992540-9" }, { @@ -8217,8 +8217,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[9]", - "text": "[9]", + "orig": "[ 9 ]", + "text": "[ 9 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992540-9" }, { @@ -8267,8 +8267,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[15]", - "text": "[15]", + "orig": "[ 15 ]", + "text": "[ 15 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992537-15" }, { @@ -8292,8 +8292,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[18]", - "text": "[18]", + "orig": "[ 18 ]", + "text": "[ 18 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992538-18" }, { @@ -8342,8 +8342,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[15]", - "text": "[15]", + "orig": "[ 15 ]", + "text": "[ 15 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992537-15" }, { @@ -8367,8 +8367,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[19]", - "text": "[19]", + "orig": "[ 19 ]", + "text": "[ 19 ]", "hyperlink": "#cite_note-FOOTNOTEChristidisBoles200862-19" }, { @@ -8392,8 +8392,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[15]", - "text": "[15]", + "orig": "[ 15 ]", + "text": "[ 15 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992537-15" }, { @@ -8839,8 +8839,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[5]", - "text": "[5]", + "orig": "[ 5 ]", + "text": "[ 5 ]", "hyperlink": "#cite_note-FOOTNOTECarboneras1992536-5" }, { @@ -8914,8 +8914,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[20]", - "text": "[20]", + "orig": "[ 20 ]", + "text": "[ 20 ]", "hyperlink": "#cite_note-FOOTNOTEShirihai2008239,_245-20" }, { @@ -9064,8 +9064,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[21]", - "text": "[21]", + "orig": "[ 21 ]", + "text": "[ 21 ]", "hyperlink": "#cite_note-FOOTNOTEPrattBrunerBerrett198798–107-21" }, { @@ -9077,8 +9077,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[22]", - "text": "[22]", + "orig": "[ 22 ]", + "text": "[ 22 ]", "hyperlink": "#cite_note-FOOTNOTEFitterFitterHosking200052–3-22" }, { @@ -9127,8 +9127,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[21]", - "text": "[21]", + "orig": "[ 21 ]", + "text": "[ 21 ]", "hyperlink": "#cite_note-FOOTNOTEPrattBrunerBerrett198798–107-21" }, { @@ -9164,8 +9164,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[23]", - "text": "[23]", + "orig": "[ 23 ]", + "text": "[ 23 ]", "hyperlink": "#cite_note-23" }, { @@ -9352,8 +9352,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[24]", - "text": "[24]", + "orig": "[ 24 ]", + "text": "[ 24 ]", "hyperlink": "#cite_note-24" }, { @@ -9601,8 +9601,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[25]", - "text": "[25]", + "orig": "[ 25 ]", + "text": "[ 25 ]", "hyperlink": "#cite_note-25" }, { @@ -9683,8 +9683,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[26]", - "text": "[26]", + "orig": "[ 26 ]", + "text": "[ 26 ]", "hyperlink": "#cite_note-26" }, { @@ -9708,8 +9708,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[27]", - "text": "[27]", + "orig": "[ 27 ]", + "text": "[ 27 ]", "hyperlink": "#cite_note-27" }, { @@ -9808,8 +9808,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[28]", - "text": "[28]", + "orig": "[ 28 ]", + "text": "[ 28 ]", "hyperlink": "#cite_note-28" }, { @@ -9866,8 +9866,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "ducks (as well as several other species in the genus Anas, such as the", - "text": "ducks (as well as several other species in the genus Anas, such as the" + "orig": "ducks (as well as several other species in the genus Anas , such as the", + "text": "ducks (as well as several other species in the genus Anas , such as the" }, { "self_ref": "#/texts/471", @@ -10003,8 +10003,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[29]", - "text": "[29]", + "orig": "[ 29 ]", + "text": "[ 29 ]", "hyperlink": "#cite_note-29" }, { @@ -10041,8 +10041,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "]but, despite widespread misconceptions, most species of duck do not \"quack\".", - "text": "]but, despite widespread misconceptions, most species of duck do not \"quack\"." + "orig": "] but, despite widespread misconceptions, most species of duck do not \"quack\".", + "text": "] but, despite widespread misconceptions, most species of duck do not \"quack\"." }, { "self_ref": "#/texts/485", @@ -10053,8 +10053,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[30]", - "text": "[30]", + "orig": "[ 30 ]", + "text": "[ 30 ]", "hyperlink": "#cite_note-30" }, { @@ -10240,8 +10240,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[31]", - "text": "[31]", + "orig": "[ 31 ]", + "text": "[ 31 ]", "hyperlink": "#cite_note-31" }, { @@ -10315,8 +10315,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[32]", - "text": "[32]", + "orig": "[ 32 ]", + "text": "[ 32 ]", "hyperlink": "#cite_note-32" }, { @@ -10770,8 +10770,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[33]", - "text": "[33]", + "orig": "[ 33 ]", + "text": "[ 33 ]", "hyperlink": "#cite_note-FOOTNOTEErlandson1994171-33" }, { @@ -10845,8 +10845,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[34]", - "text": "[34]", + "orig": "[ 34 ]", + "text": "[ 34 ]", "hyperlink": "#cite_note-FOOTNOTEJeffries2008168,_243-34" }, { @@ -10870,8 +10870,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[35]", - "text": "[35]", + "orig": "[ 35 ]", + "text": "[ 35 ]", "hyperlink": "#cite_note-FOOTNOTESued-Badillo200365-35" }, { @@ -10895,8 +10895,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[36]", - "text": "[36]", + "orig": "[ 36 ]", + "text": "[ 36 ]", "hyperlink": "#cite_note-FOOTNOTEThorpe199668-36" }, { @@ -10920,8 +10920,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[37]", - "text": "[37]", + "orig": "[ 37 ]", + "text": "[ 37 ]", "hyperlink": "#cite_note-FOOTNOTEMaisels199942-37" }, { @@ -10945,8 +10945,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[38]", - "text": "[38]", + "orig": "[ 38 ]", + "text": "[ 38 ]", "hyperlink": "#cite_note-FOOTNOTERau1876133-38" }, { @@ -10970,8 +10970,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[39]", - "text": "[39]", + "orig": "[ 39 ]", + "text": "[ 39 ]", "hyperlink": "#cite_note-FOOTNOTEHigman201223-39" }, { @@ -11045,8 +11045,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[40]", - "text": "[40]", + "orig": "[ 40 ]", + "text": "[ 40 ]", "hyperlink": "#cite_note-FOOTNOTEHume201253-40" }, { @@ -11095,8 +11095,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[41]", - "text": "[41]", + "orig": "[ 41 ]", + "text": "[ 41 ]", "hyperlink": "#cite_note-FOOTNOTEHume201252-41" }, { @@ -11120,8 +11120,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[35]", - "text": "[35]", + "orig": "[ 35 ]", + "text": "[ 35 ]", "hyperlink": "#cite_note-FOOTNOTESued-Badillo200365-35" }, { @@ -11133,8 +11133,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[42]", - "text": "[42]", + "orig": "[ 42 ]", + "text": "[ 42 ]", "hyperlink": "#cite_note-FOOTNOTEFieldhouse2002167-42" }, { @@ -11158,8 +11158,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[43]", - "text": "[43]", + "orig": "[ 43 ]", + "text": "[ 43 ]", "hyperlink": "#cite_note-43" }, { @@ -11258,8 +11258,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[44]", - "text": "[44]", + "orig": "[ 44 ]", + "text": "[ 44 ]", "hyperlink": "#cite_note-44" }, { @@ -11340,8 +11340,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[45]", - "text": "[45]", + "orig": "[ 45 ]", + "text": "[ 45 ]", "hyperlink": "#cite_note-45" }, { @@ -11378,8 +11378,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "(Anas platyrhynchos), apart from the", - "text": "(Anas platyrhynchos), apart from the" + "orig": "( Anas platyrhynchos ), apart from the", + "text": "( Anas platyrhynchos ), apart from the" }, { "self_ref": "#/texts/589", @@ -11403,8 +11403,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "(Cairina moschata).", - "text": "(Cairina moschata)." + "orig": "( Cairina moschata ).", + "text": "( Cairina moschata )." }, { "self_ref": "#/texts/591", @@ -11415,8 +11415,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[46]", - "text": "[46]", + "orig": "[ 46 ]", + "text": "[ 46 ]", "hyperlink": "#cite_note-46" }, { @@ -11428,8 +11428,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[47]", - "text": "[47]", + "orig": "[ 47 ]", + "text": "[ 47 ]", "hyperlink": "#cite_note-47" }, { @@ -11478,8 +11478,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[48]", - "text": "[48]", + "orig": "[ 48 ]", + "text": "[ 48 ]", "hyperlink": "#cite_note-48" }, { @@ -11610,8 +11610,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[50]", - "text": "[50]", + "orig": "[ 50 ]", + "text": "[ 50 ]", "hyperlink": "#cite_note-50" }, { @@ -11685,8 +11685,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[51]", - "text": "[51]", + "orig": "[ 51 ]", + "text": "[ 51 ]", "hyperlink": "#cite_note-51" }, { @@ -11830,8 +11830,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[52]", - "text": "[52]", + "orig": "[ 52 ]", + "text": "[ 52 ]", "hyperlink": "#cite_note-52" }, { @@ -12030,8 +12030,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[53]", - "text": "[53]", + "orig": "[ 53 ]", + "text": "[ 53 ]", "hyperlink": "#cite_note-53" }, { @@ -12043,8 +12043,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[54]", - "text": "[54]", + "orig": "[ 54 ]", + "text": "[ 54 ]", "hyperlink": "#cite_note-54" }, { @@ -12193,8 +12193,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": ", who were founded with the name the Mighty Ducks of Anaheim.[", - "text": ", who were founded with the name the Mighty Ducks of Anaheim.[" + "orig": ", who were founded with the name the Mighty Ducks of Anaheim. [", + "text": ", who were founded with the name the Mighty Ducks of Anaheim. [" }, { "self_ref": "#/texts/653", @@ -12218,8 +12218,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "]The duck is also the nickname of the", - "text": "]The duck is also the nickname of the" + "orig": "] The duck is also the nickname of the", + "text": "] The duck is also the nickname of the" }, { "self_ref": "#/texts/655", @@ -12305,8 +12305,8 @@ "content_layer": "body", "label": "text", "prov": [], - "orig": "[55]", - "text": "[55]", + "orig": "[ 55 ]", + "text": "[ 55 ]", "hyperlink": "#cite_note-55" }, { diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md index 49bfd5c1..18496c81 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md @@ -231,17 +231,17 @@ Ducks are sometimes confused with several types of unrelated water birds with si ## Etymology -The word duck comes from [Old English](/wiki/Old_English) dūce'diver', a derivative of the verb*dūcan'to duck, bend down low as if to get under something, or dive', because of the way many species in the [dabbling duck](/wiki/Dabbling_duck) group feed by upending; compare with [Dutch](/wiki/Dutch_language) duiken and [German](/wiki/German_language) tauchen'to dive'. +The word duck comes from [Old English](/wiki/Old_English) dūce 'diver', a derivative of the verb * dūcan 'to duck, bend down low as if to get under something, or dive', because of the way many species in the [dabbling duck](/wiki/Dabbling_duck) group feed by upending; compare with [Dutch](/wiki/Dutch_language) duiken and [German](/wiki/German_language) tauchen 'to dive'. Pacific black duck displaying the characteristic upending "duck" -This word replaced Old English ened/ænid'duck', possibly to avoid confusion with other words, such as ende'end' with similar forms. Other Germanic languages still have similar words for duck, for example, Dutch eend, German Ente and [Norwegian](/wiki/Norwegian_language) and. The word ened/ænid was inherited from [Proto-Indo-European](/wiki/Proto-Indo-European_language) ; [cf.](/wiki/Cf.) [Latin](/wiki/Latin) anas"duck", [Lithuanian](/wiki/Lithuanian_language) ántis'duck', [Ancient Greek](/wiki/Ancient_Greek_language) νῆσσα/νῆττα(nēssa/nētta) 'duck', and [Sanskrit](/wiki/Sanskrit) ātí'water bird', among others. +This word replaced Old English ened / ænid 'duck', possibly to avoid confusion with other words, such as ende 'end' with similar forms. Other Germanic languages still have similar words for duck , for example, Dutch eend , German Ente and [Norwegian](/wiki/Norwegian_language) and . The word ened / ænid was inherited from [Proto-Indo-European](/wiki/Proto-Indo-European_language) ; [cf.](/wiki/Cf.) [Latin](/wiki/Latin) anas "duck", [Lithuanian](/wiki/Lithuanian_language) ántis 'duck', [Ancient Greek](/wiki/Ancient_Greek_language) νῆσσα / νῆττα ( nēssa / nētta ) 'duck', and [Sanskrit](/wiki/Sanskrit) ātí 'water bird', among others. -A duckling is a young duck in downy plumage [[1]](#cite_note-1) or baby duck, [[2]](#cite_note-2) but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling. +A duckling is a young duck in downy plumage [[ 1 ]](#cite_note-1) or baby duck, [[ 2 ]](#cite_note-2) but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling. -A male is called a [drake](https://en.wiktionary.org/wiki/drake) and the female is called a duck, or in [ornithology](/wiki/Ornithology) a hen. [[3]](#cite_note-3) [[4]](#cite_note-4) +A male is called a [drake](https://en.wiktionary.org/wiki/drake) and the female is called a duck, or in [ornithology](/wiki/Ornithology) a hen. [[ 3 ]](#cite_note-3) [[ 4 ]](#cite_note-4) Male mallard. @@ -253,15 +253,15 @@ Wood ducks. ## Taxonomy -All ducks belong to the [biological order](/wiki/Order_(biology)) [Anseriformes](/wiki/Anseriformes) , a group that contains the ducks, geese and swans, as well as the [screamers](/wiki/Screamer) , and the [magpie goose](/wiki/Magpie_goose) . [[5]](#cite_note-FOOTNOTECarboneras1992536-5) All except the screamers belong to the [biological family](/wiki/Family_(biology)) [Anatidae](/wiki/Anatidae) . [[5]](#cite_note-FOOTNOTECarboneras1992536-5) Within the family, ducks are split into a variety of subfamilies and 'tribes'. The number and composition of these subfamilies and tribes is the cause of considerable disagreement among taxonomists. [[5]](#cite_note-FOOTNOTECarboneras1992536-5) Some base their decisions on [morphological characteristics](/wiki/Morphology_(biology)) , others on shared behaviours or genetic studies. [[6]](#cite_note-FOOTNOTELivezey1986737–738-6) [[7]](#cite_note-FOOTNOTEMadsenMcHughde_Kloet1988452-7) The number of suggested subfamilies containing ducks ranges from two to five. [[8]](#cite_note-FOOTNOTEDonne-GousséLaudetHänni2002353–354-8) [[9]](#cite_note-FOOTNOTECarboneras1992540-9) The significant level of [hybridisation](/wiki/Hybrid_(biology)) that occurs among wild ducks complicates efforts to tease apart the relationships between various species. [[9]](#cite_note-FOOTNOTECarboneras1992540-9) +All ducks belong to the [biological order](/wiki/Order_(biology)) [Anseriformes](/wiki/Anseriformes) , a group that contains the ducks, geese and swans, as well as the [screamers](/wiki/Screamer) , and the [magpie goose](/wiki/Magpie_goose) . [[ 5 ]](#cite_note-FOOTNOTECarboneras1992536-5) All except the screamers belong to the [biological family](/wiki/Family_(biology)) [Anatidae](/wiki/Anatidae) . [[ 5 ]](#cite_note-FOOTNOTECarboneras1992536-5) Within the family, ducks are split into a variety of subfamilies and 'tribes'. The number and composition of these subfamilies and tribes is the cause of considerable disagreement among taxonomists. [[ 5 ]](#cite_note-FOOTNOTECarboneras1992536-5) Some base their decisions on [morphological characteristics](/wiki/Morphology_(biology)) , others on shared behaviours or genetic studies. [[ 6 ]](#cite_note-FOOTNOTELivezey1986737–738-6) [[ 7 ]](#cite_note-FOOTNOTEMadsenMcHughde_Kloet1988452-7) The number of suggested subfamilies containing ducks ranges from two to five. [[ 8 ]](#cite_note-FOOTNOTEDonne-GousséLaudetHänni2002353–354-8) [[ 9 ]](#cite_note-FOOTNOTECarboneras1992540-9) The significant level of [hybridisation](/wiki/Hybrid_(biology)) that occurs among wild ducks complicates efforts to tease apart the relationships between various species. [[ 9 ]](#cite_note-FOOTNOTECarboneras1992540-9) Mallard landing in approach -In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes. [[10]](#cite_note-FOOTNOTEElphickDunningSibley2001191-10) The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks – named for their method of feeding primarily at the surface of fresh water. [[11]](#cite_note-FOOTNOTEKear2005448-11) The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini. [[12]](#cite_note-FOOTNOTEKear2005622–623-12) The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater. [[13]](#cite_note-FOOTNOTEKear2005686-13) The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails. [[14]](#cite_note-FOOTNOTEElphickDunningSibley2001193-14) +In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes. [[ 10 ]](#cite_note-FOOTNOTEElphickDunningSibley2001191-10) The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks – named for their method of feeding primarily at the surface of fresh water. [[ 11 ]](#cite_note-FOOTNOTEKear2005448-11) The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini. [[ 12 ]](#cite_note-FOOTNOTEKear2005622–623-12) The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater. [[ 13 ]](#cite_note-FOOTNOTEKear2005686-13) The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails. [[ 14 ]](#cite_note-FOOTNOTEElphickDunningSibley2001193-14) -A number of other species called ducks are not considered to be 'true ducks', and are typically placed in other subfamilies or tribes. The [whistling ducks](/wiki/Whistling_duck) are assigned either to a tribe (Dendrocygnini) in the subfamily Anatinae or the subfamily Anserinae, [[15]](#cite_note-FOOTNOTECarboneras1992537-15) or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae). [[9]](#cite_note-FOOTNOTECarboneras1992540-9) [[16]](#cite_note-FOOTNOTEAmerican_Ornithologists'_Union1998xix-16) The [freckled duck](/wiki/Freckled_duck) of Australia is either the sole member of the tribe Stictonettini in the subfamily Anserinae, [[15]](#cite_note-FOOTNOTECarboneras1992537-15) or in its own family, the Stictonettinae. [[9]](#cite_note-FOOTNOTECarboneras1992540-9) The [shelducks](/wiki/Shelduck) make up the tribe Tadornini in the family Anserinae in some classifications, [[15]](#cite_note-FOOTNOTECarboneras1992537-15) and their own subfamily, Tadorninae, in others, [[17]](#cite_note-FOOTNOTEAmerican_Ornithologists'_Union1998-17) while the [steamer ducks](/wiki/Steamer_duck) are either placed in the family Anserinae in the tribe Tachyerini [[15]](#cite_note-FOOTNOTECarboneras1992537-15) or lumped with the shelducks in the tribe Tadorini. [[9]](#cite_note-FOOTNOTECarboneras1992540-9) The [perching ducks](/wiki/Perching_duck) make up in the tribe Cairinini in the subfamily Anserinae in some classifications, while that tribe is eliminated in other classifications and its members assigned to the tribe Anatini. [[9]](#cite_note-FOOTNOTECarboneras1992540-9) The [torrent duck](/wiki/Torrent_duck) is generally included in the subfamily Anserinae in the monotypic tribe Merganettini, [[15]](#cite_note-FOOTNOTECarboneras1992537-15) but is sometimes included in the tribe Tadornini. [[18]](#cite_note-FOOTNOTECarboneras1992538-18) The [pink-eared duck](/wiki/Pink-eared_duck) is sometimes included as a true duck either in the tribe Anatini [[15]](#cite_note-FOOTNOTECarboneras1992537-15) or the tribe Malacorhynchini, [[19]](#cite_note-FOOTNOTEChristidisBoles200862-19) and other times is included with the shelducks in the tribe Tadornini. [[15]](#cite_note-FOOTNOTECarboneras1992537-15) +A number of other species called ducks are not considered to be 'true ducks', and are typically placed in other subfamilies or tribes. The [whistling ducks](/wiki/Whistling_duck) are assigned either to a tribe (Dendrocygnini) in the subfamily Anatinae or the subfamily Anserinae, [[ 15 ]](#cite_note-FOOTNOTECarboneras1992537-15) or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae). [[ 9 ]](#cite_note-FOOTNOTECarboneras1992540-9) [[ 16 ]](#cite_note-FOOTNOTEAmerican_Ornithologists'_Union1998xix-16) The [freckled duck](/wiki/Freckled_duck) of Australia is either the sole member of the tribe Stictonettini in the subfamily Anserinae, [[ 15 ]](#cite_note-FOOTNOTECarboneras1992537-15) or in its own family, the Stictonettinae. [[ 9 ]](#cite_note-FOOTNOTECarboneras1992540-9) The [shelducks](/wiki/Shelduck) make up the tribe Tadornini in the family Anserinae in some classifications, [[ 15 ]](#cite_note-FOOTNOTECarboneras1992537-15) and their own subfamily, Tadorninae, in others, [[ 17 ]](#cite_note-FOOTNOTEAmerican_Ornithologists'_Union1998-17) while the [steamer ducks](/wiki/Steamer_duck) are either placed in the family Anserinae in the tribe Tachyerini [[ 15 ]](#cite_note-FOOTNOTECarboneras1992537-15) or lumped with the shelducks in the tribe Tadorini. [[ 9 ]](#cite_note-FOOTNOTECarboneras1992540-9) The [perching ducks](/wiki/Perching_duck) make up in the tribe Cairinini in the subfamily Anserinae in some classifications, while that tribe is eliminated in other classifications and its members assigned to the tribe Anatini. [[ 9 ]](#cite_note-FOOTNOTECarboneras1992540-9) The [torrent duck](/wiki/Torrent_duck) is generally included in the subfamily Anserinae in the monotypic tribe Merganettini, [[ 15 ]](#cite_note-FOOTNOTECarboneras1992537-15) but is sometimes included in the tribe Tadornini. [[ 18 ]](#cite_note-FOOTNOTECarboneras1992538-18) The [pink-eared duck](/wiki/Pink-eared_duck) is sometimes included as a true duck either in the tribe Anatini [[ 15 ]](#cite_note-FOOTNOTECarboneras1992537-15) or the tribe Malacorhynchini, [[ 19 ]](#cite_note-FOOTNOTEChristidisBoles200862-19) and other times is included with the shelducks in the tribe Tadornini. [[ 15 ]](#cite_note-FOOTNOTECarboneras1992537-15) ## Morphology @@ -279,13 +279,13 @@ Flying steamer ducks in Ushuaia, Argentina -Ducks have a [cosmopolitan distribution](/wiki/Cosmopolitan_distribution) , and are found on every continent except Antarctica. [[5]](#cite_note-FOOTNOTECarboneras1992536-5) Several species manage to live on subantarctic islands, including [South Georgia](/wiki/South_Georgia_and_the_South_Sandwich_Islands) and the [Auckland Islands](/wiki/Auckland_Islands) . [[20]](#cite_note-FOOTNOTEShirihai2008239,_245-20) Ducks have reached a number of isolated oceanic islands, including the [Hawaiian Islands](/wiki/Hawaiian_Islands) , [Micronesia](/wiki/Micronesia) and the [Galápagos Islands](/wiki/Gal%C3%A1pagos_Islands) , where they are often [vagrants](/wiki/Glossary_of_bird_terms#vagrants) and less often [residents](/wiki/Glossary_of_bird_terms#residents) . [[21]](#cite_note-FOOTNOTEPrattBrunerBerrett198798–107-21) [[22]](#cite_note-FOOTNOTEFitterFitterHosking200052–3-22) A handful are [endemic](/wiki/Endemic) to such far-flung islands. [[21]](#cite_note-FOOTNOTEPrattBrunerBerrett198798–107-21) +Ducks have a [cosmopolitan distribution](/wiki/Cosmopolitan_distribution) , and are found on every continent except Antarctica. [[ 5 ]](#cite_note-FOOTNOTECarboneras1992536-5) Several species manage to live on subantarctic islands, including [South Georgia](/wiki/South_Georgia_and_the_South_Sandwich_Islands) and the [Auckland Islands](/wiki/Auckland_Islands) . [[ 20 ]](#cite_note-FOOTNOTEShirihai2008239,_245-20) Ducks have reached a number of isolated oceanic islands, including the [Hawaiian Islands](/wiki/Hawaiian_Islands) , [Micronesia](/wiki/Micronesia) and the [Galápagos Islands](/wiki/Gal%C3%A1pagos_Islands) , where they are often [vagrants](/wiki/Glossary_of_bird_terms#vagrants) and less often [residents](/wiki/Glossary_of_bird_terms#residents) . [[ 21 ]](#cite_note-FOOTNOTEPrattBrunerBerrett198798–107-21) [[ 22 ]](#cite_note-FOOTNOTEFitterFitterHosking200052–3-22) A handful are [endemic](/wiki/Endemic) to such far-flung islands. [[ 21 ]](#cite_note-FOOTNOTEPrattBrunerBerrett198798–107-21) Female mallard in Cornwall, England -Some duck species, mainly those breeding in the temperate and Arctic Northern Hemisphere, are migratory; those in the tropics are generally not. Some ducks, particularly in Australia where rainfall is erratic, are nomadic, seeking out the temporary lakes and pools that form after localised heavy rain. [[23]](#cite_note-23) +Some duck species, mainly those breeding in the temperate and Arctic Northern Hemisphere, are migratory; those in the tropics are generally not. Some ducks, particularly in Australia where rainfall is erratic, are nomadic, seeking out the temporary lakes and pools that form after localised heavy rain. [[ 23 ]](#cite_note-23) ## Behaviour @@ -301,7 +301,7 @@ Mallard duckling preening Ducks eat food sources such as [grasses](/wiki/Poaceae) , aquatic plants, fish, insects, small amphibians, worms, and small [molluscs](/wiki/Mollusc) . -[Dabbling ducks](/wiki/Dabbling_duck) feed on the surface of water or on land, or as deep as they can reach by up-ending without completely submerging. [[24]](#cite_note-24) Along the edge of the bill, there is a comb-like structure called a [pecten](/wiki/Pecten_(biology)) . This strains the water squirting from the side of the bill and traps any food. The pecten is also used to preen feathers and to hold slippery food items. +[Dabbling ducks](/wiki/Dabbling_duck) feed on the surface of water or on land, or as deep as they can reach by up-ending without completely submerging. [[ 24 ]](#cite_note-24) Along the edge of the bill, there is a comb-like structure called a [pecten](/wiki/Pecten_(biology)) . This strains the water squirting from the side of the bill and traps any food. The pecten is also used to preen feathers and to hold slippery food items. [Diving ducks](/wiki/Diving_duck) and [sea ducks](/wiki/Sea_duck) forage deep underwater. To be able to submerge more easily, the diving ducks are heavier than dabbling ducks, and therefore have more difficulty taking off to fly. @@ -309,7 +309,7 @@ A few specialized species such as the [mergansers](/wiki/Merganser) are adapted The others have the characteristic wide flat bill adapted to [dredging](/wiki/Dredging) -type jobs such as pulling up waterweed, pulling worms and small molluscs out of mud, searching for insect larvae, and bulk jobs such as dredging out, holding, turning head first, and swallowing a squirming frog. To avoid injury when digging into sediment it has no [cere](/wiki/Cere) , but the nostrils come out through hard horn. -[The Guardian](/wiki/The_Guardian) published an article advising that ducks should not be fed with bread because it [damages the health of the ducks](/wiki/Angel_wing) and pollutes waterways. [[25]](#cite_note-25) +[The Guardian](/wiki/The_Guardian) published an article advising that ducks should not be fed with bread because it [damages the health of the ducks](/wiki/Angel_wing) and pollutes waterways. [[ 25 ]](#cite_note-25) ### Breeding @@ -317,13 +317,13 @@ A Muscovy duckling -Ducks generally [only have one partner at a time](/wiki/Monogamy_in_animals) , although the partnership usually only lasts one year. [[26]](#cite_note-26) Larger species and the more sedentary species (like fast-river specialists) tend to have pair-bonds that last numerous years. [[27]](#cite_note-27) Most duck species breed once a year, choosing to do so in favourable conditions ( [spring](/wiki/Spring_(season)) /summer or wet seasons). Ducks also tend to make a [nest](/wiki/Bird_nest) before breeding, and, after hatching, lead their ducklings to water. Mother ducks are very caring and protective of their young, but may abandon some of their ducklings if they are physically stuck in an area they cannot get out of (such as nesting in an enclosed [courtyard](/wiki/Courtyard) ) or are not prospering due to genetic defects or sickness brought about by hypothermia, starvation, or disease. Ducklings can also be orphaned by inconsistent late hatching where a few eggs hatch after the mother has abandoned the nest and led her ducklings to water. [[28]](#cite_note-28) +Ducks generally [only have one partner at a time](/wiki/Monogamy_in_animals) , although the partnership usually only lasts one year. [[ 26 ]](#cite_note-26) Larger species and the more sedentary species (like fast-river specialists) tend to have pair-bonds that last numerous years. [[ 27 ]](#cite_note-27) Most duck species breed once a year, choosing to do so in favourable conditions ( [spring](/wiki/Spring_(season)) /summer or wet seasons). Ducks also tend to make a [nest](/wiki/Bird_nest) before breeding, and, after hatching, lead their ducklings to water. Mother ducks are very caring and protective of their young, but may abandon some of their ducklings if they are physically stuck in an area they cannot get out of (such as nesting in an enclosed [courtyard](/wiki/Courtyard) ) or are not prospering due to genetic defects or sickness brought about by hypothermia, starvation, or disease. Ducklings can also be orphaned by inconsistent late hatching where a few eggs hatch after the mother has abandoned the nest and led her ducklings to water. [[ 28 ]](#cite_note-28) ### Communication -Female [mallard](/wiki/Mallard) ducks (as well as several other species in the genus Anas, such as the [American](/wiki/American_black_duck) and [Pacific black ducks](/wiki/Pacific_black_duck) , [spot-billed duck](/wiki/Spot-billed_duck) , [northern pintail](/wiki/Northern_pintail) and [common teal](/wiki/Common_teal) ) make the classic "quack" sound while males make a similar but raspier sound that is sometimes written as "breeeeze", [[29]](#cite_note-29) [ [self-published source?](/wiki/Wikipedia:Verifiability#Self-published_sources) ]but, despite widespread misconceptions, most species of duck do not "quack". [[30]](#cite_note-30) In general, ducks make a range of [calls](/wiki/Bird_vocalisation) , including whistles, cooing, yodels and grunts. For example, the [scaup](/wiki/Scaup) – which are [diving ducks](/wiki/Diving_duck) – make a noise like "scaup" (hence their name). Calls may be loud displaying calls or quieter contact calls. +Female [mallard](/wiki/Mallard) ducks (as well as several other species in the genus Anas , such as the [American](/wiki/American_black_duck) and [Pacific black ducks](/wiki/Pacific_black_duck) , [spot-billed duck](/wiki/Spot-billed_duck) , [northern pintail](/wiki/Northern_pintail) and [common teal](/wiki/Common_teal) ) make the classic "quack" sound while males make a similar but raspier sound that is sometimes written as "breeeeze", [[ 29 ]](#cite_note-29) [ [self-published source?](/wiki/Wikipedia:Verifiability#Self-published_sources) ] but, despite widespread misconceptions, most species of duck do not "quack". [[ 30 ]](#cite_note-30) In general, ducks make a range of [calls](/wiki/Bird_vocalisation) , including whistles, cooing, yodels and grunts. For example, the [scaup](/wiki/Scaup) – which are [diving ducks](/wiki/Diving_duck) – make a noise like "scaup" (hence their name). Calls may be loud displaying calls or quieter contact calls. -A common [urban legend](/wiki/Urban_legend) claims that duck quacks do not echo; however, this has been proven to be false. This myth was first debunked by the Acoustics Research Centre at the [University of Salford](/wiki/University_of_Salford) in 2003 as part of the [British Association](/wiki/British_Association) 's Festival of Science. [[31]](#cite_note-31) It was also debunked in [one of the earlier episodes](/wiki/MythBusters_(2003_season)#Does_a_Duck's_Quack_Echo?) of the popular Discovery Channel television show [MythBusters](/wiki/MythBusters) . [[32]](#cite_note-32) +A common [urban legend](/wiki/Urban_legend) claims that duck quacks do not echo; however, this has been proven to be false. This myth was first debunked by the Acoustics Research Centre at the [University of Salford](/wiki/University_of_Salford) in 2003 as part of the [British Association](/wiki/British_Association) 's Festival of Science. [[ 31 ]](#cite_note-31) It was also debunked in [one of the earlier episodes](/wiki/MythBusters_(2003_season)#Does_a_Duck's_Quack_Echo?) of the popular Discovery Channel television show [MythBusters](/wiki/MythBusters) . [[ 32 ]](#cite_note-32) ### Predators @@ -339,9 +339,9 @@ Adult ducks are fast fliers, but may be caught on the water by large aquatic pre ### Hunting -Humans have hunted ducks since prehistoric times. Excavations of [middens](/wiki/Midden) in California dating to 7800 – 6400 [BP](/wiki/Before_present) have turned up bones of ducks, including at least one now-extinct flightless species. [[33]](#cite_note-FOOTNOTEErlandson1994171-33) Ducks were captured in "significant numbers" by [Holocene](/wiki/Holocene) inhabitants of the lower [Ohio River](/wiki/Ohio_River) valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl. [[34]](#cite_note-FOOTNOTEJeffries2008168,_243-34) Neolithic hunters in locations as far apart as the Caribbean, [[35]](#cite_note-FOOTNOTESued-Badillo200365-35) Scandinavia, [[36]](#cite_note-FOOTNOTEThorpe199668-36) Egypt, [[37]](#cite_note-FOOTNOTEMaisels199942-37) Switzerland, [[38]](#cite_note-FOOTNOTERau1876133-38) and China relied on ducks as a source of protein for some or all of the year. [[39]](#cite_note-FOOTNOTEHigman201223-39) Archeological evidence shows that [Māori people](/wiki/M%C4%81ori_people) in New Zealand hunted the flightless [Finsch's duck](/wiki/Finsch%27s_duck) , possibly to extinction, though rat predation may also have contributed to its fate. [[40]](#cite_note-FOOTNOTEHume201253-40) A similar end awaited the [Chatham duck](/wiki/Chatham_duck) , a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers. [[41]](#cite_note-FOOTNOTEHume201252-41) It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon. [[35]](#cite_note-FOOTNOTESued-Badillo200365-35) [[42]](#cite_note-FOOTNOTEFieldhouse2002167-42) +Humans have hunted ducks since prehistoric times. Excavations of [middens](/wiki/Midden) in California dating to 7800 – 6400 [BP](/wiki/Before_present) have turned up bones of ducks, including at least one now-extinct flightless species. [[ 33 ]](#cite_note-FOOTNOTEErlandson1994171-33) Ducks were captured in "significant numbers" by [Holocene](/wiki/Holocene) inhabitants of the lower [Ohio River](/wiki/Ohio_River) valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl. [[ 34 ]](#cite_note-FOOTNOTEJeffries2008168,_243-34) Neolithic hunters in locations as far apart as the Caribbean, [[ 35 ]](#cite_note-FOOTNOTESued-Badillo200365-35) Scandinavia, [[ 36 ]](#cite_note-FOOTNOTEThorpe199668-36) Egypt, [[ 37 ]](#cite_note-FOOTNOTEMaisels199942-37) Switzerland, [[ 38 ]](#cite_note-FOOTNOTERau1876133-38) and China relied on ducks as a source of protein for some or all of the year. [[ 39 ]](#cite_note-FOOTNOTEHigman201223-39) Archeological evidence shows that [Māori people](/wiki/M%C4%81ori_people) in New Zealand hunted the flightless [Finsch's duck](/wiki/Finsch%27s_duck) , possibly to extinction, though rat predation may also have contributed to its fate. [[ 40 ]](#cite_note-FOOTNOTEHume201253-40) A similar end awaited the [Chatham duck](/wiki/Chatham_duck) , a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers. [[ 41 ]](#cite_note-FOOTNOTEHume201252-41) It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon. [[ 35 ]](#cite_note-FOOTNOTESued-Badillo200365-35) [[ 42 ]](#cite_note-FOOTNOTEFieldhouse2002167-42) -In many areas, wild ducks (including ducks farmed and released into the wild) are hunted for food or sport, [[43]](#cite_note-43) by shooting, or by being trapped using [duck decoys](/wiki/Duck_decoy_(structure)) . Because an idle floating duck or a duck squatting on land cannot react to fly or move quickly, "a sitting duck" has come to mean "an easy target". These ducks may be [contaminated by pollutants](/wiki/Duck_(food)#Pollution) such as [PCBs](/wiki/Polychlorinated_biphenyl) . [[44]](#cite_note-44) +In many areas, wild ducks (including ducks farmed and released into the wild) are hunted for food or sport, [[ 43 ]](#cite_note-43) by shooting, or by being trapped using [duck decoys](/wiki/Duck_decoy_(structure)) . Because an idle floating duck or a duck squatting on land cannot react to fly or move quickly, "a sitting duck" has come to mean "an easy target". These ducks may be [contaminated by pollutants](/wiki/Duck_(food)#Pollution) such as [PCBs](/wiki/Polychlorinated_biphenyl) . [[ 44 ]](#cite_note-44) ### Domestication @@ -349,7 +349,7 @@ Indian Runner ducks, a common breed of domestic ducks -Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their [down](/wiki/Down_feather) ). Approximately 3 billion ducks are slaughtered each year for meat worldwide. [[45]](#cite_note-45) They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the [mallard](/wiki/Mallard) (Anas platyrhynchos), apart from the [Muscovy duck](/wiki/Muscovy_duck) (Cairina moschata). [[46]](#cite_note-46) [[47]](#cite_note-47) The [Call duck](/wiki/Call_duck) is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb). [[48]](#cite_note-48) +Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their [down](/wiki/Down_feather) ). Approximately 3 billion ducks are slaughtered each year for meat worldwide. [[ 45 ]](#cite_note-45) They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the [mallard](/wiki/Mallard) ( Anas platyrhynchos ), apart from the [Muscovy duck](/wiki/Muscovy_duck) ( Cairina moschata ). [[ 46 ]](#cite_note-46) [[ 47 ]](#cite_note-47) The [Call duck](/wiki/Call_duck) is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb). [[ 48 ]](#cite_note-48) ### Heraldry @@ -357,13 +357,13 @@ Three black-colored ducks in the coat of arms of Maaninka[49] -Ducks appear on several [coats of arms](/wiki/Coats_of_arms) , including the coat of arms of [Lubāna](/wiki/Lub%C4%81na) ( [Latvia](/wiki/Latvia) ) [[50]](#cite_note-50) and the coat of arms of [Föglö](/wiki/F%C3%B6gl%C3%B6) ( [Åland](/wiki/%C3%85land) ). [[51]](#cite_note-51) +Ducks appear on several [coats of arms](/wiki/Coats_of_arms) , including the coat of arms of [Lubāna](/wiki/Lub%C4%81na) ( [Latvia](/wiki/Latvia) ) [[ 50 ]](#cite_note-50) and the coat of arms of [Föglö](/wiki/F%C3%B6gl%C3%B6) ( [Åland](/wiki/%C3%85land) ). [[ 51 ]](#cite_note-51) ### Cultural references -In 2002, psychologist [Richard Wiseman](/wiki/Richard_Wiseman) and colleagues at the [University of Hertfordshire](/wiki/University_of_Hertfordshire) , [UK](/wiki/UK) , finished a year-long [LaughLab](/wiki/LaughLab) experiment, concluding that of all animals, ducks attract the most humor and silliness; he said, "If you're going to tell a joke involving an animal, make it a duck." [[52]](#cite_note-52) The word "duck" may have become an [inherently funny word](/wiki/Inherently_funny_word) in many languages, possibly because ducks are seen as silly in their looks or behavior. Of the many [ducks in fiction](/wiki/List_of_fictional_ducks) , many are cartoon characters, such as [Walt Disney](/wiki/The_Walt_Disney_Company) 's [Donald Duck](/wiki/Donald_Duck) , and [Warner Bros.](/wiki/Warner_Bros.) ' [Daffy Duck](/wiki/Daffy_Duck) . [Howard the Duck](/wiki/Howard_the_Duck) started as a comic book character in 1973 [[53]](#cite_note-53) [[54]](#cite_note-54) and was made into a [movie](/wiki/Howard_the_Duck_(film)) in 1986. +In 2002, psychologist [Richard Wiseman](/wiki/Richard_Wiseman) and colleagues at the [University of Hertfordshire](/wiki/University_of_Hertfordshire) , [UK](/wiki/UK) , finished a year-long [LaughLab](/wiki/LaughLab) experiment, concluding that of all animals, ducks attract the most humor and silliness; he said, "If you're going to tell a joke involving an animal, make it a duck." [[ 52 ]](#cite_note-52) The word "duck" may have become an [inherently funny word](/wiki/Inherently_funny_word) in many languages, possibly because ducks are seen as silly in their looks or behavior. Of the many [ducks in fiction](/wiki/List_of_fictional_ducks) , many are cartoon characters, such as [Walt Disney](/wiki/The_Walt_Disney_Company) 's [Donald Duck](/wiki/Donald_Duck) , and [Warner Bros.](/wiki/Warner_Bros.) ' [Daffy Duck](/wiki/Daffy_Duck) . [Howard the Duck](/wiki/Howard_the_Duck) started as a comic book character in 1973 [[ 53 ]](#cite_note-53) [[ 54 ]](#cite_note-54) and was made into a [movie](/wiki/Howard_the_Duck_(film)) in 1986. -The 1992 Disney film [The Mighty Ducks](/wiki/The_Mighty_Ducks_(film)) , starring [Emilio Estevez](/wiki/Emilio_Estevez) , chose the duck as the mascot for the fictional youth hockey team who are protagonists of the movie, based on the duck being described as a fierce fighter. This led to the duck becoming the nickname and mascot for the eventual [National Hockey League](/wiki/National_Hockey_League) professional team of the [Anaheim Ducks](/wiki/Anaheim_Ducks) , who were founded with the name the Mighty Ducks of Anaheim.[ [citation needed](/wiki/Wikipedia:Citation_needed) ]The duck is also the nickname of the [University of Oregon](/wiki/University_of_Oregon) sports teams as well as the [Long Island Ducks](/wiki/Long_Island_Ducks) minor league [baseball](/wiki/Baseball) team. [[55]](#cite_note-55) +The 1992 Disney film [The Mighty Ducks](/wiki/The_Mighty_Ducks_(film)) , starring [Emilio Estevez](/wiki/Emilio_Estevez) , chose the duck as the mascot for the fictional youth hockey team who are protagonists of the movie, based on the duck being described as a fierce fighter. This led to the duck becoming the nickname and mascot for the eventual [National Hockey League](/wiki/National_Hockey_League) professional team of the [Anaheim Ducks](/wiki/Anaheim_Ducks) , who were founded with the name the Mighty Ducks of Anaheim. [ [citation needed](/wiki/Wikipedia:Citation_needed) ] The duck is also the nickname of the [University of Oregon](/wiki/University_of_Oregon) sports teams as well as the [Long Island Ducks](/wiki/Long_Island_Ducks) minor league [baseball](/wiki/Baseball) team. [[ 55 ]](#cite_note-55) ## See also diff --git a/tests/data/html/hyperlink_01.html b/tests/data/html/hyperlink_01.html new file mode 100644 index 00000000..f8c13b06 --- /dev/null +++ b/tests/data/html/hyperlink_01.html @@ -0,0 +1,17 @@ + + + +

Something

+

+ Please follow the link to: + + This page + + . +

+
+ +
+ + + \ No newline at end of file diff --git a/tests/data/html/hyperlink_02.html b/tests/data/html/hyperlink_02.html new file mode 100644 index 00000000..a18e6d52 --- /dev/null +++ b/tests/data/html/hyperlink_02.html @@ -0,0 +1,18 @@ + + + + + + + \ No newline at end of file diff --git a/tests/data/html/hyperlink_03.html b/tests/data/html/hyperlink_03.html new file mode 100644 index 00000000..f2b4f7a1 --- /dev/null +++ b/tests/data/html/hyperlink_03.html @@ -0,0 +1,31 @@ + + + + + + + \ No newline at end of file