From 86f70128aae9a4c4187c69757e77ebf90c8cf0c0 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Tue, 29 Jul 2025 11:05:35 +0200 Subject: [PATCH] fix(HTML): replace non-standard Unicode characters (#2006) chore(HTML): replace non-standard Unicode characters for beter downstream tasks Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 77 ++++++++++++++++--- .../docling_v2/example_09.html.json | 2 +- .../groundtruth/docling_v2/example_09.html.md | 2 +- tests/data/groundtruth/docling_v2/mixed.md.md | 2 +- .../docling_v2/wiki_duck.html.itxt | 14 ++-- .../docling_v2/wiki_duck.html.json | 32 ++++---- .../groundtruth/docling_v2/wiki_duck.html.md | 32 ++++---- tests/test_backend_html.py | 16 ++++ 8 files changed, 125 insertions(+), 52 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 0c07994a..5f6366dd 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -125,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # set the title as furniture, since it is part of the document metadata title = self.soup.title if title: + title_text = title.get_text(separator=" ", strip=True) + title_clean = HTMLDocumentBackend._clean_unicode(title_text) doc.add_title( - text=title.get_text(separator=" ", strip=True), + text=title_clean, + orig=title_text, content_layer=ContentLayer.FURNITURE, ) # remove scripts/styles @@ -168,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return for part in text.split("\n"): seg = part.strip() + seg_clean = HTMLDocumentBackend._clean_unicode(seg) if seg: doc.add_text( - DocItemLabel.TEXT, - seg, + label=DocItemLabel.TEXT, + text=seg_clean, + orig=seg, parent=self.parents[self.level], content_layer=self.content_layer, ) @@ -203,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.content_layer = ContentLayer.BODY level = int(tag_name[1]) text = tag.get_text(strip=True, separator=" ") + text_clean = HTMLDocumentBackend._clean_unicode(text) # the first level is for the title item if level == 1: for key in self.parents.keys(): self.parents[key] = None self.level = 0 self.parents[self.level + 1] = doc.add_title( - text, content_layer=self.content_layer + text=text_clean, orig=text, content_layer=self.content_layer ) # the other levels need to be lowered by 1 if a title was set else: @@ -234,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level = level self.parents[self.level + 1] = doc.add_heading( parent=self.parents[self.level], - text=text, + text=text_clean, + orig=text, level=self.level, content_layer=self.content_layer, ) @@ -296,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if text_part: parts.append(text_part) li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip() + li_clean = HTMLDocumentBackend._clean_unicode(li_text) # 3) add the list item if li_text: self.parents[self.level + 1] = doc.add_list_item( - text=li_text, + text=li_clean, enumerated=is_ordered, marker=marker, + orig=li_text, parent=list_group, content_layer=self.content_layer, ) @@ -344,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif tag_name in {"p", "address", "summary"}: for part in tag.text.split("\n"): seg = part.strip() + seg_clean = HTMLDocumentBackend._clean_unicode(seg) if seg: doc.add_text( - parent=self.parents[self.level], label=DocItemLabel.TEXT, - text=seg, + text=seg_clean, + orig=seg, + parent=self.parents[self.level], content_layer=self.content_layer, ) for img_tag in tag("img"): @@ -370,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif tag_name in {"pre", "code"}: # handle monospace code snippets (pre). text = tag.get_text(strip=True) + text_clean = HTMLDocumentBackend._clean_unicode(text) if text: doc.add_code( parent=self.parents[self.level], - text=text, + text=text_clean, + orig=text, content_layer=self.content_layer, ) @@ -402,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): caption_item: Optional[TextItem] = None if caption: + caption_clean = HTMLDocumentBackend._clean_unicode(caption) caption_item = doc.add_text( - DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer + label=DocItemLabel.CAPTION, + text=caption_clean, + orig=caption, + content_layer=self.content_layer, ) doc.add_picture( @@ -442,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "".join(parts) + @staticmethod + def _clean_unicode(text: str) -> str: + """Replace typical Unicode characters in HTML for text processing. + + Several Unicode characters (e.g., non-printable or formatting) are typically + found in HTML but are worth replacing to sanitize text and ensure consistency + in text processing tasks. + + Args: + text: The original text. + + Returns: + The sanitized text without typical Unicode characters. + """ + replacements = { + "\u00a0": " ", # non-breaking space + "\u200b": "", # zero-width space + "\u200c": "", # zero-width non-joiner + "\u200d": "", # zero-width joiner + "\u2010": "-", # hyphen + "\u2011": "-", # non-breaking hyphen + "\u2012": "-", # dash + "\u2013": "-", # dash + "\u2014": "-", # dash + "\u2015": "-", # horizontal bar + "\u2018": "'", # left single quotation mark + "\u2019": "'", # right single quotation mark + "\u201c": '"', # left double quotation mark + "\u201d": '"', # right double quotation mark + "\u2026": "...", # ellipsis + "\u00ad": "", # soft hyphen + "\ufeff": "", # zero width non-break space + "\u202f": " ", # narrow non-break space + "\u2060": "", # word joiner + } + for raw, clean in replacements.items(): + text = text.replace(raw, clean) + + return text + @staticmethod def _get_cell_spans(cell: Tag) -> tuple[int, int]: """Extract colspan and rowspan values from a table cell tag. diff --git a/tests/data/groundtruth/docling_v2/example_09.html.json b/tests/data/groundtruth/docling_v2/example_09.html.json index 536f5aa3..f6e19253 100644 --- a/tests/data/groundtruth/docling_v2/example_09.html.json +++ b/tests/data/groundtruth/docling_v2/example_09.html.json @@ -133,7 +133,7 @@ "label": "text", "prov": [], "orig": "Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.", - "text": "Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem." + "text": "Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem." }, { "self_ref": "#/texts/3", diff --git a/tests/data/groundtruth/docling_v2/example_09.html.md b/tests/data/groundtruth/docling_v2/example_09.html.md index 1502f624..dd849c6a 100644 --- a/tests/data/groundtruth/docling_v2/example_09.html.md +++ b/tests/data/groundtruth/docling_v2/example_09.html.md @@ -4,7 +4,7 @@ Docling -Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem. +Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem. ## Supported file formats diff --git a/tests/data/groundtruth/docling_v2/mixed.md.md b/tests/data/groundtruth/docling_v2/mixed.md.md index 6cd5d52b..5524e09d 100644 --- a/tests/data/groundtruth/docling_v2/mixed.md.md +++ b/tests/data/groundtruth/docling_v2/mixed.md.md @@ -17,7 +17,7 @@ And here is more HTML: Some paragraph. -Now a div — almost there... +Now a div - almost there... - foo - bar diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt index 0ec336dc..4f65770c 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt @@ -296,7 +296,7 @@ item-0 at level 0: unspecified: group _root_ item-285 at level 4: text: Main article: Domestic duck item-286 at level 4: picture item-286 at level 5: caption: Indian Runner ducks, a common breed of domestic ducks - item-287 at level 4: text: Ducks have many economic uses, b ... it weighs less than 1 kg (2.2 lb).[48] + item-287 at level 4: text: Ducks have many economic uses, b ... it weighs less than 1 kg (2.2 lb).[48] item-288 at level 3: section_header: Heraldry item-289 at level 4: picture item-289 at level 5: caption: Three black-colored ducks in the coat of arms of Maaninka[49] @@ -319,17 +319,17 @@ item-0 at level 0: unspecified: group _root_ item-306 at level 3: section_header: Citations item-307 at level 4: list: group ordered list item-308 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22. - item-309 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 2000–2006. Retrieved 2015-05-22. + item-309 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 2000-2006. Retrieved 2015-05-22. item-310 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139. item-311 at level 5: list_item: ^ Visca, Curt; Visca, Kelley (20 ... Publishing Group. ISBN 9780823961566. item-312 at level 5: list_item: ^ a b c d Carboneras 1992, p. 536. - item-313 at level 5: list_item: ^ Livezey 1986, pp. 737–738. + item-313 at level 5: list_item: ^ Livezey 1986, pp. 737-738. item-314 at level 5: list_item: ^ Madsen, McHugh & de Kloet 1988, p. 452. - item-315 at level 5: list_item: ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354. + item-315 at level 5: list_item: ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353-354. item-316 at level 5: list_item: ^ a b c d e f Carboneras 1992, p. 540. item-317 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 191. item-318 at level 5: list_item: ^ Kear 2005, p. 448. - item-319 at level 5: list_item: ^ Kear 2005, p. 622–623. + item-319 at level 5: list_item: ^ Kear 2005, p. 622-623. item-320 at level 5: list_item: ^ Kear 2005, p. 686. item-321 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 193. item-322 at level 5: list_item: ^ a b c d e f g Carboneras 1992, p. 537. @@ -338,8 +338,8 @@ item-0 at level 0: unspecified: group _root_ item-325 at level 5: list_item: ^ Carboneras 1992, p. 538. item-326 at level 5: list_item: ^ Christidis & Boles 2008, p. 62. item-327 at level 5: list_item: ^ Shirihai 2008, pp. 239, 245. - item-328 at level 5: list_item: ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107. - item-329 at level 5: list_item: ^ Fitter, Fitter & Hosking 2000, pp. 52–3. + item-328 at level 5: list_item: ^ a b Pratt, Bruner & Berrett 1987, pp. 98-107. + item-329 at level 5: list_item: ^ Fitter, Fitter & Hosking 2000, pp. 52-3. item-330 at level 5: list_item: ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27. item-331 at level 5: list_item: ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02. item-332 at level 5: list_item: ^ Karl Mathiesen (16 March 2015) ... Guardian. Retrieved 13 November 2016. diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 31978f31..6c897cf0 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -5271,7 +5271,7 @@ "label": "text", "prov": [], "orig": "In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks – named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14]", - "text": "In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks – named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14]" + "text": "In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks - named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14]" }, { "self_ref": "#/texts/247", @@ -5652,7 +5652,7 @@ "label": "text", "prov": [], "orig": "Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic \"quack\" sound while males make a similar but raspier sound that is sometimes written as \"breeeeze\",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not \"quack\".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup – which are diving ducks – make a noise like \"scaup\" (hence their name). Calls may be loud displaying calls or quieter contact calls.", - "text": "Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic \"quack\" sound while males make a similar but raspier sound that is sometimes written as \"breeeeze\",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not \"quack\".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup – which are diving ducks – make a noise like \"scaup\" (hence their name). Calls may be loud displaying calls or quieter contact calls." + "text": "Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic \"quack\" sound while males make a similar but raspier sound that is sometimes written as \"breeeeze\",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not \"quack\".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup - which are diving ducks - make a noise like \"scaup\" (hence their name). Calls may be loud displaying calls or quieter contact calls." }, { "self_ref": "#/texts/272", @@ -5796,7 +5796,7 @@ "label": "text", "prov": [], "orig": "Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 – 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in \"significant numbers\" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that Māori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42]", - "text": "Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 – 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in \"significant numbers\" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that Māori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42]" + "text": "Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 - 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in \"significant numbers\" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that Māori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42]" }, { "self_ref": "#/texts/281", @@ -5867,7 +5867,7 @@ "label": "text", "prov": [], "orig": "Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb).[48]", - "text": "Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb).[48]" + "text": "Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb).[48]" }, { "self_ref": "#/texts/286", @@ -6136,7 +6136,7 @@ "label": "list_item", "prov": [], "orig": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22.", - "text": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22.", + "text": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000-2006. Retrieved 2015-05-22.", "enumerated": true, "marker": "" }, @@ -6192,7 +6192,7 @@ "label": "list_item", "prov": [], "orig": "^ Livezey 1986, pp. 737–738.", - "text": "^ Livezey 1986, pp. 737–738.", + "text": "^ Livezey 1986, pp. 737-738.", "enumerated": true, "marker": "" }, @@ -6220,7 +6220,7 @@ "label": "list_item", "prov": [], "orig": "^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.", - "text": "^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.", + "text": "^ Donne-Goussé, Laudet & Hänni 2002, pp. 353-354.", "enumerated": true, "marker": "" }, @@ -6276,7 +6276,7 @@ "label": "list_item", "prov": [], "orig": "^ Kear 2005, p. 622–623.", - "text": "^ Kear 2005, p. 622–623.", + "text": "^ Kear 2005, p. 622-623.", "enumerated": true, "marker": "" }, @@ -6402,7 +6402,7 @@ "label": "list_item", "prov": [], "orig": "^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.", - "text": "^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.", + "text": "^ a b Pratt, Bruner & Berrett 1987, pp. 98-107.", "enumerated": true, "marker": "" }, @@ -6416,7 +6416,7 @@ "label": "list_item", "prov": [], "orig": "^ Fitter, Fitter & Hosking 2000, pp. 52–3.", - "text": "^ Fitter, Fitter & Hosking 2000, pp. 52–3.", + "text": "^ Fitter, Fitter & Hosking 2000, pp. 52-3.", "enumerated": true, "marker": "" }, @@ -6472,7 +6472,7 @@ "label": "list_item", "prov": [], "orig": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9.", - "text": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9.", + "text": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp. 187-221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9.", "enumerated": true, "marker": "" }, @@ -6486,7 +6486,7 @@ "label": "list_item", "prov": [], "orig": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", - "text": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", + "text": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201-205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", "enumerated": true, "marker": "" }, @@ -6951,7 +6951,7 @@ "label": "list_item", "prov": [], "orig": "Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). \"A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis\". Molecular Phylogenetics and Evolution. 23 (3): 339–356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792.", - "text": "Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). \"A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis\". Molecular Phylogenetics and Evolution. 23 (3): 339–356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792.", + "text": "Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). \"A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis\". Molecular Phylogenetics and Evolution. 23 (3): 339-356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792.", "enumerated": false, "marker": "" }, @@ -6993,7 +6993,7 @@ "label": "list_item", "prov": [], "orig": "Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A–K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4.", - "text": "Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A–K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4.", + "text": "Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A-K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4.", "enumerated": false, "marker": "" }, @@ -7077,7 +7077,7 @@ "label": "list_item", "prov": [], "orig": "Livezey, Bradley C. (October 1986). \"A phylogenetic analysis of recent Anseriform genera using morphological characters\" (PDF). The Auk. 103 (4): 737–754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09.", - "text": "Livezey, Bradley C. (October 1986). \"A phylogenetic analysis of recent Anseriform genera using morphological characters\" (PDF). The Auk. 103 (4): 737–754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09.", + "text": "Livezey, Bradley C. (October 1986). \"A phylogenetic analysis of recent Anseriform genera using morphological characters\" (PDF). The Auk. 103 (4): 737-754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09.", "enumerated": false, "marker": "" }, @@ -7091,7 +7091,7 @@ "label": "list_item", "prov": [], "orig": "Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). \"A partial classification of waterfowl (Anatidae) based on single-copy DNA\" (PDF). The Auk. 105 (3): 452–459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09.", - "text": "Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). \"A partial classification of waterfowl (Anatidae) based on single-copy DNA\" (PDF). The Auk. 105 (3): 452–459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09.", + "text": "Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). \"A partial classification of waterfowl (Anatidae) based on single-copy DNA\" (PDF). The Auk. 105 (3): 452-459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09.", "enumerated": false, "marker": "" }, diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md index d121e122..5826cc06 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md @@ -285,7 +285,7 @@ Mallard landing in approach -In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks – named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14] +In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks - named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14] A number of other species called ducks are not considered to be 'true ducks', and are typically placed in other subfamilies or tribes. The whistling ducks are assigned either to a tribe (Dendrocygnini) in the subfamily Anatinae or the subfamily Anserinae,[15] or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae).[9][16] The freckled duck of Australia is either the sole member of the tribe Stictonettini in the subfamily Anserinae,[15] or in its own family, the Stictonettinae.[9] The shelducks make up the tribe Tadornini in the family Anserinae in some classifications,[15] and their own subfamily, Tadorninae, in others,[17] while the steamer ducks are either placed in the family Anserinae in the tribe Tachyerini[15] or lumped with the shelducks in the tribe Tadorini.[9] The perching ducks make up in the tribe Cairinini in the subfamily Anserinae in some classifications, while that tribe is eliminated in other classifications and its members assigned to the tribe Anatini.[9] The torrent duck is generally included in the subfamily Anserinae in the monotypic tribe Merganettini,[15] but is sometimes included in the tribe Tadornini.[18] The pink-eared duck is sometimes included as a true duck either in the tribe Anatini[15] or the tribe Malacorhynchini,[19] and other times is included with the shelducks in the tribe Tadornini.[15] @@ -345,7 +345,7 @@ Ducks generally only have one partner at a time, although the partnership usuall ### Communication -Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic "quack" sound while males make a similar but raspier sound that is sometimes written as "breeeeze",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not "quack".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup – which are diving ducks – make a noise like "scaup" (hence their name). Calls may be loud displaying calls or quieter contact calls. +Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic "quack" sound while males make a similar but raspier sound that is sometimes written as "breeeeze",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not "quack".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup - which are diving ducks - make a noise like "scaup" (hence their name). Calls may be loud displaying calls or quieter contact calls. A common urban legend claims that duck quacks do not echo; however, this has been proven to be false. This myth was first debunked by the Acoustics Research Centre at the University of Salford in 2003 as part of the British Association's Festival of Science.[31] It was also debunked in one of the earlier episodes of the popular Discovery Channel television show MythBusters.[32] @@ -365,7 +365,7 @@ Adult ducks are fast fliers, but may be caught on the water by large aquatic pre Main article: Waterfowl hunting -Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 – 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in "significant numbers" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that Māori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42] +Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 - 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in "significant numbers" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that Māori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42] In many areas, wild ducks (including ducks farmed and released into the wild) are hunted for food or sport,[43] by shooting, or by being trapped using duck decoys. Because an idle floating duck or a duck squatting on land cannot react to fly or move quickly, "a sitting duck" has come to mean "an easy target". These ducks may be contaminated by pollutants such as PCBs.[44] @@ -377,7 +377,7 @@ Indian Runner ducks, a common breed of domestic ducks -Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb).[48] +Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb).[48] ### Heraldry @@ -410,17 +410,17 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a ### Citations 1. ^ "Duckling". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22. -2. ^ "Duckling". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22. +2. ^ "Duckling". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000-2006. Retrieved 2015-05-22. 3. ^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139. 4. ^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566. 5. ^ a b c d Carboneras 1992, p. 536. -6. ^ Livezey 1986, pp. 737–738. +6. ^ Livezey 1986, pp. 737-738. 7. ^ Madsen, McHugh & de Kloet 1988, p. 452. -8. ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354. +8. ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353-354. 9. ^ a b c d e f Carboneras 1992, p. 540. 10. ^ Elphick, Dunning & Sibley 2001, p. 191. 11. ^ Kear 2005, p. 448. -12. ^ Kear 2005, p. 622–623. +12. ^ Kear 2005, p. 622-623. 13. ^ Kear 2005, p. 686. 14. ^ Elphick, Dunning & Sibley 2001, p. 193. 15. ^ a b c d e f g Carboneras 1992, p. 537. @@ -429,13 +429,13 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a 18. ^ Carboneras 1992, p. 538. 19. ^ Christidis & Boles 2008, p. 62. 20. ^ Shirihai 2008, pp. 239, 245. -21. ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107. -22. ^ Fitter, Fitter & Hosking 2000, pp. 52–3. +21. ^ a b Pratt, Bruner & Berrett 1987, pp. 98-107. +22. ^ Fitter, Fitter & Hosking 2000, pp. 52-3. 23. ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27. 24. ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02. 25. ^ Karl Mathiesen (16 March 2015). "Don't feed the ducks bread, say conservationists". The Guardian. Retrieved 13 November 2016. -26. ^ Rohwer, Frank C.; Anderson, Michael G. (1988). "Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5\_4. ISBN 978-1-4615-6789-9. -27. ^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). "Long-Term Pair Bonds in Harlequin Ducks". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797. +26. ^ Rohwer, Frank C.; Anderson, Michael G. (1988). "Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl". Current Ornithology. pp. 187-221. doi:10.1007/978-1-4615-6787-5\_4. ISBN 978-1-4615-6789-9. +27. ^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). "Long-Term Pair Bonds in Harlequin Ducks". The Condor. 102 (1): 201-205. doi:10.1093/condor/102.1.201. hdl:10315/13797. 28. ^ "If You Find An Orphaned Duckling - Wildlife Rehabber". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22. 29. ^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source] 30. ^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707. @@ -470,17 +470,17 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a - American Ornithologists' Union (1998). Checklist of North American Birds (PDF). Washington, DC: American Ornithologists' Union. ISBN 978-1-891276-00-2. Archived (PDF) from the original on 2022-10-09. - Carboneras, Carlos (1992). del Hoyo, Josep; Elliott, Andrew; Sargatal, Jordi (eds.). Handbook of the Birds of the World. Vol. 1: Ostrich to Ducks. Barcelona: Lynx Edicions. ISBN 978-84-87334-10-8. - Christidis, Les; Boles, Walter E., eds. (2008). Systematics and Taxonomy of Australian Birds. Collingwood, VIC: Csiro Publishing. ISBN 978-0-643-06511-6. -- Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). "A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis". Molecular Phylogenetics and Evolution. 23 (3): 339–356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792. +- Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). "A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis". Molecular Phylogenetics and Evolution. 23 (3): 339-356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792. - Elphick, Chris; Dunning, John B. Jr.; Sibley, David, eds. (2001). The Sibley Guide to Bird Life and Behaviour. London: Christopher Helm. ISBN 978-0-7136-6250-4. - Erlandson, Jon M. (1994). Early Hunter-Gatherers of the California Coast. New York, NY: Springer Science & Business Media. ISBN 978-1-4419-3231-0. -- Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A–K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4. +- Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A-K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4. - Fitter, Julian; Fitter, Daniel; Hosking, David (2000). Wildlife of the Galápagos. Princeton, NJ: Princeton University Press. ISBN 978-0-691-10295-5. - Higman, B. W. (2012). How Food Made History. Chichester, UK: John Wiley & Sons. ISBN 978-1-4051-8947-7. - Hume, Julian H. (2012). Extinct Birds. London: Christopher Helm. ISBN 978-1-4729-3744-5. - Jeffries, Richard (2008). Holocene Hunter-Gatherers of the Lower Ohio River Valley. Tuscaloosa: University of Alabama Press. ISBN 978-0-8173-1658-7. - Kear, Janet, ed. (2005). Ducks, Geese and Swans: Species Accounts (Cairina to Mergus). Bird Families of the World. Oxford: Oxford University Press. ISBN 978-0-19-861009-0. -- Livezey, Bradley C. (October 1986). "A phylogenetic analysis of recent Anseriform genera using morphological characters" (PDF). The Auk. 103 (4): 737–754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09. -- Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). "A partial classification of waterfowl (Anatidae) based on single-copy DNA" (PDF). The Auk. 105 (3): 452–459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09. +- Livezey, Bradley C. (October 1986). "A phylogenetic analysis of recent Anseriform genera using morphological characters" (PDF). The Auk. 103 (4): 737-754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09. +- Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). "A partial classification of waterfowl (Anatidae) based on single-copy DNA" (PDF). The Auk. 105 (3): 452-459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09. - Maisels, Charles Keith (1999). Early Civilizations of the Old World. London: Routledge. ISBN 978-0-415-10975-8. - Pratt, H. Douglas; Bruner, Phillip L.; Berrett, Delwyn G. (1987). A Field Guide to the Birds of Hawaii and the Tropical Pacific. Princeton, NJ: Princeton University Press. ISBN 0-691-02399-9. - Rau, Charles (1876). Early Man in Europe. New York: Harper & Brothers. LCCN 05040168. diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 014f4c4c..c5509e6e 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -100,6 +100,22 @@ def test_ordered_lists(): assert doc.export_to_markdown() == pair[1], f"Error in case {idx}" +def test_unicode_characters(): + raw_html = "