From 6beec77788cfc1f9bdb5536a784d495745b510c3 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Fri, 27 Jun 2025 10:21:57 +0200 Subject: [PATCH] update backends to leverage new list modeling Signed-off-by: Panos Vagenas --- docling/backend/html_backend.py | 73 ++- docling/backend/md_backend.py | 35 +- docling/backend/mspowerpoint_backend.py | 9 +- docling/backend/msword_backend.py | 67 ++- .../data/groundtruth/docling_v2/blocks.md.md | 2 +- .../docling_v2/example_01.html.itxt | 5 +- .../docling_v2/example_01.html.json | 54 ++- .../groundtruth/docling_v2/example_01.html.md | 7 +- .../docling_v2/example_02.html.json | 4 +- .../groundtruth/docling_v2/example_02.html.md | 4 +- .../docling_v2/example_03.html.json | 8 +- .../groundtruth/docling_v2/example_03.html.md | 8 +- .../docling_v2/inline_and_formatting.md.md | 14 +- .../docling_v2/inline_and_formatting.md.yaml | 14 +- .../docling_v2/mixed_without_h1.md.md | 2 +- .../groundtruth/docling_v2/textbox.docx.itxt | 80 ++-- .../groundtruth/docling_v2/textbox.docx.json | 453 ++++++++++-------- .../groundtruth/docling_v2/textbox.docx.md | 6 +- .../docling_v2/wiki_duck.html.json | 110 ++--- .../groundtruth/docling_v2/wiki_duck.html.md | 110 ++--- tests/data/html/example_01.html | 4 + 21 files changed, 599 insertions(+), 470 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7c716908..3b9a55a5 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -17,6 +17,7 @@ from docling_core.types.doc import ( TableData, ) from docling_core.types.doc.document import ContentLayer +from pydantic import BaseModel from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [ ] +class _Context(BaseModel): + list_ordered_flag_by_ref: dict[str, bool] = {} + list_start_by_ref: dict[str, int] = {} + + class HTMLDocumentBackend(DeclarativeDocumentBackend): @override def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): @@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.max_levels = 10 self.level = 0 self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} + self.ctx = _Context() for i in range(self.max_levels): self.parents[i] = None @@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.content_layer = ( ContentLayer.BODY if headers is None else ContentLayer.FURNITURE ) + self.ctx = _Context() # reset context self.walk(content, doc) else: raise RuntimeError( @@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" - if element.name == "ul": - # create a list group - self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], - name="list", - label=GroupLabel.LIST, - content_layer=self.content_layer, - ) - elif element.name == "ol": + start: Optional[int] = None + if is_ordered := element.name == "ol": start_attr = element.get("start") - start: int = ( - int(start_attr) - if isinstance(start_attr, str) and start_attr.isnumeric() - else 1 - ) - # create a list group - self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], - name="ordered list" + (f" start {start}" if start != 1 else ""), - label=GroupLabel.ORDERED_LIST, - content_layer=self.content_layer, - ) + if isinstance(start_attr, str) and start_attr.isnumeric(): + start = int(start_attr) + name = "ordered list" + (f" start {start}" if start is not None else "") + else: + name = "list" + # create a list group + list_group = doc.add_list_group( + name=name, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) + self.parents[self.level + 1] = list_group + self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered + if is_ordered and start is not None: + self.ctx.list_start_by_ref[list_group.self_ref] = start + self.level += 1 self.walk(element, doc) @@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if parent is None: _log.debug(f"list-item has no parent in DoclingDocument: {element}") return - parent_label: str = parent.label - index_in_list = len(parent.children) + 1 - if ( - parent_label == GroupLabel.ORDERED_LIST - and isinstance(parent, GroupItem) - and parent.name - ): - start_in_list: str = parent.name.split(" ")[-1] - start: int = int(start_in_list) if start_in_list.isnumeric() else 1 - index_in_list += start - 1 + enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False) + if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)): + marker = f"{start + len(parent.children)}." + else: + marker = "" if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text = text.replace("\n", "").replace("\r", "") text = " ".join(text.split()).strip() - marker = "" - enumerated = False - if parent_label == GroupLabel.ORDERED_LIST: - marker = str(index_in_list) - enumerated = True - if len(text) > 0: # create a list-item self.parents[self.level + 1] = doc.add_list_item( @@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif element.text.strip(): text = element.text.strip() - marker = "" - enumerated = False - if parent_label == GroupLabel.ORDERED_LIST: - marker = f"{index_in_list!s}." - enumerated = True doc.add_list_item( text=text, enumerated=enumerated, diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index e2e970da..214e8d1e 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -14,13 +14,12 @@ from docling_core.types.doc import ( DocItemLabel, DoclingDocument, DocumentOrigin, - GroupLabel, NodeItem, TableCell, TableData, TextItem, ) -from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList +from docling_core.types.doc.document import Formatting from marko import Markdown from pydantic import AnyUrl, BaseModel, Field, TypeAdapter from typing_extensions import Annotated @@ -51,6 +50,7 @@ class _HeadingCreationPayload(BaseModel): class _ListItemCreationPayload(BaseModel): kind: Literal["list_item"] = "list_item" + enumerated: bool _CreationPayload = Annotated[ @@ -187,15 +187,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, parent_item: Optional[NodeItem], text: str, + enumerated: bool, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, ): - if not isinstance(parent_item, (OrderedList, UnorderedList)): - _log.warning("ListItem would have not had a list parent, adding one.") - parent_item = doc.add_unordered_list(parent=parent_item) item = doc.add_list_item( text=text, - enumerated=(isinstance(parent_item, OrderedList)), + enumerated=enumerated, parent=parent_item, formatting=formatting, hyperlink=hyperlink, @@ -238,6 +236,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): creation_stack: list[ _CreationPayload ], # stack for lazy item creation triggered deep in marko's AST (on RawText) + list_ordered_flag_by_ref: dict[str, bool], parent_item: Optional[NodeItem] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -275,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._close_table(doc) _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") if has_non_empty_list_items: - label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST - parent_item = doc.add_group( - label=label, name="list", parent=parent_item - ) + parent_item = doc.add_list_group(name="list", parent=parent_item) + list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered elif ( isinstance(element, marko.block.ListItem) @@ -289,16 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._close_table(doc) _log.debug(" - List item") + enumerated = ( + list_ordered_flag_by_ref.get(parent_item.self_ref, False) + if parent_item + else False + ) if len(child.children) > 1: # inline group will be created further down parent_item = self._create_list_item( doc=doc, parent_item=parent_item, text="", + enumerated=enumerated, formatting=formatting, hyperlink=hyperlink, ) else: - creation_stack.append(_ListItemCreationPayload()) + creation_stack.append(_ListItemCreationPayload(enumerated=enumerated)) elif isinstance(element, marko.inline.Image): self._close_table(doc) @@ -349,10 +352,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): while len(creation_stack) > 0: to_create = creation_stack.pop() if isinstance(to_create, _ListItemCreationPayload): + enumerated = ( + list_ordered_flag_by_ref.get( + parent_item.self_ref, False + ) + if parent_item + else False + ) parent_item = self._create_list_item( doc=doc, parent_item=parent_item, text=snippet_text, + enumerated=enumerated, formatting=formatting, hyperlink=hyperlink, ) @@ -453,6 +464,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc=doc, visited=visited, creation_stack=creation_stack, + list_ordered_flag_by_ref=list_ordered_flag_by_ref, parent_item=parent_item, formatting=formatting, hyperlink=hyperlink, @@ -497,6 +509,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): parent_item=None, visited=set(), creation_stack=[], + list_ordered_flag_by_ref={}, ) self._close_table(doc=doc) # handle any last hanging table diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 63aa9e93..f512fb7e 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB return prov - def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): + def handle_text_elements( + self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size + ): is_list_group_created = False enum_list_item_value = 0 new_list = None @@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB enumerated = bullet_type == "Numbered" if not is_list_group_created: - new_list = doc.add_group( - label=GroupLabel.ORDERED_LIST - if enumerated - else GroupLabel.LIST, + new_list = doc.add_list_group( name="list", parent=parent_slide, ) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 3e84d643..abbcc6f6 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -10,11 +10,12 @@ from docling_core.types.doc import ( DocumentOrigin, GroupLabel, ImageRef, + ListGroup, NodeItem, TableCell, TableData, ) -from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList +from docling_core.types.doc.document import Formatting from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -688,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph_elements: list, ) -> Optional[NodeItem]: return ( - doc.add_group(label=GroupLabel.INLINE, parent=prev_parent) + doc.add_inline_group(parent=prev_parent) if len(paragraph_elements) > 1 else prev_parent ) @@ -781,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: # Inline equation level = self._get_level() - inline_equation = doc.add_group( - label=GroupLabel.INLINE, parent=self.parents[level - 1] - ) + inline_equation = doc.add_inline_group(parent=self.parents[level - 1]) text_tmp = text for eq in equations: if len(text_tmp) == 0: @@ -931,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level: int, ) -> None: # This should not happen by construction - if not isinstance(self.parents[level], (OrderedList, UnorderedList)): + if not isinstance(self.parents[level], ListGroup): return + if not elements: + return + if len(elements) == 1: text, format, hyperlink = elements[0] - doc.add_list_item( - marker=marker, - enumerated=enumerated, - parent=self.parents[level], - text=text, - formatting=format, - hyperlink=hyperlink, - ) + if text: + doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text=text, + formatting=format, + hyperlink=hyperlink, + ) else: new_item = doc.add_list_item( marker=marker, @@ -950,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[level], text="", ) - new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item) + new_parent = doc.add_inline_group(parent=new_item) for text, format, hyperlink in elements: - doc.add_text( - label=DocItemLabel.TEXT, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) + if text: + doc.add_text( + label=DocItemLabel.TEXT, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) def _add_list_item( self, @@ -979,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if self._prev_numid() is None: # Open new list self.level_at_new_list = level - self.parents[level] = doc.add_group( - label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] + self.parents[level] = doc.add_list_group( + name="list", parent=self.parents[level - 1] ) # Set marker and enumerated arguments if this is an enumeration element. @@ -1001,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level_at_new_list + prev_indent + 1, self.level_at_new_list + ilevel + 1, ): - # Determine if this is an unordered list or an ordered list. - # Set GroupLabel.ORDERED_LIST when it fits. self.listIter = 0 - if is_numbered: - self.parents[i] = doc.add_group( - label=GroupLabel.ORDERED_LIST, - name="list", - parent=self.parents[i - 1], - ) - else: - self.parents[i] = doc.add_group( - label=GroupLabel.LIST, name="list", parent=self.parents[i - 1] - ) + self.parents[i] = doc.add_list_group( + name="list", parent=self.parents[i - 1] + ) # TODO: Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 diff --git a/tests/data/groundtruth/docling_v2/blocks.md.md b/tests/data/groundtruth/docling_v2/blocks.md.md index 5269e7d8..6a194066 100644 --- a/tests/data/groundtruth/docling_v2/blocks.md.md +++ b/tests/data/groundtruth/docling_v2/blocks.md.md @@ -6,7 +6,7 @@ Empty unordered list: Ordered list: -- bar +1. bar Empty ordered list: diff --git a/tests/data/groundtruth/docling_v2/example_01.html.itxt b/tests/data/groundtruth/docling_v2/example_01.html.itxt index 88c5a327..5db91c54 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.itxt +++ b/tests/data/groundtruth/docling_v2/example_01.html.itxt @@ -9,4 +9,7 @@ item-0 at level 0: unspecified: group _root_ item-8 at level 4: list_item: Second item in unordered list item-9 at level 3: list: group ordered list item-10 at level 4: list_item: First item in ordered list - item-11 at level 4: list_item: Second item in ordered list \ No newline at end of file + item-11 at level 4: list_item: Second item in ordered list + item-12 at level 3: list: group ordered list start 42 + item-13 at level 4: list_item: First item in ordered list with start + item-14 at level 4: list_item: Second item in ordered list with start \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_01.html.json b/tests/data/groundtruth/docling_v2/example_01.html.json index 3a158001..2e8e5104 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.json +++ b/tests/data/groundtruth/docling_v2/example_01.html.json @@ -4,7 +4,7 @@ "name": "example_01", "origin": { "mimetype": "text/html", - "binary_hash": 13782069548509991617, + "binary_hash": 13726679883013609282, "filename": "example_01.html" }, "furniture": { @@ -59,6 +59,23 @@ "content_layer": "body", "name": "ordered list", "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/2" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + } + ], + "content_layer": "body", + "name": "ordered list start 42", + "label": "list" } ], "texts": [ @@ -110,6 +127,9 @@ }, { "$ref": "#/groups/1" + }, + { + "$ref": "#/groups/2" } ], "content_layer": "body", @@ -170,7 +190,7 @@ "prov": [], "orig": "First item in ordered list", "text": "First item in ordered list", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -184,8 +204,36 @@ "prov": [], "orig": "Second item in ordered list", "text": "Second item in ordered list", - "enumerated": false, + "enumerated": true, "marker": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "First item in ordered list with start", + "text": "First item in ordered list with start", + "enumerated": true, + "marker": "42." + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Second item in ordered list with start", + "text": "Second item in ordered list with start", + "enumerated": true, + "marker": "43." } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/example_01.html.md b/tests/data/groundtruth/docling_v2/example_01.html.md index ba42409d..f36f7852 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.md +++ b/tests/data/groundtruth/docling_v2/example_01.html.md @@ -11,5 +11,8 @@ Some background information here. - First item in unordered list - Second item in unordered list -- First item in ordered list -- Second item in ordered list \ No newline at end of file +1. First item in ordered list +2. Second item in ordered list + +42. First item in ordered list with start +43. Second item in ordered list with start \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_02.html.json b/tests/data/groundtruth/docling_v2/example_02.html.json index b8a6c9c4..786a26c4 100644 --- a/tests/data/groundtruth/docling_v2/example_02.html.json +++ b/tests/data/groundtruth/docling_v2/example_02.html.json @@ -167,7 +167,7 @@ "prov": [], "orig": "First item in ordered list", "text": "First item in ordered list", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -181,7 +181,7 @@ "prov": [], "orig": "Second item in ordered list", "text": "Second item in ordered list", - "enumerated": false, + "enumerated": true, "marker": "" } ], diff --git a/tests/data/groundtruth/docling_v2/example_02.html.md b/tests/data/groundtruth/docling_v2/example_02.html.md index 8c0fecd4..2b0a2ceb 100644 --- a/tests/data/groundtruth/docling_v2/example_02.html.md +++ b/tests/data/groundtruth/docling_v2/example_02.html.md @@ -9,5 +9,5 @@ Some background information here. - First item in unordered list - Second item in unordered list -- First item in ordered list -- Second item in ordered list \ No newline at end of file +1. First item in ordered list +2. Second item in ordered list \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_03.html.json b/tests/data/groundtruth/docling_v2/example_03.html.json index b9d331a6..8c8f91ba 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.json +++ b/tests/data/groundtruth/docling_v2/example_03.html.json @@ -257,7 +257,7 @@ "prov": [], "orig": "First item in ordered list", "text": "First item in ordered list", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -271,7 +271,7 @@ "prov": [], "orig": "Nested ordered item 1", "text": "Nested ordered item 1", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -285,7 +285,7 @@ "prov": [], "orig": "Nested ordered item 2", "text": "Nested ordered item 2", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -299,7 +299,7 @@ "prov": [], "orig": "Second item in ordered list", "text": "Second item in ordered list", - "enumerated": false, + "enumerated": true, "marker": "" }, { diff --git a/tests/data/groundtruth/docling_v2/example_03.html.md b/tests/data/groundtruth/docling_v2/example_03.html.md index ceff1e39..23158ed9 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.md +++ b/tests/data/groundtruth/docling_v2/example_03.html.md @@ -13,10 +13,10 @@ Some background information here. - Nested item 2 - Second item in unordered list -- First item in ordered list - - Nested ordered item 1 - - Nested ordered item 2 -- Second item in ordered list +1. First item in ordered list + 1. Nested ordered item 1 + 2. Nested ordered item 2 +2. Second item in ordered list ## Data Table diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md index 3f18368d..130375c8 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md @@ -6,13 +6,13 @@ Foo *emphasis* **strong emphasis** ***both*** . Create your feature branch: `git checkout -b feature/AmazingFeature` . -- Pull the [**repository**](https://github.com/docling-project/docling) . -- Create your feature branch ( `git checkout -b feature/AmazingFeature` ) -- Commit your changes ( `git commit -m 'Add some AmazingFeature'` ) -- Push to the branch ( `git push origin feature/AmazingFeature` ) -- Open a Pull Request -- **Whole list item has same formatting** -- List item has *mixed or partial* formatting +1. Pull the [**repository**](https://github.com/docling-project/docling) . +2. Create your feature branch ( `git checkout -b feature/AmazingFeature` ) +3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` ) +4. Push to the branch ( `git push origin feature/AmazingFeature` ) +5. Open a Pull Request +6. **Whole list item has same formatting** +7. List item has *mixed or partial* formatting # *Whole heading is italic* diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml index 409c6c03..771a606a 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -252,7 +252,7 @@ texts: - children: - $ref: '#/groups/3' content_layer: body - enumerated: false + enumerated: true label: list_item marker: '' orig: '' @@ -298,7 +298,7 @@ texts: - children: - $ref: '#/groups/4' content_layer: body - enumerated: false + enumerated: true label: list_item marker: '' orig: '' @@ -341,7 +341,7 @@ texts: - children: - $ref: '#/groups/5' content_layer: body - enumerated: false + enumerated: true label: list_item marker: '' orig: '' @@ -384,7 +384,7 @@ texts: - children: - $ref: '#/groups/6' content_layer: body - enumerated: false + enumerated: true label: list_item marker: '' orig: '' @@ -426,7 +426,7 @@ texts: text: ) - children: [] content_layer: body - enumerated: false + enumerated: true label: list_item marker: '' orig: Open a Pull Request @@ -437,7 +437,7 @@ texts: text: Open a Pull Request - children: [] content_layer: body - enumerated: false + enumerated: true formatting: bold: true italic: false @@ -455,7 +455,7 @@ texts: - children: - $ref: '#/groups/7' content_layer: body - enumerated: false + enumerated: true label: list_item marker: '' orig: '' diff --git a/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md b/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md index 9445866b..5f76d50c 100644 --- a/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md +++ b/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md @@ -3,6 +3,6 @@ - A. first - subitem - B. second - - strange + 1. strange The end! diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index cfd56bd7..4558be5e 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -29,58 +29,62 @@ item-0 at level 0: unspecified: group _root_ item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. item-25 at level 2: paragraph: item-26 at level 1: list: group list - item-27 at level 2: list_item: + item-27 at level 1: paragraph: item-28 at level 1: paragraph: item-29 at level 1: paragraph: item-30 at level 1: paragraph: item-31 at level 1: paragraph: - item-32 at level 1: paragraph: - item-33 at level 1: section: group textbox - item-34 at level 2: paragraph: Health Bureau: - item-35 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-36 at level 2: list: group list - item-37 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-38 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-39 at level 2: paragraph: - item-40 at level 1: list: group list - item-41 at level 2: list_item: - item-42 at level 1: paragraph: - item-43 at level 1: section: group textbox - item-44 at level 2: paragraph: Department of Education: + item-32 at level 1: section: group textbox + item-33 at level 2: paragraph: Health Bureau: + item-34 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-35 at level 2: list: group list + item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-37 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-38 at level 2: paragraph: + item-39 at level 1: list: group list + item-40 at level 1: paragraph: + item-41 at level 1: section: group textbox + item-42 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. + item-43 at level 1: paragraph: + item-44 at level 1: paragraph: item-45 at level 1: paragraph: item-46 at level 1: paragraph: item-47 at level 1: paragraph: item-48 at level 1: paragraph: item-49 at level 1: paragraph: - item-50 at level 1: paragraph: - item-51 at level 1: paragraph: - item-52 at level 1: paragraph: - item-53 at level 1: paragraph: - item-54 at level 1: paragraph: - item-55 at level 1: section: group textbox - item-56 at level 2: paragraph: Whether the epidemic has eased. - item-57 at level 2: paragraph: + item-50 at level 1: section: group textbox + item-51 at level 2: inline: group group + item-52 at level 3: paragraph: The Health Bureau will handle + item-53 at level 3: paragraph: reporting and specimen collection + item-54 at level 3: paragraph: . + item-55 at level 2: paragraph: + item-56 at level 1: paragraph: + item-57 at level 1: paragraph: item-58 at level 1: paragraph: item-59 at level 1: section: group textbox - item-60 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-61 at level 2: paragraph: No + item-60 at level 2: paragraph: Whether the epidemic has eased. + item-61 at level 2: paragraph: item-62 at level 1: paragraph: - item-63 at level 1: paragraph: - item-64 at level 1: section: group textbox - item-65 at level 2: paragraph: Yes + item-63 at level 1: section: group textbox + item-64 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-65 at level 2: paragraph: No item-66 at level 1: paragraph: - item-67 at level 1: section: group textbox - item-68 at level 2: paragraph: Yes - item-69 at level 1: paragraph: + item-67 at level 1: paragraph: + item-68 at level 1: section: group textbox + item-69 at level 2: paragraph: Yes item-70 at level 1: paragraph: item-71 at level 1: section: group textbox - item-72 at level 2: paragraph: Case closed. - item-73 at level 2: paragraph: - item-74 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. - item-75 at level 1: paragraph: - item-76 at level 1: section: group textbox - item-77 at level 2: paragraph: No - item-78 at level 1: paragraph: + item-72 at level 2: paragraph: Yes + item-73 at level 1: paragraph: + item-74 at level 1: paragraph: + item-75 at level 1: section: group textbox + item-76 at level 2: paragraph: Case closed. + item-77 at level 2: paragraph: + item-78 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. item-79 at level 1: paragraph: - item-80 at level 1: paragraph: \ No newline at end of file + item-80 at level 1: section: group textbox + item-81 at level 2: paragraph: No + item-82 at level 1: paragraph: + item-83 at level 1: paragraph: + item-84 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 3c03abdb..9300c933 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -65,6 +65,9 @@ { "$ref": "#/groups/6" }, + { + "$ref": "#/texts/19" + }, { "$ref": "#/texts/20" }, @@ -77,9 +80,6 @@ { "$ref": "#/texts/23" }, - { - "$ref": "#/texts/24" - }, { "$ref": "#/groups/7" }, @@ -87,11 +87,17 @@ "$ref": "#/groups/9" }, { - "$ref": "#/texts/31" + "$ref": "#/texts/29" }, { "$ref": "#/groups/10" }, + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, { "$ref": "#/texts/33" }, @@ -108,67 +114,64 @@ "$ref": "#/texts/37" }, { - "$ref": "#/texts/38" - }, - { - "$ref": "#/texts/39" - }, - { - "$ref": "#/texts/40" - }, - { - "$ref": "#/texts/41" + "$ref": "#/groups/11" }, { "$ref": "#/texts/42" }, { - "$ref": "#/groups/11" + "$ref": "#/texts/43" }, { - "$ref": "#/texts/45" - }, - { - "$ref": "#/groups/12" - }, - { - "$ref": "#/texts/48" - }, - { - "$ref": "#/texts/49" + "$ref": "#/texts/44" }, { "$ref": "#/groups/13" }, { - "$ref": "#/texts/51" + "$ref": "#/texts/47" }, { "$ref": "#/groups/14" }, { - "$ref": "#/texts/53" + "$ref": "#/texts/50" }, { - "$ref": "#/texts/54" + "$ref": "#/texts/51" }, { "$ref": "#/groups/15" }, { - "$ref": "#/texts/58" + "$ref": "#/texts/53" }, { "$ref": "#/groups/16" }, + { + "$ref": "#/texts/55" + }, + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/groups/17" + }, { "$ref": "#/texts/60" }, { - "$ref": "#/texts/61" + "$ref": "#/groups/18" }, { "$ref": "#/texts/62" + }, + { + "$ref": "#/texts/63" + }, + { + "$ref": "#/texts/64" } ], "content_layer": "body", @@ -277,11 +280,7 @@ "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/19" - } - ], + "children": [], "content_layer": "body", "name": "list", "label": "list" @@ -293,16 +292,16 @@ }, "children": [ { - "$ref": "#/texts/25" + "$ref": "#/texts/24" }, { - "$ref": "#/texts/26" + "$ref": "#/texts/25" }, { "$ref": "#/groups/8" }, { - "$ref": "#/texts/29" + "$ref": "#/texts/28" } ], "content_layer": "body", @@ -316,10 +315,10 @@ }, "children": [ { - "$ref": "#/texts/27" + "$ref": "#/texts/26" }, { - "$ref": "#/texts/28" + "$ref": "#/texts/27" } ], "content_layer": "body", @@ -331,11 +330,7 @@ "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/30" - } - ], + "children": [], "content_layer": "body", "name": "list", "label": "list" @@ -347,7 +342,7 @@ }, "children": [ { - "$ref": "#/texts/32" + "$ref": "#/texts/30" } ], "content_layer": "body", @@ -361,10 +356,10 @@ }, "children": [ { - "$ref": "#/texts/43" + "$ref": "#/groups/12" }, { - "$ref": "#/texts/44" + "$ref": "#/texts/41" } ], "content_layer": "body", @@ -374,19 +369,22 @@ { "self_ref": "#/groups/12", "parent": { - "$ref": "#/body" + "$ref": "#/groups/11" }, "children": [ { - "$ref": "#/texts/46" + "$ref": "#/texts/38" }, { - "$ref": "#/texts/47" + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/40" } ], "content_layer": "body", - "name": "textbox", - "label": "section" + "name": "group", + "label": "inline" }, { "self_ref": "#/groups/13", @@ -395,7 +393,10 @@ }, "children": [ { - "$ref": "#/texts/50" + "$ref": "#/texts/45" + }, + { + "$ref": "#/texts/46" } ], "content_layer": "body", @@ -409,7 +410,10 @@ }, "children": [ { - "$ref": "#/texts/52" + "$ref": "#/texts/48" + }, + { + "$ref": "#/texts/49" } ], "content_layer": "body", @@ -423,13 +427,7 @@ }, "children": [ { - "$ref": "#/texts/55" - }, - { - "$ref": "#/texts/56" - }, - { - "$ref": "#/texts/57" + "$ref": "#/texts/52" } ], "content_layer": "body", @@ -442,6 +440,26 @@ "$ref": "#/body" }, "children": [ + { + "$ref": "#/texts/54" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/17", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/57" + }, + { + "$ref": "#/texts/58" + }, { "$ref": "#/texts/59" } @@ -449,6 +467,20 @@ "content_layer": "body", "name": "textbox", "label": "section" + }, + { + "self_ref": "#/groups/18", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/61" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" } ], "texts": [ @@ -745,16 +777,14 @@ { "self_ref": "#/texts/19", "parent": { - "$ref": "#/groups/6" + "$ref": "#/body" }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "paragraph", "prov": [], "orig": "", - "text": "", - "enumerated": false, - "marker": "" + "text": "" }, { "self_ref": "#/texts/20", @@ -806,18 +836,6 @@ }, { "self_ref": "#/texts/24", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/25", "parent": { "$ref": "#/groups/7" }, @@ -836,7 +854,7 @@ } }, { - "self_ref": "#/texts/26", + "self_ref": "#/texts/25", "parent": { "$ref": "#/groups/7" }, @@ -855,7 +873,7 @@ } }, { - "self_ref": "#/texts/27", + "self_ref": "#/texts/26", "parent": { "$ref": "#/groups/8" }, @@ -876,7 +894,7 @@ "marker": "" }, { - "self_ref": "#/texts/28", + "self_ref": "#/texts/27", "parent": { "$ref": "#/groups/8" }, @@ -897,7 +915,7 @@ "marker": "" }, { - "self_ref": "#/texts/29", + "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/7" }, @@ -909,21 +927,7 @@ "text": "" }, { - "self_ref": "#/texts/30", - "parent": { - "$ref": "#/groups/9" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "", - "text": "", - "enumerated": false, - "marker": "" - }, - { - "self_ref": "#/texts/31", + "self_ref": "#/texts/29", "parent": { "$ref": "#/body" }, @@ -935,7 +939,7 @@ "text": "" }, { - "self_ref": "#/texts/32", + "self_ref": "#/texts/30", "parent": { "$ref": "#/groups/10" }, @@ -953,6 +957,30 @@ "script": "baseline" } }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/33", "parent": { @@ -1016,43 +1044,64 @@ { "self_ref": "#/texts/38", "parent": { - "$ref": "#/body" + "$ref": "#/groups/12" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "", - "text": "" + "orig": "The Health Bureau will handle", + "text": "The Health Bureau will handle", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } }, { "self_ref": "#/texts/39", "parent": { - "$ref": "#/body" + "$ref": "#/groups/12" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "", - "text": "" + "orig": "reporting and specimen collection", + "text": "reporting and specimen collection", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } }, { "self_ref": "#/texts/40", "parent": { - "$ref": "#/body" + "$ref": "#/groups/12" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "", - "text": "" + "orig": ".", + "text": ".", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } }, { "self_ref": "#/texts/41", "parent": { - "$ref": "#/body" + "$ref": "#/groups/11" }, "children": [], "content_layer": "body", @@ -1076,7 +1125,31 @@ { "self_ref": "#/texts/43", "parent": { - "$ref": "#/groups/11" + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", @@ -1093,9 +1166,9 @@ } }, { - "self_ref": "#/texts/44", + "self_ref": "#/texts/46", "parent": { - "$ref": "#/groups/11" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", @@ -1105,7 +1178,7 @@ "text": "" }, { - "self_ref": "#/texts/45", + "self_ref": "#/texts/47", "parent": { "$ref": "#/body" }, @@ -1117,9 +1190,9 @@ "text": "" }, { - "self_ref": "#/texts/46", + "self_ref": "#/texts/48", "parent": { - "$ref": "#/groups/12" + "$ref": "#/groups/14" }, "children": [], "content_layer": "body", @@ -1136,9 +1209,9 @@ } }, { - "self_ref": "#/texts/47", + "self_ref": "#/texts/49", "parent": { - "$ref": "#/groups/12" + "$ref": "#/groups/14" }, "children": [], "content_layer": "body", @@ -1154,48 +1227,17 @@ "script": "baseline" } }, - { - "self_ref": "#/texts/48", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/49", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, { "self_ref": "#/texts/50", "parent": { - "$ref": "#/groups/13" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Yes", - "text": "Yes", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "orig": "", + "text": "" }, { "self_ref": "#/texts/51", @@ -1212,7 +1254,7 @@ { "self_ref": "#/texts/52", "parent": { - "$ref": "#/groups/14" + "$ref": "#/groups/15" }, "children": [], "content_layer": "body", @@ -1242,6 +1284,25 @@ }, { "self_ref": "#/texts/54", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/55", "parent": { "$ref": "#/body" }, @@ -1253,9 +1314,21 @@ "text": "" }, { - "self_ref": "#/texts/55", + "self_ref": "#/texts/56", "parent": { - "$ref": "#/groups/15" + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/17" }, "children": [], "content_layer": "body", @@ -1271,41 +1344,10 @@ "script": "baseline" } }, - { - "self_ref": "#/texts/56", - "parent": { - "$ref": "#/groups/15" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/57", - "parent": { - "$ref": "#/groups/15" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", - "text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, { "self_ref": "#/texts/58", "parent": { - "$ref": "#/body" + "$ref": "#/groups/17" }, "children": [], "content_layer": "body", @@ -1317,14 +1359,14 @@ { "self_ref": "#/texts/59", "parent": { - "$ref": "#/groups/16" + "$ref": "#/groups/17" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "No", - "text": "No", + "orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", + "text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", "formatting": { "bold": false, "italic": false, @@ -1347,6 +1389,25 @@ }, { "self_ref": "#/texts/61", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "No", + "text": "No", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/62", "parent": { "$ref": "#/body" }, @@ -1358,7 +1419,19 @@ "text": "" }, { - "self_ref": "#/texts/62", + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/64", "parent": { "$ref": "#/body" }, diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.md b/tests/data/groundtruth/docling_v2/textbox.docx.md index b52ef634..293c4d8c 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.md +++ b/tests/data/groundtruth/docling_v2/textbox.docx.md @@ -22,8 +22,6 @@ Yes - A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network. - A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System. -- - **Health Bureau:** Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control. @@ -31,11 +29,11 @@ Upon receiving a report from the kindergarten, conduct a preliminary assessment - If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection. - Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act. -- - Department of Education: Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels. +The Health Bureau will handle **reporting and specimen collection** . + **Whether the epidemic has eased.** **Whether the test results are positive for a legally designated infectious disease.** diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 76538b91..4a46406c 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -5753,7 +5753,7 @@ "prov": [], "orig": "^ \"Duckling\". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22.", "text": "^ \"Duckling\". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5767,7 +5767,7 @@ "prov": [], "orig": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22.", "text": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5781,7 +5781,7 @@ "prov": [], "orig": "^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139.", "text": "^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5795,7 +5795,7 @@ "prov": [], "orig": "^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566.", "text": "^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5809,7 +5809,7 @@ "prov": [], "orig": "^ a b c d Carboneras 1992, p. 536.", "text": "^ a b c d Carboneras 1992, p. 536.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5823,7 +5823,7 @@ "prov": [], "orig": "^ Livezey 1986, pp. 737–738.", "text": "^ Livezey 1986, pp. 737–738.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5837,7 +5837,7 @@ "prov": [], "orig": "^ Madsen, McHugh & de Kloet 1988, p. 452.", "text": "^ Madsen, McHugh & de Kloet 1988, p. 452.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5851,7 +5851,7 @@ "prov": [], "orig": "^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.", "text": "^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5865,7 +5865,7 @@ "prov": [], "orig": "^ a b c d e f Carboneras 1992, p. 540.", "text": "^ a b c d e f Carboneras 1992, p. 540.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5879,7 +5879,7 @@ "prov": [], "orig": "^ Elphick, Dunning & Sibley 2001, p. 191.", "text": "^ Elphick, Dunning & Sibley 2001, p. 191.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5893,7 +5893,7 @@ "prov": [], "orig": "^ Kear 2005, p. 448.", "text": "^ Kear 2005, p. 448.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5907,7 +5907,7 @@ "prov": [], "orig": "^ Kear 2005, p. 622–623.", "text": "^ Kear 2005, p. 622–623.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5921,7 +5921,7 @@ "prov": [], "orig": "^ Kear 2005, p. 686.", "text": "^ Kear 2005, p. 686.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5935,7 +5935,7 @@ "prov": [], "orig": "^ Elphick, Dunning & Sibley 2001, p. 193.", "text": "^ Elphick, Dunning & Sibley 2001, p. 193.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5949,7 +5949,7 @@ "prov": [], "orig": "^ a b c d e f g Carboneras 1992, p. 537.", "text": "^ a b c d e f g Carboneras 1992, p. 537.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5963,7 +5963,7 @@ "prov": [], "orig": "^ American Ornithologists' Union 1998, p. xix.", "text": "^ American Ornithologists' Union 1998, p. xix.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5977,7 +5977,7 @@ "prov": [], "orig": "^ American Ornithologists' Union 1998.", "text": "^ American Ornithologists' Union 1998.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -5991,7 +5991,7 @@ "prov": [], "orig": "^ Carboneras 1992, p. 538.", "text": "^ Carboneras 1992, p. 538.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6005,7 +6005,7 @@ "prov": [], "orig": "^ Christidis & Boles 2008, p. 62.", "text": "^ Christidis & Boles 2008, p. 62.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6019,7 +6019,7 @@ "prov": [], "orig": "^ Shirihai 2008, pp. 239, 245.", "text": "^ Shirihai 2008, pp. 239, 245.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6033,7 +6033,7 @@ "prov": [], "orig": "^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.", "text": "^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6047,7 +6047,7 @@ "prov": [], "orig": "^ Fitter, Fitter & Hosking 2000, pp. 52–3.", "text": "^ Fitter, Fitter & Hosking 2000, pp. 52–3.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6061,7 +6061,7 @@ "prov": [], "orig": "^ \"Pacific Black Duck\". www.wiresnr.org. Retrieved 2018-04-27.", "text": "^ \"Pacific Black Duck\". www.wiresnr.org. Retrieved 2018-04-27.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6075,7 +6075,7 @@ "prov": [], "orig": "^ Ogden, Evans. \"Dabbling Ducks\". CWE. Retrieved 2006-11-02.", "text": "^ Ogden, Evans. \"Dabbling Ducks\". CWE. Retrieved 2006-11-02.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6089,7 +6089,7 @@ "prov": [], "orig": "^ Karl Mathiesen (16 March 2015). \"Don't feed the ducks bread, say conservationists\". The Guardian. Retrieved 13 November 2016.", "text": "^ Karl Mathiesen (16 March 2015). \"Don't feed the ducks bread, say conservationists\". The Guardian. Retrieved 13 November 2016.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6103,7 +6103,7 @@ "prov": [], "orig": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9.", "text": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6117,7 +6117,7 @@ "prov": [], "orig": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", "text": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6131,7 +6131,7 @@ "prov": [], "orig": "^ \"If You Find An Orphaned Duckling - Wildlife Rehabber\". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22.", "text": "^ \"If You Find An Orphaned Duckling - Wildlife Rehabber\". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6145,7 +6145,7 @@ "prov": [], "orig": "^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source]", "text": "^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source]", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6159,7 +6159,7 @@ "prov": [], "orig": "^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707.", "text": "^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6173,7 +6173,7 @@ "prov": [], "orig": "^ Amos, Jonathan (2003-09-08). \"Sound science is quackers\". BBC News. Retrieved 2006-11-02.", "text": "^ Amos, Jonathan (2003-09-08). \"Sound science is quackers\". BBC News. Retrieved 2006-11-02.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6187,7 +6187,7 @@ "prov": [], "orig": "^ \"Mythbusters Episode 8\". 12 December 2003.", "text": "^ \"Mythbusters Episode 8\". 12 December 2003.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6201,7 +6201,7 @@ "prov": [], "orig": "^ Erlandson 1994, p. 171.", "text": "^ Erlandson 1994, p. 171.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6215,7 +6215,7 @@ "prov": [], "orig": "^ Jeffries 2008, pp. 168, 243.", "text": "^ Jeffries 2008, pp. 168, 243.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6229,7 +6229,7 @@ "prov": [], "orig": "^ a b Sued-Badillo 2003, p. 65.", "text": "^ a b Sued-Badillo 2003, p. 65.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6243,7 +6243,7 @@ "prov": [], "orig": "^ Thorpe 1996, p. 68.", "text": "^ Thorpe 1996, p. 68.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6257,7 +6257,7 @@ "prov": [], "orig": "^ Maisels 1999, p. 42.", "text": "^ Maisels 1999, p. 42.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6271,7 +6271,7 @@ "prov": [], "orig": "^ Rau 1876, p. 133.", "text": "^ Rau 1876, p. 133.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6285,7 +6285,7 @@ "prov": [], "orig": "^ Higman 2012, p. 23.", "text": "^ Higman 2012, p. 23.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6299,7 +6299,7 @@ "prov": [], "orig": "^ Hume 2012, p. 53.", "text": "^ Hume 2012, p. 53.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6313,7 +6313,7 @@ "prov": [], "orig": "^ Hume 2012, p. 52.", "text": "^ Hume 2012, p. 52.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6327,7 +6327,7 @@ "prov": [], "orig": "^ Fieldhouse 2002, p. 167.", "text": "^ Fieldhouse 2002, p. 167.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6341,7 +6341,7 @@ "prov": [], "orig": "^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774.", "text": "^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6355,7 +6355,7 @@ "prov": [], "orig": "^ \"Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl\" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019.", "text": "^ \"Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl\" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6369,7 +6369,7 @@ "prov": [], "orig": "^ \"FAOSTAT\". www.fao.org. Retrieved 2019-10-25.", "text": "^ \"FAOSTAT\". www.fao.org. Retrieved 2019-10-25.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6383,7 +6383,7 @@ "prov": [], "orig": "^ \"Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin\". Digimorph.org. Retrieved 2012-12-23.", "text": "^ \"Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin\". Digimorph.org. Retrieved 2012-12-23.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6397,7 +6397,7 @@ "prov": [], "orig": "^ Sy Montgomery. \"Mallard; Encyclopædia Britannica\". Britannica.com. Retrieved 2012-12-23.", "text": "^ Sy Montgomery. \"Mallard; Encyclopædia Britannica\". Britannica.com. Retrieved 2012-12-23.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6411,7 +6411,7 @@ "prov": [], "orig": "^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9.", "text": "^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6425,7 +6425,7 @@ "prov": [], "orig": "^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3.", "text": "^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6439,7 +6439,7 @@ "prov": [], "orig": "^ \"Lubānas simbolika\" (in Latvian). Retrieved September 9, 2021.", "text": "^ \"Lubānas simbolika\" (in Latvian). Retrieved September 9, 2021.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6453,7 +6453,7 @@ "prov": [], "orig": "^ \"Föglö\" (in Swedish). Retrieved September 9, 2021.", "text": "^ \"Föglö\" (in Swedish). Retrieved September 9, 2021.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6467,7 +6467,7 @@ "prov": [], "orig": "^ Young, Emma. \"World's funniest joke revealed\". New Scientist. Retrieved 7 January 2019.", "text": "^ Young, Emma. \"World's funniest joke revealed\". New Scientist. Retrieved 7 January 2019.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6481,7 +6481,7 @@ "prov": [], "orig": "^ \"Howard the Duck (character)\". Grand Comics Database.", "text": "^ \"Howard the Duck (character)\". Grand Comics Database.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6495,7 +6495,7 @@ "prov": [], "orig": "^ Sanderson, Peter; Gilbert, Laura (2008). \"1970s\". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck.", "text": "^ Sanderson, Peter; Gilbert, Laura (2008). \"1970s\". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck.", - "enumerated": false, + "enumerated": true, "marker": "" }, { @@ -6509,7 +6509,7 @@ "prov": [], "orig": "^ \"The Duck\". University of Oregon Athletics. Retrieved 2022-01-20.", "text": "^ \"The Duck\". University of Oregon Athletics. Retrieved 2022-01-20.", - "enumerated": false, + "enumerated": true, "marker": "" }, { diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md index 28c0fe4e..9467bc4e 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md @@ -380,61 +380,61 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a ### Citations -- ^ "Duckling". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22. -- ^ "Duckling". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22. -- ^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139. -- ^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566. -- ^ a b c d Carboneras 1992, p. 536. -- ^ Livezey 1986, pp. 737–738. -- ^ Madsen, McHugh & de Kloet 1988, p. 452. -- ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354. -- ^ a b c d e f Carboneras 1992, p. 540. -- ^ Elphick, Dunning & Sibley 2001, p. 191. -- ^ Kear 2005, p. 448. -- ^ Kear 2005, p. 622–623. -- ^ Kear 2005, p. 686. -- ^ Elphick, Dunning & Sibley 2001, p. 193. -- ^ a b c d e f g Carboneras 1992, p. 537. -- ^ American Ornithologists' Union 1998, p. xix. -- ^ American Ornithologists' Union 1998. -- ^ Carboneras 1992, p. 538. -- ^ Christidis & Boles 2008, p. 62. -- ^ Shirihai 2008, pp. 239, 245. -- ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107. -- ^ Fitter, Fitter & Hosking 2000, pp. 52–3. -- ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27. -- ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02. -- ^ Karl Mathiesen (16 March 2015). "Don't feed the ducks bread, say conservationists". The Guardian. Retrieved 13 November 2016. -- ^ Rohwer, Frank C.; Anderson, Michael G. (1988). "Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5\_4. ISBN 978-1-4615-6789-9. -- ^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). "Long-Term Pair Bonds in Harlequin Ducks". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797. -- ^ "If You Find An Orphaned Duckling - Wildlife Rehabber". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22. -- ^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source] -- ^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707. -- ^ Amos, Jonathan (2003-09-08). "Sound science is quackers". BBC News. Retrieved 2006-11-02. -- ^ "Mythbusters Episode 8". 12 December 2003. -- ^ Erlandson 1994, p. 171. -- ^ Jeffries 2008, pp. 168, 243. -- ^ a b Sued-Badillo 2003, p. 65. -- ^ Thorpe 1996, p. 68. -- ^ Maisels 1999, p. 42. -- ^ Rau 1876, p. 133. -- ^ Higman 2012, p. 23. -- ^ Hume 2012, p. 53. -- ^ Hume 2012, p. 52. -- ^ Fieldhouse 2002, p. 167. -- ^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774. -- ^ "Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019. -- ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25. -- ^ "Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin". Digimorph.org. Retrieved 2012-12-23. -- ^ Sy Montgomery. "Mallard; Encyclopædia Britannica". Britannica.com. Retrieved 2012-12-23. -- ^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9. -- ^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3. -- ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021. -- ^ "Föglö" (in Swedish). Retrieved September 9, 2021. -- ^ Young, Emma. "World's funniest joke revealed". New Scientist. Retrieved 7 January 2019. -- ^ "Howard the Duck (character)". Grand Comics Database. -- ^ Sanderson, Peter; Gilbert, Laura (2008). "1970s". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck. -- ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20. +1. ^ "Duckling". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22. +2. ^ "Duckling". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22. +3. ^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139. +4. ^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566. +5. ^ a b c d Carboneras 1992, p. 536. +6. ^ Livezey 1986, pp. 737–738. +7. ^ Madsen, McHugh & de Kloet 1988, p. 452. +8. ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354. +9. ^ a b c d e f Carboneras 1992, p. 540. +10. ^ Elphick, Dunning & Sibley 2001, p. 191. +11. ^ Kear 2005, p. 448. +12. ^ Kear 2005, p. 622–623. +13. ^ Kear 2005, p. 686. +14. ^ Elphick, Dunning & Sibley 2001, p. 193. +15. ^ a b c d e f g Carboneras 1992, p. 537. +16. ^ American Ornithologists' Union 1998, p. xix. +17. ^ American Ornithologists' Union 1998. +18. ^ Carboneras 1992, p. 538. +19. ^ Christidis & Boles 2008, p. 62. +20. ^ Shirihai 2008, pp. 239, 245. +21. ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107. +22. ^ Fitter, Fitter & Hosking 2000, pp. 52–3. +23. ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27. +24. ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02. +25. ^ Karl Mathiesen (16 March 2015). "Don't feed the ducks bread, say conservationists". The Guardian. Retrieved 13 November 2016. +26. ^ Rohwer, Frank C.; Anderson, Michael G. (1988). "Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5\_4. ISBN 978-1-4615-6789-9. +27. ^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). "Long-Term Pair Bonds in Harlequin Ducks". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797. +28. ^ "If You Find An Orphaned Duckling - Wildlife Rehabber". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22. +29. ^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source] +30. ^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707. +31. ^ Amos, Jonathan (2003-09-08). "Sound science is quackers". BBC News. Retrieved 2006-11-02. +32. ^ "Mythbusters Episode 8". 12 December 2003. +33. ^ Erlandson 1994, p. 171. +34. ^ Jeffries 2008, pp. 168, 243. +35. ^ a b Sued-Badillo 2003, p. 65. +36. ^ Thorpe 1996, p. 68. +37. ^ Maisels 1999, p. 42. +38. ^ Rau 1876, p. 133. +39. ^ Higman 2012, p. 23. +40. ^ Hume 2012, p. 53. +41. ^ Hume 2012, p. 52. +42. ^ Fieldhouse 2002, p. 167. +43. ^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774. +44. ^ "Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019. +45. ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25. +46. ^ "Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin". Digimorph.org. Retrieved 2012-12-23. +47. ^ Sy Montgomery. "Mallard; Encyclopædia Britannica". Britannica.com. Retrieved 2012-12-23. +48. ^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9. +49. ^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3. +50. ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021. +51. ^ "Föglö" (in Swedish). Retrieved September 9, 2021. +52. ^ Young, Emma. "World's funniest joke revealed". New Scientist. Retrieved 7 January 2019. +53. ^ "Howard the Duck (character)". Grand Comics Database. +54. ^ Sanderson, Peter; Gilbert, Laura (2008). "1970s". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck. +55. ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20. ### Sources diff --git a/tests/data/html/example_01.html b/tests/data/html/example_01.html index 792dc6c2..2f86b5b0 100644 --- a/tests/data/html/example_01.html +++ b/tests/data/html/example_01.html @@ -13,5 +13,9 @@
  • First item in ordered list
  • Second item in ordered list
  • +
      +
    1. First item in ordered list with start
    2. +
    3. Second item in ordered list with start
    4. +