From df5c15195bcac0864c8b1ad0eae75e451ab8bc73 Mon Sep 17 00:00:00 2001 From: William Easton Date: Sat, 12 Jul 2025 20:10:27 -0500 Subject: [PATCH] Support hierarchical markdown --- docling/backend/md_backend.py | 100 ++++---- tests/conftest.py | 10 + .../docling_v2/inline_and_formatting.md.yaml | 218 ++++++++++-------- tests/test_backend_markdown.py | 200 +++++++++++++--- 4 files changed, 356 insertions(+), 172 deletions(-) create mode 100644 tests/conftest.py diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index fb42547e..480ecb44 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -5,21 +5,24 @@ from copy import deepcopy from enum import Enum from io import BytesIO from pathlib import Path -from typing import List, Literal, Optional, Set, Union +from typing import List, Literal, Optional, Set, Union, override import marko import marko.element import marko.inline from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, NodeItem, TableCell, TableData, TextItem, ) -from docling_core.types.doc.document import Formatting +from docling_core.types.doc.document import Formatting, SectionHeaderItem, TitleItem +from docling_core.types.doc.labels import GroupLabel from marko import Markdown from pydantic import AnyUrl, BaseModel, Field, TypeAdapter from typing_extensions import Annotated @@ -45,7 +48,7 @@ class _PendingCreationType(str, Enum): class _HeadingCreationPayload(BaseModel): kind: Literal["heading"] = "heading" - level: int + heading_item: TitleItem | SectionHeaderItem class _ListItemCreationPayload(BaseModel): @@ -63,6 +66,12 @@ _CreationPayload = Annotated[ class MarkdownDocumentBackend(DeclarativeDocumentBackend): + def _get_current_heading_level(self) -> int: + return max(self.header_to_group.keys()) if self.header_to_group else 0 + + def _get_current_heading_group(self) -> Optional[Union[DocItem, GroupItem]]: + return self.header_to_group.get(self._get_current_heading_level(), None) + def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): # This regex will match any sequence of underscores pattern = r"_+" @@ -100,6 +109,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self.md_table_buffer: list[str] = [] self._html_blocks: int = 0 + self.header_to_group: dict[int, GroupItem] = {} + try: if isinstance(self.path_or_stream, BytesIO): text_stream = self.path_or_stream.getvalue().decode("utf-8") @@ -125,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) from e return - def _close_table(self, doc: DoclingDocument): + def _close_table(self, doc: DoclingDocument, parent_item: Optional[NodeItem]): if self.in_table: _log.debug("=== TABLE START ===") for md_table_row in self.md_table_buffer: @@ -179,7 +190,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): for tcell in tcells: table_data.table_cells.append(tcell) if len(tcells) > 0: - doc.add_table(data=table_data) + _ = doc.add_table(data=table_data, parent=parent_item) return def _create_list_item( @@ -208,7 +219,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): level: int, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, - ): + ) -> TitleItem | SectionHeaderItem: if level == 1: item = doc.add_title( text=text, @@ -244,25 +255,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): if element in visited: return + if parent_item is None: + parent_item = self._get_current_heading_group() + # Iterates over all elements in the AST # Check for different element types and process relevant details if isinstance(element, marko.block.Heading) and len(element.children) > 0: - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug( f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore ) - if len(element.children) > 1: # inline group will be created further down - parent_item = self._create_heading_item( - doc=doc, - parent_item=parent_item, - text="", - level=element.level, - formatting=formatting, - hyperlink=hyperlink, - ) - else: - creation_stack.append(_HeadingCreationPayload(level=element.level)) + while self._get_current_heading_level() >= element.level: + _ = self.header_to_group.pop(self._get_current_heading_level()) + + parent_item = doc.add_group( + name=f"header-{element.level}", + label=GroupLabel.SECTION, + parent=self._get_current_heading_group(), + ) + + self.header_to_group[element.level] = parent_item + + parent_item = self._create_heading_item( + doc=doc, + parent_item=parent_item, + text="", + level=element.level, + formatting=formatting, + ) + + + if len(element.children) > 1: + parent_item = doc.add_inline_group(parent=parent_item) + elif len(element.children) == 1: + creation_stack.append(_HeadingCreationPayload(heading_item=parent_item)) elif isinstance(element, marko.block.List): has_non_empty_list_items = False @@ -271,7 +298,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): has_non_empty_list_items = True break - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") if has_non_empty_list_items: parent_item = doc.add_list_group(name="list", parent=parent_item) @@ -283,7 +310,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): and isinstance((child := element.children[0]), marko.block.Paragraph) and len(child.children) > 0 ): - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug(" - List item") enumerated = ( @@ -304,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): creation_stack.append(_ListItemCreationPayload(enumerated=enumerated)) elif isinstance(element, marko.inline.Image): - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") fig_caption: Optional[TextItem] = None @@ -346,7 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): else: self.md_table_buffer.append(snippet_text) elif snippet_text: - self._close_table(doc) + self._close_table(doc, parent_item) if creation_stack: while len(creation_stack) > 0: @@ -368,17 +395,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): hyperlink=hyperlink, ) elif isinstance(to_create, _HeadingCreationPayload): - # not keeping as parent_item as logic for correctly tracking - # that not implemented yet (section components not captured - # as heading children in marko) - self._create_heading_item( - doc=doc, - parent_item=parent_item, - text=snippet_text, - level=to_create.level, - formatting=formatting, - hyperlink=hyperlink, - ) + to_create.heading_item.text = snippet_text + to_create.heading_item.formatting = formatting + to_create.heading_item.hyperlink = hyperlink else: doc.add_text( label=DocItemLabel.TEXT, @@ -389,7 +408,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) elif isinstance(element, marko.inline.CodeSpan): - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug(f" - Code Span: {element.children}") snippet_text = str(element.children).strip() doc.add_code( @@ -405,7 +424,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): and isinstance((child := element.children[0]), marko.inline.RawText) and len(snippet_text := (child.children.strip())) > 0 ): - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug(f" - Code Block: {element.children}") doc.add_code( parent=parent_item, @@ -421,7 +440,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.block.HTMLBlock): self._html_blocks += 1 - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug(f"HTML Block: {element}") if ( len(element.body) > 0 @@ -438,13 +457,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) else: if not isinstance(element, str): - self._close_table(doc) + self._close_table(doc, parent_item) _log.debug(f"Some other element: {element}") - if ( - isinstance(element, (marko.block.Paragraph, marko.block.Heading)) - and len(element.children) > 1 - ): + if isinstance(element, marko.block.Paragraph) and len(element.children) > 1: parent_item = doc.add_inline_group(parent=parent_item) processed_block_types = ( @@ -511,7 +527,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): creation_stack=[], list_ordered_flag_by_ref={}, ) - self._close_table(doc=doc) # handle any last hanging table + self._close_table(doc=doc, parent_item=None) # handle any last hanging table # if HTML blocks were detected, export to HTML and delegate to HTML backend if self._html_blocks > 0: diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..439961cf --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +from pathlib import Path + +import pytest + +TEST_DATA_DIR = Path("./tests/data/") + + +@pytest.fixture +def test_data_directory() -> Path: + return TEST_DATA_DIR diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml index 801d2b76..83dc8f2f 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -1,18 +1,7 @@ body: children: - - $ref: '#/texts/0' - - $ref: '#/texts/1' - $ref: '#/groups/0' - - $ref: '#/groups/1' - - $ref: '#/groups/2' - - $ref: '#/texts/32' - - $ref: '#/groups/8' - - $ref: '#/groups/11' - - $ref: '#/texts/43' - - $ref: '#/texts/47' - - $ref: '#/texts/48' - - $ref: '#/groups/13' - - $ref: '#/tables/0' + - $ref: '#/groups/9' content_layer: body label: unspecified name: _root_ @@ -25,6 +14,18 @@ furniture: name: _root_ self_ref: '#/furniture' groups: +- children: + - $ref: '#/texts/0' + - $ref: '#/texts/1' + - $ref: '#/groups/1' + - $ref: '#/groups/2' + - $ref: '#/groups/3' + content_layer: body + label: section + name: header-1 + parent: + $ref: '#/body' + self_ref: '#/groups/0' - children: - $ref: '#/texts/2' - $ref: '#/texts/3' @@ -35,8 +36,8 @@ groups: label: inline name: group parent: - $ref: '#/body' - self_ref: '#/groups/0' + $ref: '#/groups/0' + self_ref: '#/groups/1' - children: - $ref: '#/texts/7' - $ref: '#/texts/8' @@ -45,8 +46,8 @@ groups: label: inline name: group parent: - $ref: '#/body' - self_ref: '#/groups/1' + $ref: '#/groups/0' + self_ref: '#/groups/2' - children: - $ref: '#/texts/10' - $ref: '#/texts/14' @@ -59,8 +60,8 @@ groups: label: list name: list parent: - $ref: '#/body' - self_ref: '#/groups/2' + $ref: '#/groups/0' + self_ref: '#/groups/3' - children: - $ref: '#/texts/11' - $ref: '#/texts/12' @@ -70,7 +71,7 @@ groups: name: group parent: $ref: '#/texts/10' - self_ref: '#/groups/3' + self_ref: '#/groups/4' - children: - $ref: '#/texts/15' - $ref: '#/texts/16' @@ -80,7 +81,7 @@ groups: name: group parent: $ref: '#/texts/14' - self_ref: '#/groups/4' + self_ref: '#/groups/5' - children: - $ref: '#/texts/19' - $ref: '#/texts/20' @@ -90,7 +91,7 @@ groups: name: group parent: $ref: '#/texts/18' - self_ref: '#/groups/5' + self_ref: '#/groups/6' - children: - $ref: '#/texts/23' - $ref: '#/texts/24' @@ -100,7 +101,7 @@ groups: name: group parent: $ref: '#/texts/22' - self_ref: '#/groups/6' + self_ref: '#/groups/7' - children: - $ref: '#/texts/29' - $ref: '#/texts/30' @@ -110,7 +111,19 @@ groups: name: group parent: $ref: '#/texts/28' - self_ref: '#/groups/7' + self_ref: '#/groups/8' +- children: + - $ref: '#/texts/32' + - $ref: '#/groups/10' + - $ref: '#/groups/13' + - $ref: '#/groups/14' + - $ref: '#/groups/16' + content_layer: body + label: section + name: header-1 + parent: + $ref: '#/body' + self_ref: '#/groups/9' - children: - $ref: '#/texts/33' - $ref: '#/texts/36' @@ -118,8 +131,8 @@ groups: label: list name: list parent: - $ref: '#/body' - self_ref: '#/groups/8' + $ref: '#/groups/9' + self_ref: '#/groups/10' - children: - $ref: '#/texts/34' - $ref: '#/texts/35' @@ -128,7 +141,7 @@ groups: name: group parent: $ref: '#/texts/33' - self_ref: '#/groups/9' + self_ref: '#/groups/11' - children: - $ref: '#/texts/37' - $ref: '#/texts/38' @@ -139,7 +152,7 @@ groups: name: group parent: $ref: '#/texts/36' - self_ref: '#/groups/10' + self_ref: '#/groups/12' - children: - $ref: '#/texts/41' - $ref: '#/texts/42' @@ -147,8 +160,17 @@ groups: label: inline name: group parent: - $ref: '#/body' - self_ref: '#/groups/11' + $ref: '#/groups/9' + self_ref: '#/groups/13' +- children: + - $ref: '#/texts/43' + - $ref: '#/texts/47' + content_layer: body + label: section + name: header-2 + parent: + $ref: '#/groups/9' + self_ref: '#/groups/14' - children: - $ref: '#/texts/44' - $ref: '#/texts/45' @@ -158,14 +180,24 @@ groups: name: group parent: $ref: '#/texts/43' - self_ref: '#/groups/12' + self_ref: '#/groups/15' +- children: + - $ref: '#/texts/48' + - $ref: '#/groups/17' + - $ref: '#/tables/0' + content_layer: body + label: section + name: header-2 + parent: + $ref: '#/groups/9' + self_ref: '#/groups/16' - children: [] content_layer: body label: inline name: group parent: - $ref: '#/body' - self_ref: '#/groups/13' + $ref: '#/groups/16' + self_ref: '#/groups/17' key_value_items: [] name: inline_and_formatting origin: @@ -308,7 +340,7 @@ tables: footnotes: [] label: table parent: - $ref: '#/body' + $ref: '#/groups/16' prov: [] references: [] self_ref: '#/tables/0' @@ -316,9 +348,9 @@ texts: - children: [] content_layer: body label: title - orig: Contribution guideline example + orig: '' parent: - $ref: '#/body' + $ref: '#/groups/0' prov: [] self_ref: '#/texts/0' text: Contribution guideline example @@ -327,7 +359,7 @@ texts: label: text orig: This is simple. parent: - $ref: '#/body' + $ref: '#/groups/0' prov: [] self_ref: '#/texts/1' text: This is simple. @@ -336,7 +368,7 @@ texts: label: text orig: Foo parent: - $ref: '#/groups/0' + $ref: '#/groups/1' prov: [] self_ref: '#/texts/2' text: Foo @@ -351,7 +383,7 @@ texts: label: text orig: emphasis parent: - $ref: '#/groups/0' + $ref: '#/groups/1' prov: [] self_ref: '#/texts/3' text: emphasis @@ -366,7 +398,7 @@ texts: label: text orig: strong emphasis parent: - $ref: '#/groups/0' + $ref: '#/groups/1' prov: [] self_ref: '#/texts/4' text: strong emphasis @@ -381,7 +413,7 @@ texts: label: text orig: both parent: - $ref: '#/groups/0' + $ref: '#/groups/1' prov: [] self_ref: '#/texts/5' text: both @@ -390,7 +422,7 @@ texts: label: text orig: . parent: - $ref: '#/groups/0' + $ref: '#/groups/1' prov: [] self_ref: '#/texts/6' text: . @@ -399,7 +431,7 @@ texts: label: text orig: 'Create your feature branch:' parent: - $ref: '#/groups/1' + $ref: '#/groups/2' prov: [] self_ref: '#/texts/7' text: 'Create your feature branch:' @@ -411,7 +443,7 @@ texts: label: code orig: git checkout -b feature/AmazingFeature parent: - $ref: '#/groups/1' + $ref: '#/groups/2' prov: [] references: [] self_ref: '#/texts/8' @@ -421,19 +453,19 @@ texts: label: text orig: . parent: - $ref: '#/groups/1' + $ref: '#/groups/2' prov: [] self_ref: '#/texts/9' text: . - children: - - $ref: '#/groups/3' + - $ref: '#/groups/4' content_layer: body enumerated: true label: list_item marker: '' orig: '' parent: - $ref: '#/groups/2' + $ref: '#/groups/3' prov: [] self_ref: '#/texts/10' text: '' @@ -442,7 +474,7 @@ texts: label: text orig: Pull the parent: - $ref: '#/groups/3' + $ref: '#/groups/4' prov: [] self_ref: '#/texts/11' text: Pull the @@ -458,7 +490,7 @@ texts: label: text orig: repository parent: - $ref: '#/groups/3' + $ref: '#/groups/4' prov: [] self_ref: '#/texts/12' text: repository @@ -467,19 +499,19 @@ texts: label: text orig: . parent: - $ref: '#/groups/3' + $ref: '#/groups/4' prov: [] self_ref: '#/texts/13' text: . - children: - - $ref: '#/groups/4' + - $ref: '#/groups/5' content_layer: body enumerated: true label: list_item marker: '' orig: '' parent: - $ref: '#/groups/2' + $ref: '#/groups/3' prov: [] self_ref: '#/texts/14' text: '' @@ -488,7 +520,7 @@ texts: label: text orig: Create your feature branch ( parent: - $ref: '#/groups/4' + $ref: '#/groups/5' prov: [] self_ref: '#/texts/15' text: Create your feature branch ( @@ -500,7 +532,7 @@ texts: label: code orig: git checkout -b feature/AmazingFeature parent: - $ref: '#/groups/4' + $ref: '#/groups/5' prov: [] references: [] self_ref: '#/texts/16' @@ -510,19 +542,19 @@ texts: label: text orig: ) parent: - $ref: '#/groups/4' + $ref: '#/groups/5' prov: [] self_ref: '#/texts/17' text: ) - children: - - $ref: '#/groups/5' + - $ref: '#/groups/6' content_layer: body enumerated: true label: list_item marker: '' orig: '' parent: - $ref: '#/groups/2' + $ref: '#/groups/3' prov: [] self_ref: '#/texts/18' text: '' @@ -531,7 +563,7 @@ texts: label: text orig: Commit your changes ( parent: - $ref: '#/groups/5' + $ref: '#/groups/6' prov: [] self_ref: '#/texts/19' text: Commit your changes ( @@ -543,7 +575,7 @@ texts: label: code orig: git commit -m 'Add some AmazingFeature' parent: - $ref: '#/groups/5' + $ref: '#/groups/6' prov: [] references: [] self_ref: '#/texts/20' @@ -553,19 +585,19 @@ texts: label: text orig: ) parent: - $ref: '#/groups/5' + $ref: '#/groups/6' prov: [] self_ref: '#/texts/21' text: ) - children: - - $ref: '#/groups/6' + - $ref: '#/groups/7' content_layer: body enumerated: true label: list_item marker: '' orig: '' parent: - $ref: '#/groups/2' + $ref: '#/groups/3' prov: [] self_ref: '#/texts/22' text: '' @@ -574,7 +606,7 @@ texts: label: text orig: Push to the branch ( parent: - $ref: '#/groups/6' + $ref: '#/groups/7' prov: [] self_ref: '#/texts/23' text: Push to the branch ( @@ -586,7 +618,7 @@ texts: label: code orig: git push origin feature/AmazingFeature parent: - $ref: '#/groups/6' + $ref: '#/groups/7' prov: [] references: [] self_ref: '#/texts/24' @@ -596,7 +628,7 @@ texts: label: text orig: ) parent: - $ref: '#/groups/6' + $ref: '#/groups/7' prov: [] self_ref: '#/texts/25' text: ) @@ -607,7 +639,7 @@ texts: marker: '' orig: Open a Pull Request parent: - $ref: '#/groups/2' + $ref: '#/groups/3' prov: [] self_ref: '#/texts/26' text: Open a Pull Request @@ -624,19 +656,19 @@ texts: marker: '' orig: Whole list item has same formatting parent: - $ref: '#/groups/2' + $ref: '#/groups/3' prov: [] self_ref: '#/texts/27' text: Whole list item has same formatting - children: - - $ref: '#/groups/7' + - $ref: '#/groups/8' content_layer: body enumerated: true label: list_item marker: '' orig: '' parent: - $ref: '#/groups/2' + $ref: '#/groups/3' prov: [] self_ref: '#/texts/28' text: '' @@ -645,7 +677,7 @@ texts: label: text orig: List item has parent: - $ref: '#/groups/7' + $ref: '#/groups/8' prov: [] self_ref: '#/texts/29' text: List item has @@ -660,7 +692,7 @@ texts: label: text orig: mixed or partial parent: - $ref: '#/groups/7' + $ref: '#/groups/8' prov: [] self_ref: '#/texts/30' text: mixed or partial @@ -669,7 +701,7 @@ texts: label: text orig: formatting parent: - $ref: '#/groups/7' + $ref: '#/groups/8' prov: [] self_ref: '#/texts/31' text: formatting @@ -682,21 +714,21 @@ texts: strikethrough: false underline: false label: title - orig: Whole heading is italic + orig: '' parent: - $ref: '#/body' + $ref: '#/groups/9' prov: [] self_ref: '#/texts/32' text: Whole heading is italic - children: - - $ref: '#/groups/9' + - $ref: '#/groups/11' content_layer: body enumerated: false label: list_item marker: '' orig: '' parent: - $ref: '#/groups/8' + $ref: '#/groups/10' prov: [] self_ref: '#/texts/33' text: '' @@ -711,7 +743,7 @@ texts: label: text orig: First parent: - $ref: '#/groups/9' + $ref: '#/groups/11' prov: [] self_ref: '#/texts/34' text: First @@ -720,19 +752,19 @@ texts: label: text orig: ': Lorem ipsum.' parent: - $ref: '#/groups/9' + $ref: '#/groups/11' prov: [] self_ref: '#/texts/35' text: ': Lorem ipsum.' - children: - - $ref: '#/groups/10' + - $ref: '#/groups/12' content_layer: body enumerated: false label: list_item marker: '' orig: '' parent: - $ref: '#/groups/8' + $ref: '#/groups/10' prov: [] self_ref: '#/texts/36' text: '' @@ -747,7 +779,7 @@ texts: label: text orig: Second parent: - $ref: '#/groups/10' + $ref: '#/groups/12' prov: [] self_ref: '#/texts/37' text: Second @@ -756,7 +788,7 @@ texts: label: text orig: ': Dolor' parent: - $ref: '#/groups/10' + $ref: '#/groups/12' prov: [] self_ref: '#/texts/38' text: ': Dolor' @@ -768,7 +800,7 @@ texts: label: code orig: sit parent: - $ref: '#/groups/10' + $ref: '#/groups/12' prov: [] references: [] self_ref: '#/texts/39' @@ -778,7 +810,7 @@ texts: label: text orig: amet. parent: - $ref: '#/groups/10' + $ref: '#/groups/12' prov: [] self_ref: '#/texts/40' text: amet. @@ -787,7 +819,7 @@ texts: label: text orig: Some parent: - $ref: '#/groups/11' + $ref: '#/groups/13' prov: [] self_ref: '#/texts/41' text: Some @@ -805,19 +837,19 @@ texts: label: code orig: formatted_code parent: - $ref: '#/groups/11' + $ref: '#/groups/13' prov: [] references: [] self_ref: '#/texts/42' text: formatted_code - children: - - $ref: '#/groups/12' + - $ref: '#/groups/15' content_layer: body label: section_header level: 1 orig: '' parent: - $ref: '#/body' + $ref: '#/groups/14' prov: [] self_ref: '#/texts/43' text: '' @@ -832,7 +864,7 @@ texts: label: text orig: Partially formatted parent: - $ref: '#/groups/12' + $ref: '#/groups/15' prov: [] self_ref: '#/texts/44' text: Partially formatted @@ -841,7 +873,7 @@ texts: label: text orig: heading to_escape parent: - $ref: '#/groups/12' + $ref: '#/groups/15' prov: [] self_ref: '#/texts/45' text: heading to_escape @@ -853,7 +885,7 @@ texts: label: code orig: not_to_escape parent: - $ref: '#/groups/12' + $ref: '#/groups/15' prov: [] references: [] self_ref: '#/texts/46' @@ -864,7 +896,7 @@ texts: label: text orig: $$E=mc^2$$ parent: - $ref: '#/body' + $ref: '#/groups/14' prov: [] self_ref: '#/texts/47' text: $$E=mc^2$$ @@ -872,9 +904,9 @@ texts: content_layer: body label: section_header level: 1 - orig: Table Heading + orig: '' parent: - $ref: '#/body' + $ref: '#/groups/16' prov: [] self_ref: '#/texts/48' text: Table Heading diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py index 30865668..fa3e0004 100644 --- a/tests/test_backend_markdown.py +++ b/tests/test_backend_markdown.py @@ -1,57 +1,183 @@ +import io from pathlib import Path +from textwrap import dedent +from typing import Annotated + +import pytest +from _pytest.mark import ParameterSet +from docling_core.types.doc.document import DoclingDocument, GroupItem, RefItem from docling.backend.md_backend import MarkdownDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import DoclingDocument, InputDocument +from docling.datamodel.document import ( + InputDocument, +) +from tests.conftest import TEST_DATA_DIR from tests.verify_utils import CONFID_PREC, COORD_PREC from .test_data_gen_flag import GEN_TEST_DATA +GENERATE = True or GEN_TEST_DATA -def test_convert_valid(): - fmt = InputFormat.MD - cls = MarkdownDocumentBackend +ALSO_GENERATE_YAML = ["inline_and_formatting"] +"""A list of document names that should also be generated as yaml""" - root_path = Path("tests") / "data" - relevant_paths = sorted((root_path / "md").rglob("*.md")) - assert len(relevant_paths) > 0 +# Test Input Directories +INPUT_DIR = TEST_DATA_DIR / "md" - yaml_filter = ["inline_and_formatting"] +# Test Output Directories +SNAPSHOT_DIR = TEST_DATA_DIR / "groundtruth" / "docling_v2" - for in_path in relevant_paths: - md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" - yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml" +TestCase = Annotated[tuple[str, Path, Path], "test_name, in_file, snapshot_file"] - in_doc = InputDocument( - path_or_stream=in_path, - format=fmt, - backend=cls, + +def markdown_test_data() -> list[ParameterSet]: + """Returns test cases for each of our input markdown files""" + + test_case_paths = sorted(INPUT_DIR.glob(pattern="*.md"), key=lambda x: x.name) + + test_cases: list[ParameterSet] = [] + + for test_case_path in test_case_paths: + name: str = test_case_path.stem + + markdown_document_path: Path = test_case_path.resolve() + + markdown_snapshot_path: Path = SNAPSHOT_DIR / f"{name}.md.md" + yaml_snapshot_path: Path | None = ( + SNAPSHOT_DIR / f"{name}.md.yaml" if name in ALSO_GENERATE_YAML else None ) - backend = cls( - in_doc=in_doc, - path_or_stream=in_path, + + test_cases.append( + pytest.param( + markdown_document_path, + markdown_snapshot_path, + yaml_snapshot_path, + id=name, + ) ) - assert backend.is_valid() - act_doc = backend.convert() - act_data = act_doc.export_to_markdown() + return test_cases - if GEN_TEST_DATA: - with open(md_gt_path, mode="w", encoding="utf-8") as f: - f.write(f"{act_data}\n") - if in_path.stem in yaml_filter: - with open(yaml_gt_path, mode="w", encoding="utf-8") as f: - act_doc.save_as_yaml( - yaml_gt_path, - coord_precision=COORD_PREC, - confid_precision=CONFID_PREC, - ) +@pytest.mark.parametrize( + ("markdown_document_path", "markdown_snapshot_path", "yaml_snapshot_path"), + markdown_test_data(), +) +def test_convert_markdown( + markdown_document_path: Path, + markdown_snapshot_path: Path, + yaml_snapshot_path: Path | None, +): + """Test that the Markdown backend can: + 1) convert the input markdown file to a DoclingDocument + 2) export the markdown (and optionally, yaml) and verify it matches the committed snapshot + """ + + if not GENERATE and not markdown_snapshot_path.exists(): + pytest.skip( + f"Test requires {markdown_snapshot_path} to exist, you may need to generate it with GENERATE=True" + ) + + document_backend = MarkdownDocumentBackend( + in_doc=InputDocument( + path_or_stream=markdown_document_path, + format=InputFormat.MD, + backend=MarkdownDocumentBackend, + ), + path_or_stream=markdown_document_path, + ) + + assert document_backend.is_valid() + + try: + out_docling_document: DoclingDocument = document_backend.convert() + except Exception as e: + pytest.skip(f"Error converting {markdown_document_path}: {e}") + + # Validate the YAML/JSON Export + if yaml_snapshot_path: + if GENERATE: + out_docling_document.save_as_yaml( + yaml_snapshot_path, + coord_precision=COORD_PREC, + confid_precision=CONFID_PREC, + ) else: - with open(md_gt_path, encoding="utf-8") as f: - exp_data = f.read().rstrip() - assert act_data == exp_data + assert out_docling_document == DoclingDocument.load_from_yaml( + yaml_snapshot_path + ) - if in_path.stem in yaml_filter: - exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path) - assert act_doc == exp_doc + # Validate the Markdown Export + out_markdown: str = out_docling_document.export_to_markdown() + + if GENERATE: + _ = markdown_snapshot_path.write_text(out_markdown + "\n") + else: + assert ( + out_markdown == markdown_snapshot_path.read_text(encoding="utf-8") + ) + + +def test_convert_headers_to_groups(): + """Test that the Markdown backend can convert headers into hierarchical groups""" + + input_document = dedent(""" + # Header 1 + + some content under the header 1 + + ## Header 2a + + some content under the header 2 + + ### Header 3 + + some content under the header 3 + + ## Header 2b + """) + + in_doc = InputDocument( + path_or_stream=io.BytesIO(input_document.encode("utf-8")), + format=InputFormat.MD, + filename="headers_to_groups.md", + backend=MarkdownDocumentBackend, + ) + backend = MarkdownDocumentBackend( + in_doc=in_doc, + path_or_stream=io.BytesIO(input_document.encode("utf-8")), + ) + + act_doc: DoclingDocument = backend.convert() + + assert len(act_doc.body.children) == 1 + body_first_child_ref: RefItem = act_doc.body.children[0] + assert isinstance(body_first_child_ref, RefItem) + + assert body_first_child_ref.cref == "#/groups/0" + + body_first_child: GroupItem = body_first_child_ref.resolve(act_doc) + + # The first child should have the header, content and two subheaders + assert len(body_first_child.children) == 4 + + act_data = act_doc.export_to_markdown() + + expected_output = dedent(""" + # Header 1 + + some content under the header 1 + + ## Header 2a + + some content under the header 2 + + ### Header 3 + + some content under the header 3 + + ## Header 2b + """).strip() + + assert act_data == expected_output