Support hierarchical markdown

This commit is contained in:
William Easton 2025-07-12 20:10:27 -05:00
parent 95e70962f1
commit df5c15195b
No known key found for this signature in database
4 changed files with 356 additions and 172 deletions

View File

@ -5,21 +5,24 @@ from copy import deepcopy
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import List, Literal, Optional, Set, Union from typing import List, Literal, Optional, Set, Union, override
import marko import marko
import marko.element import marko.element
import marko.inline import marko.inline
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupItem,
NodeItem, NodeItem,
TableCell, TableCell,
TableData, TableData,
TextItem, TextItem,
) )
from docling_core.types.doc.document import Formatting from docling_core.types.doc.document import Formatting, SectionHeaderItem, TitleItem
from docling_core.types.doc.labels import GroupLabel
from marko import Markdown from marko import Markdown
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
from typing_extensions import Annotated from typing_extensions import Annotated
@ -45,7 +48,7 @@ class _PendingCreationType(str, Enum):
class _HeadingCreationPayload(BaseModel): class _HeadingCreationPayload(BaseModel):
kind: Literal["heading"] = "heading" kind: Literal["heading"] = "heading"
level: int heading_item: TitleItem | SectionHeaderItem
class _ListItemCreationPayload(BaseModel): class _ListItemCreationPayload(BaseModel):
@ -63,6 +66,12 @@ _CreationPayload = Annotated[
class MarkdownDocumentBackend(DeclarativeDocumentBackend): class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def _get_current_heading_level(self) -> int:
return max(self.header_to_group.keys()) if self.header_to_group else 0
def _get_current_heading_group(self) -> Optional[Union[DocItem, GroupItem]]:
return self.header_to_group.get(self._get_current_heading_level(), None)
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores # This regex will match any sequence of underscores
pattern = r"_+" pattern = r"_+"
@ -100,6 +109,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer: list[str] = [] self.md_table_buffer: list[str] = []
self._html_blocks: int = 0 self._html_blocks: int = 0
self.header_to_group: dict[int, GroupItem] = {}
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8") text_stream = self.path_or_stream.getvalue().decode("utf-8")
@ -125,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e ) from e
return return
def _close_table(self, doc: DoclingDocument): def _close_table(self, doc: DoclingDocument, parent_item: Optional[NodeItem]):
if self.in_table: if self.in_table:
_log.debug("=== TABLE START ===") _log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer: for md_table_row in self.md_table_buffer:
@ -179,7 +190,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
for tcell in tcells: for tcell in tcells:
table_data.table_cells.append(tcell) table_data.table_cells.append(tcell)
if len(tcells) > 0: if len(tcells) > 0:
doc.add_table(data=table_data) _ = doc.add_table(data=table_data, parent=parent_item)
return return
def _create_list_item( def _create_list_item(
@ -208,7 +219,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
level: int, level: int,
formatting: Optional[Formatting] = None, formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None,
): ) -> TitleItem | SectionHeaderItem:
if level == 1: if level == 1:
item = doc.add_title( item = doc.add_title(
text=text, text=text,
@ -244,25 +255,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if element in visited: if element in visited:
return return
if parent_item is None:
parent_item = self._get_current_heading_group()
# Iterates over all elements in the AST # Iterates over all elements in the AST
# Check for different element types and process relevant details # Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0: if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug( _log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
) )
if len(element.children) > 1: # inline group will be created further down while self._get_current_heading_level() >= element.level:
parent_item = self._create_heading_item( _ = self.header_to_group.pop(self._get_current_heading_level())
doc=doc,
parent_item=parent_item, parent_item = doc.add_group(
text="", name=f"header-{element.level}",
level=element.level, label=GroupLabel.SECTION,
formatting=formatting, parent=self._get_current_heading_group(),
hyperlink=hyperlink, )
)
else: self.header_to_group[element.level] = parent_item
creation_stack.append(_HeadingCreationPayload(level=element.level))
parent_item = self._create_heading_item(
doc=doc,
parent_item=parent_item,
text="",
level=element.level,
formatting=formatting,
)
if len(element.children) > 1:
parent_item = doc.add_inline_group(parent=parent_item)
elif len(element.children) == 1:
creation_stack.append(_HeadingCreationPayload(heading_item=parent_item))
elif isinstance(element, marko.block.List): elif isinstance(element, marko.block.List):
has_non_empty_list_items = False has_non_empty_list_items = False
@ -271,7 +298,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
has_non_empty_list_items = True has_non_empty_list_items = True
break break
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items: if has_non_empty_list_items:
parent_item = doc.add_list_group(name="list", parent=parent_item) parent_item = doc.add_list_group(name="list", parent=parent_item)
@ -283,7 +310,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
and isinstance((child := element.children[0]), marko.block.Paragraph) and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0 and len(child.children) > 0
): ):
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug(" - List item") _log.debug(" - List item")
enumerated = ( enumerated = (
@ -304,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated)) creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
elif isinstance(element, marko.inline.Image): elif isinstance(element, marko.inline.Image):
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}") _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None fig_caption: Optional[TextItem] = None
@ -346,7 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else: else:
self.md_table_buffer.append(snippet_text) self.md_table_buffer.append(snippet_text)
elif snippet_text: elif snippet_text:
self._close_table(doc) self._close_table(doc, parent_item)
if creation_stack: if creation_stack:
while len(creation_stack) > 0: while len(creation_stack) > 0:
@ -368,17 +395,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
hyperlink=hyperlink, hyperlink=hyperlink,
) )
elif isinstance(to_create, _HeadingCreationPayload): elif isinstance(to_create, _HeadingCreationPayload):
# not keeping as parent_item as logic for correctly tracking to_create.heading_item.text = snippet_text
# that not implemented yet (section components not captured to_create.heading_item.formatting = formatting
# as heading children in marko) to_create.heading_item.hyperlink = hyperlink
self._create_heading_item(
doc=doc,
parent_item=parent_item,
text=snippet_text,
level=to_create.level,
formatting=formatting,
hyperlink=hyperlink,
)
else: else:
doc.add_text( doc.add_text(
label=DocItemLabel.TEXT, label=DocItemLabel.TEXT,
@ -389,7 +408,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
elif isinstance(element, marko.inline.CodeSpan): elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug(f" - Code Span: {element.children}") _log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip() snippet_text = str(element.children).strip()
doc.add_code( doc.add_code(
@ -405,7 +424,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
and isinstance((child := element.children[0]), marko.inline.RawText) and isinstance((child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (child.children.strip())) > 0 and len(snippet_text := (child.children.strip())) > 0
): ):
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
doc.add_code( doc.add_code(
parent=parent_item, parent=parent_item,
@ -421,7 +440,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock): elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1 self._html_blocks += 1
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug(f"HTML Block: {element}") _log.debug(f"HTML Block: {element}")
if ( if (
len(element.body) > 0 len(element.body) > 0
@ -438,13 +457,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
else: else:
if not isinstance(element, str): if not isinstance(element, str):
self._close_table(doc) self._close_table(doc, parent_item)
_log.debug(f"Some other element: {element}") _log.debug(f"Some other element: {element}")
if ( if isinstance(element, marko.block.Paragraph) and len(element.children) > 1:
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
and len(element.children) > 1
):
parent_item = doc.add_inline_group(parent=parent_item) parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = ( processed_block_types = (
@ -511,7 +527,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
creation_stack=[], creation_stack=[],
list_ordered_flag_by_ref={}, list_ordered_flag_by_ref={},
) )
self._close_table(doc=doc) # handle any last hanging table self._close_table(doc=doc, parent_item=None) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend # if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0: if self._html_blocks > 0:

10
tests/conftest.py Normal file
View File

@ -0,0 +1,10 @@
from pathlib import Path
import pytest
TEST_DATA_DIR = Path("./tests/data/")
@pytest.fixture
def test_data_directory() -> Path:
return TEST_DATA_DIR

View File

@ -1,18 +1,7 @@
body: body:
children: children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0' - $ref: '#/groups/0'
- $ref: '#/groups/1' - $ref: '#/groups/9'
- $ref: '#/groups/2'
- $ref: '#/texts/32'
- $ref: '#/groups/8'
- $ref: '#/groups/11'
- $ref: '#/texts/43'
- $ref: '#/texts/47'
- $ref: '#/texts/48'
- $ref: '#/groups/13'
- $ref: '#/tables/0'
content_layer: body content_layer: body
label: unspecified label: unspecified
name: _root_ name: _root_
@ -25,6 +14,18 @@ furniture:
name: _root_ name: _root_
self_ref: '#/furniture' self_ref: '#/furniture'
groups: groups:
- children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/1'
- $ref: '#/groups/2'
- $ref: '#/groups/3'
content_layer: body
label: section
name: header-1
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children: - children:
- $ref: '#/texts/2' - $ref: '#/texts/2'
- $ref: '#/texts/3' - $ref: '#/texts/3'
@ -35,8 +36,8 @@ groups:
label: inline label: inline
name: group name: group
parent: parent:
$ref: '#/body' $ref: '#/groups/0'
self_ref: '#/groups/0' self_ref: '#/groups/1'
- children: - children:
- $ref: '#/texts/7' - $ref: '#/texts/7'
- $ref: '#/texts/8' - $ref: '#/texts/8'
@ -45,8 +46,8 @@ groups:
label: inline label: inline
name: group name: group
parent: parent:
$ref: '#/body' $ref: '#/groups/0'
self_ref: '#/groups/1' self_ref: '#/groups/2'
- children: - children:
- $ref: '#/texts/10' - $ref: '#/texts/10'
- $ref: '#/texts/14' - $ref: '#/texts/14'
@ -59,8 +60,8 @@ groups:
label: list label: list
name: list name: list
parent: parent:
$ref: '#/body' $ref: '#/groups/0'
self_ref: '#/groups/2' self_ref: '#/groups/3'
- children: - children:
- $ref: '#/texts/11' - $ref: '#/texts/11'
- $ref: '#/texts/12' - $ref: '#/texts/12'
@ -70,7 +71,7 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/10' $ref: '#/texts/10'
self_ref: '#/groups/3' self_ref: '#/groups/4'
- children: - children:
- $ref: '#/texts/15' - $ref: '#/texts/15'
- $ref: '#/texts/16' - $ref: '#/texts/16'
@ -80,7 +81,7 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/14' $ref: '#/texts/14'
self_ref: '#/groups/4' self_ref: '#/groups/5'
- children: - children:
- $ref: '#/texts/19' - $ref: '#/texts/19'
- $ref: '#/texts/20' - $ref: '#/texts/20'
@ -90,7 +91,7 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/18' $ref: '#/texts/18'
self_ref: '#/groups/5' self_ref: '#/groups/6'
- children: - children:
- $ref: '#/texts/23' - $ref: '#/texts/23'
- $ref: '#/texts/24' - $ref: '#/texts/24'
@ -100,7 +101,7 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/22' $ref: '#/texts/22'
self_ref: '#/groups/6' self_ref: '#/groups/7'
- children: - children:
- $ref: '#/texts/29' - $ref: '#/texts/29'
- $ref: '#/texts/30' - $ref: '#/texts/30'
@ -110,7 +111,19 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/28' $ref: '#/texts/28'
self_ref: '#/groups/7' self_ref: '#/groups/8'
- children:
- $ref: '#/texts/32'
- $ref: '#/groups/10'
- $ref: '#/groups/13'
- $ref: '#/groups/14'
- $ref: '#/groups/16'
content_layer: body
label: section
name: header-1
parent:
$ref: '#/body'
self_ref: '#/groups/9'
- children: - children:
- $ref: '#/texts/33' - $ref: '#/texts/33'
- $ref: '#/texts/36' - $ref: '#/texts/36'
@ -118,8 +131,8 @@ groups:
label: list label: list
name: list name: list
parent: parent:
$ref: '#/body' $ref: '#/groups/9'
self_ref: '#/groups/8' self_ref: '#/groups/10'
- children: - children:
- $ref: '#/texts/34' - $ref: '#/texts/34'
- $ref: '#/texts/35' - $ref: '#/texts/35'
@ -128,7 +141,7 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/33' $ref: '#/texts/33'
self_ref: '#/groups/9' self_ref: '#/groups/11'
- children: - children:
- $ref: '#/texts/37' - $ref: '#/texts/37'
- $ref: '#/texts/38' - $ref: '#/texts/38'
@ -139,7 +152,7 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/36' $ref: '#/texts/36'
self_ref: '#/groups/10' self_ref: '#/groups/12'
- children: - children:
- $ref: '#/texts/41' - $ref: '#/texts/41'
- $ref: '#/texts/42' - $ref: '#/texts/42'
@ -147,8 +160,17 @@ groups:
label: inline label: inline
name: group name: group
parent: parent:
$ref: '#/body' $ref: '#/groups/9'
self_ref: '#/groups/11' self_ref: '#/groups/13'
- children:
- $ref: '#/texts/43'
- $ref: '#/texts/47'
content_layer: body
label: section
name: header-2
parent:
$ref: '#/groups/9'
self_ref: '#/groups/14'
- children: - children:
- $ref: '#/texts/44' - $ref: '#/texts/44'
- $ref: '#/texts/45' - $ref: '#/texts/45'
@ -158,14 +180,24 @@ groups:
name: group name: group
parent: parent:
$ref: '#/texts/43' $ref: '#/texts/43'
self_ref: '#/groups/12' self_ref: '#/groups/15'
- children:
- $ref: '#/texts/48'
- $ref: '#/groups/17'
- $ref: '#/tables/0'
content_layer: body
label: section
name: header-2
parent:
$ref: '#/groups/9'
self_ref: '#/groups/16'
- children: [] - children: []
content_layer: body content_layer: body
label: inline label: inline
name: group name: group
parent: parent:
$ref: '#/body' $ref: '#/groups/16'
self_ref: '#/groups/13' self_ref: '#/groups/17'
key_value_items: [] key_value_items: []
name: inline_and_formatting name: inline_and_formatting
origin: origin:
@ -308,7 +340,7 @@ tables:
footnotes: [] footnotes: []
label: table label: table
parent: parent:
$ref: '#/body' $ref: '#/groups/16'
prov: [] prov: []
references: [] references: []
self_ref: '#/tables/0' self_ref: '#/tables/0'
@ -316,9 +348,9 @@ texts:
- children: [] - children: []
content_layer: body content_layer: body
label: title label: title
orig: Contribution guideline example orig: ''
parent: parent:
$ref: '#/body' $ref: '#/groups/0'
prov: [] prov: []
self_ref: '#/texts/0' self_ref: '#/texts/0'
text: Contribution guideline example text: Contribution guideline example
@ -327,7 +359,7 @@ texts:
label: text label: text
orig: This is simple. orig: This is simple.
parent: parent:
$ref: '#/body' $ref: '#/groups/0'
prov: [] prov: []
self_ref: '#/texts/1' self_ref: '#/texts/1'
text: This is simple. text: This is simple.
@ -336,7 +368,7 @@ texts:
label: text label: text
orig: Foo orig: Foo
parent: parent:
$ref: '#/groups/0' $ref: '#/groups/1'
prov: [] prov: []
self_ref: '#/texts/2' self_ref: '#/texts/2'
text: Foo text: Foo
@ -351,7 +383,7 @@ texts:
label: text label: text
orig: emphasis orig: emphasis
parent: parent:
$ref: '#/groups/0' $ref: '#/groups/1'
prov: [] prov: []
self_ref: '#/texts/3' self_ref: '#/texts/3'
text: emphasis text: emphasis
@ -366,7 +398,7 @@ texts:
label: text label: text
orig: strong emphasis orig: strong emphasis
parent: parent:
$ref: '#/groups/0' $ref: '#/groups/1'
prov: [] prov: []
self_ref: '#/texts/4' self_ref: '#/texts/4'
text: strong emphasis text: strong emphasis
@ -381,7 +413,7 @@ texts:
label: text label: text
orig: both orig: both
parent: parent:
$ref: '#/groups/0' $ref: '#/groups/1'
prov: [] prov: []
self_ref: '#/texts/5' self_ref: '#/texts/5'
text: both text: both
@ -390,7 +422,7 @@ texts:
label: text label: text
orig: . orig: .
parent: parent:
$ref: '#/groups/0' $ref: '#/groups/1'
prov: [] prov: []
self_ref: '#/texts/6' self_ref: '#/texts/6'
text: . text: .
@ -399,7 +431,7 @@ texts:
label: text label: text
orig: 'Create your feature branch:' orig: 'Create your feature branch:'
parent: parent:
$ref: '#/groups/1' $ref: '#/groups/2'
prov: [] prov: []
self_ref: '#/texts/7' self_ref: '#/texts/7'
text: 'Create your feature branch:' text: 'Create your feature branch:'
@ -411,7 +443,7 @@ texts:
label: code label: code
orig: git checkout -b feature/AmazingFeature orig: git checkout -b feature/AmazingFeature
parent: parent:
$ref: '#/groups/1' $ref: '#/groups/2'
prov: [] prov: []
references: [] references: []
self_ref: '#/texts/8' self_ref: '#/texts/8'
@ -421,19 +453,19 @@ texts:
label: text label: text
orig: . orig: .
parent: parent:
$ref: '#/groups/1' $ref: '#/groups/2'
prov: [] prov: []
self_ref: '#/texts/9' self_ref: '#/texts/9'
text: . text: .
- children: - children:
- $ref: '#/groups/3' - $ref: '#/groups/4'
content_layer: body content_layer: body
enumerated: true enumerated: true
label: list_item label: list_item
marker: '' marker: ''
orig: '' orig: ''
parent: parent:
$ref: '#/groups/2' $ref: '#/groups/3'
prov: [] prov: []
self_ref: '#/texts/10' self_ref: '#/texts/10'
text: '' text: ''
@ -442,7 +474,7 @@ texts:
label: text label: text
orig: Pull the orig: Pull the
parent: parent:
$ref: '#/groups/3' $ref: '#/groups/4'
prov: [] prov: []
self_ref: '#/texts/11' self_ref: '#/texts/11'
text: Pull the text: Pull the
@ -458,7 +490,7 @@ texts:
label: text label: text
orig: repository orig: repository
parent: parent:
$ref: '#/groups/3' $ref: '#/groups/4'
prov: [] prov: []
self_ref: '#/texts/12' self_ref: '#/texts/12'
text: repository text: repository
@ -467,19 +499,19 @@ texts:
label: text label: text
orig: . orig: .
parent: parent:
$ref: '#/groups/3' $ref: '#/groups/4'
prov: [] prov: []
self_ref: '#/texts/13' self_ref: '#/texts/13'
text: . text: .
- children: - children:
- $ref: '#/groups/4' - $ref: '#/groups/5'
content_layer: body content_layer: body
enumerated: true enumerated: true
label: list_item label: list_item
marker: '' marker: ''
orig: '' orig: ''
parent: parent:
$ref: '#/groups/2' $ref: '#/groups/3'
prov: [] prov: []
self_ref: '#/texts/14' self_ref: '#/texts/14'
text: '' text: ''
@ -488,7 +520,7 @@ texts:
label: text label: text
orig: Create your feature branch ( orig: Create your feature branch (
parent: parent:
$ref: '#/groups/4' $ref: '#/groups/5'
prov: [] prov: []
self_ref: '#/texts/15' self_ref: '#/texts/15'
text: Create your feature branch ( text: Create your feature branch (
@ -500,7 +532,7 @@ texts:
label: code label: code
orig: git checkout -b feature/AmazingFeature orig: git checkout -b feature/AmazingFeature
parent: parent:
$ref: '#/groups/4' $ref: '#/groups/5'
prov: [] prov: []
references: [] references: []
self_ref: '#/texts/16' self_ref: '#/texts/16'
@ -510,19 +542,19 @@ texts:
label: text label: text
orig: ) orig: )
parent: parent:
$ref: '#/groups/4' $ref: '#/groups/5'
prov: [] prov: []
self_ref: '#/texts/17' self_ref: '#/texts/17'
text: ) text: )
- children: - children:
- $ref: '#/groups/5' - $ref: '#/groups/6'
content_layer: body content_layer: body
enumerated: true enumerated: true
label: list_item label: list_item
marker: '' marker: ''
orig: '' orig: ''
parent: parent:
$ref: '#/groups/2' $ref: '#/groups/3'
prov: [] prov: []
self_ref: '#/texts/18' self_ref: '#/texts/18'
text: '' text: ''
@ -531,7 +563,7 @@ texts:
label: text label: text
orig: Commit your changes ( orig: Commit your changes (
parent: parent:
$ref: '#/groups/5' $ref: '#/groups/6'
prov: [] prov: []
self_ref: '#/texts/19' self_ref: '#/texts/19'
text: Commit your changes ( text: Commit your changes (
@ -543,7 +575,7 @@ texts:
label: code label: code
orig: git commit -m 'Add some AmazingFeature' orig: git commit -m 'Add some AmazingFeature'
parent: parent:
$ref: '#/groups/5' $ref: '#/groups/6'
prov: [] prov: []
references: [] references: []
self_ref: '#/texts/20' self_ref: '#/texts/20'
@ -553,19 +585,19 @@ texts:
label: text label: text
orig: ) orig: )
parent: parent:
$ref: '#/groups/5' $ref: '#/groups/6'
prov: [] prov: []
self_ref: '#/texts/21' self_ref: '#/texts/21'
text: ) text: )
- children: - children:
- $ref: '#/groups/6' - $ref: '#/groups/7'
content_layer: body content_layer: body
enumerated: true enumerated: true
label: list_item label: list_item
marker: '' marker: ''
orig: '' orig: ''
parent: parent:
$ref: '#/groups/2' $ref: '#/groups/3'
prov: [] prov: []
self_ref: '#/texts/22' self_ref: '#/texts/22'
text: '' text: ''
@ -574,7 +606,7 @@ texts:
label: text label: text
orig: Push to the branch ( orig: Push to the branch (
parent: parent:
$ref: '#/groups/6' $ref: '#/groups/7'
prov: [] prov: []
self_ref: '#/texts/23' self_ref: '#/texts/23'
text: Push to the branch ( text: Push to the branch (
@ -586,7 +618,7 @@ texts:
label: code label: code
orig: git push origin feature/AmazingFeature orig: git push origin feature/AmazingFeature
parent: parent:
$ref: '#/groups/6' $ref: '#/groups/7'
prov: [] prov: []
references: [] references: []
self_ref: '#/texts/24' self_ref: '#/texts/24'
@ -596,7 +628,7 @@ texts:
label: text label: text
orig: ) orig: )
parent: parent:
$ref: '#/groups/6' $ref: '#/groups/7'
prov: [] prov: []
self_ref: '#/texts/25' self_ref: '#/texts/25'
text: ) text: )
@ -607,7 +639,7 @@ texts:
marker: '' marker: ''
orig: Open a Pull Request orig: Open a Pull Request
parent: parent:
$ref: '#/groups/2' $ref: '#/groups/3'
prov: [] prov: []
self_ref: '#/texts/26' self_ref: '#/texts/26'
text: Open a Pull Request text: Open a Pull Request
@ -624,19 +656,19 @@ texts:
marker: '' marker: ''
orig: Whole list item has same formatting orig: Whole list item has same formatting
parent: parent:
$ref: '#/groups/2' $ref: '#/groups/3'
prov: [] prov: []
self_ref: '#/texts/27' self_ref: '#/texts/27'
text: Whole list item has same formatting text: Whole list item has same formatting
- children: - children:
- $ref: '#/groups/7' - $ref: '#/groups/8'
content_layer: body content_layer: body
enumerated: true enumerated: true
label: list_item label: list_item
marker: '' marker: ''
orig: '' orig: ''
parent: parent:
$ref: '#/groups/2' $ref: '#/groups/3'
prov: [] prov: []
self_ref: '#/texts/28' self_ref: '#/texts/28'
text: '' text: ''
@ -645,7 +677,7 @@ texts:
label: text label: text
orig: List item has orig: List item has
parent: parent:
$ref: '#/groups/7' $ref: '#/groups/8'
prov: [] prov: []
self_ref: '#/texts/29' self_ref: '#/texts/29'
text: List item has text: List item has
@ -660,7 +692,7 @@ texts:
label: text label: text
orig: mixed or partial orig: mixed or partial
parent: parent:
$ref: '#/groups/7' $ref: '#/groups/8'
prov: [] prov: []
self_ref: '#/texts/30' self_ref: '#/texts/30'
text: mixed or partial text: mixed or partial
@ -669,7 +701,7 @@ texts:
label: text label: text
orig: formatting orig: formatting
parent: parent:
$ref: '#/groups/7' $ref: '#/groups/8'
prov: [] prov: []
self_ref: '#/texts/31' self_ref: '#/texts/31'
text: formatting text: formatting
@ -682,21 +714,21 @@ texts:
strikethrough: false strikethrough: false
underline: false underline: false
label: title label: title
orig: Whole heading is italic orig: ''
parent: parent:
$ref: '#/body' $ref: '#/groups/9'
prov: [] prov: []
self_ref: '#/texts/32' self_ref: '#/texts/32'
text: Whole heading is italic text: Whole heading is italic
- children: - children:
- $ref: '#/groups/9' - $ref: '#/groups/11'
content_layer: body content_layer: body
enumerated: false enumerated: false
label: list_item label: list_item
marker: '' marker: ''
orig: '' orig: ''
parent: parent:
$ref: '#/groups/8' $ref: '#/groups/10'
prov: [] prov: []
self_ref: '#/texts/33' self_ref: '#/texts/33'
text: '' text: ''
@ -711,7 +743,7 @@ texts:
label: text label: text
orig: First orig: First
parent: parent:
$ref: '#/groups/9' $ref: '#/groups/11'
prov: [] prov: []
self_ref: '#/texts/34' self_ref: '#/texts/34'
text: First text: First
@ -720,19 +752,19 @@ texts:
label: text label: text
orig: ': Lorem ipsum.' orig: ': Lorem ipsum.'
parent: parent:
$ref: '#/groups/9' $ref: '#/groups/11'
prov: [] prov: []
self_ref: '#/texts/35' self_ref: '#/texts/35'
text: ': Lorem ipsum.' text: ': Lorem ipsum.'
- children: - children:
- $ref: '#/groups/10' - $ref: '#/groups/12'
content_layer: body content_layer: body
enumerated: false enumerated: false
label: list_item label: list_item
marker: '' marker: ''
orig: '' orig: ''
parent: parent:
$ref: '#/groups/8' $ref: '#/groups/10'
prov: [] prov: []
self_ref: '#/texts/36' self_ref: '#/texts/36'
text: '' text: ''
@ -747,7 +779,7 @@ texts:
label: text label: text
orig: Second orig: Second
parent: parent:
$ref: '#/groups/10' $ref: '#/groups/12'
prov: [] prov: []
self_ref: '#/texts/37' self_ref: '#/texts/37'
text: Second text: Second
@ -756,7 +788,7 @@ texts:
label: text label: text
orig: ': Dolor' orig: ': Dolor'
parent: parent:
$ref: '#/groups/10' $ref: '#/groups/12'
prov: [] prov: []
self_ref: '#/texts/38' self_ref: '#/texts/38'
text: ': Dolor' text: ': Dolor'
@ -768,7 +800,7 @@ texts:
label: code label: code
orig: sit orig: sit
parent: parent:
$ref: '#/groups/10' $ref: '#/groups/12'
prov: [] prov: []
references: [] references: []
self_ref: '#/texts/39' self_ref: '#/texts/39'
@ -778,7 +810,7 @@ texts:
label: text label: text
orig: amet. orig: amet.
parent: parent:
$ref: '#/groups/10' $ref: '#/groups/12'
prov: [] prov: []
self_ref: '#/texts/40' self_ref: '#/texts/40'
text: amet. text: amet.
@ -787,7 +819,7 @@ texts:
label: text label: text
orig: Some orig: Some
parent: parent:
$ref: '#/groups/11' $ref: '#/groups/13'
prov: [] prov: []
self_ref: '#/texts/41' self_ref: '#/texts/41'
text: Some text: Some
@ -805,19 +837,19 @@ texts:
label: code label: code
orig: formatted_code orig: formatted_code
parent: parent:
$ref: '#/groups/11' $ref: '#/groups/13'
prov: [] prov: []
references: [] references: []
self_ref: '#/texts/42' self_ref: '#/texts/42'
text: formatted_code text: formatted_code
- children: - children:
- $ref: '#/groups/12' - $ref: '#/groups/15'
content_layer: body content_layer: body
label: section_header label: section_header
level: 1 level: 1
orig: '' orig: ''
parent: parent:
$ref: '#/body' $ref: '#/groups/14'
prov: [] prov: []
self_ref: '#/texts/43' self_ref: '#/texts/43'
text: '' text: ''
@ -832,7 +864,7 @@ texts:
label: text label: text
orig: Partially formatted orig: Partially formatted
parent: parent:
$ref: '#/groups/12' $ref: '#/groups/15'
prov: [] prov: []
self_ref: '#/texts/44' self_ref: '#/texts/44'
text: Partially formatted text: Partially formatted
@ -841,7 +873,7 @@ texts:
label: text label: text
orig: heading to_escape orig: heading to_escape
parent: parent:
$ref: '#/groups/12' $ref: '#/groups/15'
prov: [] prov: []
self_ref: '#/texts/45' self_ref: '#/texts/45'
text: heading to_escape text: heading to_escape
@ -853,7 +885,7 @@ texts:
label: code label: code
orig: not_to_escape orig: not_to_escape
parent: parent:
$ref: '#/groups/12' $ref: '#/groups/15'
prov: [] prov: []
references: [] references: []
self_ref: '#/texts/46' self_ref: '#/texts/46'
@ -864,7 +896,7 @@ texts:
label: text label: text
orig: $$E=mc^2$$ orig: $$E=mc^2$$
parent: parent:
$ref: '#/body' $ref: '#/groups/14'
prov: [] prov: []
self_ref: '#/texts/47' self_ref: '#/texts/47'
text: $$E=mc^2$$ text: $$E=mc^2$$
@ -872,9 +904,9 @@ texts:
content_layer: body content_layer: body
label: section_header label: section_header
level: 1 level: 1
orig: Table Heading orig: ''
parent: parent:
$ref: '#/body' $ref: '#/groups/16'
prov: [] prov: []
self_ref: '#/texts/48' self_ref: '#/texts/48'
text: Table Heading text: Table Heading

View File

@ -1,57 +1,183 @@
import io
from pathlib import Path from pathlib import Path
from textwrap import dedent
from typing import Annotated
import pytest
from _pytest.mark import ParameterSet
from docling_core.types.doc.document import DoclingDocument, GroupItem, RefItem
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument from docling.datamodel.document import (
InputDocument,
)
from tests.conftest import TEST_DATA_DIR
from tests.verify_utils import CONFID_PREC, COORD_PREC from tests.verify_utils import CONFID_PREC, COORD_PREC
from .test_data_gen_flag import GEN_TEST_DATA from .test_data_gen_flag import GEN_TEST_DATA
GENERATE = True or GEN_TEST_DATA
def test_convert_valid(): ALSO_GENERATE_YAML = ["inline_and_formatting"]
fmt = InputFormat.MD """A list of document names that should also be generated as yaml"""
cls = MarkdownDocumentBackend
root_path = Path("tests") / "data" # Test Input Directories
relevant_paths = sorted((root_path / "md").rglob("*.md")) INPUT_DIR = TEST_DATA_DIR / "md"
assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"] # Test Output Directories
SNAPSHOT_DIR = TEST_DATA_DIR / "groundtruth" / "docling_v2"
for in_path in relevant_paths: TestCase = Annotated[tuple[str, Path, Path], "test_name, in_file, snapshot_file"]
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
in_doc = InputDocument(
path_or_stream=in_path, def markdown_test_data() -> list[ParameterSet]:
format=fmt, """Returns test cases for each of our input markdown files"""
backend=cls,
test_case_paths = sorted(INPUT_DIR.glob(pattern="*.md"), key=lambda x: x.name)
test_cases: list[ParameterSet] = []
for test_case_path in test_case_paths:
name: str = test_case_path.stem
markdown_document_path: Path = test_case_path.resolve()
markdown_snapshot_path: Path = SNAPSHOT_DIR / f"{name}.md.md"
yaml_snapshot_path: Path | None = (
SNAPSHOT_DIR / f"{name}.md.yaml" if name in ALSO_GENERATE_YAML else None
) )
backend = cls(
in_doc=in_doc, test_cases.append(
path_or_stream=in_path, pytest.param(
markdown_document_path,
markdown_snapshot_path,
yaml_snapshot_path,
id=name,
)
) )
assert backend.is_valid()
act_doc = backend.convert() return test_cases
act_data = act_doc.export_to_markdown()
if GEN_TEST_DATA:
with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")
if in_path.stem in yaml_filter: @pytest.mark.parametrize(
with open(yaml_gt_path, mode="w", encoding="utf-8") as f: ("markdown_document_path", "markdown_snapshot_path", "yaml_snapshot_path"),
act_doc.save_as_yaml( markdown_test_data(),
yaml_gt_path, )
coord_precision=COORD_PREC, def test_convert_markdown(
confid_precision=CONFID_PREC, markdown_document_path: Path,
) markdown_snapshot_path: Path,
yaml_snapshot_path: Path | None,
):
"""Test that the Markdown backend can:
1) convert the input markdown file to a DoclingDocument
2) export the markdown (and optionally, yaml) and verify it matches the committed snapshot
"""
if not GENERATE and not markdown_snapshot_path.exists():
pytest.skip(
f"Test requires {markdown_snapshot_path} to exist, you may need to generate it with GENERATE=True"
)
document_backend = MarkdownDocumentBackend(
in_doc=InputDocument(
path_or_stream=markdown_document_path,
format=InputFormat.MD,
backend=MarkdownDocumentBackend,
),
path_or_stream=markdown_document_path,
)
assert document_backend.is_valid()
try:
out_docling_document: DoclingDocument = document_backend.convert()
except Exception as e:
pytest.skip(f"Error converting {markdown_document_path}: {e}")
# Validate the YAML/JSON Export
if yaml_snapshot_path:
if GENERATE:
out_docling_document.save_as_yaml(
yaml_snapshot_path,
coord_precision=COORD_PREC,
confid_precision=CONFID_PREC,
)
else: else:
with open(md_gt_path, encoding="utf-8") as f: assert out_docling_document == DoclingDocument.load_from_yaml(
exp_data = f.read().rstrip() yaml_snapshot_path
assert act_data == exp_data )
if in_path.stem in yaml_filter: # Validate the Markdown Export
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path) out_markdown: str = out_docling_document.export_to_markdown()
assert act_doc == exp_doc
if GENERATE:
_ = markdown_snapshot_path.write_text(out_markdown + "\n")
else:
assert (
out_markdown == markdown_snapshot_path.read_text(encoding="utf-8")
)
def test_convert_headers_to_groups():
"""Test that the Markdown backend can convert headers into hierarchical groups"""
input_document = dedent("""
# Header 1
some content under the header 1
## Header 2a
some content under the header 2
### Header 3
some content under the header 3
## Header 2b
""")
in_doc = InputDocument(
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
format=InputFormat.MD,
filename="headers_to_groups.md",
backend=MarkdownDocumentBackend,
)
backend = MarkdownDocumentBackend(
in_doc=in_doc,
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
)
act_doc: DoclingDocument = backend.convert()
assert len(act_doc.body.children) == 1
body_first_child_ref: RefItem = act_doc.body.children[0]
assert isinstance(body_first_child_ref, RefItem)
assert body_first_child_ref.cref == "#/groups/0"
body_first_child: GroupItem = body_first_child_ref.resolve(act_doc)
# The first child should have the header, content and two subheaders
assert len(body_first_child.children) == 4
act_data = act_doc.export_to_markdown()
expected_output = dedent("""
# Header 1
some content under the header 1
## Header 2a
some content under the header 2
### Header 3
some content under the header 3
## Header 2b
""").strip()
assert act_data == expected_output