This commit is contained in:
William Easton 2025-07-23 14:02:04 +02:00 committed by GitHub
commit 92deef45f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 356 additions and 172 deletions

View File

@ -5,21 +5,24 @@ from copy import deepcopy
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import List, Literal, Optional, Set, Union
from typing import List, Literal, Optional, Set, Union, override
import marko
import marko.element
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
NodeItem,
TableCell,
TableData,
TextItem,
)
from docling_core.types.doc.document import Formatting
from docling_core.types.doc.document import Formatting, SectionHeaderItem, TitleItem
from docling_core.types.doc.labels import GroupLabel
from marko import Markdown
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
from typing_extensions import Annotated
@ -45,7 +48,7 @@ class _PendingCreationType(str, Enum):
class _HeadingCreationPayload(BaseModel):
kind: Literal["heading"] = "heading"
level: int
heading_item: TitleItem | SectionHeaderItem
class _ListItemCreationPayload(BaseModel):
@ -63,6 +66,12 @@ _CreationPayload = Annotated[
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def _get_current_heading_level(self) -> int:
return max(self.header_to_group.keys()) if self.header_to_group else 0
def _get_current_heading_group(self) -> Optional[Union[DocItem, GroupItem]]:
return self.header_to_group.get(self._get_current_heading_level(), None)
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"
@ -100,6 +109,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer: list[str] = []
self._html_blocks: int = 0
self.header_to_group: dict[int, GroupItem] = {}
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
@ -125,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e
return
def _close_table(self, doc: DoclingDocument):
def _close_table(self, doc: DoclingDocument, parent_item: Optional[NodeItem]):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
@ -179,7 +190,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
for tcell in tcells:
table_data.table_cells.append(tcell)
if len(tcells) > 0:
doc.add_table(data=table_data)
_ = doc.add_table(data=table_data, parent=parent_item)
return
def _create_list_item(
@ -208,7 +219,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
level: int,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
):
) -> TitleItem | SectionHeaderItem:
if level == 1:
item = doc.add_title(
text=text,
@ -244,25 +255,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if element in visited:
return
if parent_item is None:
parent_item = self._get_current_heading_group()
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if len(element.children) > 1: # inline group will be created further down
parent_item = self._create_heading_item(
doc=doc,
parent_item=parent_item,
text="",
level=element.level,
formatting=formatting,
hyperlink=hyperlink,
)
else:
creation_stack.append(_HeadingCreationPayload(level=element.level))
while self._get_current_heading_level() >= element.level:
_ = self.header_to_group.pop(self._get_current_heading_level())
parent_item = doc.add_group(
name=f"header-{element.level}",
label=GroupLabel.SECTION,
parent=self._get_current_heading_group(),
)
self.header_to_group[element.level] = parent_item
parent_item = self._create_heading_item(
doc=doc,
parent_item=parent_item,
text="",
level=element.level,
formatting=formatting,
)
if len(element.children) > 1:
parent_item = doc.add_inline_group(parent=parent_item)
elif len(element.children) == 1:
creation_stack.append(_HeadingCreationPayload(heading_item=parent_item))
elif isinstance(element, marko.block.List):
has_non_empty_list_items = False
@ -271,7 +298,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
has_non_empty_list_items = True
break
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
parent_item = doc.add_list_group(name="list", parent=parent_item)
@ -283,7 +310,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0
):
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(" - List item")
enumerated = (
@ -304,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None
@ -346,7 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else:
self.md_table_buffer.append(snippet_text)
elif snippet_text:
self._close_table(doc)
self._close_table(doc, parent_item)
if creation_stack:
while len(creation_stack) > 0:
@ -368,17 +395,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
hyperlink=hyperlink,
)
elif isinstance(to_create, _HeadingCreationPayload):
# not keeping as parent_item as logic for correctly tracking
# that not implemented yet (section components not captured
# as heading children in marko)
self._create_heading_item(
doc=doc,
parent_item=parent_item,
text=snippet_text,
level=to_create.level,
formatting=formatting,
hyperlink=hyperlink,
)
to_create.heading_item.text = snippet_text
to_create.heading_item.formatting = formatting
to_create.heading_item.hyperlink = hyperlink
else:
doc.add_text(
label=DocItemLabel.TEXT,
@ -389,7 +408,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_code(
@ -405,7 +424,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
and isinstance((child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (child.children.strip())) > 0
):
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(f" - Code Block: {element.children}")
doc.add_code(
parent=parent_item,
@ -421,7 +440,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(f"HTML Block: {element}")
if (
len(element.body) > 0
@ -438,13 +457,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
else:
if not isinstance(element, str):
self._close_table(doc)
self._close_table(doc, parent_item)
_log.debug(f"Some other element: {element}")
if (
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
and len(element.children) > 1
):
if isinstance(element, marko.block.Paragraph) and len(element.children) > 1:
parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = (
@ -511,7 +527,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
creation_stack=[],
list_ordered_flag_by_ref={},
)
self._close_table(doc=doc) # handle any last hanging table
self._close_table(doc=doc, parent_item=None) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:

10
tests/conftest.py Normal file
View File

@ -0,0 +1,10 @@
from pathlib import Path
import pytest
TEST_DATA_DIR = Path("./tests/data/")
@pytest.fixture
def test_data_directory() -> Path:
return TEST_DATA_DIR

View File

@ -1,18 +1,7 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0'
- $ref: '#/groups/1'
- $ref: '#/groups/2'
- $ref: '#/texts/32'
- $ref: '#/groups/8'
- $ref: '#/groups/11'
- $ref: '#/texts/43'
- $ref: '#/texts/47'
- $ref: '#/texts/48'
- $ref: '#/groups/13'
- $ref: '#/tables/0'
- $ref: '#/groups/9'
content_layer: body
label: unspecified
name: _root_
@ -25,6 +14,18 @@ furniture:
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/1'
- $ref: '#/groups/2'
- $ref: '#/groups/3'
content_layer: body
label: section
name: header-1
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children:
- $ref: '#/texts/2'
- $ref: '#/texts/3'
@ -35,8 +36,8 @@ groups:
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/0'
$ref: '#/groups/0'
self_ref: '#/groups/1'
- children:
- $ref: '#/texts/7'
- $ref: '#/texts/8'
@ -45,8 +46,8 @@ groups:
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/1'
$ref: '#/groups/0'
self_ref: '#/groups/2'
- children:
- $ref: '#/texts/10'
- $ref: '#/texts/14'
@ -59,8 +60,8 @@ groups:
label: list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/2'
$ref: '#/groups/0'
self_ref: '#/groups/3'
- children:
- $ref: '#/texts/11'
- $ref: '#/texts/12'
@ -70,7 +71,7 @@ groups:
name: group
parent:
$ref: '#/texts/10'
self_ref: '#/groups/3'
self_ref: '#/groups/4'
- children:
- $ref: '#/texts/15'
- $ref: '#/texts/16'
@ -80,7 +81,7 @@ groups:
name: group
parent:
$ref: '#/texts/14'
self_ref: '#/groups/4'
self_ref: '#/groups/5'
- children:
- $ref: '#/texts/19'
- $ref: '#/texts/20'
@ -90,7 +91,7 @@ groups:
name: group
parent:
$ref: '#/texts/18'
self_ref: '#/groups/5'
self_ref: '#/groups/6'
- children:
- $ref: '#/texts/23'
- $ref: '#/texts/24'
@ -100,7 +101,7 @@ groups:
name: group
parent:
$ref: '#/texts/22'
self_ref: '#/groups/6'
self_ref: '#/groups/7'
- children:
- $ref: '#/texts/29'
- $ref: '#/texts/30'
@ -110,7 +111,19 @@ groups:
name: group
parent:
$ref: '#/texts/28'
self_ref: '#/groups/7'
self_ref: '#/groups/8'
- children:
- $ref: '#/texts/32'
- $ref: '#/groups/10'
- $ref: '#/groups/13'
- $ref: '#/groups/14'
- $ref: '#/groups/16'
content_layer: body
label: section
name: header-1
parent:
$ref: '#/body'
self_ref: '#/groups/9'
- children:
- $ref: '#/texts/33'
- $ref: '#/texts/36'
@ -118,8 +131,8 @@ groups:
label: list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/8'
$ref: '#/groups/9'
self_ref: '#/groups/10'
- children:
- $ref: '#/texts/34'
- $ref: '#/texts/35'
@ -128,7 +141,7 @@ groups:
name: group
parent:
$ref: '#/texts/33'
self_ref: '#/groups/9'
self_ref: '#/groups/11'
- children:
- $ref: '#/texts/37'
- $ref: '#/texts/38'
@ -139,7 +152,7 @@ groups:
name: group
parent:
$ref: '#/texts/36'
self_ref: '#/groups/10'
self_ref: '#/groups/12'
- children:
- $ref: '#/texts/41'
- $ref: '#/texts/42'
@ -147,8 +160,17 @@ groups:
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/11'
$ref: '#/groups/9'
self_ref: '#/groups/13'
- children:
- $ref: '#/texts/43'
- $ref: '#/texts/47'
content_layer: body
label: section
name: header-2
parent:
$ref: '#/groups/9'
self_ref: '#/groups/14'
- children:
- $ref: '#/texts/44'
- $ref: '#/texts/45'
@ -158,14 +180,24 @@ groups:
name: group
parent:
$ref: '#/texts/43'
self_ref: '#/groups/12'
self_ref: '#/groups/15'
- children:
- $ref: '#/texts/48'
- $ref: '#/groups/17'
- $ref: '#/tables/0'
content_layer: body
label: section
name: header-2
parent:
$ref: '#/groups/9'
self_ref: '#/groups/16'
- children: []
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/13'
$ref: '#/groups/16'
self_ref: '#/groups/17'
key_value_items: []
name: inline_and_formatting
origin:
@ -308,7 +340,7 @@ tables:
footnotes: []
label: table
parent:
$ref: '#/body'
$ref: '#/groups/16'
prov: []
references: []
self_ref: '#/tables/0'
@ -316,9 +348,9 @@ texts:
- children: []
content_layer: body
label: title
orig: Contribution guideline example
orig: ''
parent:
$ref: '#/body'
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/0'
text: Contribution guideline example
@ -327,7 +359,7 @@ texts:
label: text
orig: This is simple.
parent:
$ref: '#/body'
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/1'
text: This is simple.
@ -336,7 +368,7 @@ texts:
label: text
orig: Foo
parent:
$ref: '#/groups/0'
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/2'
text: Foo
@ -351,7 +383,7 @@ texts:
label: text
orig: emphasis
parent:
$ref: '#/groups/0'
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/3'
text: emphasis
@ -366,7 +398,7 @@ texts:
label: text
orig: strong emphasis
parent:
$ref: '#/groups/0'
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/4'
text: strong emphasis
@ -381,7 +413,7 @@ texts:
label: text
orig: both
parent:
$ref: '#/groups/0'
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/5'
text: both
@ -390,7 +422,7 @@ texts:
label: text
orig: .
parent:
$ref: '#/groups/0'
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/6'
text: .
@ -399,7 +431,7 @@ texts:
label: text
orig: 'Create your feature branch:'
parent:
$ref: '#/groups/1'
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/7'
text: 'Create your feature branch:'
@ -411,7 +443,7 @@ texts:
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/1'
$ref: '#/groups/2'
prov: []
references: []
self_ref: '#/texts/8'
@ -421,19 +453,19 @@ texts:
label: text
orig: .
parent:
$ref: '#/groups/1'
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/9'
text: .
- children:
- $ref: '#/groups/3'
- $ref: '#/groups/4'
content_layer: body
enumerated: true
label: list_item
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/10'
text: ''
@ -442,7 +474,7 @@ texts:
label: text
orig: Pull the
parent:
$ref: '#/groups/3'
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/11'
text: Pull the
@ -458,7 +490,7 @@ texts:
label: text
orig: repository
parent:
$ref: '#/groups/3'
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/12'
text: repository
@ -467,19 +499,19 @@ texts:
label: text
orig: .
parent:
$ref: '#/groups/3'
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/13'
text: .
- children:
- $ref: '#/groups/4'
- $ref: '#/groups/5'
content_layer: body
enumerated: true
label: list_item
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/14'
text: ''
@ -488,7 +520,7 @@ texts:
label: text
orig: Create your feature branch (
parent:
$ref: '#/groups/4'
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/15'
text: Create your feature branch (
@ -500,7 +532,7 @@ texts:
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/4'
$ref: '#/groups/5'
prov: []
references: []
self_ref: '#/texts/16'
@ -510,19 +542,19 @@ texts:
label: text
orig: )
parent:
$ref: '#/groups/4'
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/17'
text: )
- children:
- $ref: '#/groups/5'
- $ref: '#/groups/6'
content_layer: body
enumerated: true
label: list_item
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/18'
text: ''
@ -531,7 +563,7 @@ texts:
label: text
orig: Commit your changes (
parent:
$ref: '#/groups/5'
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/19'
text: Commit your changes (
@ -543,7 +575,7 @@ texts:
label: code
orig: git commit -m 'Add some AmazingFeature'
parent:
$ref: '#/groups/5'
$ref: '#/groups/6'
prov: []
references: []
self_ref: '#/texts/20'
@ -553,19 +585,19 @@ texts:
label: text
orig: )
parent:
$ref: '#/groups/5'
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/21'
text: )
- children:
- $ref: '#/groups/6'
- $ref: '#/groups/7'
content_layer: body
enumerated: true
label: list_item
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/22'
text: ''
@ -574,7 +606,7 @@ texts:
label: text
orig: Push to the branch (
parent:
$ref: '#/groups/6'
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/23'
text: Push to the branch (
@ -586,7 +618,7 @@ texts:
label: code
orig: git push origin feature/AmazingFeature
parent:
$ref: '#/groups/6'
$ref: '#/groups/7'
prov: []
references: []
self_ref: '#/texts/24'
@ -596,7 +628,7 @@ texts:
label: text
orig: )
parent:
$ref: '#/groups/6'
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/25'
text: )
@ -607,7 +639,7 @@ texts:
marker: ''
orig: Open a Pull Request
parent:
$ref: '#/groups/2'
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/26'
text: Open a Pull Request
@ -624,19 +656,19 @@ texts:
marker: ''
orig: Whole list item has same formatting
parent:
$ref: '#/groups/2'
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/27'
text: Whole list item has same formatting
- children:
- $ref: '#/groups/7'
- $ref: '#/groups/8'
content_layer: body
enumerated: true
label: list_item
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/28'
text: ''
@ -645,7 +677,7 @@ texts:
label: text
orig: List item has
parent:
$ref: '#/groups/7'
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/29'
text: List item has
@ -660,7 +692,7 @@ texts:
label: text
orig: mixed or partial
parent:
$ref: '#/groups/7'
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/30'
text: mixed or partial
@ -669,7 +701,7 @@ texts:
label: text
orig: formatting
parent:
$ref: '#/groups/7'
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/31'
text: formatting
@ -682,21 +714,21 @@ texts:
strikethrough: false
underline: false
label: title
orig: Whole heading is italic
orig: ''
parent:
$ref: '#/body'
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/32'
text: Whole heading is italic
- children:
- $ref: '#/groups/9'
- $ref: '#/groups/11'
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: ''
parent:
$ref: '#/groups/8'
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/33'
text: ''
@ -711,7 +743,7 @@ texts:
label: text
orig: First
parent:
$ref: '#/groups/9'
$ref: '#/groups/11'
prov: []
self_ref: '#/texts/34'
text: First
@ -720,19 +752,19 @@ texts:
label: text
orig: ': Lorem ipsum.'
parent:
$ref: '#/groups/9'
$ref: '#/groups/11'
prov: []
self_ref: '#/texts/35'
text: ': Lorem ipsum.'
- children:
- $ref: '#/groups/10'
- $ref: '#/groups/12'
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: ''
parent:
$ref: '#/groups/8'
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/36'
text: ''
@ -747,7 +779,7 @@ texts:
label: text
orig: Second
parent:
$ref: '#/groups/10'
$ref: '#/groups/12'
prov: []
self_ref: '#/texts/37'
text: Second
@ -756,7 +788,7 @@ texts:
label: text
orig: ': Dolor'
parent:
$ref: '#/groups/10'
$ref: '#/groups/12'
prov: []
self_ref: '#/texts/38'
text: ': Dolor'
@ -768,7 +800,7 @@ texts:
label: code
orig: sit
parent:
$ref: '#/groups/10'
$ref: '#/groups/12'
prov: []
references: []
self_ref: '#/texts/39'
@ -778,7 +810,7 @@ texts:
label: text
orig: amet.
parent:
$ref: '#/groups/10'
$ref: '#/groups/12'
prov: []
self_ref: '#/texts/40'
text: amet.
@ -787,7 +819,7 @@ texts:
label: text
orig: Some
parent:
$ref: '#/groups/11'
$ref: '#/groups/13'
prov: []
self_ref: '#/texts/41'
text: Some
@ -805,19 +837,19 @@ texts:
label: code
orig: formatted_code
parent:
$ref: '#/groups/11'
$ref: '#/groups/13'
prov: []
references: []
self_ref: '#/texts/42'
text: formatted_code
- children:
- $ref: '#/groups/12'
- $ref: '#/groups/15'
content_layer: body
label: section_header
level: 1
orig: ''
parent:
$ref: '#/body'
$ref: '#/groups/14'
prov: []
self_ref: '#/texts/43'
text: ''
@ -832,7 +864,7 @@ texts:
label: text
orig: Partially formatted
parent:
$ref: '#/groups/12'
$ref: '#/groups/15'
prov: []
self_ref: '#/texts/44'
text: Partially formatted
@ -841,7 +873,7 @@ texts:
label: text
orig: heading to_escape
parent:
$ref: '#/groups/12'
$ref: '#/groups/15'
prov: []
self_ref: '#/texts/45'
text: heading to_escape
@ -853,7 +885,7 @@ texts:
label: code
orig: not_to_escape
parent:
$ref: '#/groups/12'
$ref: '#/groups/15'
prov: []
references: []
self_ref: '#/texts/46'
@ -864,7 +896,7 @@ texts:
label: text
orig: $$E=mc^2$$
parent:
$ref: '#/body'
$ref: '#/groups/14'
prov: []
self_ref: '#/texts/47'
text: $$E=mc^2$$
@ -872,9 +904,9 @@ texts:
content_layer: body
label: section_header
level: 1
orig: Table Heading
orig: ''
parent:
$ref: '#/body'
$ref: '#/groups/16'
prov: []
self_ref: '#/texts/48'
text: Table Heading

View File

@ -1,57 +1,183 @@
import io
from pathlib import Path
from textwrap import dedent
from typing import Annotated
import pytest
from _pytest.mark import ParameterSet
from docling_core.types.doc.document import DoclingDocument, GroupItem, RefItem
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument
from docling.datamodel.document import (
InputDocument,
)
from tests.conftest import TEST_DATA_DIR
from tests.verify_utils import CONFID_PREC, COORD_PREC
from .test_data_gen_flag import GEN_TEST_DATA
GENERATE = True or GEN_TEST_DATA
def test_convert_valid():
fmt = InputFormat.MD
cls = MarkdownDocumentBackend
ALSO_GENERATE_YAML = ["inline_and_formatting"]
"""A list of document names that should also be generated as yaml"""
root_path = Path("tests") / "data"
relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0
# Test Input Directories
INPUT_DIR = TEST_DATA_DIR / "md"
yaml_filter = ["inline_and_formatting"]
# Test Output Directories
SNAPSHOT_DIR = TEST_DATA_DIR / "groundtruth" / "docling_v2"
for in_path in relevant_paths:
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
TestCase = Annotated[tuple[str, Path, Path], "test_name, in_file, snapshot_file"]
in_doc = InputDocument(
path_or_stream=in_path,
format=fmt,
backend=cls,
def markdown_test_data() -> list[ParameterSet]:
"""Returns test cases for each of our input markdown files"""
test_case_paths = sorted(INPUT_DIR.glob(pattern="*.md"), key=lambda x: x.name)
test_cases: list[ParameterSet] = []
for test_case_path in test_case_paths:
name: str = test_case_path.stem
markdown_document_path: Path = test_case_path.resolve()
markdown_snapshot_path: Path = SNAPSHOT_DIR / f"{name}.md.md"
yaml_snapshot_path: Path | None = (
SNAPSHOT_DIR / f"{name}.md.yaml" if name in ALSO_GENERATE_YAML else None
)
backend = cls(
in_doc=in_doc,
path_or_stream=in_path,
test_cases.append(
pytest.param(
markdown_document_path,
markdown_snapshot_path,
yaml_snapshot_path,
id=name,
)
)
assert backend.is_valid()
act_doc = backend.convert()
act_data = act_doc.export_to_markdown()
return test_cases
if GEN_TEST_DATA:
with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")
if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
act_doc.save_as_yaml(
yaml_gt_path,
coord_precision=COORD_PREC,
confid_precision=CONFID_PREC,
)
@pytest.mark.parametrize(
("markdown_document_path", "markdown_snapshot_path", "yaml_snapshot_path"),
markdown_test_data(),
)
def test_convert_markdown(
markdown_document_path: Path,
markdown_snapshot_path: Path,
yaml_snapshot_path: Path | None,
):
"""Test that the Markdown backend can:
1) convert the input markdown file to a DoclingDocument
2) export the markdown (and optionally, yaml) and verify it matches the committed snapshot
"""
if not GENERATE and not markdown_snapshot_path.exists():
pytest.skip(
f"Test requires {markdown_snapshot_path} to exist, you may need to generate it with GENERATE=True"
)
document_backend = MarkdownDocumentBackend(
in_doc=InputDocument(
path_or_stream=markdown_document_path,
format=InputFormat.MD,
backend=MarkdownDocumentBackend,
),
path_or_stream=markdown_document_path,
)
assert document_backend.is_valid()
try:
out_docling_document: DoclingDocument = document_backend.convert()
except Exception as e:
pytest.skip(f"Error converting {markdown_document_path}: {e}")
# Validate the YAML/JSON Export
if yaml_snapshot_path:
if GENERATE:
out_docling_document.save_as_yaml(
yaml_snapshot_path,
coord_precision=COORD_PREC,
confid_precision=CONFID_PREC,
)
else:
with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert act_data == exp_data
assert out_docling_document == DoclingDocument.load_from_yaml(
yaml_snapshot_path
)
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc
# Validate the Markdown Export
out_markdown: str = out_docling_document.export_to_markdown()
if GENERATE:
_ = markdown_snapshot_path.write_text(out_markdown + "\n")
else:
assert (
out_markdown == markdown_snapshot_path.read_text(encoding="utf-8")
)
def test_convert_headers_to_groups():
"""Test that the Markdown backend can convert headers into hierarchical groups"""
input_document = dedent("""
# Header 1
some content under the header 1
## Header 2a
some content under the header 2
### Header 3
some content under the header 3
## Header 2b
""")
in_doc = InputDocument(
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
format=InputFormat.MD,
filename="headers_to_groups.md",
backend=MarkdownDocumentBackend,
)
backend = MarkdownDocumentBackend(
in_doc=in_doc,
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
)
act_doc: DoclingDocument = backend.convert()
assert len(act_doc.body.children) == 1
body_first_child_ref: RefItem = act_doc.body.children[0]
assert isinstance(body_first_child_ref, RefItem)
assert body_first_child_ref.cref == "#/groups/0"
body_first_child: GroupItem = body_first_child_ref.resolve(act_doc)
# The first child should have the header, content and two subheaders
assert len(body_first_child.children) == 4
act_data = act_doc.export_to_markdown()
expected_output = dedent("""
# Header 1
some content under the header 1
## Header 2a
some content under the header 2
### Header 3
some content under the header 3
## Header 2b
""").strip()
assert act_data == expected_output