mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 03:24:59 +00:00
Support hierarchical markdown
This commit is contained in:
parent
95e70962f1
commit
df5c15195b
@ -5,21 +5,24 @@ from copy import deepcopy
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Set, Union
|
||||
from typing import List, Literal, Optional, Set, Union, override
|
||||
|
||||
import marko
|
||||
import marko.element
|
||||
import marko.inline
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docling_core.types.doc.document import Formatting, SectionHeaderItem, TitleItem
|
||||
from docling_core.types.doc.labels import GroupLabel
|
||||
from marko import Markdown
|
||||
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
||||
from typing_extensions import Annotated
|
||||
@ -45,7 +48,7 @@ class _PendingCreationType(str, Enum):
|
||||
|
||||
class _HeadingCreationPayload(BaseModel):
|
||||
kind: Literal["heading"] = "heading"
|
||||
level: int
|
||||
heading_item: TitleItem | SectionHeaderItem
|
||||
|
||||
|
||||
class _ListItemCreationPayload(BaseModel):
|
||||
@ -63,6 +66,12 @@ _CreationPayload = Annotated[
|
||||
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _get_current_heading_level(self) -> int:
|
||||
return max(self.header_to_group.keys()) if self.header_to_group else 0
|
||||
|
||||
def _get_current_heading_group(self) -> Optional[Union[DocItem, GroupItem]]:
|
||||
return self.header_to_group.get(self._get_current_heading_level(), None)
|
||||
|
||||
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||
# This regex will match any sequence of underscores
|
||||
pattern = r"_+"
|
||||
@ -100,6 +109,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.md_table_buffer: list[str] = []
|
||||
self._html_blocks: int = 0
|
||||
|
||||
self.header_to_group: dict[int, GroupItem] = {}
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
@ -125,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
) from e
|
||||
return
|
||||
|
||||
def _close_table(self, doc: DoclingDocument):
|
||||
def _close_table(self, doc: DoclingDocument, parent_item: Optional[NodeItem]):
|
||||
if self.in_table:
|
||||
_log.debug("=== TABLE START ===")
|
||||
for md_table_row in self.md_table_buffer:
|
||||
@ -179,7 +190,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
for tcell in tcells:
|
||||
table_data.table_cells.append(tcell)
|
||||
if len(tcells) > 0:
|
||||
doc.add_table(data=table_data)
|
||||
_ = doc.add_table(data=table_data, parent=parent_item)
|
||||
return
|
||||
|
||||
def _create_list_item(
|
||||
@ -208,7 +219,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
level: int,
|
||||
formatting: Optional[Formatting] = None,
|
||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||
):
|
||||
) -> TitleItem | SectionHeaderItem:
|
||||
if level == 1:
|
||||
item = doc.add_title(
|
||||
text=text,
|
||||
@ -244,25 +255,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if element in visited:
|
||||
return
|
||||
|
||||
if parent_item is None:
|
||||
parent_item = self._get_current_heading_group()
|
||||
|
||||
# Iterates over all elements in the AST
|
||||
# Check for different element types and process relevant details
|
||||
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(
|
||||
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||
)
|
||||
|
||||
if len(element.children) > 1: # inline group will be created further down
|
||||
parent_item = self._create_heading_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
text="",
|
||||
level=element.level,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
creation_stack.append(_HeadingCreationPayload(level=element.level))
|
||||
while self._get_current_heading_level() >= element.level:
|
||||
_ = self.header_to_group.pop(self._get_current_heading_level())
|
||||
|
||||
parent_item = doc.add_group(
|
||||
name=f"header-{element.level}",
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self._get_current_heading_group(),
|
||||
)
|
||||
|
||||
self.header_to_group[element.level] = parent_item
|
||||
|
||||
parent_item = self._create_heading_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
text="",
|
||||
level=element.level,
|
||||
formatting=formatting,
|
||||
)
|
||||
|
||||
|
||||
if len(element.children) > 1:
|
||||
parent_item = doc.add_inline_group(parent=parent_item)
|
||||
elif len(element.children) == 1:
|
||||
creation_stack.append(_HeadingCreationPayload(heading_item=parent_item))
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
has_non_empty_list_items = False
|
||||
@ -271,7 +298,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
has_non_empty_list_items = True
|
||||
break
|
||||
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
if has_non_empty_list_items:
|
||||
parent_item = doc.add_list_group(name="list", parent=parent_item)
|
||||
@ -283,7 +310,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
||||
and len(child.children) > 0
|
||||
):
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(" - List item")
|
||||
|
||||
enumerated = (
|
||||
@ -304,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||
|
||||
fig_caption: Optional[TextItem] = None
|
||||
@ -346,7 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
self.md_table_buffer.append(snippet_text)
|
||||
elif snippet_text:
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
|
||||
if creation_stack:
|
||||
while len(creation_stack) > 0:
|
||||
@ -368,17 +395,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
elif isinstance(to_create, _HeadingCreationPayload):
|
||||
# not keeping as parent_item as logic for correctly tracking
|
||||
# that not implemented yet (section components not captured
|
||||
# as heading children in marko)
|
||||
self._create_heading_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
text=snippet_text,
|
||||
level=to_create.level,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
to_create.heading_item.text = snippet_text
|
||||
to_create.heading_item.formatting = formatting
|
||||
to_create.heading_item.hyperlink = hyperlink
|
||||
else:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
@ -389,7 +408,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.CodeSpan):
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(f" - Code Span: {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_code(
|
||||
@ -405,7 +424,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
and isinstance((child := element.children[0]), marko.inline.RawText)
|
||||
and len(snippet_text := (child.children.strip())) > 0
|
||||
):
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
doc.add_code(
|
||||
parent=parent_item,
|
||||
@ -421,7 +440,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif isinstance(element, marko.block.HTMLBlock):
|
||||
self._html_blocks += 1
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(f"HTML Block: {element}")
|
||||
if (
|
||||
len(element.body) > 0
|
||||
@ -438,13 +457,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self._close_table(doc)
|
||||
self._close_table(doc, parent_item)
|
||||
_log.debug(f"Some other element: {element}")
|
||||
|
||||
if (
|
||||
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
||||
and len(element.children) > 1
|
||||
):
|
||||
if isinstance(element, marko.block.Paragraph) and len(element.children) > 1:
|
||||
parent_item = doc.add_inline_group(parent=parent_item)
|
||||
|
||||
processed_block_types = (
|
||||
@ -511,7 +527,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
creation_stack=[],
|
||||
list_ordered_flag_by_ref={},
|
||||
)
|
||||
self._close_table(doc=doc) # handle any last hanging table
|
||||
self._close_table(doc=doc, parent_item=None) # handle any last hanging table
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
if self._html_blocks > 0:
|
||||
|
10
tests/conftest.py
Normal file
10
tests/conftest.py
Normal file
@ -0,0 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
TEST_DATA_DIR = Path("./tests/data/")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_data_directory() -> Path:
|
||||
return TEST_DATA_DIR
|
@ -1,18 +1,7 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/groups/0'
|
||||
- $ref: '#/groups/1'
|
||||
- $ref: '#/groups/2'
|
||||
- $ref: '#/texts/32'
|
||||
- $ref: '#/groups/8'
|
||||
- $ref: '#/groups/11'
|
||||
- $ref: '#/texts/43'
|
||||
- $ref: '#/texts/47'
|
||||
- $ref: '#/texts/48'
|
||||
- $ref: '#/groups/13'
|
||||
- $ref: '#/tables/0'
|
||||
- $ref: '#/groups/9'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
@ -25,6 +14,18 @@ furniture:
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups:
|
||||
- children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/groups/1'
|
||||
- $ref: '#/groups/2'
|
||||
- $ref: '#/groups/3'
|
||||
content_layer: body
|
||||
label: section
|
||||
name: header-1
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/0'
|
||||
- children:
|
||||
- $ref: '#/texts/2'
|
||||
- $ref: '#/texts/3'
|
||||
@ -35,8 +36,8 @@ groups:
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/0'
|
||||
$ref: '#/groups/0'
|
||||
self_ref: '#/groups/1'
|
||||
- children:
|
||||
- $ref: '#/texts/7'
|
||||
- $ref: '#/texts/8'
|
||||
@ -45,8 +46,8 @@ groups:
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/1'
|
||||
$ref: '#/groups/0'
|
||||
self_ref: '#/groups/2'
|
||||
- children:
|
||||
- $ref: '#/texts/10'
|
||||
- $ref: '#/texts/14'
|
||||
@ -59,8 +60,8 @@ groups:
|
||||
label: list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/2'
|
||||
$ref: '#/groups/0'
|
||||
self_ref: '#/groups/3'
|
||||
- children:
|
||||
- $ref: '#/texts/11'
|
||||
- $ref: '#/texts/12'
|
||||
@ -70,7 +71,7 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/10'
|
||||
self_ref: '#/groups/3'
|
||||
self_ref: '#/groups/4'
|
||||
- children:
|
||||
- $ref: '#/texts/15'
|
||||
- $ref: '#/texts/16'
|
||||
@ -80,7 +81,7 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/14'
|
||||
self_ref: '#/groups/4'
|
||||
self_ref: '#/groups/5'
|
||||
- children:
|
||||
- $ref: '#/texts/19'
|
||||
- $ref: '#/texts/20'
|
||||
@ -90,7 +91,7 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/18'
|
||||
self_ref: '#/groups/5'
|
||||
self_ref: '#/groups/6'
|
||||
- children:
|
||||
- $ref: '#/texts/23'
|
||||
- $ref: '#/texts/24'
|
||||
@ -100,7 +101,7 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/22'
|
||||
self_ref: '#/groups/6'
|
||||
self_ref: '#/groups/7'
|
||||
- children:
|
||||
- $ref: '#/texts/29'
|
||||
- $ref: '#/texts/30'
|
||||
@ -110,7 +111,19 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/28'
|
||||
self_ref: '#/groups/7'
|
||||
self_ref: '#/groups/8'
|
||||
- children:
|
||||
- $ref: '#/texts/32'
|
||||
- $ref: '#/groups/10'
|
||||
- $ref: '#/groups/13'
|
||||
- $ref: '#/groups/14'
|
||||
- $ref: '#/groups/16'
|
||||
content_layer: body
|
||||
label: section
|
||||
name: header-1
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/9'
|
||||
- children:
|
||||
- $ref: '#/texts/33'
|
||||
- $ref: '#/texts/36'
|
||||
@ -118,8 +131,8 @@ groups:
|
||||
label: list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/8'
|
||||
$ref: '#/groups/9'
|
||||
self_ref: '#/groups/10'
|
||||
- children:
|
||||
- $ref: '#/texts/34'
|
||||
- $ref: '#/texts/35'
|
||||
@ -128,7 +141,7 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/33'
|
||||
self_ref: '#/groups/9'
|
||||
self_ref: '#/groups/11'
|
||||
- children:
|
||||
- $ref: '#/texts/37'
|
||||
- $ref: '#/texts/38'
|
||||
@ -139,7 +152,7 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/36'
|
||||
self_ref: '#/groups/10'
|
||||
self_ref: '#/groups/12'
|
||||
- children:
|
||||
- $ref: '#/texts/41'
|
||||
- $ref: '#/texts/42'
|
||||
@ -147,8 +160,17 @@ groups:
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/11'
|
||||
$ref: '#/groups/9'
|
||||
self_ref: '#/groups/13'
|
||||
- children:
|
||||
- $ref: '#/texts/43'
|
||||
- $ref: '#/texts/47'
|
||||
content_layer: body
|
||||
label: section
|
||||
name: header-2
|
||||
parent:
|
||||
$ref: '#/groups/9'
|
||||
self_ref: '#/groups/14'
|
||||
- children:
|
||||
- $ref: '#/texts/44'
|
||||
- $ref: '#/texts/45'
|
||||
@ -158,14 +180,24 @@ groups:
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/43'
|
||||
self_ref: '#/groups/12'
|
||||
self_ref: '#/groups/15'
|
||||
- children:
|
||||
- $ref: '#/texts/48'
|
||||
- $ref: '#/groups/17'
|
||||
- $ref: '#/tables/0'
|
||||
content_layer: body
|
||||
label: section
|
||||
name: header-2
|
||||
parent:
|
||||
$ref: '#/groups/9'
|
||||
self_ref: '#/groups/16'
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/13'
|
||||
$ref: '#/groups/16'
|
||||
self_ref: '#/groups/17'
|
||||
key_value_items: []
|
||||
name: inline_and_formatting
|
||||
origin:
|
||||
@ -308,7 +340,7 @@ tables:
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
$ref: '#/groups/16'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/0'
|
||||
@ -316,9 +348,9 @@ texts:
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: title
|
||||
orig: Contribution guideline example
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: Contribution guideline example
|
||||
@ -327,7 +359,7 @@ texts:
|
||||
label: text
|
||||
orig: This is simple.
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: This is simple.
|
||||
@ -336,7 +368,7 @@ texts:
|
||||
label: text
|
||||
orig: Foo
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: Foo
|
||||
@ -351,7 +383,7 @@ texts:
|
||||
label: text
|
||||
orig: emphasis
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/3'
|
||||
text: emphasis
|
||||
@ -366,7 +398,7 @@ texts:
|
||||
label: text
|
||||
orig: strong emphasis
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/4'
|
||||
text: strong emphasis
|
||||
@ -381,7 +413,7 @@ texts:
|
||||
label: text
|
||||
orig: both
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/5'
|
||||
text: both
|
||||
@ -390,7 +422,7 @@ texts:
|
||||
label: text
|
||||
orig: .
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/6'
|
||||
text: .
|
||||
@ -399,7 +431,7 @@ texts:
|
||||
label: text
|
||||
orig: 'Create your feature branch:'
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/7'
|
||||
text: 'Create your feature branch:'
|
||||
@ -411,7 +443,7 @@ texts:
|
||||
label: code
|
||||
orig: git checkout -b feature/AmazingFeature
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/8'
|
||||
@ -421,19 +453,19 @@ texts:
|
||||
label: text
|
||||
orig: .
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/9'
|
||||
text: .
|
||||
- children:
|
||||
- $ref: '#/groups/3'
|
||||
- $ref: '#/groups/4'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/10'
|
||||
text: ''
|
||||
@ -442,7 +474,7 @@ texts:
|
||||
label: text
|
||||
orig: Pull the
|
||||
parent:
|
||||
$ref: '#/groups/3'
|
||||
$ref: '#/groups/4'
|
||||
prov: []
|
||||
self_ref: '#/texts/11'
|
||||
text: Pull the
|
||||
@ -458,7 +490,7 @@ texts:
|
||||
label: text
|
||||
orig: repository
|
||||
parent:
|
||||
$ref: '#/groups/3'
|
||||
$ref: '#/groups/4'
|
||||
prov: []
|
||||
self_ref: '#/texts/12'
|
||||
text: repository
|
||||
@ -467,19 +499,19 @@ texts:
|
||||
label: text
|
||||
orig: .
|
||||
parent:
|
||||
$ref: '#/groups/3'
|
||||
$ref: '#/groups/4'
|
||||
prov: []
|
||||
self_ref: '#/texts/13'
|
||||
text: .
|
||||
- children:
|
||||
- $ref: '#/groups/4'
|
||||
- $ref: '#/groups/5'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/14'
|
||||
text: ''
|
||||
@ -488,7 +520,7 @@ texts:
|
||||
label: text
|
||||
orig: Create your feature branch (
|
||||
parent:
|
||||
$ref: '#/groups/4'
|
||||
$ref: '#/groups/5'
|
||||
prov: []
|
||||
self_ref: '#/texts/15'
|
||||
text: Create your feature branch (
|
||||
@ -500,7 +532,7 @@ texts:
|
||||
label: code
|
||||
orig: git checkout -b feature/AmazingFeature
|
||||
parent:
|
||||
$ref: '#/groups/4'
|
||||
$ref: '#/groups/5'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/16'
|
||||
@ -510,19 +542,19 @@ texts:
|
||||
label: text
|
||||
orig: )
|
||||
parent:
|
||||
$ref: '#/groups/4'
|
||||
$ref: '#/groups/5'
|
||||
prov: []
|
||||
self_ref: '#/texts/17'
|
||||
text: )
|
||||
- children:
|
||||
- $ref: '#/groups/5'
|
||||
- $ref: '#/groups/6'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/18'
|
||||
text: ''
|
||||
@ -531,7 +563,7 @@ texts:
|
||||
label: text
|
||||
orig: Commit your changes (
|
||||
parent:
|
||||
$ref: '#/groups/5'
|
||||
$ref: '#/groups/6'
|
||||
prov: []
|
||||
self_ref: '#/texts/19'
|
||||
text: Commit your changes (
|
||||
@ -543,7 +575,7 @@ texts:
|
||||
label: code
|
||||
orig: git commit -m 'Add some AmazingFeature'
|
||||
parent:
|
||||
$ref: '#/groups/5'
|
||||
$ref: '#/groups/6'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/20'
|
||||
@ -553,19 +585,19 @@ texts:
|
||||
label: text
|
||||
orig: )
|
||||
parent:
|
||||
$ref: '#/groups/5'
|
||||
$ref: '#/groups/6'
|
||||
prov: []
|
||||
self_ref: '#/texts/21'
|
||||
text: )
|
||||
- children:
|
||||
- $ref: '#/groups/6'
|
||||
- $ref: '#/groups/7'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/22'
|
||||
text: ''
|
||||
@ -574,7 +606,7 @@ texts:
|
||||
label: text
|
||||
orig: Push to the branch (
|
||||
parent:
|
||||
$ref: '#/groups/6'
|
||||
$ref: '#/groups/7'
|
||||
prov: []
|
||||
self_ref: '#/texts/23'
|
||||
text: Push to the branch (
|
||||
@ -586,7 +618,7 @@ texts:
|
||||
label: code
|
||||
orig: git push origin feature/AmazingFeature
|
||||
parent:
|
||||
$ref: '#/groups/6'
|
||||
$ref: '#/groups/7'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/24'
|
||||
@ -596,7 +628,7 @@ texts:
|
||||
label: text
|
||||
orig: )
|
||||
parent:
|
||||
$ref: '#/groups/6'
|
||||
$ref: '#/groups/7'
|
||||
prov: []
|
||||
self_ref: '#/texts/25'
|
||||
text: )
|
||||
@ -607,7 +639,7 @@ texts:
|
||||
marker: ''
|
||||
orig: Open a Pull Request
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/26'
|
||||
text: Open a Pull Request
|
||||
@ -624,19 +656,19 @@ texts:
|
||||
marker: ''
|
||||
orig: Whole list item has same formatting
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/27'
|
||||
text: Whole list item has same formatting
|
||||
- children:
|
||||
- $ref: '#/groups/7'
|
||||
- $ref: '#/groups/8'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/28'
|
||||
text: ''
|
||||
@ -645,7 +677,7 @@ texts:
|
||||
label: text
|
||||
orig: List item has
|
||||
parent:
|
||||
$ref: '#/groups/7'
|
||||
$ref: '#/groups/8'
|
||||
prov: []
|
||||
self_ref: '#/texts/29'
|
||||
text: List item has
|
||||
@ -660,7 +692,7 @@ texts:
|
||||
label: text
|
||||
orig: mixed or partial
|
||||
parent:
|
||||
$ref: '#/groups/7'
|
||||
$ref: '#/groups/8'
|
||||
prov: []
|
||||
self_ref: '#/texts/30'
|
||||
text: mixed or partial
|
||||
@ -669,7 +701,7 @@ texts:
|
||||
label: text
|
||||
orig: formatting
|
||||
parent:
|
||||
$ref: '#/groups/7'
|
||||
$ref: '#/groups/8'
|
||||
prov: []
|
||||
self_ref: '#/texts/31'
|
||||
text: formatting
|
||||
@ -682,21 +714,21 @@ texts:
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: title
|
||||
orig: Whole heading is italic
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
$ref: '#/groups/9'
|
||||
prov: []
|
||||
self_ref: '#/texts/32'
|
||||
text: Whole heading is italic
|
||||
- children:
|
||||
- $ref: '#/groups/9'
|
||||
- $ref: '#/groups/11'
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/8'
|
||||
$ref: '#/groups/10'
|
||||
prov: []
|
||||
self_ref: '#/texts/33'
|
||||
text: ''
|
||||
@ -711,7 +743,7 @@ texts:
|
||||
label: text
|
||||
orig: First
|
||||
parent:
|
||||
$ref: '#/groups/9'
|
||||
$ref: '#/groups/11'
|
||||
prov: []
|
||||
self_ref: '#/texts/34'
|
||||
text: First
|
||||
@ -720,19 +752,19 @@ texts:
|
||||
label: text
|
||||
orig: ': Lorem ipsum.'
|
||||
parent:
|
||||
$ref: '#/groups/9'
|
||||
$ref: '#/groups/11'
|
||||
prov: []
|
||||
self_ref: '#/texts/35'
|
||||
text: ': Lorem ipsum.'
|
||||
- children:
|
||||
- $ref: '#/groups/10'
|
||||
- $ref: '#/groups/12'
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/8'
|
||||
$ref: '#/groups/10'
|
||||
prov: []
|
||||
self_ref: '#/texts/36'
|
||||
text: ''
|
||||
@ -747,7 +779,7 @@ texts:
|
||||
label: text
|
||||
orig: Second
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
$ref: '#/groups/12'
|
||||
prov: []
|
||||
self_ref: '#/texts/37'
|
||||
text: Second
|
||||
@ -756,7 +788,7 @@ texts:
|
||||
label: text
|
||||
orig: ': Dolor'
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
$ref: '#/groups/12'
|
||||
prov: []
|
||||
self_ref: '#/texts/38'
|
||||
text: ': Dolor'
|
||||
@ -768,7 +800,7 @@ texts:
|
||||
label: code
|
||||
orig: sit
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
$ref: '#/groups/12'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/39'
|
||||
@ -778,7 +810,7 @@ texts:
|
||||
label: text
|
||||
orig: amet.
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
$ref: '#/groups/12'
|
||||
prov: []
|
||||
self_ref: '#/texts/40'
|
||||
text: amet.
|
||||
@ -787,7 +819,7 @@ texts:
|
||||
label: text
|
||||
orig: Some
|
||||
parent:
|
||||
$ref: '#/groups/11'
|
||||
$ref: '#/groups/13'
|
||||
prov: []
|
||||
self_ref: '#/texts/41'
|
||||
text: Some
|
||||
@ -805,19 +837,19 @@ texts:
|
||||
label: code
|
||||
orig: formatted_code
|
||||
parent:
|
||||
$ref: '#/groups/11'
|
||||
$ref: '#/groups/13'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/42'
|
||||
text: formatted_code
|
||||
- children:
|
||||
- $ref: '#/groups/12'
|
||||
- $ref: '#/groups/15'
|
||||
content_layer: body
|
||||
label: section_header
|
||||
level: 1
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
$ref: '#/groups/14'
|
||||
prov: []
|
||||
self_ref: '#/texts/43'
|
||||
text: ''
|
||||
@ -832,7 +864,7 @@ texts:
|
||||
label: text
|
||||
orig: Partially formatted
|
||||
parent:
|
||||
$ref: '#/groups/12'
|
||||
$ref: '#/groups/15'
|
||||
prov: []
|
||||
self_ref: '#/texts/44'
|
||||
text: Partially formatted
|
||||
@ -841,7 +873,7 @@ texts:
|
||||
label: text
|
||||
orig: heading to_escape
|
||||
parent:
|
||||
$ref: '#/groups/12'
|
||||
$ref: '#/groups/15'
|
||||
prov: []
|
||||
self_ref: '#/texts/45'
|
||||
text: heading to_escape
|
||||
@ -853,7 +885,7 @@ texts:
|
||||
label: code
|
||||
orig: not_to_escape
|
||||
parent:
|
||||
$ref: '#/groups/12'
|
||||
$ref: '#/groups/15'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/46'
|
||||
@ -864,7 +896,7 @@ texts:
|
||||
label: text
|
||||
orig: $$E=mc^2$$
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
$ref: '#/groups/14'
|
||||
prov: []
|
||||
self_ref: '#/texts/47'
|
||||
text: $$E=mc^2$$
|
||||
@ -872,9 +904,9 @@ texts:
|
||||
content_layer: body
|
||||
label: section_header
|
||||
level: 1
|
||||
orig: Table Heading
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
$ref: '#/groups/16'
|
||||
prov: []
|
||||
self_ref: '#/texts/48'
|
||||
text: Table Heading
|
||||
|
@ -1,57 +1,183 @@
|
||||
import io
|
||||
from pathlib import Path
|
||||
from textwrap import dedent
|
||||
from typing import Annotated
|
||||
|
||||
import pytest
|
||||
from _pytest.mark import ParameterSet
|
||||
from docling_core.types.doc.document import DoclingDocument, GroupItem, RefItem
|
||||
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DoclingDocument, InputDocument
|
||||
from docling.datamodel.document import (
|
||||
InputDocument,
|
||||
)
|
||||
from tests.conftest import TEST_DATA_DIR
|
||||
from tests.verify_utils import CONFID_PREC, COORD_PREC
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
|
||||
GENERATE = True or GEN_TEST_DATA
|
||||
|
||||
def test_convert_valid():
|
||||
fmt = InputFormat.MD
|
||||
cls = MarkdownDocumentBackend
|
||||
ALSO_GENERATE_YAML = ["inline_and_formatting"]
|
||||
"""A list of document names that should also be generated as yaml"""
|
||||
|
||||
root_path = Path("tests") / "data"
|
||||
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
||||
assert len(relevant_paths) > 0
|
||||
# Test Input Directories
|
||||
INPUT_DIR = TEST_DATA_DIR / "md"
|
||||
|
||||
yaml_filter = ["inline_and_formatting"]
|
||||
# Test Output Directories
|
||||
SNAPSHOT_DIR = TEST_DATA_DIR / "groundtruth" / "docling_v2"
|
||||
|
||||
for in_path in relevant_paths:
|
||||
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
|
||||
TestCase = Annotated[tuple[str, Path, Path], "test_name, in_file, snapshot_file"]
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
format=fmt,
|
||||
backend=cls,
|
||||
|
||||
def markdown_test_data() -> list[ParameterSet]:
|
||||
"""Returns test cases for each of our input markdown files"""
|
||||
|
||||
test_case_paths = sorted(INPUT_DIR.glob(pattern="*.md"), key=lambda x: x.name)
|
||||
|
||||
test_cases: list[ParameterSet] = []
|
||||
|
||||
for test_case_path in test_case_paths:
|
||||
name: str = test_case_path.stem
|
||||
|
||||
markdown_document_path: Path = test_case_path.resolve()
|
||||
|
||||
markdown_snapshot_path: Path = SNAPSHOT_DIR / f"{name}.md.md"
|
||||
yaml_snapshot_path: Path | None = (
|
||||
SNAPSHOT_DIR / f"{name}.md.yaml" if name in ALSO_GENERATE_YAML else None
|
||||
)
|
||||
backend = cls(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=in_path,
|
||||
|
||||
test_cases.append(
|
||||
pytest.param(
|
||||
markdown_document_path,
|
||||
markdown_snapshot_path,
|
||||
yaml_snapshot_path,
|
||||
id=name,
|
||||
)
|
||||
)
|
||||
assert backend.is_valid()
|
||||
|
||||
act_doc = backend.convert()
|
||||
act_data = act_doc.export_to_markdown()
|
||||
return test_cases
|
||||
|
||||
if GEN_TEST_DATA:
|
||||
with open(md_gt_path, mode="w", encoding="utf-8") as f:
|
||||
f.write(f"{act_data}\n")
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
|
||||
act_doc.save_as_yaml(
|
||||
yaml_gt_path,
|
||||
coord_precision=COORD_PREC,
|
||||
confid_precision=CONFID_PREC,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
("markdown_document_path", "markdown_snapshot_path", "yaml_snapshot_path"),
|
||||
markdown_test_data(),
|
||||
)
|
||||
def test_convert_markdown(
|
||||
markdown_document_path: Path,
|
||||
markdown_snapshot_path: Path,
|
||||
yaml_snapshot_path: Path | None,
|
||||
):
|
||||
"""Test that the Markdown backend can:
|
||||
1) convert the input markdown file to a DoclingDocument
|
||||
2) export the markdown (and optionally, yaml) and verify it matches the committed snapshot
|
||||
"""
|
||||
|
||||
if not GENERATE and not markdown_snapshot_path.exists():
|
||||
pytest.skip(
|
||||
f"Test requires {markdown_snapshot_path} to exist, you may need to generate it with GENERATE=True"
|
||||
)
|
||||
|
||||
document_backend = MarkdownDocumentBackend(
|
||||
in_doc=InputDocument(
|
||||
path_or_stream=markdown_document_path,
|
||||
format=InputFormat.MD,
|
||||
backend=MarkdownDocumentBackend,
|
||||
),
|
||||
path_or_stream=markdown_document_path,
|
||||
)
|
||||
|
||||
assert document_backend.is_valid()
|
||||
|
||||
try:
|
||||
out_docling_document: DoclingDocument = document_backend.convert()
|
||||
except Exception as e:
|
||||
pytest.skip(f"Error converting {markdown_document_path}: {e}")
|
||||
|
||||
# Validate the YAML/JSON Export
|
||||
if yaml_snapshot_path:
|
||||
if GENERATE:
|
||||
out_docling_document.save_as_yaml(
|
||||
yaml_snapshot_path,
|
||||
coord_precision=COORD_PREC,
|
||||
confid_precision=CONFID_PREC,
|
||||
)
|
||||
else:
|
||||
with open(md_gt_path, encoding="utf-8") as f:
|
||||
exp_data = f.read().rstrip()
|
||||
assert act_data == exp_data
|
||||
assert out_docling_document == DoclingDocument.load_from_yaml(
|
||||
yaml_snapshot_path
|
||||
)
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
||||
assert act_doc == exp_doc
|
||||
# Validate the Markdown Export
|
||||
out_markdown: str = out_docling_document.export_to_markdown()
|
||||
|
||||
if GENERATE:
|
||||
_ = markdown_snapshot_path.write_text(out_markdown + "\n")
|
||||
else:
|
||||
assert (
|
||||
out_markdown == markdown_snapshot_path.read_text(encoding="utf-8")
|
||||
)
|
||||
|
||||
|
||||
def test_convert_headers_to_groups():
|
||||
"""Test that the Markdown backend can convert headers into hierarchical groups"""
|
||||
|
||||
input_document = dedent("""
|
||||
# Header 1
|
||||
|
||||
some content under the header 1
|
||||
|
||||
## Header 2a
|
||||
|
||||
some content under the header 2
|
||||
|
||||
### Header 3
|
||||
|
||||
some content under the header 3
|
||||
|
||||
## Header 2b
|
||||
""")
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
|
||||
format=InputFormat.MD,
|
||||
filename="headers_to_groups.md",
|
||||
backend=MarkdownDocumentBackend,
|
||||
)
|
||||
backend = MarkdownDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
|
||||
)
|
||||
|
||||
act_doc: DoclingDocument = backend.convert()
|
||||
|
||||
assert len(act_doc.body.children) == 1
|
||||
body_first_child_ref: RefItem = act_doc.body.children[0]
|
||||
assert isinstance(body_first_child_ref, RefItem)
|
||||
|
||||
assert body_first_child_ref.cref == "#/groups/0"
|
||||
|
||||
body_first_child: GroupItem = body_first_child_ref.resolve(act_doc)
|
||||
|
||||
# The first child should have the header, content and two subheaders
|
||||
assert len(body_first_child.children) == 4
|
||||
|
||||
act_data = act_doc.export_to_markdown()
|
||||
|
||||
expected_output = dedent("""
|
||||
# Header 1
|
||||
|
||||
some content under the header 1
|
||||
|
||||
## Header 2a
|
||||
|
||||
some content under the header 2
|
||||
|
||||
### Header 3
|
||||
|
||||
some content under the header 3
|
||||
|
||||
## Header 2b
|
||||
""").strip()
|
||||
|
||||
assert act_data == expected_output
|
||||
|
Loading…
Reference in New Issue
Block a user