diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index fb42547e..cd267406 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -5,7 +5,7 @@ from copy import deepcopy from enum import Enum from io import BytesIO from pathlib import Path -from typing import List, Literal, Optional, Set, Union +from typing import Literal, Optional, Union, cast import marko import marko.element @@ -14,6 +14,7 @@ from docling_core.types.doc import ( DocItemLabel, DoclingDocument, DocumentOrigin, + ListItem, NodeItem, TableCell, TableData, @@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - _log.debug("MD INIT!!!") + _log.debug("Starting MarkdownDocumentBackend...") # Markdown file: self.path_or_stream = path_or_stream @@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): for md_table_row in self.md_table_buffer: _log.debug(md_table_row) _log.debug("=== TABLE END ===") - tcells: List[TableCell] = [] + tcells: list[TableCell] = [] result_table = [] for n, md_table_row in enumerate(self.md_table_buffer): data = [] @@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): element: marko.element.Element, depth: int, doc: DoclingDocument, - visited: Set[marko.element.Element], + visited: set[marko.element.Element], creation_stack: list[ _CreationPayload ], # stack for lazy item creation triggered deep in marko's AST (on RawText) list_ordered_flag_by_ref: dict[str, bool], + list_last_item_by_ref: dict[str, ListItem], parent_item: Optional[NodeItem] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif ( isinstance(element, marko.block.ListItem) - and len(element.children) == 1 + and len(element.children) > 0 and isinstance((child := element.children[0]), marko.block.Paragraph) and len(child.children) > 0 ): @@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): if parent_item else False ) - if len(child.children) > 1: # inline group will be created further down + non_list_children: list[marko.element.Element] = [ + item + for item in child.children + if not isinstance(item, marko.block.ListItem) + ] + if len(non_list_children) > 1: # inline group will be created further down + parent_ref: Optional[str] = ( + parent_item.self_ref if parent_item else None + ) parent_item = self._create_list_item( doc=doc, parent_item=parent_item, @@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): formatting=formatting, hyperlink=hyperlink, ) + if parent_ref: + list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item) else: creation_stack.append(_ListItemCreationPayload(enumerated=enumerated)) @@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): element.dest ) - elif isinstance(element, marko.inline.RawText): - _log.debug(f" - Paragraph (raw text): {element.children}") - snippet_text = element.children.strip() + elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)): + _log.debug(f" - RawText/Literal: {element.children}") + snippet_text = ( + element.children.strip() if isinstance(element.children, str) else "" + ) # Detect start of the table: if "|" in snippet_text or self.in_table: # most likely part of the markdown table @@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): if parent_item else False ) + parent_ref = parent_item.self_ref if parent_item else None parent_item = self._create_list_item( doc=doc, parent_item=parent_item, @@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): formatting=formatting, hyperlink=hyperlink, ) + if parent_ref: + list_last_item_by_ref[parent_ref] = cast( + ListItem, parent_item + ) + elif isinstance(to_create, _HeadingCreationPayload): # not keeping as parent_item as logic for correctly tracking # that not implemented yet (section components not captured @@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): element, processed_block_types ): for child in element.children: + if ( + isinstance(element, marko.block.ListItem) + and isinstance(child, marko.block.List) + and parent_item + and list_last_item_by_ref.get(parent_item.self_ref, None) + ): + _log.debug( + f"walking into new List hanging from item of parent list {parent_item.self_ref}" + ) + parent_item = list_last_item_by_ref[parent_item.self_ref] + self._iterate_elements( element=child, depth=depth + 1, @@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): visited=visited, creation_stack=creation_stack, list_ordered_flag_by_ref=list_ordered_flag_by_ref, + list_last_item_by_ref=list_last_item_by_ref, parent_item=parent_item, formatting=formatting, hyperlink=hyperlink, @@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): return False @classmethod - def supported_formats(cls) -> Set[InputFormat]: + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.MD} def convert(self) -> DoclingDocument: @@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): visited=set(), creation_stack=[], list_ordered_flag_by_ref={}, + list_last_item_by_ref={}, ) self._close_table(doc=doc) # handle any last hanging table @@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ]: html_str = _restore_original_html(txt=html_str, regex=regex) self._html_blocks = 0 - # delegate to HTML backend stream = BytesIO(bytes(html_str, encoding="utf-8")) in_doc = InputDocument( diff --git a/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md b/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md index 5f76d50c..c3c162b7 100644 --- a/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md +++ b/tests/data/groundtruth/docling_v2/mixed_without_h1.md.md @@ -3,6 +3,6 @@ - A. first - subitem - B. second - 1. strange +- 2 . strange The end! diff --git a/tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml b/tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml new file mode 100644 index 00000000..402f47d1 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml @@ -0,0 +1,139 @@ +body: + children: + - $ref: '#/texts/0' + - $ref: '#/texts/1' + - $ref: '#/groups/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: [] + content_layer: furniture + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: +- children: + - $ref: '#/texts/2' + content_layer: body + label: section + name: header-1 + parent: + $ref: '#/body' + self_ref: '#/groups/0' +- children: + - $ref: '#/texts/3' + - $ref: '#/texts/5' + - $ref: '#/texts/6' + content_layer: body + label: list + name: list + parent: + $ref: '#/texts/2' + self_ref: '#/groups/1' +- children: + - $ref: '#/texts/4' + content_layer: body + label: list + name: list + parent: + $ref: '#/texts/3' + self_ref: '#/groups/2' +key_value_items: [] +name: mixed_without_h1 +origin: + binary_hash: 7394721163373597328 + filename: mixed_without_h1.md + mimetype: text/html +pages: {} +pictures: [] +schema_name: DoclingDocument +tables: [] +texts: +- children: [] + content_layer: furniture + label: title + orig: mixed_without_h1 + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/0' + text: mixed_without_h1 +- children: [] + content_layer: furniture + label: text + orig: Content before first heading + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/1' + text: Content before first heading +- children: + - $ref: '#/groups/1' + - $ref: '#/texts/7' + content_layer: body + label: section_header + level: 1 + orig: Some heading + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/2' + text: Some heading +- children: + - $ref: '#/groups/2' + content_layer: body + enumerated: false + label: list_item + marker: '' + orig: A. first + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/3' + text: A. first +- children: [] + content_layer: body + enumerated: false + label: list_item + marker: '' + orig: subitem + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/4' + text: subitem +- children: [] + content_layer: body + enumerated: false + label: list_item + marker: '' + orig: B. second + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/5' + text: B. second +- children: [] + content_layer: body + enumerated: false + label: list_item + marker: '' + orig: 2 . strange + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/6' + text: 2 . strange +- children: [] + content_layer: body + label: text + orig: The end! + parent: + $ref: '#/texts/2' + prov: [] + self_ref: '#/texts/7' + text: The end! +version: 1.5.0 diff --git a/tests/data/md/mixed_without_h1.md b/tests/data/md/mixed_without_h1.md index efc85a00..9a5593dd 100644 --- a/tests/data/md/mixed_without_h1.md +++ b/tests/data/md/mixed_without_h1.md @@ -7,6 +7,6 @@ Content before first heading - A. first - subitem - B. second -- 2. strange +- 2\. strange The end! diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py index 30865668..8a0d7b4f 100644 --- a/tests/test_backend_markdown.py +++ b/tests/test_backend_markdown.py @@ -16,7 +16,7 @@ def test_convert_valid(): relevant_paths = sorted((root_path / "md").rglob("*.md")) assert len(relevant_paths) > 0 - yaml_filter = ["inline_and_formatting"] + yaml_filter = ["inline_and_formatting", "mixed_without_h1"] for in_path in relevant_paths: md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" @@ -41,12 +41,11 @@ def test_convert_valid(): f.write(f"{act_data}\n") if in_path.stem in yaml_filter: - with open(yaml_gt_path, mode="w", encoding="utf-8") as f: - act_doc.save_as_yaml( - yaml_gt_path, - coord_precision=COORD_PREC, - confid_precision=CONFID_PREC, - ) + act_doc.save_as_yaml( + yaml_gt_path, + coord_precision=COORD_PREC, + confid_precision=CONFID_PREC, + ) else: with open(md_gt_path, encoding="utf-8") as f: exp_data = f.read().rstrip() @@ -54,4 +53,4 @@ def test_convert_valid(): if in_path.stem in yaml_filter: exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path) - assert act_doc == exp_doc + assert act_doc == exp_doc, f"export to yaml failed on {in_path}" diff --git a/uv.lock b/uv.lock index 4975799a..bb31e3fe 100644 --- a/uv.lock +++ b/uv.lock @@ -982,7 +982,7 @@ examples = [ [[package]] name = "docling-core" -version = "2.42.0" +version = "2.43.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jsonref" }, @@ -996,9 +996,9 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/60/c9/f5555f8efbbbecce858e78791fbe0b9465c3c91ea917a3a3acdb1c3c9887/docling_core-2.42.0.tar.gz", hash = "sha256:cf2bb9e889920bac1d94412bd89c10e647419b6d5f7fe5e6f71ed732eb8f24f6", size = 154657, upload-time = "2025-07-09T12:27:34.858Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/eb/c5af5ab617ca162fac7a1b9b89db6e52c33beb50b083b4eed858cea4f4b3/docling_core-2.43.1.tar.gz", hash = "sha256:8bc76879439e4ef6f65e60621fc70e6c81e02cb7490b08a12e416bfb05593180", size = 155583, upload-time = "2025-07-23T14:18:34.149Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0d/e4/685bb1b38ca120fdffc1c24f1ce54229ff996e5cad50a9c9dd39b671cb83/docling_core-2.42.0-py3-none-any.whl", hash = "sha256:0774391f335217a5aec8357977e66b63b6e8c9d821c56103de54c526eab92ed6", size = 158101, upload-time = "2025-07-09T12:27:33.147Z" }, + { url = "https://files.pythonhosted.org/packages/de/a1/25eafa2cfd8e73ff15a23e74d3698dac7608e1ca984081728788dd1444df/docling_core-2.43.1-py3-none-any.whl", hash = "sha256:24364a2344b3324a55fb4eba8cf2396345a90ca782766daa78412b6cdef00776", size = 159318, upload-time = "2025-07-23T14:18:32.576Z" }, ] [package.optional-dependencies]