From 93683299737a8892e9cbdbc36abc6d628305b8f7 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Thu, 19 Jun 2025 16:40:05 +0200 Subject: [PATCH] add change and updated test data Signed-off-by: Panos Vagenas --- docling/backend/md_backend.py | 172 +++++++++++++----- .../docling_v2/inline_and_formatting.md.md | 4 +- .../docling_v2/inline_and_formatting.md.yaml | 18 +- 3 files changed, 145 insertions(+), 49 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index b8b0e6d0..9d6c9515 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -2,9 +2,10 @@ import logging import re import warnings from copy import deepcopy +from enum import Enum from io import BytesIO from pathlib import Path -from typing import List, Optional, Set, Union +from typing import List, Literal, Optional, Set, Union import marko import marko.element @@ -21,7 +22,8 @@ from docling_core.types.doc import ( ) from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList from marko import Markdown -from pydantic import AnyUrl, TypeAdapter +from pydantic import AnyUrl, BaseModel, Field, TypeAdapter +from typing_extensions import Annotated from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend @@ -35,6 +37,31 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#" _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#" +class _PendingCreationType(str, Enum): + """CoordOrigin.""" + + HEADING = "heading" + LIST_ITEM = "list_item" + + +class _HeadingCreationPayload(BaseModel): + kind: Literal["heading"] = "heading" + level: int + + +class _ListItemCreationPayload(BaseModel): + kind: Literal["list_item"] = "list_item" + + +_CreationPayload = Annotated[ + Union[ + _HeadingCreationPayload, + _ListItemCreationPayload, + ], + Field(discriminator="kind"), +] + + class MarkdownDocumentBackend(DeclarativeDocumentBackend): def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): # This regex will match any sequence of underscores @@ -155,6 +182,52 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=table_data) return + def _create_list_item( + self, + doc: DoclingDocument, + parent_item: Optional[NodeItem], + text: str, + formatting: Optional[Formatting] = None, + hyperlink: Optional[Union[AnyUrl, Path]] = None, + ): + if not isinstance(parent_item, (OrderedList, UnorderedList)): + _log.warning("ListItem would have not had a list parent, adding one.") + parent_item = doc.add_unordered_list(parent=parent_item) + item = doc.add_list_item( + text=text, + enumerated=(isinstance(parent_item, OrderedList)), + parent=parent_item, + formatting=formatting, + hyperlink=hyperlink, + ) + return item + + def _create_heading_item( + self, + doc: DoclingDocument, + parent_item: Optional[NodeItem], + text: str, + level: int, + formatting: Optional[Formatting] = None, + hyperlink: Optional[Union[AnyUrl, Path]] = None, + ): + if level == 1: + item = doc.add_title( + text=text, + parent=parent_item, + formatting=formatting, + hyperlink=hyperlink, + ) + else: + item = doc.add_heading( + text=text, + level=level - 1, + parent=parent_item, + formatting=formatting, + hyperlink=hyperlink, + ) + return item + def _iterate_elements( # noqa: C901 self, *, @@ -162,6 +235,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): depth: int, doc: DoclingDocument, visited: Set[marko.element.Element], + creation_stack: list[ + _CreationPayload + ], # stack for lazy item creation triggered deep in marko's AST (on RawText) parent_item: Optional[NodeItem] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -177,28 +253,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore ) - if len(element.children) == 1: - child = element.children[0] - snippet_text = str(child.children) # type: ignore - visited.add(child) - else: - snippet_text = "" # inline group will be created - - if element.level == 1: - parent_item = doc.add_title( - text=snippet_text, - parent=parent_item, + if len(element.children) > 1: # inline group will be created further down + parent_item = self._create_heading_item( + doc=doc, + parent_item=parent_item, + text="", + level=element.level, formatting=formatting, hyperlink=hyperlink, ) else: - parent_item = doc.add_heading( - text=snippet_text, - level=element.level - 1, - parent=parent_item, - formatting=formatting, - hyperlink=hyperlink, - ) + creation_stack.append(_HeadingCreationPayload(level=element.level)) elif isinstance(element, marko.block.List): has_non_empty_list_items = False @@ -224,22 +289,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._close_table(doc) _log.debug(" - List item") - if len(child.children) == 1: - snippet_text = str(child.children[0].children) # type: ignore - visited.add(child) + if len(child.children) > 1: # inline group will be created further down + parent_item = self._create_list_item( + doc=doc, + parent_item=parent_item, + text="", + formatting=formatting, + hyperlink=hyperlink, + ) else: - snippet_text = "" # inline group will be created - is_numbered = isinstance(parent_item, OrderedList) - if not isinstance(parent_item, (OrderedList, UnorderedList)): - _log.warning("ListItem would have not had a list parent, adding one.") - parent_item = doc.add_unordered_list(parent=parent_item) - parent_item = doc.add_list_item( - enumerated=is_numbered, - parent=parent_item, - text=snippet_text, - formatting=formatting, - hyperlink=hyperlink, - ) + creation_stack.append(_ListItemCreationPayload()) elif isinstance(element, marko.inline.Image): self._close_table(doc) @@ -285,13 +344,38 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self.md_table_buffer.append(snippet_text) elif snippet_text: self._close_table(doc) - doc.add_text( - label=DocItemLabel.TEXT, - parent=parent_item, - text=snippet_text, - formatting=formatting, - hyperlink=hyperlink, - ) + + if creation_stack: + while len(creation_stack) > 0: + to_create = creation_stack.pop() + if isinstance(to_create, _ListItemCreationPayload): + parent_item = self._create_list_item( + doc=doc, + parent_item=parent_item, + text=snippet_text, + formatting=formatting, + hyperlink=hyperlink, + ) + elif isinstance(to_create, _HeadingCreationPayload): + # not keeping as parent_item as logic for correctly tracking + # that not implemented yet (section components not captured + # as heading children in marko) + self._create_heading_item( + doc=doc, + parent_item=parent_item, + text=snippet_text, + level=to_create.level, + formatting=formatting, + hyperlink=hyperlink, + ) + else: + doc.add_text( + label=DocItemLabel.TEXT, + parent=parent_item, + text=snippet_text, + formatting=formatting, + hyperlink=hyperlink, + ) elif isinstance(element, marko.inline.CodeSpan): self._close_table(doc) @@ -353,7 +437,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): parent_item = doc.add_inline_group(parent=parent_item) processed_block_types = ( - # marko.block.Heading, marko.block.CodeBlock, marko.block.FencedCode, marko.inline.RawText, @@ -369,6 +452,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): depth=depth + 1, doc=doc, visited=visited, + creation_stack=creation_stack, parent_item=parent_item, formatting=formatting, hyperlink=hyperlink, @@ -405,6 +489,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # Parse the markdown into an abstract syntax tree (AST) marko_parser = Markdown() parsed_ast = marko_parser.parse(self.markdown) + print(f"{parsed_ast=}") # Start iterating from the root of the AST self._iterate_elements( element=parsed_ast, @@ -412,6 +497,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc=doc, parent_item=None, visited=set(), + creation_stack=[], ) self._close_table(doc=doc) # handle any last hanging table diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md index 98dc0040..10abe23d 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md @@ -11,10 +11,10 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` . 3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` ) 4. Push to the branch ( `git push origin feature/AmazingFeature` ) 5. Open a Pull Request -6. [<RawText children='Whole list item has same formatting'>] +6. **Whole list item has same formatting** 7. List item has *mixed or partial* formatting -# [<RawText children='Whole heading is italic'>] +*# Whole heading is italic* Bar diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml index 95c86a96..a7a49d70 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -424,14 +424,19 @@ texts: - children: [] content_layer: body enumerated: true + formatting: + bold: true + italic: false + strikethrough: false + underline: false label: list_item marker: '-' - orig: '[]' + orig: Whole list item has same formatting parent: $ref: '#/groups/2' prov: [] self_ref: '#/texts/27' - text: '[]' + text: Whole list item has same formatting - children: - $ref: '#/groups/7' content_layer: body @@ -478,13 +483,18 @@ texts: text: formatting - children: [] content_layer: body + formatting: + bold: false + italic: true + strikethrough: false + underline: false label: title - orig: '[]' + orig: Whole heading is italic parent: $ref: '#/body' prov: [] self_ref: '#/texts/32' - text: '[]' + text: Whole heading is italic - children: [] content_layer: body label: text