feat(markdown): add formatting & improve inline support (#1804)

feat(markdown): support formatting & hyperlinks

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas
2025-06-18 15:57:57 +02:00
committed by GitHub
parent 215b540f6c
commit 861abcdcb0
5 changed files with 722 additions and 88 deletions

View File

@@ -1,17 +1,15 @@
import logging
import re
import warnings
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Set, Union
import marko
import marko.element
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
TableData,
TextItem,
)
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from marko import Markdown
from pydantic import AnyUrl, TypeAdapter
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False
self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0
try:
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data)
return
def _process_inline_text(
self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_item,
text=txt,
)
self.inline_texts = []
def _iterate_elements( # noqa: C901
self,
*,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
):
if element in visited:
return
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if element.level == 1:
doc_label = DocItemLabel.TITLE
if len(element.children) == 1:
child = element.children[0]
snippet_text = str(child.children) # type: ignore
visited.add(child)
else:
doc_label = DocItemLabel.SECTION_HEADER
snippet_text = "" # inline group will be created
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings: List[str] = []
# Define a recursive function to traverse the tree
def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
if isinstance(node.children, list):
for child in node.children:
traverse(child)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
if doc_label == DocItemLabel.SECTION_HEADER:
parent_item = doc.add_heading(
text=snippet_text,
level=element.level - 1,
parent=parent_item,
)
else:
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
)
if element.level == 1:
parent_item = doc.add_title(
text=snippet_text,
parent=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
else:
parent_item = doc.add_heading(
text=snippet_text,
level=element.level - 1,
parent=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.block.List):
has_non_empty_list_items = False
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
break
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif (
isinstance(element, marko.block.ListItem)
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
and len(element.children) == 1
and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0
):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(" - List item")
snippet_text = str(first_child.children[0].children) # type: ignore
is_numbered = False
if (
parent_item is not None
and isinstance(parent_item, DocItem)
and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_item, text=snippet_text
if len(child.children) == 1:
snippet_text = str(child.children[0].children) # type: ignore
visited.add(child)
else:
snippet_text = "" # inline group will be created
is_numbered = isinstance(parent_item, OrderedList)
if not isinstance(parent_item, (OrderedList, UnorderedList)):
_log.warning("ListItem would have not had a list parent, adding one.")
parent_item = doc.add_unordered_list(parent=parent_item)
parent_item = doc.add_list_item(
enumerated=is_numbered,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
visited.add(first_child)
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
label=DocItemLabel.CAPTION,
text=element.title,
formatting=formatting,
hyperlink=hyperlink,
)
doc.add_picture(parent=parent_item, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self._process_inline_text(parent_item, doc)
elif isinstance(element, marko.inline.Emphasis):
_log.debug(f" - Emphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.italic = True
elif isinstance(element, marko.inline.StrongEmphasis):
_log.debug(f" - StrongEmphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.bold = True
elif isinstance(element, marko.inline.Link):
_log.debug(f" - Link: {element.children}")
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
element.dest
)
elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
else:
self.md_table_buffer.append(snippet_text)
else:
elif snippet_text:
self._close_table(doc)
# most likely just inline text
self.inline_texts.append(str(element.children))
doc.add_text(
label=DocItemLabel.TEXT,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_code(parent=parent_item, text=snippet_text)
doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
and isinstance((child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (child.children.strip())) > 0
):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}")
doc.add_code(parent=parent_item, text=snippet_text)
doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug(f"HTML Block: {element}")
if (
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_item, text=text_to_add)
doc.add_code(
parent=parent_item,
text=text_to_add,
formatting=formatting,
hyperlink=hyperlink,
)
else:
if not isinstance(element, str):
self._close_table(doc)
_log.debug(f"Some other element: {element}")
if (
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
and len(element.children) > 1
):
parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = (
marko.block.Heading,
# marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
marko.inline.RawText,
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc=doc,
visited=visited,
parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
def is_valid(self) -> bool:
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parent_item=None,
visited=set(),
)
self._process_inline_text(None, doc) # handle last hanging inline text
self._close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend