mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
feat(markdown): add formatting & improve inline support (#1804)
feat(markdown): support formatting & hyperlinks Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
@@ -1,17 +1,15 @@
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
import marko
|
||||
import marko.element
|
||||
import marko.ext
|
||||
import marko.ext.gfm
|
||||
import marko.inline
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from marko import Markdown
|
||||
from pydantic import AnyUrl, TypeAdapter
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
self.in_table = False
|
||||
self.md_table_buffer: list[str] = []
|
||||
self.inline_texts: list[str] = []
|
||||
self._html_blocks: int = 0
|
||||
|
||||
try:
|
||||
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc.add_table(data=table_data)
|
||||
return
|
||||
|
||||
def _process_inline_text(
|
||||
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
||||
):
|
||||
txt = " ".join(self.inline_texts)
|
||||
if len(txt) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=parent_item,
|
||||
text=txt,
|
||||
)
|
||||
self.inline_texts = []
|
||||
|
||||
def _iterate_elements( # noqa: C901
|
||||
self,
|
||||
*,
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
doc: DoclingDocument,
|
||||
visited: Set[marko.element.Element],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
formatting: Optional[Formatting] = None,
|
||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||
):
|
||||
if element in visited:
|
||||
return
|
||||
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Check for different element types and process relevant details
|
||||
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(
|
||||
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||
)
|
||||
if element.level == 1:
|
||||
doc_label = DocItemLabel.TITLE
|
||||
|
||||
if len(element.children) == 1:
|
||||
child = element.children[0]
|
||||
snippet_text = str(child.children) # type: ignore
|
||||
visited.add(child)
|
||||
else:
|
||||
doc_label = DocItemLabel.SECTION_HEADER
|
||||
snippet_text = "" # inline group will be created
|
||||
|
||||
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
||||
# hence we need to traverse the tree to get full text of a header
|
||||
strings: List[str] = []
|
||||
|
||||
# Define a recursive function to traverse the tree
|
||||
def traverse(node: marko.block.BlockElement):
|
||||
# Check if the node has a "children" attribute
|
||||
if hasattr(node, "children"):
|
||||
# If "children" is a list, continue traversal
|
||||
if isinstance(node.children, list):
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
# If "children" is text, add it to header text
|
||||
elif isinstance(node.children, str):
|
||||
strings.append(node.children)
|
||||
|
||||
traverse(element)
|
||||
snippet_text = "".join(strings)
|
||||
if len(snippet_text) > 0:
|
||||
if doc_label == DocItemLabel.SECTION_HEADER:
|
||||
parent_item = doc.add_heading(
|
||||
text=snippet_text,
|
||||
level=element.level - 1,
|
||||
parent=parent_item,
|
||||
)
|
||||
else:
|
||||
parent_item = doc.add_text(
|
||||
label=doc_label, parent=parent_item, text=snippet_text
|
||||
)
|
||||
if element.level == 1:
|
||||
parent_item = doc.add_title(
|
||||
text=snippet_text,
|
||||
parent=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
parent_item = doc.add_heading(
|
||||
text=snippet_text,
|
||||
level=element.level - 1,
|
||||
parent=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
has_non_empty_list_items = False
|
||||
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
break
|
||||
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
and len(element.children) > 0
|
||||
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
|
||||
and len(element.children) == 1
|
||||
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
||||
and len(child.children) > 0
|
||||
):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
snippet_text = str(first_child.children[0].children) # type: ignore
|
||||
is_numbered = False
|
||||
if (
|
||||
parent_item is not None
|
||||
and isinstance(parent_item, DocItem)
|
||||
and parent_item.label == GroupLabel.ORDERED_LIST
|
||||
):
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
||||
if len(child.children) == 1:
|
||||
snippet_text = str(child.children[0].children) # type: ignore
|
||||
visited.add(child)
|
||||
else:
|
||||
snippet_text = "" # inline group will be created
|
||||
is_numbered = isinstance(parent_item, OrderedList)
|
||||
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
||||
_log.warning("ListItem would have not had a list parent, adding one.")
|
||||
parent_item = doc.add_unordered_list(parent=parent_item)
|
||||
parent_item = doc.add_list_item(
|
||||
enumerated=is_numbered,
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
visited.add(first_child)
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||
|
||||
fig_caption: Optional[TextItem] = None
|
||||
if element.title is not None and element.title != "":
|
||||
fig_caption = doc.add_text(
|
||||
label=DocItemLabel.CAPTION, text=element.title
|
||||
label=DocItemLabel.CAPTION,
|
||||
text=element.title,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
doc.add_picture(parent=parent_item, caption=fig_caption)
|
||||
|
||||
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
||||
self._process_inline_text(parent_item, doc)
|
||||
elif isinstance(element, marko.inline.Emphasis):
|
||||
_log.debug(f" - Emphasis: {element.children}")
|
||||
formatting = deepcopy(formatting) if formatting else Formatting()
|
||||
formatting.italic = True
|
||||
|
||||
elif isinstance(element, marko.inline.StrongEmphasis):
|
||||
_log.debug(f" - StrongEmphasis: {element.children}")
|
||||
formatting = deepcopy(formatting) if formatting else Formatting()
|
||||
formatting.bold = True
|
||||
|
||||
elif isinstance(element, marko.inline.Link):
|
||||
_log.debug(f" - Link: {element.children}")
|
||||
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
|
||||
element.dest
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.RawText):
|
||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
||||
else:
|
||||
self.md_table_buffer.append(snippet_text)
|
||||
else:
|
||||
elif snippet_text:
|
||||
self._close_table(doc)
|
||||
# most likely just inline text
|
||||
self.inline_texts.append(str(element.children))
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.CodeSpan):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Code Span: {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_code(parent=parent_item, text=snippet_text)
|
||||
doc.add_code(
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif (
|
||||
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
||||
and len(element.children) > 0
|
||||
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
||||
and len(snippet_text := (first_child.children.strip())) > 0
|
||||
and isinstance((child := element.children[0]), marko.inline.RawText)
|
||||
and len(snippet_text := (child.children.strip())) > 0
|
||||
):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
doc.add_code(parent=parent_item, text=snippet_text)
|
||||
doc.add_code(
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.LineBreak):
|
||||
if self.in_table:
|
||||
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif isinstance(element, marko.block.HTMLBlock):
|
||||
self._html_blocks += 1
|
||||
self._process_inline_text(parent_item, doc)
|
||||
self._close_table(doc)
|
||||
_log.debug(f"HTML Block: {element}")
|
||||
if (
|
||||
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# wrap in markers to enable post-processing in convert()
|
||||
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
||||
doc.add_code(parent=parent_item, text=text_to_add)
|
||||
doc.add_code(
|
||||
parent=parent_item,
|
||||
text=text_to_add,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self._close_table(doc)
|
||||
_log.debug(f"Some other element: {element}")
|
||||
|
||||
if (
|
||||
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
||||
and len(element.children) > 1
|
||||
):
|
||||
parent_item = doc.add_inline_group(parent=parent_item)
|
||||
|
||||
processed_block_types = (
|
||||
marko.block.Heading,
|
||||
# marko.block.Heading,
|
||||
marko.block.CodeBlock,
|
||||
marko.block.FencedCode,
|
||||
marko.inline.RawText,
|
||||
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc=doc,
|
||||
visited=visited,
|
||||
parent_item=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent_item=None,
|
||||
visited=set(),
|
||||
)
|
||||
self._process_inline_text(None, doc) # handle last hanging inline text
|
||||
self._close_table(doc=doc) # handle any last hanging table
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
|
||||
Reference in New Issue
Block a user