Drafting Markdown backend via Marko library

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-17 16:40:18 +02:00 · 2024-10-17 16:40:18 +02:00 · 5986213cfe
commit 5986213cfe
parent d5460e2d1f
6 changed files with 840 additions and 483 deletions
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -0,0 +1,175 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    ProvenanceItem,
+    Size,
+    TableCell,
+    TableData,
+)
+
+from docling.backend.abstract_backend import (
+    DeclarativeDocumentBackend,
+    PaginatedDocumentBackend,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+import marko
+from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell
+from marko.inline import Image, Link, Emphasis, Strong
+
+_log = logging.getLogger(__name__)
+
+
+class MarkdownToDoclingRenderer(marko.Renderer):
+    """
+    # This is text analog of object based methods...
+    def render_heading(self, element: Heading):
+        return f"{'#' * element.level} {self.render_children(element)}\n\n"
+
+    def render_list(self, element: List):
+        if element.ordered:
+            return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
+        else:
+            return ''.join(f"* {self.render(child)}\n" for child in element.children)
+
+    def render_list_item(self, element: ListItem):
+        return self.render_children(element)
+
+    def render_paragraph(self, element: Paragraph):
+        return f"{self.render_children(element)}\n\n"
+
+    def render_image(self, element: Image):
+        return f"![{element.title}]({element.dest})\n\n"
+
+    def render_table(self, element: Table):
+        rows = [self.render(child) for child in element.children]
+        return '\n'.join(rows) + '\n'
+
+    def render_table_row(self, element: TableRow):
+        cells = ' | '.join(self.render(cell) for cell in element.children)
+        return f"| {cells} |"
+
+    def render_table_cell(self, element: TableCell):
+        return self.render_children(element)
+    """
+    def render_heading(self, element: Heading):
+        return {
+            "type": "heading",
+            "level": element.level,
+            "content": self.render_children(element),
+        }
+
+    def render_paragraph(self, element: Paragraph):
+        return {
+            "type": "paragraph",
+            "content": self.render_children(element),
+        }
+
+    def render_list(self, element: List):
+        return {
+            "type": "list",
+            "ordered": element.ordered,
+            "items": [self.render(child) for child in element.children]
+        }
+
+    def render_list_item(self, element: ListItem):
+        return {
+            "type": "list_item",
+            "content": self.render_children(element),
+        }
+
+    def render_image(self, element: Image):
+        return {
+            "type": "image",
+            "alt": element.title,
+            "url": element.dest,
+        }
+
+    def render_table(self, element: Table):
+        return {
+            "type": "table",
+            "rows": [self.render(row) for row in element.children]
+        }
+
+    def render_table_row(self, element: TableRow):
+        return {
+            "type": "table_row",
+            "cells": [self.render(cell) for cell in element.children]
+        }
+
+    def render_table_cell(self, element: TableCell):
+        return {
+            "type": "table_cell",
+            "content": self.render_children(element)
+        }
+
+    def render(self, element):
+        if isinstance(element, str):
+            return element
+        return super().render(element)
+
+class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        # Markdown file:
+        self.path_or_stream = path_or_stream
+
+        self.valid = False
+
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                self.markdown = text_stream
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                    md_content = f.read()
+                    self.markdown = md_content
+        except Exception as e:
+            raise RuntimeError(
+                f"Could not initialize MD backend for file with hash {self.document_hash}."
+            ) from e
+        return
+
+    def page_count(self) -> int:
+        return 0
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.MD}
+
+    def convert(self) -> DoclingDocument:
+            # Parse and render
+            parser = marko.Markdown(renderer=MarkdownToDoclingRenderer)
+            parsed_object = parser.parse(markdown_text)
+            # Render the parsed Markdown into a structured object
+            markdown_object = parser.render(parsed_object)
+
+            print(marko_doc)
+            # doc = self.walk(self.soup.body, doc)
+        else:
+            raise RuntimeError(
+                f"Cannot convert md with {self.document_hash} because the backend failed to init."
+            )
+        return doc
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -30,6 +30,7 @@ class InputFormat(str, Enum):
    HTML = "html"
    IMAGE = "image"
    PDF = "pdf"
+    MD = "md"


 class OutputFormat(str, Enum):
@ -43,6 +44,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
    InputFormat.PDF: ["pdf"],
+    InputFormat.MD: ["md"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
 }
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
        "image/bmp",
    },
    InputFormat.PDF: {"application/pdf"},
+    InputFormat.MD: {"text/markdown", "text/x-markdown"},
 }
 MimeTypeToFormat = {
    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -12,6 +12,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
 from docling.datamodel.document import (
    ConversionResult,
@ -52,6 +53,11 @@ class PowerpointFormatOption(FormatOption):
    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend


+class MarkdownFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
+
+
 class HTMLFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
@ -74,6 +80,9 @@ _format_to_default_options = {
    InputFormat.PPTX: FormatOption(
        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
    ),
+    InputFormat.MD: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
+    ),
    InputFormat.HTML: FormatOption(
        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
    ),
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -19,6 +19,7 @@ _log = logging.getLogger(__name__)

 def main():
    input_paths = [
+        Path("README.md"),
        Path("tests/data/wiki_duck.html"),
        Path("tests/data/word_sample.docx"),
        Path("tests/data/lorem_ipsum.docx"),
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -57,6 +57,7 @@ python-docx = "^1.1.2"
 python-pptx = "^1.0.2"
 beautifulsoup4 = "^4.12.3"
 pandas = "^2.1.4"
+marko = "^2.1.2"

 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}