work in progress on MD backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-08-02 07:22:14 +00:00 · 2024-10-18 14:39:22 +02:00 · 2024-10-18 14:39:22 +02:00 · 1df89f79ff
commit 1df89f79ff
parent 5986213cfe
3 changed files with 120 additions and 106 deletions
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -24,107 +24,23 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 import marko
-from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell
+from marko.ext.gfm import gfm  # GitHub Flavored Markdown plugin (tables, task lists, etc.)
-from marko.inline import Image, Link, Emphasis, Strong
+from marko.block import BlockElement
 from marko.inline import InlineElement
 _log = logging.getLogger(__name__)
-class MarkdownToDoclingRenderer(marko.Renderer):
+class MarkdownDocumentBackend(DeclarativeDocumentBackend):
    """
    # This is text analog of object based methods...
    def render_heading(self, element: Heading):
        return f"{'#' * element.level} {self.render_children(element)}\n\n"
    def render_list(self, element: List):
        if element.ordered:
            return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
        else:
            return ''.join(f"* {self.render(child)}\n" for child in element.children)
    def render_list_item(self, element: ListItem):
        return self.render_children(element)
    def render_paragraph(self, element: Paragraph):
        return f"{self.render_children(element)}\n\n"
    def render_image(self, element: Image):
        return f"![{element.title}]({element.dest})\n\n"
    def render_table(self, element: Table):
        rows = [self.render(child) for child in element.children]
        return '\n'.join(rows) + '\n'
    def render_table_row(self, element: TableRow):
        cells = ' | '.join(self.render(cell) for cell in element.children)
        return f"| {cells} |"
    def render_table_cell(self, element: TableCell):
        return self.render_children(element)
    """
    def render_heading(self, element: Heading):
        return {
            "type": "heading",
            "level": element.level,
            "content": self.render_children(element),
        }
    def render_paragraph(self, element: Paragraph):
        return {
            "type": "paragraph",
            "content": self.render_children(element),
        }
    def render_list(self, element: List):
        return {
            "type": "list",
            "ordered": element.ordered,
            "items": [self.render(child) for child in element.children]
        }
    def render_list_item(self, element: ListItem):
        return {
            "type": "list_item",
            "content": self.render_children(element),
        }
    def render_image(self, element: Image):
        return {
            "type": "image",
            "alt": element.title,
            "url": element.dest,
        }
    def render_table(self, element: Table):
        return {
            "type": "table",
            "rows": [self.render(row) for row in element.children]
        }
    def render_table_row(self, element: TableRow):
        return {
            "type": "table_row",
            "cells": [self.render(cell) for cell in element.children]
        }
    def render_table_cell(self, element: TableCell):
        return {
            "type": "table_cell",
            "content": self.render_children(element)
        }
    def render(self, element):
        if isinstance(element, str):
            return element
        return super().render(element)
 class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
        _log.info("MD INIT!!!")
        # Markdown file:
        self.path_or_stream = path_or_stream
-
+        self.valid = True
-        self.valid = False
+        self.markdown = ""  # To store original Markdown string
        try:
            if isinstance(self.path_or_stream, BytesIO):
@ -134,40 +50,78 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacke
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    md_content = f.read()
                    self.markdown = md_content
            self.valid = True
            _log.info(self.markdown)
        except Exception as e:
            raise RuntimeError(
                f"Could not initialize MD backend for file with hash {self.document_hash}."
            ) from e
        return
-    def page_count(self) -> int:
+    # Function to iterate over all elements in the AST
-        return 0
+    def iterate_elements(self, element, depth=0):
        # Print the element type and optionally its content
        print(f"{'  ' * depth}- {type(element).__name__}", end="")
        if isinstance(element, BlockElement):
            print(" (Block Element)")
        elif isinstance(element, InlineElement):
            print(" (Inline Element)")
        # Check for different element types and print relevant details
        if isinstance(element, marko.block.Heading):
            print(f" - Heading level {element.level}, content: {element.children[0].children}")
        elif isinstance(element, marko.block.List):
            print(f" - List {'ordered' if element.ordered else 'unordered'}")
        elif isinstance(element, marko.block.ListItem):
            print(" - List item")
        elif isinstance(element, marko.block.Paragraph):
            print(f" - Paragraph: {element.children[0].children}")
        elif isinstance(element, marko.inline.Image):
            print(f" - Image with alt: {element.title}, url: {element.dest}")
        # elif isinstance(element, marko.block.Table):
        # 
            print(" - Table")
        # Iterate through the element's children (if any)
        if hasattr(element, 'children'):
            for child in element.children:
                self.iterate_elements(child, depth + 1)
    def is_valid(self) -> bool:
        return self.valid
    @classmethod
    def supports_pagination(cls) -> bool:
        return False
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    def supports_pagination(cls) -> bool:
        return False  # True? if so, how to handle pages...
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.MD}
    def convert(self) -> DoclingDocument:
-            # Parse and render
+        print("converting Markdown...")
-            parser = marko.Markdown(renderer=MarkdownToDoclingRenderer)
+        doc = DoclingDocument(name="Test")
-            parsed_object = parser.parse(markdown_text)
+        doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
-            # Render the parsed Markdown into a structured object
+
-            markdown_object = parser.render(parsed_object)
+        if self.is_valid():
            # Parse the markdown into an abstract syntax tree (AST)
            parser = marko.Markdown(extensions=['gfm'])
            parsed_ast = parser.parse(self.markdown)
            # Start iterating from the root of the AST
            self.iterate_elements(parsed_ast)
            print(marko_doc)
            # doc = self.walk(self.soup.body, doc)
        else:
            raise RuntimeError(
                f"Cannot convert md with {self.document_hash} because the backend failed to init."
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -496,6 +496,8 @@ class _DocumentConversionInput(BaseModel):
        if mime is None:
            mime = self._detect_html_xhtml(content)
        if mime is None:
            mime = "text/markdown"
        format = MimeTypeToFormat.get(mime)
        return format
--- a/docs/examples/run_md.py
+++ b/docs/examples/run_md.py
@ -0,0 +1,58 @@
 import json
 import logging
 from pathlib import Path
 import yaml
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
 )
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 import os
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
 def main():
    input_paths = [
        Path("README.md")
    ]
    for path in input_paths:
        in_doc = InputDocument(
            path_or_stream=path,
            format=InputFormat.PDF,
            backend=MarkdownDocumentBackend,
        )
        mdb = MarkdownDocumentBackend(in_doc = in_doc, path_or_stream = path)
        document = mdb.convert()
        out_path = Path("scratch")
        print(
            f"Document {path} converted."
            f"\nSaved markdown output to: {str(out_path)}"
        )
        # Export Docling document format to markdowndoc:
        fn = os.path.basename(path)
        with (out_path / f"{fn}.md").open("w") as fp:
            fp.write(document.export_to_markdown())
        with (out_path / f"{fn}.json").open("w") as fp:
            fp.write(json.dumps(document.export_to_dict()))
        with (out_path / f"{fn}.yaml").open("w") as fp:
            fp.write(yaml.safe_dump(document.export_to_dict()))
 if __name__ == "__main__":
    main()