diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index a4b67aa9..e4244f87 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -24,107 +24,23 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument import marko -from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell -from marko.inline import Image, Link, Emphasis, Strong +from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.) +from marko.block import BlockElement +from marko.inline import InlineElement _log = logging.getLogger(__name__) -class MarkdownToDoclingRenderer(marko.Renderer): - """ - # This is text analog of object based methods... - def render_heading(self, element: Heading): - return f"{'#' * element.level} {self.render_children(element)}\n\n" - - def render_list(self, element: List): - if element.ordered: - return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children)) - else: - return ''.join(f"* {self.render(child)}\n" for child in element.children) - - def render_list_item(self, element: ListItem): - return self.render_children(element) - - def render_paragraph(self, element: Paragraph): - return f"{self.render_children(element)}\n\n" - - def render_image(self, element: Image): - return f"![{element.title}]({element.dest})\n\n" - - def render_table(self, element: Table): - rows = [self.render(child) for child in element.children] - return '\n'.join(rows) + '\n' - - def render_table_row(self, element: TableRow): - cells = ' | '.join(self.render(cell) for cell in element.children) - return f"| {cells} |" - - def render_table_cell(self, element: TableCell): - return self.render_children(element) - """ - def render_heading(self, element: Heading): - return { - "type": "heading", - "level": element.level, - "content": self.render_children(element), - } - - def render_paragraph(self, element: Paragraph): - return { - "type": "paragraph", - "content": self.render_children(element), - } - - def render_list(self, element: List): - return { - "type": "list", - "ordered": element.ordered, - "items": [self.render(child) for child in element.children] - } - - def render_list_item(self, element: ListItem): - return { - "type": "list_item", - "content": self.render_children(element), - } - - def render_image(self, element: Image): - return { - "type": "image", - "alt": element.title, - "url": element.dest, - } - - def render_table(self, element: Table): - return { - "type": "table", - "rows": [self.render(row) for row in element.children] - } - - def render_table_row(self, element: TableRow): - return { - "type": "table_row", - "cells": [self.render(cell) for cell in element.children] - } - - def render_table_cell(self, element: TableCell): - return { - "type": "table_cell", - "content": self.render_children(element) - } - - def render(self, element): - if isinstance(element, str): - return element - return super().render(element) - -class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend): +class MarkdownDocumentBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) + + _log.info("MD INIT!!!") + # Markdown file: self.path_or_stream = path_or_stream - - self.valid = False + self.valid = True + self.markdown = "" # To store original Markdown string try: if isinstance(self.path_or_stream, BytesIO): @@ -134,40 +50,78 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacke with open(self.path_or_stream, "r", encoding="utf-8") as f: md_content = f.read() self.markdown = md_content + self.valid = True + + _log.info(self.markdown) except Exception as e: raise RuntimeError( f"Could not initialize MD backend for file with hash {self.document_hash}." ) from e return - def page_count(self) -> int: - return 0 + # Function to iterate over all elements in the AST + def iterate_elements(self, element, depth=0): + # Print the element type and optionally its content + print(f"{' ' * depth}- {type(element).__name__}", end="") + + if isinstance(element, BlockElement): + print(" (Block Element)") + elif isinstance(element, InlineElement): + print(" (Inline Element)") + + # Check for different element types and print relevant details + if isinstance(element, marko.block.Heading): + print(f" - Heading level {element.level}, content: {element.children[0].children}") + + elif isinstance(element, marko.block.List): + print(f" - List {'ordered' if element.ordered else 'unordered'}") + + elif isinstance(element, marko.block.ListItem): + print(" - List item") + + elif isinstance(element, marko.block.Paragraph): + print(f" - Paragraph: {element.children[0].children}") + + elif isinstance(element, marko.inline.Image): + print(f" - Image with alt: {element.title}, url: {element.dest}") + + # elif isinstance(element, marko.block.Table): + # + print(" - Table") + + # Iterate through the element's children (if any) + if hasattr(element, 'children'): + for child in element.children: + self.iterate_elements(child, depth + 1) def is_valid(self) -> bool: return self.valid - @classmethod - def supports_pagination(cls) -> bool: - return False - def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() self.path_or_stream = None + @classmethod + def supports_pagination(cls) -> bool: + return False # True? if so, how to handle pages... + @classmethod def supported_formats(cls) -> Set[InputFormat]: return {InputFormat.MD} def convert(self) -> DoclingDocument: - # Parse and render - parser = marko.Markdown(renderer=MarkdownToDoclingRenderer) - parsed_object = parser.parse(markdown_text) - # Render the parsed Markdown into a structured object - markdown_object = parser.render(parsed_object) + print("converting Markdown...") + doc = DoclingDocument(name="Test") + doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion") + + if self.is_valid(): + # Parse the markdown into an abstract syntax tree (AST) + parser = marko.Markdown(extensions=['gfm']) + parsed_ast = parser.parse(self.markdown) + # Start iterating from the root of the AST + self.iterate_elements(parsed_ast) - print(marko_doc) - # doc = self.walk(self.soup.body, doc) else: raise RuntimeError( f"Cannot convert md with {self.document_hash} because the backend failed to init." diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index e688af20..be213bc5 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -496,6 +496,8 @@ class _DocumentConversionInput(BaseModel): if mime is None: mime = self._detect_html_xhtml(content) + if mime is None: + mime = "text/markdown" format = MimeTypeToFormat.get(mime) return format diff --git a/docs/examples/run_md.py b/docs/examples/run_md.py new file mode 100644 index 00000000..126f4ec4 --- /dev/null +++ b/docs/examples/run_md.py @@ -0,0 +1,58 @@ +import json +import logging +from pathlib import Path + +import yaml + +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.document_converter import ( + DocumentConverter, + PdfFormatOption, + WordFormatOption, +) +from docling.pipeline.simple_pipeline import SimplePipeline +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +import os +from docling.backend.md_backend import MarkdownDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +def main(): + input_paths = [ + Path("README.md") + ] + + for path in input_paths: + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.PDF, + backend=MarkdownDocumentBackend, + ) + mdb = MarkdownDocumentBackend(in_doc = in_doc, path_or_stream = path) + document = mdb.convert() + + out_path = Path("scratch") + print( + f"Document {path} converted." + f"\nSaved markdown output to: {str(out_path)}" + ) + + # Export Docling document format to markdowndoc: + fn = os.path.basename(path) + + with (out_path / f"{fn}.md").open("w") as fp: + fp.write(document.export_to_markdown()) + + with (out_path / f"{fn}.json").open("w") as fp: + fp.write(json.dumps(document.export_to_dict())) + + with (out_path / f"{fn}.yaml").open("w") as fp: + fp.write(yaml.safe_dump(document.export_to_dict())) + + +if __name__ == "__main__": + main()