work in progress on MD backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-18 14:39:22 +02:00
parent 5986213cfe
commit 1df89f79ff
3 changed files with 120 additions and 106 deletions

View File

@ -24,107 +24,23 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
import marko import marko
from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.)
from marko.inline import Image, Link, Emphasis, Strong from marko.block import BlockElement
from marko.inline import InlineElement
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class MarkdownToDoclingRenderer(marko.Renderer): class MarkdownDocumentBackend(DeclarativeDocumentBackend):
"""
# This is text analog of object based methods...
def render_heading(self, element: Heading):
return f"{'#' * element.level} {self.render_children(element)}\n\n"
def render_list(self, element: List):
if element.ordered:
return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
else:
return ''.join(f"* {self.render(child)}\n" for child in element.children)
def render_list_item(self, element: ListItem):
return self.render_children(element)
def render_paragraph(self, element: Paragraph):
return f"{self.render_children(element)}\n\n"
def render_image(self, element: Image):
return f"![{element.title}]({element.dest})\n\n"
def render_table(self, element: Table):
rows = [self.render(child) for child in element.children]
return '\n'.join(rows) + '\n'
def render_table_row(self, element: TableRow):
cells = ' | '.join(self.render(cell) for cell in element.children)
return f"| {cells} |"
def render_table_cell(self, element: TableCell):
return self.render_children(element)
"""
def render_heading(self, element: Heading):
return {
"type": "heading",
"level": element.level,
"content": self.render_children(element),
}
def render_paragraph(self, element: Paragraph):
return {
"type": "paragraph",
"content": self.render_children(element),
}
def render_list(self, element: List):
return {
"type": "list",
"ordered": element.ordered,
"items": [self.render(child) for child in element.children]
}
def render_list_item(self, element: ListItem):
return {
"type": "list_item",
"content": self.render_children(element),
}
def render_image(self, element: Image):
return {
"type": "image",
"alt": element.title,
"url": element.dest,
}
def render_table(self, element: Table):
return {
"type": "table",
"rows": [self.render(row) for row in element.children]
}
def render_table_row(self, element: TableRow):
return {
"type": "table_row",
"cells": [self.render(cell) for cell in element.children]
}
def render_table_cell(self, element: TableCell):
return {
"type": "table_cell",
"content": self.render_children(element)
}
def render(self, element):
if isinstance(element, str):
return element
return super().render(element)
class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
_log.info("MD INIT!!!")
# Markdown file: # Markdown file:
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.valid = True
self.valid = False self.markdown = "" # To store original Markdown string
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
@ -134,40 +50,78 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacke
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, "r", encoding="utf-8") as f:
md_content = f.read() md_content = f.read()
self.markdown = md_content self.markdown = md_content
self.valid = True
_log.info(self.markdown)
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"Could not initialize MD backend for file with hash {self.document_hash}." f"Could not initialize MD backend for file with hash {self.document_hash}."
) from e ) from e
return return
def page_count(self) -> int: # Function to iterate over all elements in the AST
return 0 def iterate_elements(self, element, depth=0):
# Print the element type and optionally its content
print(f"{' ' * depth}- {type(element).__name__}", end="")
if isinstance(element, BlockElement):
print(" (Block Element)")
elif isinstance(element, InlineElement):
print(" (Inline Element)")
# Check for different element types and print relevant details
if isinstance(element, marko.block.Heading):
print(f" - Heading level {element.level}, content: {element.children[0].children}")
elif isinstance(element, marko.block.List):
print(f" - List {'ordered' if element.ordered else 'unordered'}")
elif isinstance(element, marko.block.ListItem):
print(" - List item")
elif isinstance(element, marko.block.Paragraph):
print(f" - Paragraph: {element.children[0].children}")
elif isinstance(element, marko.inline.Image):
print(f" - Image with alt: {element.title}, url: {element.dest}")
# elif isinstance(element, marko.block.Table):
#
print(" - Table")
# Iterate through the element's children (if any)
if hasattr(element, 'children'):
for child in element.children:
self.iterate_elements(child, depth + 1)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close() self.path_or_stream.close()
self.path_or_stream = None self.path_or_stream = None
@classmethod
def supports_pagination(cls) -> bool:
return False # True? if so, how to handle pages...
@classmethod @classmethod
def supported_formats(cls) -> Set[InputFormat]: def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.MD} return {InputFormat.MD}
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parse and render print("converting Markdown...")
parser = marko.Markdown(renderer=MarkdownToDoclingRenderer) doc = DoclingDocument(name="Test")
parsed_object = parser.parse(markdown_text) doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
# Render the parsed Markdown into a structured object
markdown_object = parser.render(parsed_object) if self.is_valid():
# Parse the markdown into an abstract syntax tree (AST)
parser = marko.Markdown(extensions=['gfm'])
parsed_ast = parser.parse(self.markdown)
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast)
print(marko_doc)
# doc = self.walk(self.soup.body, doc)
else: else:
raise RuntimeError( raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init." f"Cannot convert md with {self.document_hash} because the backend failed to init."

View File

@ -496,6 +496,8 @@ class _DocumentConversionInput(BaseModel):
if mime is None: if mime is None:
mime = self._detect_html_xhtml(content) mime = self._detect_html_xhtml(content)
if mime is None:
mime = "text/markdown"
format = MimeTypeToFormat.get(mime) format = MimeTypeToFormat.get(mime)
return format return format

58
docs/examples/run_md.py Normal file
View File

@ -0,0 +1,58 @@
import json
import logging
from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
import os
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
def main():
input_paths = [
Path("README.md")
]
for path in input_paths:
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.PDF,
backend=MarkdownDocumentBackend,
)
mdb = MarkdownDocumentBackend(in_doc = in_doc, path_or_stream = path)
document = mdb.convert()
out_path = Path("scratch")
print(
f"Document {path} converted."
f"\nSaved markdown output to: {str(out_path)}"
)
# Export Docling document format to markdowndoc:
fn = os.path.basename(path)
with (out_path / f"{fn}.md").open("w") as fp:
fp.write(document.export_to_markdown())
with (out_path / f"{fn}.json").open("w") as fp:
fp.write(json.dumps(document.export_to_dict()))
with (out_path / f"{fn}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(document.export_to_dict()))
if __name__ == "__main__":
main()