work in progress on MD backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-18 14:39:22 +02:00
parent 5986213cfe
commit 1df89f79ff
3 changed files with 120 additions and 106 deletions

View File

@ -24,107 +24,23 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
import marko
from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell
from marko.inline import Image, Link, Emphasis, Strong
from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.)
from marko.block import BlockElement
from marko.inline import InlineElement
_log = logging.getLogger(__name__)
class MarkdownToDoclingRenderer(marko.Renderer):
"""
# This is text analog of object based methods...
def render_heading(self, element: Heading):
return f"{'#' * element.level} {self.render_children(element)}\n\n"
def render_list(self, element: List):
if element.ordered:
return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
else:
return ''.join(f"* {self.render(child)}\n" for child in element.children)
def render_list_item(self, element: ListItem):
return self.render_children(element)
def render_paragraph(self, element: Paragraph):
return f"{self.render_children(element)}\n\n"
def render_image(self, element: Image):
return f"![{element.title}]({element.dest})\n\n"
def render_table(self, element: Table):
rows = [self.render(child) for child in element.children]
return '\n'.join(rows) + '\n'
def render_table_row(self, element: TableRow):
cells = ' | '.join(self.render(cell) for cell in element.children)
return f"| {cells} |"
def render_table_cell(self, element: TableCell):
return self.render_children(element)
"""
def render_heading(self, element: Heading):
return {
"type": "heading",
"level": element.level,
"content": self.render_children(element),
}
def render_paragraph(self, element: Paragraph):
return {
"type": "paragraph",
"content": self.render_children(element),
}
def render_list(self, element: List):
return {
"type": "list",
"ordered": element.ordered,
"items": [self.render(child) for child in element.children]
}
def render_list_item(self, element: ListItem):
return {
"type": "list_item",
"content": self.render_children(element),
}
def render_image(self, element: Image):
return {
"type": "image",
"alt": element.title,
"url": element.dest,
}
def render_table(self, element: Table):
return {
"type": "table",
"rows": [self.render(row) for row in element.children]
}
def render_table_row(self, element: TableRow):
return {
"type": "table_row",
"cells": [self.render(cell) for cell in element.children]
}
def render_table_cell(self, element: TableCell):
return {
"type": "table_cell",
"content": self.render_children(element)
}
def render(self, element):
if isinstance(element, str):
return element
return super().render(element)
class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.info("MD INIT!!!")
# Markdown file:
self.path_or_stream = path_or_stream
self.valid = False
self.valid = True
self.markdown = "" # To store original Markdown string
try:
if isinstance(self.path_or_stream, BytesIO):
@ -134,40 +50,78 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacke
with open(self.path_or_stream, "r", encoding="utf-8") as f:
md_content = f.read()
self.markdown = md_content
self.valid = True
_log.info(self.markdown)
except Exception as e:
raise RuntimeError(
f"Could not initialize MD backend for file with hash {self.document_hash}."
) from e
return
def page_count(self) -> int:
return 0
# Function to iterate over all elements in the AST
def iterate_elements(self, element, depth=0):
# Print the element type and optionally its content
print(f"{' ' * depth}- {type(element).__name__}", end="")
if isinstance(element, BlockElement):
print(" (Block Element)")
elif isinstance(element, InlineElement):
print(" (Inline Element)")
# Check for different element types and print relevant details
if isinstance(element, marko.block.Heading):
print(f" - Heading level {element.level}, content: {element.children[0].children}")
elif isinstance(element, marko.block.List):
print(f" - List {'ordered' if element.ordered else 'unordered'}")
elif isinstance(element, marko.block.ListItem):
print(" - List item")
elif isinstance(element, marko.block.Paragraph):
print(f" - Paragraph: {element.children[0].children}")
elif isinstance(element, marko.inline.Image):
print(f" - Image with alt: {element.title}, url: {element.dest}")
# elif isinstance(element, marko.block.Table):
#
print(" - Table")
# Iterate through the element's children (if any)
if hasattr(element, 'children'):
for child in element.children:
self.iterate_elements(child, depth + 1)
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supports_pagination(cls) -> bool:
return False # True? if so, how to handle pages...
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.MD}
def convert(self) -> DoclingDocument:
# Parse and render
parser = marko.Markdown(renderer=MarkdownToDoclingRenderer)
parsed_object = parser.parse(markdown_text)
# Render the parsed Markdown into a structured object
markdown_object = parser.render(parsed_object)
print("converting Markdown...")
doc = DoclingDocument(name="Test")
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
if self.is_valid():
# Parse the markdown into an abstract syntax tree (AST)
parser = marko.Markdown(extensions=['gfm'])
parsed_ast = parser.parse(self.markdown)
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast)
print(marko_doc)
# doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init."

View File

@ -496,6 +496,8 @@ class _DocumentConversionInput(BaseModel):
if mime is None:
mime = self._detect_html_xhtml(content)
if mime is None:
mime = "text/markdown"
format = MimeTypeToFormat.get(mime)
return format

58
docs/examples/run_md.py Normal file
View File

@ -0,0 +1,58 @@
import json
import logging
from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
import os
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
def main():
input_paths = [
Path("README.md")
]
for path in input_paths:
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.PDF,
backend=MarkdownDocumentBackend,
)
mdb = MarkdownDocumentBackend(in_doc = in_doc, path_or_stream = path)
document = mdb.convert()
out_path = Path("scratch")
print(
f"Document {path} converted."
f"\nSaved markdown output to: {str(out_path)}"
)
# Export Docling document format to markdowndoc:
fn = os.path.basename(path)
with (out_path / f"{fn}.md").open("w") as fp:
fp.write(document.export_to_markdown())
with (out_path / f"{fn}.json").open("w") as fp:
fp.write(json.dumps(document.export_to_dict()))
with (out_path / f"{fn}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(document.export_to_dict()))
if __name__ == "__main__":
main()