Drafting Markdown backend via Marko library

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-17 16:40:18 +02:00
parent d5460e2d1f
commit 5986213cfe
6 changed files with 840 additions and 483 deletions

View File

@ -0,0 +1,175 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ProvenanceItem,
Size,
TableCell,
TableData,
)
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
import marko
from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell
from marko.inline import Image, Link, Emphasis, Strong
_log = logging.getLogger(__name__)
class MarkdownToDoclingRenderer(marko.Renderer):
"""
# This is text analog of object based methods...
def render_heading(self, element: Heading):
return f"{'#' * element.level} {self.render_children(element)}\n\n"
def render_list(self, element: List):
if element.ordered:
return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
else:
return ''.join(f"* {self.render(child)}\n" for child in element.children)
def render_list_item(self, element: ListItem):
return self.render_children(element)
def render_paragraph(self, element: Paragraph):
return f"{self.render_children(element)}\n\n"
def render_image(self, element: Image):
return f"![{element.title}]({element.dest})\n\n"
def render_table(self, element: Table):
rows = [self.render(child) for child in element.children]
return '\n'.join(rows) + '\n'
def render_table_row(self, element: TableRow):
cells = ' | '.join(self.render(cell) for cell in element.children)
return f"| {cells} |"
def render_table_cell(self, element: TableCell):
return self.render_children(element)
"""
def render_heading(self, element: Heading):
return {
"type": "heading",
"level": element.level,
"content": self.render_children(element),
}
def render_paragraph(self, element: Paragraph):
return {
"type": "paragraph",
"content": self.render_children(element),
}
def render_list(self, element: List):
return {
"type": "list",
"ordered": element.ordered,
"items": [self.render(child) for child in element.children]
}
def render_list_item(self, element: ListItem):
return {
"type": "list_item",
"content": self.render_children(element),
}
def render_image(self, element: Image):
return {
"type": "image",
"alt": element.title,
"url": element.dest,
}
def render_table(self, element: Table):
return {
"type": "table",
"rows": [self.render(row) for row in element.children]
}
def render_table_row(self, element: TableRow):
return {
"type": "table_row",
"cells": [self.render(cell) for cell in element.children]
}
def render_table_cell(self, element: TableCell):
return {
"type": "table_cell",
"content": self.render_children(element)
}
def render(self, element):
if isinstance(element, str):
return element
return super().render(element)
class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
# Markdown file:
self.path_or_stream = path_or_stream
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.markdown = text_stream
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
md_content = f.read()
self.markdown = md_content
except Exception as e:
raise RuntimeError(
f"Could not initialize MD backend for file with hash {self.document_hash}."
) from e
return
def page_count(self) -> int:
return 0
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.MD}
def convert(self) -> DoclingDocument:
# Parse and render
parser = marko.Markdown(renderer=MarkdownToDoclingRenderer)
parsed_object = parser.parse(markdown_text)
# Render the parsed Markdown into a structured object
markdown_object = parser.render(parsed_object)
print(marko_doc)
# doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init."
)
return doc

View File

@ -30,6 +30,7 @@ class InputFormat(str, Enum):
HTML = "html"
IMAGE = "image"
PDF = "pdf"
MD = "md"
class OutputFormat(str, Enum):
@ -43,6 +44,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
}
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
"image/bmp",
},
InputFormat.PDF: {"application/pdf"},
InputFormat.MD: {"text/markdown", "text/x-markdown"},
}
MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes

View File

@ -12,6 +12,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import (
ConversionResult,
@ -52,6 +53,11 @@ class PowerpointFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
class MarkdownFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
@ -74,6 +80,9 @@ _format_to_default_options = {
InputFormat.PPTX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.MD: FormatOption(
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),

View File

@ -19,6 +19,7 @@ _log = logging.getLogger(__name__)
def main():
input_paths = [
Path("README.md"),
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),

1134
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -57,6 +57,7 @@ python-docx = "^1.1.2"
python-pptx = "^1.0.2"
beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4"
marko = "^2.1.2"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}