mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Drafting Markdown backend via Marko library
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
d5460e2d1f
commit
5986213cfe
175
docling/backend/md_backend.py
Normal file
175
docling/backend/md_backend.py
Normal file
@ -0,0 +1,175 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
CoordOrigin,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
ProvenanceItem,
|
||||
Size,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
import marko
|
||||
from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell
|
||||
from marko.inline import Image, Link, Emphasis, Strong
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownToDoclingRenderer(marko.Renderer):
|
||||
"""
|
||||
# This is text analog of object based methods...
|
||||
def render_heading(self, element: Heading):
|
||||
return f"{'#' * element.level} {self.render_children(element)}\n\n"
|
||||
|
||||
def render_list(self, element: List):
|
||||
if element.ordered:
|
||||
return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
|
||||
else:
|
||||
return ''.join(f"* {self.render(child)}\n" for child in element.children)
|
||||
|
||||
def render_list_item(self, element: ListItem):
|
||||
return self.render_children(element)
|
||||
|
||||
def render_paragraph(self, element: Paragraph):
|
||||
return f"{self.render_children(element)}\n\n"
|
||||
|
||||
def render_image(self, element: Image):
|
||||
return f"\n\n"
|
||||
|
||||
def render_table(self, element: Table):
|
||||
rows = [self.render(child) for child in element.children]
|
||||
return '\n'.join(rows) + '\n'
|
||||
|
||||
def render_table_row(self, element: TableRow):
|
||||
cells = ' | '.join(self.render(cell) for cell in element.children)
|
||||
return f"| {cells} |"
|
||||
|
||||
def render_table_cell(self, element: TableCell):
|
||||
return self.render_children(element)
|
||||
"""
|
||||
def render_heading(self, element: Heading):
|
||||
return {
|
||||
"type": "heading",
|
||||
"level": element.level,
|
||||
"content": self.render_children(element),
|
||||
}
|
||||
|
||||
def render_paragraph(self, element: Paragraph):
|
||||
return {
|
||||
"type": "paragraph",
|
||||
"content": self.render_children(element),
|
||||
}
|
||||
|
||||
def render_list(self, element: List):
|
||||
return {
|
||||
"type": "list",
|
||||
"ordered": element.ordered,
|
||||
"items": [self.render(child) for child in element.children]
|
||||
}
|
||||
|
||||
def render_list_item(self, element: ListItem):
|
||||
return {
|
||||
"type": "list_item",
|
||||
"content": self.render_children(element),
|
||||
}
|
||||
|
||||
def render_image(self, element: Image):
|
||||
return {
|
||||
"type": "image",
|
||||
"alt": element.title,
|
||||
"url": element.dest,
|
||||
}
|
||||
|
||||
def render_table(self, element: Table):
|
||||
return {
|
||||
"type": "table",
|
||||
"rows": [self.render(row) for row in element.children]
|
||||
}
|
||||
|
||||
def render_table_row(self, element: TableRow):
|
||||
return {
|
||||
"type": "table_row",
|
||||
"cells": [self.render(cell) for cell in element.children]
|
||||
}
|
||||
|
||||
def render_table_cell(self, element: TableCell):
|
||||
return {
|
||||
"type": "table_cell",
|
||||
"content": self.render_children(element)
|
||||
}
|
||||
|
||||
def render(self, element):
|
||||
if isinstance(element, str):
|
||||
return element
|
||||
return super().render(element)
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
# Markdown file:
|
||||
self.path_or_stream = path_or_stream
|
||||
|
||||
self.valid = False
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.markdown = text_stream
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
md_content = f.read()
|
||||
self.markdown = md_content
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize MD backend for file with hash {self.document_hash}."
|
||||
) from e
|
||||
return
|
||||
|
||||
def page_count(self) -> int:
|
||||
return 0
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.MD}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parse and render
|
||||
parser = marko.Markdown(renderer=MarkdownToDoclingRenderer)
|
||||
parsed_object = parser.parse(markdown_text)
|
||||
# Render the parsed Markdown into a structured object
|
||||
markdown_object = parser.render(parsed_object)
|
||||
|
||||
print(marko_doc)
|
||||
# doc = self.walk(self.soup.body, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
return doc
|
@ -30,6 +30,7 @@ class InputFormat(str, Enum):
|
||||
HTML = "html"
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
MD = "md"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
@ -43,6 +44,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
||||
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
||||
InputFormat.PDF: ["pdf"],
|
||||
InputFormat.MD: ["md"],
|
||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
}
|
||||
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||
"image/bmp",
|
||||
},
|
||||
InputFormat.PDF: {"application/pdf"},
|
||||
InputFormat.MD: {"text/markdown", "text/x-markdown"},
|
||||
}
|
||||
MimeTypeToFormat = {
|
||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||
|
@ -12,6 +12,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
@ -52,6 +53,11 @@ class PowerpointFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||
|
||||
|
||||
class MarkdownFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
||||
|
||||
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
@ -74,6 +80,9 @@ _format_to_default_options = {
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.MD: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
|
@ -19,6 +19,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
input_paths = [
|
||||
Path("README.md"),
|
||||
Path("tests/data/wiki_duck.html"),
|
||||
Path("tests/data/word_sample.docx"),
|
||||
Path("tests/data/lorem_ipsum.docx"),
|
||||
|
1134
poetry.lock
generated
1134
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -57,6 +57,7 @@ python-docx = "^1.1.2"
|
||||
python-pptx = "^1.0.2"
|
||||
beautifulsoup4 = "^4.12.3"
|
||||
pandas = "^2.1.4"
|
||||
marko = "^2.1.2"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
|
Loading…
Reference in New Issue
Block a user