mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Drafting Markdown backend via Marko library
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
d5460e2d1f
commit
5986213cfe
175
docling/backend/md_backend.py
Normal file
175
docling/backend/md_backend.py
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Set, Union
|
||||||
|
|
||||||
|
from docling_core.types.doc import (
|
||||||
|
BoundingBox,
|
||||||
|
CoordOrigin,
|
||||||
|
DocItemLabel,
|
||||||
|
DoclingDocument,
|
||||||
|
DocumentOrigin,
|
||||||
|
GroupLabel,
|
||||||
|
ProvenanceItem,
|
||||||
|
Size,
|
||||||
|
TableCell,
|
||||||
|
TableData,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import (
|
||||||
|
DeclarativeDocumentBackend,
|
||||||
|
PaginatedDocumentBackend,
|
||||||
|
)
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
import marko
|
||||||
|
from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell
|
||||||
|
from marko.inline import Image, Link, Emphasis, Strong
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownToDoclingRenderer(marko.Renderer):
|
||||||
|
"""
|
||||||
|
# This is text analog of object based methods...
|
||||||
|
def render_heading(self, element: Heading):
|
||||||
|
return f"{'#' * element.level} {self.render_children(element)}\n\n"
|
||||||
|
|
||||||
|
def render_list(self, element: List):
|
||||||
|
if element.ordered:
|
||||||
|
return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
|
||||||
|
else:
|
||||||
|
return ''.join(f"* {self.render(child)}\n" for child in element.children)
|
||||||
|
|
||||||
|
def render_list_item(self, element: ListItem):
|
||||||
|
return self.render_children(element)
|
||||||
|
|
||||||
|
def render_paragraph(self, element: Paragraph):
|
||||||
|
return f"{self.render_children(element)}\n\n"
|
||||||
|
|
||||||
|
def render_image(self, element: Image):
|
||||||
|
return f"\n\n"
|
||||||
|
|
||||||
|
def render_table(self, element: Table):
|
||||||
|
rows = [self.render(child) for child in element.children]
|
||||||
|
return '\n'.join(rows) + '\n'
|
||||||
|
|
||||||
|
def render_table_row(self, element: TableRow):
|
||||||
|
cells = ' | '.join(self.render(cell) for cell in element.children)
|
||||||
|
return f"| {cells} |"
|
||||||
|
|
||||||
|
def render_table_cell(self, element: TableCell):
|
||||||
|
return self.render_children(element)
|
||||||
|
"""
|
||||||
|
def render_heading(self, element: Heading):
|
||||||
|
return {
|
||||||
|
"type": "heading",
|
||||||
|
"level": element.level,
|
||||||
|
"content": self.render_children(element),
|
||||||
|
}
|
||||||
|
|
||||||
|
def render_paragraph(self, element: Paragraph):
|
||||||
|
return {
|
||||||
|
"type": "paragraph",
|
||||||
|
"content": self.render_children(element),
|
||||||
|
}
|
||||||
|
|
||||||
|
def render_list(self, element: List):
|
||||||
|
return {
|
||||||
|
"type": "list",
|
||||||
|
"ordered": element.ordered,
|
||||||
|
"items": [self.render(child) for child in element.children]
|
||||||
|
}
|
||||||
|
|
||||||
|
def render_list_item(self, element: ListItem):
|
||||||
|
return {
|
||||||
|
"type": "list_item",
|
||||||
|
"content": self.render_children(element),
|
||||||
|
}
|
||||||
|
|
||||||
|
def render_image(self, element: Image):
|
||||||
|
return {
|
||||||
|
"type": "image",
|
||||||
|
"alt": element.title,
|
||||||
|
"url": element.dest,
|
||||||
|
}
|
||||||
|
|
||||||
|
def render_table(self, element: Table):
|
||||||
|
return {
|
||||||
|
"type": "table",
|
||||||
|
"rows": [self.render(row) for row in element.children]
|
||||||
|
}
|
||||||
|
|
||||||
|
def render_table_row(self, element: TableRow):
|
||||||
|
return {
|
||||||
|
"type": "table_row",
|
||||||
|
"cells": [self.render(cell) for cell in element.children]
|
||||||
|
}
|
||||||
|
|
||||||
|
def render_table_cell(self, element: TableCell):
|
||||||
|
return {
|
||||||
|
"type": "table_cell",
|
||||||
|
"content": self.render_children(element)
|
||||||
|
}
|
||||||
|
|
||||||
|
def render(self, element):
|
||||||
|
if isinstance(element, str):
|
||||||
|
return element
|
||||||
|
return super().render(element)
|
||||||
|
|
||||||
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||||
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
# Markdown file:
|
||||||
|
self.path_or_stream = path_or_stream
|
||||||
|
|
||||||
|
self.valid = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
|
self.markdown = text_stream
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||||
|
md_content = f.read()
|
||||||
|
self.markdown = md_content
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Could not initialize MD backend for file with hash {self.document_hash}."
|
||||||
|
) from e
|
||||||
|
return
|
||||||
|
|
||||||
|
def page_count(self) -> int:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.valid
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
self.path_or_stream.close()
|
||||||
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.MD}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
# Parse and render
|
||||||
|
parser = marko.Markdown(renderer=MarkdownToDoclingRenderer)
|
||||||
|
parsed_object = parser.parse(markdown_text)
|
||||||
|
# Render the parsed Markdown into a structured object
|
||||||
|
markdown_object = parser.render(parsed_object)
|
||||||
|
|
||||||
|
print(marko_doc)
|
||||||
|
# doc = self.walk(self.soup.body, doc)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
||||||
|
)
|
||||||
|
return doc
|
@ -30,6 +30,7 @@ class InputFormat(str, Enum):
|
|||||||
HTML = "html"
|
HTML = "html"
|
||||||
IMAGE = "image"
|
IMAGE = "image"
|
||||||
PDF = "pdf"
|
PDF = "pdf"
|
||||||
|
MD = "md"
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
@ -43,6 +44,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
||||||
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
||||||
InputFormat.PDF: ["pdf"],
|
InputFormat.PDF: ["pdf"],
|
||||||
|
InputFormat.MD: ["md"],
|
||||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||||
}
|
}
|
||||||
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
|||||||
"image/bmp",
|
"image/bmp",
|
||||||
},
|
},
|
||||||
InputFormat.PDF: {"application/pdf"},
|
InputFormat.PDF: {"application/pdf"},
|
||||||
|
InputFormat.MD: {"text/markdown", "text/x-markdown"},
|
||||||
}
|
}
|
||||||
MimeTypeToFormat = {
|
MimeTypeToFormat = {
|
||||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||||
|
@ -12,6 +12,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
@ -52,6 +53,11 @@ class PowerpointFormatOption(FormatOption):
|
|||||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownFormatOption(FormatOption):
|
||||||
|
pipeline_cls: Type = SimplePipeline
|
||||||
|
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class HTMLFormatOption(FormatOption):
|
class HTMLFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = SimplePipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||||
@ -74,6 +80,9 @@ _format_to_default_options = {
|
|||||||
InputFormat.PPTX: FormatOption(
|
InputFormat.PPTX: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||||
),
|
),
|
||||||
|
InputFormat.MD: FormatOption(
|
||||||
|
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||||
|
),
|
||||||
InputFormat.HTML: FormatOption(
|
InputFormat.HTML: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||||
),
|
),
|
||||||
|
@ -19,6 +19,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
input_paths = [
|
input_paths = [
|
||||||
|
Path("README.md"),
|
||||||
Path("tests/data/wiki_duck.html"),
|
Path("tests/data/wiki_duck.html"),
|
||||||
Path("tests/data/word_sample.docx"),
|
Path("tests/data/word_sample.docx"),
|
||||||
Path("tests/data/lorem_ipsum.docx"),
|
Path("tests/data/lorem_ipsum.docx"),
|
||||||
|
1134
poetry.lock
generated
1134
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -57,6 +57,7 @@ python-docx = "^1.1.2"
|
|||||||
python-pptx = "^1.0.2"
|
python-pptx = "^1.0.2"
|
||||||
beautifulsoup4 = "^4.12.3"
|
beautifulsoup4 = "^4.12.3"
|
||||||
pandas = "^2.1.4"
|
pandas = "^2.1.4"
|
||||||
|
marko = "^2.1.2"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
|
Loading…
Reference in New Issue
Block a user