mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
work in progress on MD backend
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
5986213cfe
commit
1df89f79ff
@ -24,107 +24,23 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
import marko
|
||||
from marko.block import Heading, List, ListItem, Paragraph, BlockQuote, FencedCode, Table, TableRow, TableCell
|
||||
from marko.inline import Image, Link, Emphasis, Strong
|
||||
from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.)
|
||||
from marko.block import BlockElement
|
||||
from marko.inline import InlineElement
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownToDoclingRenderer(marko.Renderer):
|
||||
"""
|
||||
# This is text analog of object based methods...
|
||||
def render_heading(self, element: Heading):
|
||||
return f"{'#' * element.level} {self.render_children(element)}\n\n"
|
||||
|
||||
def render_list(self, element: List):
|
||||
if element.ordered:
|
||||
return ''.join(f"{i+1}. {self.render(child)}\n" for i, child in enumerate(element.children))
|
||||
else:
|
||||
return ''.join(f"* {self.render(child)}\n" for child in element.children)
|
||||
|
||||
def render_list_item(self, element: ListItem):
|
||||
return self.render_children(element)
|
||||
|
||||
def render_paragraph(self, element: Paragraph):
|
||||
return f"{self.render_children(element)}\n\n"
|
||||
|
||||
def render_image(self, element: Image):
|
||||
return f"\n\n"
|
||||
|
||||
def render_table(self, element: Table):
|
||||
rows = [self.render(child) for child in element.children]
|
||||
return '\n'.join(rows) + '\n'
|
||||
|
||||
def render_table_row(self, element: TableRow):
|
||||
cells = ' | '.join(self.render(cell) for cell in element.children)
|
||||
return f"| {cells} |"
|
||||
|
||||
def render_table_cell(self, element: TableCell):
|
||||
return self.render_children(element)
|
||||
"""
|
||||
def render_heading(self, element: Heading):
|
||||
return {
|
||||
"type": "heading",
|
||||
"level": element.level,
|
||||
"content": self.render_children(element),
|
||||
}
|
||||
|
||||
def render_paragraph(self, element: Paragraph):
|
||||
return {
|
||||
"type": "paragraph",
|
||||
"content": self.render_children(element),
|
||||
}
|
||||
|
||||
def render_list(self, element: List):
|
||||
return {
|
||||
"type": "list",
|
||||
"ordered": element.ordered,
|
||||
"items": [self.render(child) for child in element.children]
|
||||
}
|
||||
|
||||
def render_list_item(self, element: ListItem):
|
||||
return {
|
||||
"type": "list_item",
|
||||
"content": self.render_children(element),
|
||||
}
|
||||
|
||||
def render_image(self, element: Image):
|
||||
return {
|
||||
"type": "image",
|
||||
"alt": element.title,
|
||||
"url": element.dest,
|
||||
}
|
||||
|
||||
def render_table(self, element: Table):
|
||||
return {
|
||||
"type": "table",
|
||||
"rows": [self.render(row) for row in element.children]
|
||||
}
|
||||
|
||||
def render_table_row(self, element: TableRow):
|
||||
return {
|
||||
"type": "table_row",
|
||||
"cells": [self.render(cell) for cell in element.children]
|
||||
}
|
||||
|
||||
def render_table_cell(self, element: TableCell):
|
||||
return {
|
||||
"type": "table_cell",
|
||||
"content": self.render_children(element)
|
||||
}
|
||||
|
||||
def render(self, element):
|
||||
if isinstance(element, str):
|
||||
return element
|
||||
return super().render(element)
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
_log.info("MD INIT!!!")
|
||||
|
||||
# Markdown file:
|
||||
self.path_or_stream = path_or_stream
|
||||
|
||||
self.valid = False
|
||||
self.valid = True
|
||||
self.markdown = "" # To store original Markdown string
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
@ -134,40 +50,78 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacke
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
md_content = f.read()
|
||||
self.markdown = md_content
|
||||
self.valid = True
|
||||
|
||||
_log.info(self.markdown)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize MD backend for file with hash {self.document_hash}."
|
||||
) from e
|
||||
return
|
||||
|
||||
def page_count(self) -> int:
|
||||
return 0
|
||||
# Function to iterate over all elements in the AST
|
||||
def iterate_elements(self, element, depth=0):
|
||||
# Print the element type and optionally its content
|
||||
print(f"{' ' * depth}- {type(element).__name__}", end="")
|
||||
|
||||
if isinstance(element, BlockElement):
|
||||
print(" (Block Element)")
|
||||
elif isinstance(element, InlineElement):
|
||||
print(" (Inline Element)")
|
||||
|
||||
# Check for different element types and print relevant details
|
||||
if isinstance(element, marko.block.Heading):
|
||||
print(f" - Heading level {element.level}, content: {element.children[0].children}")
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
print(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
|
||||
elif isinstance(element, marko.block.ListItem):
|
||||
print(" - List item")
|
||||
|
||||
elif isinstance(element, marko.block.Paragraph):
|
||||
print(f" - Paragraph: {element.children[0].children}")
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
print(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||
|
||||
# elif isinstance(element, marko.block.Table):
|
||||
#
|
||||
print(" - Table")
|
||||
|
||||
# Iterate through the element's children (if any)
|
||||
if hasattr(element, 'children'):
|
||||
for child in element.children:
|
||||
self.iterate_elements(child, depth + 1)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False # True? if so, how to handle pages...
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.MD}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parse and render
|
||||
parser = marko.Markdown(renderer=MarkdownToDoclingRenderer)
|
||||
parsed_object = parser.parse(markdown_text)
|
||||
# Render the parsed Markdown into a structured object
|
||||
markdown_object = parser.render(parsed_object)
|
||||
print("converting Markdown...")
|
||||
doc = DoclingDocument(name="Test")
|
||||
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
|
||||
|
||||
if self.is_valid():
|
||||
# Parse the markdown into an abstract syntax tree (AST)
|
||||
parser = marko.Markdown(extensions=['gfm'])
|
||||
parsed_ast = parser.parse(self.markdown)
|
||||
# Start iterating from the root of the AST
|
||||
self.iterate_elements(parsed_ast)
|
||||
|
||||
print(marko_doc)
|
||||
# doc = self.walk(self.soup.body, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
||||
|
@ -496,6 +496,8 @@ class _DocumentConversionInput(BaseModel):
|
||||
|
||||
if mime is None:
|
||||
mime = self._detect_html_xhtml(content)
|
||||
if mime is None:
|
||||
mime = "text/markdown"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
58
docs/examples/run_md.py
Normal file
58
docs/examples/run_md.py
Normal file
@ -0,0 +1,58 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
PdfFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
import os
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
input_paths = [
|
||||
Path("README.md")
|
||||
]
|
||||
|
||||
for path in input_paths:
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path,
|
||||
format=InputFormat.PDF,
|
||||
backend=MarkdownDocumentBackend,
|
||||
)
|
||||
mdb = MarkdownDocumentBackend(in_doc = in_doc, path_or_stream = path)
|
||||
document = mdb.convert()
|
||||
|
||||
out_path = Path("scratch")
|
||||
print(
|
||||
f"Document {path} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
|
||||
# Export Docling document format to markdowndoc:
|
||||
fn = os.path.basename(path)
|
||||
|
||||
with (out_path / f"{fn}.md").open("w") as fp:
|
||||
fp.write(document.export_to_markdown())
|
||||
|
||||
with (out_path / f"{fn}.json").open("w") as fp:
|
||||
fp.write(json.dumps(document.export_to_dict()))
|
||||
|
||||
with (out_path / f"{fn}.yaml").open("w") as fp:
|
||||
fp.write(yaml.safe_dump(document.export_to_dict()))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user