mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Added support for code blocks and fenced code in MD
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
4fb803f46c
commit
186d71a057
@ -157,6 +157,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
doc.add_picture(parent=parent_element, caption=element.title)
|
doc.add_picture(parent=parent_element, caption=element.title)
|
||||||
|
|
||||||
|
elif isinstance(element, marko.block.Paragraph):
|
||||||
|
print("Paragraph:")
|
||||||
|
print(element)
|
||||||
|
print("")
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.RawText):
|
elif isinstance(element, marko.inline.RawText):
|
||||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||||
snippet_text = str(element.children).strip()
|
snippet_text = str(element.children).strip()
|
||||||
@ -182,12 +187,34 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.inline.CodeSpan):
|
elif isinstance(element, marko.inline.CodeSpan):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
_log.debug(f" - Paragraph (code): {element.children}")
|
_log.debug(f" - Code Span: {element.children}")
|
||||||
snippet_text = str(element.children).strip()
|
snippet_text = str(element.children).strip()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif isinstance(element, marko.block.CodeBlock):
|
||||||
|
self.close_table(doc)
|
||||||
|
print("CODE BLOCK")
|
||||||
|
print(element)
|
||||||
|
print("")
|
||||||
|
_log.debug(f" - Code Block: {element.children}")
|
||||||
|
snippet_text = str(element.children[0].children).strip()
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||||
|
)
|
||||||
|
|
||||||
|
elif isinstance(element, marko.block.FencedCode):
|
||||||
|
self.close_table(doc)
|
||||||
|
print("FENCED CODE")
|
||||||
|
print(element)
|
||||||
|
print("")
|
||||||
|
_log.debug(f" - Code Block: {element.children}")
|
||||||
|
snippet_text = str(element.children[0].children).strip()
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.LineBreak):
|
elif isinstance(element, marko.inline.LineBreak):
|
||||||
if self.in_table:
|
if self.in_table:
|
||||||
_log.debug("Line break in a table")
|
_log.debug("Line break in a table")
|
||||||
@ -205,14 +232,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
_log.debug("Some other element: {}".format(element))
|
_log.debug("Some other element: {}".format(element))
|
||||||
|
print("SOMETHING ELSE")
|
||||||
|
print(element)
|
||||||
|
print("")
|
||||||
|
|
||||||
# Iterate through the element's children (if any)
|
# Iterate through the element's children (if any)
|
||||||
if not isinstance(element, marko.block.ListItem):
|
if not isinstance(element, marko.block.ListItem):
|
||||||
if not isinstance(element, marko.block.Heading):
|
if not isinstance(element, marko.block.Heading):
|
||||||
# if not isinstance(element, marko.block.Paragraph):
|
if not isinstance(element, marko.block.FencedCode):
|
||||||
if hasattr(element, "children"):
|
# if not isinstance(element, marko.block.Paragraph):
|
||||||
for child in element.children:
|
if hasattr(element, "children"):
|
||||||
self.iterate_elements(child, depth + 1, doc, parent_element)
|
for child in element.children:
|
||||||
|
self.iterate_elements(child, depth + 1, doc, parent_element)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.valid
|
return self.valid
|
||||||
|
@ -1,13 +1,11 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import (
|
from docling.document_converter import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
@ -21,24 +19,17 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
input_paths = [
|
input_paths = [
|
||||||
|
Path("README.md"),
|
||||||
Path("tests/data/wiki_duck.html"),
|
Path("tests/data/wiki_duck.html"),
|
||||||
Path("tests/data/word_sample.docx"),
|
Path("tests/data/word_sample.docx"),
|
||||||
Path("tests/data/word_nested.docx"),
|
|
||||||
Path("tests/data/lorem_ipsum.docx"),
|
Path("tests/data/lorem_ipsum.docx"),
|
||||||
Path("tests/data/powerpoint_sample.pptx"),
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/2206.01062.pdf"),
|
||||||
Path("tests/data/test_01.asciidoc"),
|
Path("tests/data/test_01.asciidoc"),
|
||||||
Path("tests/data/test_02.asciidoc"),
|
Path("tests/data/test_01.asciidoc"),
|
||||||
Path("README.md"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# To read from bytes instead:
|
|
||||||
# docs = [
|
|
||||||
# DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
|
|
||||||
# for f in input_paths
|
|
||||||
# ]
|
|
||||||
|
|
||||||
## for defaults use:
|
## for defaults use:
|
||||||
# doc_converter = DocumentConverter()
|
# doc_converter = DocumentConverter()
|
||||||
|
|
||||||
@ -57,8 +48,7 @@ def main():
|
|||||||
], # whitelist formats, non-matching files are ignored.
|
], # whitelist formats, non-matching files are ignored.
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_cls=StandardPdfPipeline,
|
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||||
backend=DoclingParseDocumentBackend,
|
|
||||||
),
|
),
|
||||||
InputFormat.DOCX: WordFormatOption(
|
InputFormat.DOCX: WordFormatOption(
|
||||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||||
@ -68,7 +58,6 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(input_paths)
|
conv_results = doc_converter.convert_all(input_paths)
|
||||||
# conv_results = doc_converter.convert_all(docs)
|
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
|
Loading…
Reference in New Issue
Block a user