diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 072ced52..88e4751b 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -157,6 +157,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") doc.add_picture(parent=parent_element, caption=element.title) + elif isinstance(element, marko.block.Paragraph): + print("Paragraph:") + print(element) + print("") + elif isinstance(element, marko.inline.RawText): _log.debug(f" - Paragraph (raw text): {element.children}") snippet_text = str(element.children).strip() @@ -182,12 +187,34 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.inline.CodeSpan): self.close_table(doc) - _log.debug(f" - Paragraph (code): {element.children}") + _log.debug(f" - Code Span: {element.children}") snippet_text = str(element.children).strip() doc.add_text( label=DocItemLabel.CODE, parent=parent_element, text=snippet_text ) + elif isinstance(element, marko.block.CodeBlock): + self.close_table(doc) + print("CODE BLOCK") + print(element) + print("") + _log.debug(f" - Code Block: {element.children}") + snippet_text = str(element.children[0].children).strip() + doc.add_text( + label=DocItemLabel.CODE, parent=parent_element, text=snippet_text + ) + + elif isinstance(element, marko.block.FencedCode): + self.close_table(doc) + print("FENCED CODE") + print(element) + print("") + _log.debug(f" - Code Block: {element.children}") + snippet_text = str(element.children[0].children).strip() + doc.add_text( + label=DocItemLabel.CODE, parent=parent_element, text=snippet_text + ) + elif isinstance(element, marko.inline.LineBreak): if self.in_table: _log.debug("Line break in a table") @@ -205,14 +232,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): if not isinstance(element, str): self.close_table(doc) _log.debug("Some other element: {}".format(element)) + print("SOMETHING ELSE") + print(element) + print("") # Iterate through the element's children (if any) if not isinstance(element, marko.block.ListItem): if not isinstance(element, marko.block.Heading): - # if not isinstance(element, marko.block.Paragraph): - if hasattr(element, "children"): - for child in element.children: - self.iterate_elements(child, depth + 1, doc, parent_element) + if not isinstance(element, marko.block.FencedCode): + # if not isinstance(element, marko.block.Paragraph): + if hasattr(element, "children"): + for child in element.children: + self.iterate_elements(child, depth + 1, doc, parent_element) def is_valid(self) -> bool: return self.valid diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py index 80384f6d..00f649b4 100644 --- a/docs/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -1,13 +1,11 @@ import json import logging -from io import BytesIO from pathlib import Path import yaml -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import DocumentStream, InputFormat +from docling.datamodel.base_models import InputFormat from docling.document_converter import ( DocumentConverter, PdfFormatOption, @@ -21,24 +19,17 @@ _log = logging.getLogger(__name__) def main(): input_paths = [ + Path("README.md"), Path("tests/data/wiki_duck.html"), Path("tests/data/word_sample.docx"), - Path("tests/data/word_nested.docx"), Path("tests/data/lorem_ipsum.docx"), Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/2305.03393v1-pg9-img.png"), Path("tests/data/2206.01062.pdf"), Path("tests/data/test_01.asciidoc"), - Path("tests/data/test_02.asciidoc"), - Path("README.md"), + Path("tests/data/test_01.asciidoc"), ] - # To read from bytes instead: - # docs = [ - # DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read())) - # for f in input_paths - # ] - ## for defaults use: # doc_converter = DocumentConverter() @@ -57,8 +48,7 @@ def main(): ], # whitelist formats, non-matching files are ignored. format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_cls=StandardPdfPipeline, - backend=DoclingParseDocumentBackend, + pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend @@ -68,7 +58,6 @@ def main(): ) conv_results = doc_converter.convert_all(input_paths) - # conv_results = doc_converter.convert_all(docs) for res in conv_results: out_path = Path("scratch")