From bef429fee306585cf7bc06d58b1f41594dc785ef Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 18 Oct 2024 16:58:22 +0200 Subject: [PATCH] Improvements in md parsing Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 6eca9fd4..e18a2498 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -129,7 +129,31 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): parent=parent_element, caption=element.title ) - + + elif isinstance(element, marko.inline.RawText): + print(f" - Paragraph (raw text): {element.children}") + snippet_text = str(element.children) + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=parent_element, + text=snippet_text + ) + + elif isinstance(element, marko.inline.CodeSpan): + print(f" - Paragraph (code): {element.children}") + snippet_text = str(element.children) + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=parent_element, + text=snippet_text + ) + else: + if not isinstance(element, str): + print("Something else: {}".format(element)) + # print(element) + + # elif isinstance(element, marko.block.Table): + # print(" - Table") # elif isinstance(element, marko.block.Table): # print(" - Table")