From 534b2203f66b4f2e8f45b67841239fe58b58604d Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 18 Oct 2024 16:08:41 +0200 Subject: [PATCH] md_backend produces docling document with headers, paragraphs, lists Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 68 ++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index e4244f87..6eca9fd4 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -60,39 +60,83 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): return # Function to iterate over all elements in the AST - def iterate_elements(self, element, depth=0): + def iterate_elements(self, element, depth=0, doc=None, parent_element = None): # Print the element type and optionally its content - print(f"{' ' * depth}- {type(element).__name__}", end="") - - if isinstance(element, BlockElement): - print(" (Block Element)") - elif isinstance(element, InlineElement): - print(" (Inline Element)") + # print(f"{' ' * depth}- {type(element).__name__}", end="") + # print(f"{' ' * depth}", end="") + # if isinstance(element, BlockElement): + # print(" (Block Element)") + # elif isinstance(element, InlineElement): + # print(" (Inline Element)") + + not_a_list_item = True + # Check for different element types and print relevant details if isinstance(element, marko.block.Heading): print(f" - Heading level {element.level}, content: {element.children[0].children}") + if element.level == 1: + doc_label = DocItemLabel.TITLE + else: + doc_label = DocItemLabel.SECTION_HEADER + snippet_text = element.children[0].children + + parent_element = doc.add_text( + label=doc_label, + parent=parent_element, + text=snippet_text + ) + elif isinstance(element, marko.block.List): print(f" - List {'ordered' if element.ordered else 'unordered'}") + + list_label = GroupLabel.LIST + if element.ordered: + list_label = GroupLabel.ORDERED_LIST + parent_element = doc.add_group( + label=list_label, + name=f"list", + parent=parent_element + ) elif isinstance(element, marko.block.ListItem): print(" - List item") + not_a_list_item = False + snippet_text = str(element.children[0].children[0].children) + is_numbered = False + if parent_element.label == GroupLabel.ORDERED_LIST: + is_numbered = True + doc.add_list_item( + # marker=enum_marker, + enumerated=is_numbered, + parent=parent_element, + text=snippet_text + ) elif isinstance(element, marko.block.Paragraph): print(f" - Paragraph: {element.children[0].children}") + snippet_text = str(element.children[0].children) + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=parent_element, + text=snippet_text + ) elif isinstance(element, marko.inline.Image): print(f" - Image with alt: {element.title}, url: {element.dest}") + doc.add_picture( + parent=parent_element, + caption=element.title + ) # elif isinstance(element, marko.block.Table): - # - print(" - Table") + # print(" - Table") # Iterate through the element's children (if any) if hasattr(element, 'children'): for child in element.children: - self.iterate_elements(child, depth + 1) + self.iterate_elements(child, depth + 1, doc, parent_element) def is_valid(self) -> bool: return self.valid @@ -113,14 +157,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: print("converting Markdown...") doc = DoclingDocument(name="Test") - doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion") + # doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion") if self.is_valid(): # Parse the markdown into an abstract syntax tree (AST) parser = marko.Markdown(extensions=['gfm']) parsed_ast = parser.parse(self.markdown) # Start iterating from the root of the AST - self.iterate_elements(parsed_ast) + self.iterate_elements(parsed_ast, 0 , doc, None) else: raise RuntimeError(