md_backend produces docling document with headers, paragraphs, lists

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-18 16:08:41 +02:00
parent 1df89f79ff
commit 534b2203f6

View File

@ -60,39 +60,83 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
return
# Function to iterate over all elements in the AST
def iterate_elements(self, element, depth=0):
def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
# Print the element type and optionally its content
print(f"{' ' * depth}- {type(element).__name__}", end="")
if isinstance(element, BlockElement):
print(" (Block Element)")
elif isinstance(element, InlineElement):
print(" (Inline Element)")
# print(f"{' ' * depth}- {type(element).__name__}", end="")
# print(f"{' ' * depth}", end="")
# if isinstance(element, BlockElement):
# print(" (Block Element)")
# elif isinstance(element, InlineElement):
# print(" (Inline Element)")
not_a_list_item = True
# Check for different element types and print relevant details
if isinstance(element, marko.block.Heading):
print(f" - Heading level {element.level}, content: {element.children[0].children}")
if element.level == 1:
doc_label = DocItemLabel.TITLE
else:
doc_label = DocItemLabel.SECTION_HEADER
snippet_text = element.children[0].children
parent_element = doc.add_text(
label=doc_label,
parent=parent_element,
text=snippet_text
)
elif isinstance(element, marko.block.List):
print(f" - List {'ordered' if element.ordered else 'unordered'}")
list_label = GroupLabel.LIST
if element.ordered:
list_label = GroupLabel.ORDERED_LIST
parent_element = doc.add_group(
label=list_label,
name=f"list",
parent=parent_element
)
elif isinstance(element, marko.block.ListItem):
print(" - List item")
not_a_list_item = False
snippet_text = str(element.children[0].children[0].children)
is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST:
is_numbered = True
doc.add_list_item(
# marker=enum_marker,
enumerated=is_numbered,
parent=parent_element,
text=snippet_text
)
elif isinstance(element, marko.block.Paragraph):
print(f" - Paragraph: {element.children[0].children}")
snippet_text = str(element.children[0].children)
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_element,
text=snippet_text
)
elif isinstance(element, marko.inline.Image):
print(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(
parent=parent_element,
caption=element.title
)
# elif isinstance(element, marko.block.Table):
#
print(" - Table")
# print(" - Table")
# Iterate through the element's children (if any)
if hasattr(element, 'children'):
for child in element.children:
self.iterate_elements(child, depth + 1)
self.iterate_elements(child, depth + 1, doc, parent_element)
def is_valid(self) -> bool:
return self.valid
@ -113,14 +157,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
print("converting Markdown...")
doc = DoclingDocument(name="Test")
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
# doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
if self.is_valid():
# Parse the markdown into an abstract syntax tree (AST)
parser = marko.Markdown(extensions=['gfm'])
parsed_ast = parser.parse(self.markdown)
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast)
self.iterate_elements(parsed_ast, 0 , doc, None)
else:
raise RuntimeError(