mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
md_backend produces docling document with headers, paragraphs, lists
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
1df89f79ff
commit
534b2203f6
@ -60,39 +60,83 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Function to iterate over all elements in the AST
|
# Function to iterate over all elements in the AST
|
||||||
def iterate_elements(self, element, depth=0):
|
def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
|
||||||
# Print the element type and optionally its content
|
# Print the element type and optionally its content
|
||||||
print(f"{' ' * depth}- {type(element).__name__}", end="")
|
# print(f"{' ' * depth}- {type(element).__name__}", end="")
|
||||||
|
# print(f"{' ' * depth}", end="")
|
||||||
|
|
||||||
if isinstance(element, BlockElement):
|
# if isinstance(element, BlockElement):
|
||||||
print(" (Block Element)")
|
# print(" (Block Element)")
|
||||||
elif isinstance(element, InlineElement):
|
# elif isinstance(element, InlineElement):
|
||||||
print(" (Inline Element)")
|
# print(" (Inline Element)")
|
||||||
|
|
||||||
|
not_a_list_item = True
|
||||||
|
|
||||||
# Check for different element types and print relevant details
|
# Check for different element types and print relevant details
|
||||||
if isinstance(element, marko.block.Heading):
|
if isinstance(element, marko.block.Heading):
|
||||||
print(f" - Heading level {element.level}, content: {element.children[0].children}")
|
print(f" - Heading level {element.level}, content: {element.children[0].children}")
|
||||||
|
if element.level == 1:
|
||||||
|
doc_label = DocItemLabel.TITLE
|
||||||
|
else:
|
||||||
|
doc_label = DocItemLabel.SECTION_HEADER
|
||||||
|
snippet_text = element.children[0].children
|
||||||
|
|
||||||
|
parent_element = doc.add_text(
|
||||||
|
label=doc_label,
|
||||||
|
parent=parent_element,
|
||||||
|
text=snippet_text
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
elif isinstance(element, marko.block.List):
|
elif isinstance(element, marko.block.List):
|
||||||
print(f" - List {'ordered' if element.ordered else 'unordered'}")
|
print(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||||
|
|
||||||
|
list_label = GroupLabel.LIST
|
||||||
|
if element.ordered:
|
||||||
|
list_label = GroupLabel.ORDERED_LIST
|
||||||
|
parent_element = doc.add_group(
|
||||||
|
label=list_label,
|
||||||
|
name=f"list",
|
||||||
|
parent=parent_element
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.ListItem):
|
elif isinstance(element, marko.block.ListItem):
|
||||||
print(" - List item")
|
print(" - List item")
|
||||||
|
not_a_list_item = False
|
||||||
|
snippet_text = str(element.children[0].children[0].children)
|
||||||
|
is_numbered = False
|
||||||
|
if parent_element.label == GroupLabel.ORDERED_LIST:
|
||||||
|
is_numbered = True
|
||||||
|
doc.add_list_item(
|
||||||
|
# marker=enum_marker,
|
||||||
|
enumerated=is_numbered,
|
||||||
|
parent=parent_element,
|
||||||
|
text=snippet_text
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.Paragraph):
|
elif isinstance(element, marko.block.Paragraph):
|
||||||
print(f" - Paragraph: {element.children[0].children}")
|
print(f" - Paragraph: {element.children[0].children}")
|
||||||
|
snippet_text = str(element.children[0].children)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=parent_element,
|
||||||
|
text=snippet_text
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
print(f" - Image with alt: {element.title}, url: {element.dest}")
|
print(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
|
doc.add_picture(
|
||||||
|
parent=parent_element,
|
||||||
|
caption=element.title
|
||||||
|
)
|
||||||
|
|
||||||
# elif isinstance(element, marko.block.Table):
|
# elif isinstance(element, marko.block.Table):
|
||||||
#
|
# print(" - Table")
|
||||||
print(" - Table")
|
|
||||||
|
|
||||||
# Iterate through the element's children (if any)
|
# Iterate through the element's children (if any)
|
||||||
if hasattr(element, 'children'):
|
if hasattr(element, 'children'):
|
||||||
for child in element.children:
|
for child in element.children:
|
||||||
self.iterate_elements(child, depth + 1)
|
self.iterate_elements(child, depth + 1, doc, parent_element)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.valid
|
return self.valid
|
||||||
@ -113,14 +157,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
print("converting Markdown...")
|
print("converting Markdown...")
|
||||||
doc = DoclingDocument(name="Test")
|
doc = DoclingDocument(name="Test")
|
||||||
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
|
# doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
|
||||||
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
# Parse the markdown into an abstract syntax tree (AST)
|
# Parse the markdown into an abstract syntax tree (AST)
|
||||||
parser = marko.Markdown(extensions=['gfm'])
|
parser = marko.Markdown(extensions=['gfm'])
|
||||||
parsed_ast = parser.parse(self.markdown)
|
parsed_ast = parser.parse(self.markdown)
|
||||||
# Start iterating from the root of the AST
|
# Start iterating from the root of the AST
|
||||||
self.iterate_elements(parsed_ast)
|
self.iterate_elements(parsed_ast, 0 , doc, None)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
Loading…
Reference in New Issue
Block a user