Detecting and assembling tables in markdown in temporary buffers

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-21 13:07:05 +02:00
parent bef429fee3
commit fa2f8cf236

View File

@ -15,18 +15,25 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
import marko.ext
import marko.ext.gfm
import marko.inline
from docling.backend.abstract_backend import ( from docling.backend.abstract_backend import (
DeclarativeDocumentBackend, DeclarativeDocumentBackend
PaginatedDocumentBackend,
) )
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
import marko import marko
from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.) from marko import Markdown
from marko.block import BlockElement # from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.)
from marko.inline import InlineElement # from marko.ext.gfm.elements import Table
# from marko.ext.gfm.elements import TableCell
# from marko.ext.gfm.elements import TableRow
# from marko.block import BlockElement
# from marko.inline import InlineElement
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -42,6 +49,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.valid = True self.valid = True
self.markdown = "" # To store original Markdown string self.markdown = "" # To store original Markdown string
self.in_table = False
self.md_table_buffer = []
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8") text_stream = self.path_or_stream.getvalue().decode("utf-8")
@ -59,6 +69,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e ) from e
return return
def close_table(self):
if self.in_table:
print("")
print("====================================== TABLE START")
for md_table_row in self.md_table_buffer:
print(md_table_row)
print("====================================== TABLE END")
print("")
self.in_table = False
self.md_table_buffer = [] # clean table markdown buffer
# return self.in_table, self.md_table_buffer
return
# Function to iterate over all elements in the AST # Function to iterate over all elements in the AST
def iterate_elements(self, element, depth=0, doc=None, parent_element = None): def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
# Print the element type and optionally its content # Print the element type and optionally its content
@ -70,11 +93,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# elif isinstance(element, InlineElement): # elif isinstance(element, InlineElement):
# print(" (Inline Element)") # print(" (Inline Element)")
not_a_list_item = True # not_a_list_item = True
# Check for different element types and print relevant details # Check for different element types and print relevant details
if isinstance(element, marko.block.Heading): if isinstance(element, marko.block.Heading):
print(f" - Heading level {element.level}, content: {element.children[0].children}") self.close_table()
# print(f" - Heading level {element.level}, content: {element.children[0].children}")
if element.level == 1: if element.level == 1:
doc_label = DocItemLabel.TITLE doc_label = DocItemLabel.TITLE
else: else:
@ -89,8 +114,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.List): elif isinstance(element, marko.block.List):
print(f" - List {'ordered' if element.ordered else 'unordered'}") self.close_table()
# print(f" - List {'ordered' if element.ordered else 'unordered'}")
list_label = GroupLabel.LIST list_label = GroupLabel.LIST
if element.ordered: if element.ordered:
list_label = GroupLabel.ORDERED_LIST list_label = GroupLabel.ORDERED_LIST
@ -101,8 +126,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
elif isinstance(element, marko.block.ListItem): elif isinstance(element, marko.block.ListItem):
print(" - List item") self.close_table()
not_a_list_item = False # print(" - List item")
# not_a_list_item = False
snippet_text = str(element.children[0].children[0].children) snippet_text = str(element.children[0].children[0].children)
is_numbered = False is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST: if parent_element.label == GroupLabel.ORDERED_LIST:
@ -115,7 +141,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
elif isinstance(element, marko.block.Paragraph): elif isinstance(element, marko.block.Paragraph):
print(f" - Paragraph: {element.children[0].children}") self.close_table()
# print(f" - Paragraph: {element.children[0].children}")
snippet_text = str(element.children[0].children) snippet_text = str(element.children[0].children)
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
@ -124,38 +151,71 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
elif isinstance(element, marko.inline.Image): elif isinstance(element, marko.inline.Image):
print(f" - Image with alt: {element.title}, url: {element.dest}") self.close_table()
# print(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture( doc.add_picture(
parent=parent_element, parent=parent_element,
caption=element.title caption=element.title
) )
elif isinstance(element, marko.inline.RawText): elif isinstance(element, marko.inline.RawText):
print(f" - Paragraph (raw text): {element.children}") # print(f" - Paragraph (raw text): {element.children}")
# TODO: Detect start of the table here...
snippet_text = str(element.children) snippet_text = str(element.children)
doc.add_text( if "|" in snippet_text:
label=DocItemLabel.PARAGRAPH, # most likely table
parent=parent_element, # if in_table == False:
text=snippet_text # print("====================================== TABLE START!")
) self.in_table = True
# print(f" - TABLE: {element.children}")
if len(self.md_table_buffer) > 0:
self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text)
else:
self.md_table_buffer.append(snippet_text)
else:
self.close_table()
self.in_table = False
# most likely just text
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_element,
text=snippet_text
)
elif isinstance(element, marko.inline.CodeSpan): elif isinstance(element, marko.inline.CodeSpan):
print(f" - Paragraph (code): {element.children}") self.close_table()
# print(f" - Paragraph (code): {element.children}")
snippet_text = str(element.children) snippet_text = str(element.children)
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.CODE,
parent=parent_element, parent=parent_element,
text=snippet_text text=snippet_text
) )
elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
print("Line break in table")
self.md_table_buffer.append("")
# print("HTML Block else: {}".format(element))
elif isinstance(element, marko.block.HTMLBlock):
self.close_table()
print("HTML Block else: {}".format(element))
# elif isinstance(element, marko.ext.gfm.elements.Table):
# elif isinstance(element, marko.ext.gfm.elements.Table):
# print(" - Table")
# elif isinstance(element, TableRow):
# print(" - TableRow")
# elif isinstance(element, TableCell):
# print(" - TableCell")
else: else:
if not isinstance(element, str): if not isinstance(element, str):
self.close_table()
print("Something else: {}".format(element)) print("Something else: {}".format(element))
# print(element)
# elif isinstance(element, marko.block.Table): # elif isinstance(element, marko.block.Table):
# print(" - Table") # print(" - Table")
# elif isinstance(element, marko.block.Table):
# print(" - Table")
# Iterate through the element's children (if any) # Iterate through the element's children (if any)
if hasattr(element, 'children'): if hasattr(element, 'children'):
@ -185,8 +245,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if self.is_valid(): if self.is_valid():
# Parse the markdown into an abstract syntax tree (AST) # Parse the markdown into an abstract syntax tree (AST)
parser = marko.Markdown(extensions=['gfm']) # parser = marko.Markdown(extensions=['gfm'])
parsed_ast = parser.parse(self.markdown)
# gfm_parser = Markdown(extensions=['gfm'])
gfm_parser = Markdown()
# gfm_parser.use('gfm')
parsed_ast = gfm_parser.parse(self.markdown)
# parsed_ast = gfm(self.markdown)
# Start iterating from the root of the AST # Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0 , doc, None) self.iterate_elements(parsed_ast, 0 , doc, None)