mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Detecting and assembling tables in markdown in temporary buffers
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
bef429fee3
commit
fa2f8cf236
@ -15,18 +15,25 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
import marko.ext
|
||||||
|
import marko.ext.gfm
|
||||||
|
import marko.inline
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docling.backend.abstract_backend import (
|
||||||
DeclarativeDocumentBackend,
|
DeclarativeDocumentBackend
|
||||||
PaginatedDocumentBackend,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
import marko
|
import marko
|
||||||
from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.)
|
from marko import Markdown
|
||||||
from marko.block import BlockElement
|
# from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.)
|
||||||
from marko.inline import InlineElement
|
# from marko.ext.gfm.elements import Table
|
||||||
|
# from marko.ext.gfm.elements import TableCell
|
||||||
|
# from marko.ext.gfm.elements import TableRow
|
||||||
|
|
||||||
|
# from marko.block import BlockElement
|
||||||
|
# from marko.inline import InlineElement
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -42,6 +49,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.valid = True
|
self.valid = True
|
||||||
self.markdown = "" # To store original Markdown string
|
self.markdown = "" # To store original Markdown string
|
||||||
|
|
||||||
|
self.in_table = False
|
||||||
|
self.md_table_buffer = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
@ -59,6 +69,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
) from e
|
) from e
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def close_table(self):
|
||||||
|
if self.in_table:
|
||||||
|
print("")
|
||||||
|
print("====================================== TABLE START")
|
||||||
|
for md_table_row in self.md_table_buffer:
|
||||||
|
print(md_table_row)
|
||||||
|
print("====================================== TABLE END")
|
||||||
|
print("")
|
||||||
|
self.in_table = False
|
||||||
|
self.md_table_buffer = [] # clean table markdown buffer
|
||||||
|
# return self.in_table, self.md_table_buffer
|
||||||
|
return
|
||||||
|
|
||||||
# Function to iterate over all elements in the AST
|
# Function to iterate over all elements in the AST
|
||||||
def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
|
def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
|
||||||
# Print the element type and optionally its content
|
# Print the element type and optionally its content
|
||||||
@ -70,11 +93,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# elif isinstance(element, InlineElement):
|
# elif isinstance(element, InlineElement):
|
||||||
# print(" (Inline Element)")
|
# print(" (Inline Element)")
|
||||||
|
|
||||||
not_a_list_item = True
|
# not_a_list_item = True
|
||||||
|
|
||||||
|
|
||||||
# Check for different element types and print relevant details
|
# Check for different element types and print relevant details
|
||||||
if isinstance(element, marko.block.Heading):
|
if isinstance(element, marko.block.Heading):
|
||||||
print(f" - Heading level {element.level}, content: {element.children[0].children}")
|
self.close_table()
|
||||||
|
# print(f" - Heading level {element.level}, content: {element.children[0].children}")
|
||||||
if element.level == 1:
|
if element.level == 1:
|
||||||
doc_label = DocItemLabel.TITLE
|
doc_label = DocItemLabel.TITLE
|
||||||
else:
|
else:
|
||||||
@ -89,8 +114,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
|
|
||||||
elif isinstance(element, marko.block.List):
|
elif isinstance(element, marko.block.List):
|
||||||
print(f" - List {'ordered' if element.ordered else 'unordered'}")
|
self.close_table()
|
||||||
|
# print(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||||
list_label = GroupLabel.LIST
|
list_label = GroupLabel.LIST
|
||||||
if element.ordered:
|
if element.ordered:
|
||||||
list_label = GroupLabel.ORDERED_LIST
|
list_label = GroupLabel.ORDERED_LIST
|
||||||
@ -101,8 +126,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.ListItem):
|
elif isinstance(element, marko.block.ListItem):
|
||||||
print(" - List item")
|
self.close_table()
|
||||||
not_a_list_item = False
|
# print(" - List item")
|
||||||
|
# not_a_list_item = False
|
||||||
snippet_text = str(element.children[0].children[0].children)
|
snippet_text = str(element.children[0].children[0].children)
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
if parent_element.label == GroupLabel.ORDERED_LIST:
|
if parent_element.label == GroupLabel.ORDERED_LIST:
|
||||||
@ -115,7 +141,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.Paragraph):
|
elif isinstance(element, marko.block.Paragraph):
|
||||||
print(f" - Paragraph: {element.children[0].children}")
|
self.close_table()
|
||||||
|
# print(f" - Paragraph: {element.children[0].children}")
|
||||||
snippet_text = str(element.children[0].children)
|
snippet_text = str(element.children[0].children)
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -124,38 +151,71 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
print(f" - Image with alt: {element.title}, url: {element.dest}")
|
self.close_table()
|
||||||
|
# print(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=parent_element,
|
parent=parent_element,
|
||||||
caption=element.title
|
caption=element.title
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.RawText):
|
elif isinstance(element, marko.inline.RawText):
|
||||||
print(f" - Paragraph (raw text): {element.children}")
|
# print(f" - Paragraph (raw text): {element.children}")
|
||||||
|
# TODO: Detect start of the table here...
|
||||||
snippet_text = str(element.children)
|
snippet_text = str(element.children)
|
||||||
doc.add_text(
|
if "|" in snippet_text:
|
||||||
label=DocItemLabel.PARAGRAPH,
|
# most likely table
|
||||||
parent=parent_element,
|
# if in_table == False:
|
||||||
text=snippet_text
|
# print("====================================== TABLE START!")
|
||||||
)
|
self.in_table = True
|
||||||
|
# print(f" - TABLE: {element.children}")
|
||||||
|
if len(self.md_table_buffer) > 0:
|
||||||
|
self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text)
|
||||||
|
else:
|
||||||
|
self.md_table_buffer.append(snippet_text)
|
||||||
|
else:
|
||||||
|
self.close_table()
|
||||||
|
self.in_table = False
|
||||||
|
# most likely just text
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=parent_element,
|
||||||
|
text=snippet_text
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.CodeSpan):
|
elif isinstance(element, marko.inline.CodeSpan):
|
||||||
print(f" - Paragraph (code): {element.children}")
|
self.close_table()
|
||||||
|
# print(f" - Paragraph (code): {element.children}")
|
||||||
snippet_text = str(element.children)
|
snippet_text = str(element.children)
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.CODE,
|
||||||
parent=parent_element,
|
parent=parent_element,
|
||||||
text=snippet_text
|
text=snippet_text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif isinstance(element, marko.inline.LineBreak):
|
||||||
|
if self.in_table:
|
||||||
|
print("Line break in table")
|
||||||
|
self.md_table_buffer.append("")
|
||||||
|
# print("HTML Block else: {}".format(element))
|
||||||
|
|
||||||
|
elif isinstance(element, marko.block.HTMLBlock):
|
||||||
|
self.close_table()
|
||||||
|
print("HTML Block else: {}".format(element))
|
||||||
|
|
||||||
|
# elif isinstance(element, marko.ext.gfm.elements.Table):
|
||||||
|
# elif isinstance(element, marko.ext.gfm.elements.Table):
|
||||||
|
# print(" - Table")
|
||||||
|
# elif isinstance(element, TableRow):
|
||||||
|
# print(" - TableRow")
|
||||||
|
# elif isinstance(element, TableCell):
|
||||||
|
# print(" - TableCell")
|
||||||
else:
|
else:
|
||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
|
self.close_table()
|
||||||
print("Something else: {}".format(element))
|
print("Something else: {}".format(element))
|
||||||
# print(element)
|
|
||||||
|
|
||||||
# elif isinstance(element, marko.block.Table):
|
# elif isinstance(element, marko.block.Table):
|
||||||
# print(" - Table")
|
# print(" - Table")
|
||||||
# elif isinstance(element, marko.block.Table):
|
|
||||||
# print(" - Table")
|
|
||||||
|
|
||||||
# Iterate through the element's children (if any)
|
# Iterate through the element's children (if any)
|
||||||
if hasattr(element, 'children'):
|
if hasattr(element, 'children'):
|
||||||
@ -185,8 +245,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
# Parse the markdown into an abstract syntax tree (AST)
|
# Parse the markdown into an abstract syntax tree (AST)
|
||||||
parser = marko.Markdown(extensions=['gfm'])
|
# parser = marko.Markdown(extensions=['gfm'])
|
||||||
parsed_ast = parser.parse(self.markdown)
|
|
||||||
|
# gfm_parser = Markdown(extensions=['gfm'])
|
||||||
|
gfm_parser = Markdown()
|
||||||
|
# gfm_parser.use('gfm')
|
||||||
|
|
||||||
|
parsed_ast = gfm_parser.parse(self.markdown)
|
||||||
|
|
||||||
|
# parsed_ast = gfm(self.markdown)
|
||||||
# Start iterating from the root of the AST
|
# Start iterating from the root of the AST
|
||||||
self.iterate_elements(parsed_ast, 0 , doc, None)
|
self.iterate_elements(parsed_ast, 0 , doc, None)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user