mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Added proper processing of in-line textual elements for MD backend
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
e8229fdd4c
commit
0f81ffda74
@ -37,6 +37,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.md_table_buffer: list[str] = []
|
self.md_table_buffer: list[str] = []
|
||||||
|
self.inline_text_buffer = ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
@ -56,7 +57,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return
|
return
|
||||||
|
|
||||||
def close_table(self, doc=None):
|
def close_table(self, doc=None):
|
||||||
|
|
||||||
if self.in_table:
|
if self.in_table:
|
||||||
_log.debug("=== TABLE START ===")
|
_log.debug("=== TABLE START ===")
|
||||||
for md_table_row in self.md_table_buffer:
|
for md_table_row in self.md_table_buffer:
|
||||||
@ -111,11 +111,23 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_table(data=data)
|
doc.add_table(data=data)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def process_inline_text(self, parent_element, doc=None):
|
||||||
|
# self.inline_text_buffer += str(text_in)
|
||||||
|
txt = self.inline_text_buffer.strip()
|
||||||
|
if len(txt) > 0:
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=parent_element,
|
||||||
|
text=txt,
|
||||||
|
)
|
||||||
|
self.inline_text_buffer = ""
|
||||||
|
|
||||||
def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
|
def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
|
||||||
# Iterates over all elements in the AST
|
# Iterates over all elements in the AST
|
||||||
# Check for different element types and process relevant details
|
# Check for different element types and process relevant details
|
||||||
if isinstance(element, marko.block.Heading):
|
if isinstance(element, marko.block.Heading):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(
|
_log.debug(
|
||||||
f" - Heading level {element.level}, content: {element.children[0].children}"
|
f" - Heading level {element.level}, content: {element.children[0].children}"
|
||||||
)
|
)
|
||||||
@ -131,6 +143,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.List):
|
elif isinstance(element, marko.block.List):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||||
list_label = GroupLabel.LIST
|
list_label = GroupLabel.LIST
|
||||||
if element.ordered:
|
if element.ordered:
|
||||||
@ -141,6 +154,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.ListItem):
|
elif isinstance(element, marko.block.ListItem):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(" - List item")
|
_log.debug(" - List item")
|
||||||
|
|
||||||
snippet_text = str(element.children[0].children[0].children)
|
snippet_text = str(element.children[0].children[0].children)
|
||||||
@ -153,14 +167,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
doc.add_picture(parent=parent_element, caption=element.title)
|
doc.add_picture(parent=parent_element, caption=element.title)
|
||||||
|
|
||||||
# elif isinstance(element, marko.block.Paragraph):
|
elif isinstance(element, marko.block.Paragraph):
|
||||||
# print("Paragraph:")
|
self.process_inline_text(parent_element, doc)
|
||||||
# print(element)
|
|
||||||
# print("")
|
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.RawText):
|
elif isinstance(element, marko.inline.RawText):
|
||||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||||
@ -178,15 +190,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
# most likely just text
|
# most likely just inline text
|
||||||
doc.add_text(
|
self.inline_text_buffer += str(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
element.children
|
||||||
parent=parent_element,
|
) # do not strip an inline text, as it may contain important spaces
|
||||||
text=snippet_text,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.CodeSpan):
|
elif isinstance(element, marko.inline.CodeSpan):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Code Span: {element.children}")
|
_log.debug(f" - Code Span: {element.children}")
|
||||||
snippet_text = str(element.children).strip()
|
snippet_text = str(element.children).strip()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
@ -195,6 +206,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.CodeBlock):
|
elif isinstance(element, marko.block.CodeBlock):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Code Block: {element.children}")
|
_log.debug(f" - Code Block: {element.children}")
|
||||||
snippet_text = str(element.children[0].children).strip()
|
snippet_text = str(element.children[0].children).strip()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
@ -203,6 +215,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.FencedCode):
|
elif isinstance(element, marko.block.FencedCode):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Code Block: {element.children}")
|
_log.debug(f" - Code Block: {element.children}")
|
||||||
snippet_text = str(element.children[0].children).strip()
|
snippet_text = str(element.children[0].children).strip()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
@ -210,18 +223,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.LineBreak):
|
elif isinstance(element, marko.inline.LineBreak):
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
if self.in_table:
|
if self.in_table:
|
||||||
_log.debug("Line break in a table")
|
_log.debug("Line break in a table")
|
||||||
self.md_table_buffer.append("")
|
self.md_table_buffer.append("")
|
||||||
|
|
||||||
elif isinstance(element, marko.block.HTMLBlock):
|
elif isinstance(element, marko.block.HTMLBlock):
|
||||||
|
self.process_inline_text(parent_element, doc)
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
_log.debug("HTML Block: {}".format(element))
|
_log.debug("HTML Block: {}".format(element))
|
||||||
snippet_text = str(element.children).strip()
|
if (
|
||||||
doc.add_text(
|
len(element.children) > 0
|
||||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
): # If Marko doesn't return any content for HTML block, skip it
|
||||||
)
|
snippet_text = str(element.children).strip()
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
|
Loading…
Reference in New Issue
Block a user