fix: MD Backend, fixes to properly handle trailing inline text and emphasis in headers (#178)

* Small fix to properly handle trailing inline text in the md backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Added proper handling of headers with bold, italic or emphasis

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* removed print

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Made smarter processing of headers, with arbitrary styling

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Updated docling-core to 2.2.1

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Updated tests because of the change in Markdown export in docling-core

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2024-10-25 18:02:20 +02:00
committed by GitHub
parent 77a89c3334
commit 88c1673057
6 changed files with 279 additions and 255 deletions

View File

@@ -135,11 +135,29 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc_label = DocItemLabel.TITLE
else:
doc_label = DocItemLabel.SECTION_HEADER
snippet_text = element.children[0].children.strip()
parent_element = doc.add_text(
label=doc_label, parent=parent_element, text=snippet_text
)
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings = []
# Define a recursive function to traverse the tree
def traverse(node):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
if isinstance(node.children, list):
for child in node.children:
traverse(child)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
parent_element = doc.add_text(
label=doc_label, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.block.List):
self.close_table(doc)
@@ -286,6 +304,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parsed_ast = marko_parser.parse(self.markdown)
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text
else:
raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init."