From a5735f4fd4771d2d9568186d2eabd7796b6bf8c3 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 25 Oct 2024 14:08:47 +0200 Subject: [PATCH] Made smarter processing of headers, with arbitrary styling Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 158d2c0b..900319c0 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -136,18 +136,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): else: doc_label = DocItemLabel.SECTION_HEADER - if isinstance(element.children[0].children, str): - # Straight text in the header - snippet_text = element.children[0].children.strip() - elif isinstance(element.children[0].children[0].children, str): - # Bold or italic text in the header - snippet_text = element.children[0].children[0].children.strip() - elif isinstance(element.children[0].children[0].children[0].children, str): - # Emphasized text in the header - snippet_text = ( - element.children[0].children[0].children[0].children.strip() - ) + # Header could have arbitrary inclusion of bold, italic or emphasis, + # hence we need to traverse the tree to get full text of a header + strings = [] + # Define a recursive function to traverse the tree + def traverse(node): + # Check if the node has a "children" attribute + if hasattr(node, "children"): + # If "children" is a list, continue traversal + if isinstance(node.children, list): + for child in node.children: + traverse(child) + # If "children" is text, add it to header text + elif isinstance(node.children, str): + strings.append(node.children) + + traverse(element) + snippet_text = "".join(strings) if len(snippet_text) > 0: parent_element = doc.add_text( label=doc_label, parent=parent_element, text=snippet_text