use new add_code in backends and update typing in MD backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-01-21 18:19:49 +01:00
parent e707747863
commit d5b2c07295
2 changed files with 45 additions and 27 deletions

View File

@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
label = DocItemLabel.CODE label = DocItemLabel.CODE
if len(text) == 0: if len(text) == 0:
return return
doc.add_text(parent=self.parents[self.level], label=label, text=text) doc.add_code(parent=self.parents[self.level], label=label, text=text)
def handle_paragraph(self, element, idx, doc): def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p).""" """Handles paragraph tags (p)."""

View File

@ -3,19 +3,22 @@ import re
import warnings import warnings
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import List, Optional, Set, Union
import marko import marko
import marko.ext import marko.ext
import marko.ext.gfm import marko.ext.gfm
import marko.inline import marko.inline
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupLabel, GroupLabel,
NodeItem,
TableCell, TableCell,
TableData, TableData,
TextItem,
) )
from marko import Markdown from marko import Markdown
@ -27,7 +30,7 @@ _log = logging.getLogger(__name__)
class MarkdownDocumentBackend(DeclarativeDocumentBackend): class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text, max_length=10): def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores # This regex will match any sequence of underscores
pattern = r"_+" pattern = r"_+"
@ -89,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e ) from e
return return
def close_table(self, doc=None): def close_table(self, doc: DoclingDocument):
if self.in_table: if self.in_table:
_log.debug("=== TABLE START ===") _log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer: for md_table_row in self.md_table_buffer:
_log.debug(md_table_row) _log.debug(md_table_row)
_log.debug("=== TABLE END ===") _log.debug("=== TABLE END ===")
tcells = [] tcells: List[TableCell] = []
result_table = [] result_table = []
for n, md_table_row in enumerate(self.md_table_buffer): for n, md_table_row in enumerate(self.md_table_buffer):
data = [] data = []
@ -136,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False self.in_table = False
self.md_table_buffer = [] # clean table markdown buffer self.md_table_buffer = [] # clean table markdown buffer
# Initialize Docling TableData # Initialize Docling TableData
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells) table_data = TableData(
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
)
# Populate # Populate
for tcell in tcells: for tcell in tcells:
data.table_cells.append(tcell) table_data.table_cells.append(tcell)
if len(tcells) > 0: if len(tcells) > 0:
doc.add_table(data=data) doc.add_table(data=table_data)
return return
def process_inline_text(self, parent_element, doc=None): def process_inline_text(
self, parent_element: Optional[NodeItem], doc: DoclingDocument
):
# self.inline_text_buffer += str(text_in) # self.inline_text_buffer += str(text_in)
txt = self.inline_text_buffer.strip() txt = self.inline_text_buffer.strip()
if len(txt) > 0: if len(txt) > 0:
@ -155,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
self.inline_text_buffer = "" self.inline_text_buffer = ""
def iterate_elements(self, element, depth=0, doc=None, parent_element=None): def iterate_elements(
self,
element: marko.block.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
):
# Iterates over all elements in the AST # Iterates over all elements in the AST
# Check for different element types and process relevant details # Check for different element types and process relevant details
if isinstance(element, marko.block.Heading): if isinstance(element, marko.block.Heading):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug( _log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
) )
if element.level == 1: if element.level == 1:
doc_label = DocItemLabel.TITLE doc_label = DocItemLabel.TITLE
@ -171,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Header could have arbitrary inclusion of bold, italic or emphasis, # Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header # hence we need to traverse the tree to get full text of a header
strings = [] strings: List[str] = []
# Define a recursive function to traverse the tree # Define a recursive function to traverse the tree
def traverse(node): def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute # Check if the node has a "children" attribute
if hasattr(node, "children"): if hasattr(node, "children"):
# If "children" is a list, continue traversal # If "children" is a list, continue traversal
@ -208,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(" - List item") _log.debug(" - List item")
snippet_text = str(element.children[0].children[0].children) snippet_text = str(element.children[0].children[0].children) # type: ignore
is_numbered = False is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST: if (
parent_element is not None
and isinstance(parent_element, DocItem)
and parent_element.label == GroupLabel.ORDERED_LIST
):
is_numbered = True is_numbered = True
doc.add_list_item( doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text enumerated=is_numbered, parent=parent_element, text=snippet_text
@ -220,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}") _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(parent=parent_element, caption=element.title)
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
)
doc.add_picture(parent=parent_element, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph): elif isinstance(element, marko.block.Paragraph):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
@ -251,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Span: {element.children}") _log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip() snippet_text = str(element.children).strip()
doc.add_text( doc.add_code(parent=parent_element, text=snippet_text)
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.block.CodeBlock): elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_text( doc.add_code(parent=parent_element, text=snippet_text)
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.block.FencedCode): elif isinstance(element, marko.block.FencedCode):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_text( doc.add_code(parent=parent_element, text=snippet_text)
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.inline.LineBreak): elif isinstance(element, marko.inline.LineBreak):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)