diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index ae478885..66dd4a2c 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
label = DocItemLabel.CODE
if len(text) == 0:
return
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
+ doc.add_code(parent=self.parents[self.level], label=label, text=text)
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index 25bc3682..8171085c 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -3,19 +3,22 @@ import re
import warnings
from io import BytesIO
from pathlib import Path
-from typing import Set, Union
+from typing import List, Optional, Set, Union
import marko
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
+ DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
+ NodeItem,
TableCell,
TableData,
+ TextItem,
)
from marko import Markdown
@@ -27,7 +30,7 @@ _log = logging.getLogger(__name__)
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
- def shorten_underscore_sequences(self, markdown_text, max_length=10):
+ def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"
@@ -89,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e
return
- def close_table(self, doc=None):
+ def close_table(self, doc: DoclingDocument):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
_log.debug(md_table_row)
_log.debug("=== TABLE END ===")
- tcells = []
+ tcells: List[TableCell] = []
result_table = []
for n, md_table_row in enumerate(self.md_table_buffer):
data = []
@@ -136,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False
self.md_table_buffer = [] # clean table markdown buffer
# Initialize Docling TableData
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+ table_data = TableData(
+ num_rows=num_rows, num_cols=num_cols, table_cells=tcells
+ )
# Populate
for tcell in tcells:
- data.table_cells.append(tcell)
+ table_data.table_cells.append(tcell)
if len(tcells) > 0:
- doc.add_table(data=data)
+ doc.add_table(data=table_data)
return
- def process_inline_text(self, parent_element, doc=None):
+ def process_inline_text(
+ self, parent_element: Optional[NodeItem], doc: DoclingDocument
+ ):
# self.inline_text_buffer += str(text_in)
txt = self.inline_text_buffer.strip()
if len(txt) > 0:
@@ -155,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
self.inline_text_buffer = ""
- def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
+ def iterate_elements(
+ self,
+ element: marko.block.Element,
+ depth: int,
+ doc: DoclingDocument,
+ parent_element: Optional[NodeItem] = None,
+ ):
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(
- f" - Heading level {element.level}, content: {element.children[0].children}"
+ f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if element.level == 1:
doc_label = DocItemLabel.TITLE
@@ -171,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
- strings = []
+ strings: List[str] = []
# Define a recursive function to traverse the tree
- def traverse(node):
+ def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
@@ -208,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc)
_log.debug(" - List item")
- snippet_text = str(element.children[0].children[0].children)
+ snippet_text = str(element.children[0].children[0].children) # type: ignore
is_numbered = False
- if parent_element.label == GroupLabel.ORDERED_LIST:
+ if (
+ parent_element is not None
+ and isinstance(parent_element, DocItem)
+ and parent_element.label == GroupLabel.ORDERED_LIST
+ ):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -220,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
- doc.add_picture(parent=parent_element, caption=element.title)
+
+ fig_caption: Optional[TextItem] = None
+ if element.title is not None and element.title != "":
+ fig_caption = doc.add_text(
+ label=DocItemLabel.CAPTION, text=element.title
+ )
+
+ doc.add_picture(parent=parent_element, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph):
self.process_inline_text(parent_element, doc)
@@ -251,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
- doc.add_text(
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
- )
+ doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
- snippet_text = str(element.children[0].children).strip()
- doc.add_text(
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
- )
+ snippet_text = str(element.children[0].children).strip() # type: ignore
+ doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.FencedCode):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
- snippet_text = str(element.children[0].children).strip()
- doc.add_text(
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
- )
+ snippet_text = str(element.children[0].children).strip() # type: ignore
+ doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.inline.LineBreak):
self.process_inline_text(parent_element, doc)