mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
use new add_code in backends and update typing in MD backend
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
e707747863
commit
d5b2c07295
@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
label = DocItemLabel.CODE
|
||||
if len(text) == 0:
|
||||
return
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
doc.add_code(parent=self.parents[self.level], label=label, text=text)
|
||||
|
||||
def handle_paragraph(self, element, idx, doc):
|
||||
"""Handles paragraph tags (p)."""
|
||||
|
@ -3,19 +3,22 @@ import re
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
import marko
|
||||
import marko.ext
|
||||
import marko.ext.gfm
|
||||
import marko.inline
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from marko import Markdown
|
||||
|
||||
@ -27,7 +30,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
||||
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||
# This regex will match any sequence of underscores
|
||||
pattern = r"_+"
|
||||
|
||||
@ -89,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
) from e
|
||||
return
|
||||
|
||||
def close_table(self, doc=None):
|
||||
def close_table(self, doc: DoclingDocument):
|
||||
if self.in_table:
|
||||
_log.debug("=== TABLE START ===")
|
||||
for md_table_row in self.md_table_buffer:
|
||||
_log.debug(md_table_row)
|
||||
_log.debug("=== TABLE END ===")
|
||||
tcells = []
|
||||
tcells: List[TableCell] = []
|
||||
result_table = []
|
||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||
data = []
|
||||
@ -136,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.in_table = False
|
||||
self.md_table_buffer = [] # clean table markdown buffer
|
||||
# Initialize Docling TableData
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
|
||||
table_data = TableData(
|
||||
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
|
||||
)
|
||||
# Populate
|
||||
for tcell in tcells:
|
||||
data.table_cells.append(tcell)
|
||||
table_data.table_cells.append(tcell)
|
||||
if len(tcells) > 0:
|
||||
doc.add_table(data=data)
|
||||
doc.add_table(data=table_data)
|
||||
return
|
||||
|
||||
def process_inline_text(self, parent_element, doc=None):
|
||||
def process_inline_text(
|
||||
self, parent_element: Optional[NodeItem], doc: DoclingDocument
|
||||
):
|
||||
# self.inline_text_buffer += str(text_in)
|
||||
txt = self.inline_text_buffer.strip()
|
||||
if len(txt) > 0:
|
||||
@ -155,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
self.inline_text_buffer = ""
|
||||
|
||||
def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
|
||||
def iterate_elements(
|
||||
self,
|
||||
element: marko.block.Element,
|
||||
depth: int,
|
||||
doc: DoclingDocument,
|
||||
parent_element: Optional[NodeItem] = None,
|
||||
):
|
||||
# Iterates over all elements in the AST
|
||||
# Check for different element types and process relevant details
|
||||
if isinstance(element, marko.block.Heading):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(
|
||||
f" - Heading level {element.level}, content: {element.children[0].children}"
|
||||
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||
)
|
||||
if element.level == 1:
|
||||
doc_label = DocItemLabel.TITLE
|
||||
@ -171,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
||||
# hence we need to traverse the tree to get full text of a header
|
||||
strings = []
|
||||
strings: List[str] = []
|
||||
|
||||
# Define a recursive function to traverse the tree
|
||||
def traverse(node):
|
||||
def traverse(node: marko.block.BlockElement):
|
||||
# Check if the node has a "children" attribute
|
||||
if hasattr(node, "children"):
|
||||
# If "children" is a list, continue traversal
|
||||
@ -208,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
snippet_text = str(element.children[0].children[0].children)
|
||||
snippet_text = str(element.children[0].children[0].children) # type: ignore
|
||||
is_numbered = False
|
||||
if parent_element.label == GroupLabel.ORDERED_LIST:
|
||||
if (
|
||||
parent_element is not None
|
||||
and isinstance(parent_element, DocItem)
|
||||
and parent_element.label == GroupLabel.ORDERED_LIST
|
||||
):
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
||||
@ -220,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||
doc.add_picture(parent=parent_element, caption=element.title)
|
||||
|
||||
fig_caption: Optional[TextItem] = None
|
||||
if element.title is not None and element.title != "":
|
||||
fig_caption = doc.add_text(
|
||||
label=DocItemLabel.CAPTION, text=element.title
|
||||
)
|
||||
|
||||
doc.add_picture(parent=parent_element, caption=fig_caption)
|
||||
|
||||
elif isinstance(element, marko.block.Paragraph):
|
||||
self.process_inline_text(parent_element, doc)
|
||||
@ -251,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Span: {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
|
||||
elif isinstance(element, marko.block.CodeBlock):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
snippet_text = str(element.children[0].children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
snippet_text = str(element.children[0].children).strip() # type: ignore
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
|
||||
elif isinstance(element, marko.block.FencedCode):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
snippet_text = str(element.children[0].children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
snippet_text = str(element.children[0].children).strip() # type: ignore
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
|
||||
elif isinstance(element, marko.inline.LineBreak):
|
||||
self.process_inline_text(parent_element, doc)
|
||||
|
Loading…
Reference in New Issue
Block a user