mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
use new add_code in backends and update typing in MD backend
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
e707747863
commit
d5b2c07295
@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label = DocItemLabel.CODE
|
label = DocItemLabel.CODE
|
||||||
if len(text) == 0:
|
if len(text) == 0:
|
||||||
return
|
return
|
||||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
doc.add_code(parent=self.parents[self.level], label=label, text=text)
|
||||||
|
|
||||||
def handle_paragraph(self, element, idx, doc):
|
def handle_paragraph(self, element, idx, doc):
|
||||||
"""Handles paragraph tags (p)."""
|
"""Handles paragraph tags (p)."""
|
||||||
|
@ -3,19 +3,22 @@ import re
|
|||||||
import warnings
|
import warnings
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import List, Optional, Set, Union
|
||||||
|
|
||||||
import marko
|
import marko
|
||||||
import marko.ext
|
import marko.ext
|
||||||
import marko.ext.gfm
|
import marko.ext.gfm
|
||||||
import marko.inline
|
import marko.inline
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
|
NodeItem,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
|
TextItem,
|
||||||
)
|
)
|
||||||
from marko import Markdown
|
from marko import Markdown
|
||||||
|
|
||||||
@ -27,7 +30,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||||
# This regex will match any sequence of underscores
|
# This regex will match any sequence of underscores
|
||||||
pattern = r"_+"
|
pattern = r"_+"
|
||||||
|
|
||||||
@ -89,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
) from e
|
) from e
|
||||||
return
|
return
|
||||||
|
|
||||||
def close_table(self, doc=None):
|
def close_table(self, doc: DoclingDocument):
|
||||||
if self.in_table:
|
if self.in_table:
|
||||||
_log.debug("=== TABLE START ===")
|
_log.debug("=== TABLE START ===")
|
||||||
for md_table_row in self.md_table_buffer:
|
for md_table_row in self.md_table_buffer:
|
||||||
_log.debug(md_table_row)
|
_log.debug(md_table_row)
|
||||||
_log.debug("=== TABLE END ===")
|
_log.debug("=== TABLE END ===")
|
||||||
tcells = []
|
tcells: List[TableCell] = []
|
||||||
result_table = []
|
result_table = []
|
||||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||||
data = []
|
data = []
|
||||||
@ -136,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.md_table_buffer = [] # clean table markdown buffer
|
self.md_table_buffer = [] # clean table markdown buffer
|
||||||
# Initialize Docling TableData
|
# Initialize Docling TableData
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
|
table_data = TableData(
|
||||||
|
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
|
||||||
|
)
|
||||||
# Populate
|
# Populate
|
||||||
for tcell in tcells:
|
for tcell in tcells:
|
||||||
data.table_cells.append(tcell)
|
table_data.table_cells.append(tcell)
|
||||||
if len(tcells) > 0:
|
if len(tcells) > 0:
|
||||||
doc.add_table(data=data)
|
doc.add_table(data=table_data)
|
||||||
return
|
return
|
||||||
|
|
||||||
def process_inline_text(self, parent_element, doc=None):
|
def process_inline_text(
|
||||||
|
self, parent_element: Optional[NodeItem], doc: DoclingDocument
|
||||||
|
):
|
||||||
# self.inline_text_buffer += str(text_in)
|
# self.inline_text_buffer += str(text_in)
|
||||||
txt = self.inline_text_buffer.strip()
|
txt = self.inline_text_buffer.strip()
|
||||||
if len(txt) > 0:
|
if len(txt) > 0:
|
||||||
@ -155,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
self.inline_text_buffer = ""
|
self.inline_text_buffer = ""
|
||||||
|
|
||||||
def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
|
def iterate_elements(
|
||||||
|
self,
|
||||||
|
element: marko.block.Element,
|
||||||
|
depth: int,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
parent_element: Optional[NodeItem] = None,
|
||||||
|
):
|
||||||
# Iterates over all elements in the AST
|
# Iterates over all elements in the AST
|
||||||
# Check for different element types and process relevant details
|
# Check for different element types and process relevant details
|
||||||
if isinstance(element, marko.block.Heading):
|
if isinstance(element, marko.block.Heading):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(
|
_log.debug(
|
||||||
f" - Heading level {element.level}, content: {element.children[0].children}"
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||||
)
|
)
|
||||||
if element.level == 1:
|
if element.level == 1:
|
||||||
doc_label = DocItemLabel.TITLE
|
doc_label = DocItemLabel.TITLE
|
||||||
@ -171,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
||||||
# hence we need to traverse the tree to get full text of a header
|
# hence we need to traverse the tree to get full text of a header
|
||||||
strings = []
|
strings: List[str] = []
|
||||||
|
|
||||||
# Define a recursive function to traverse the tree
|
# Define a recursive function to traverse the tree
|
||||||
def traverse(node):
|
def traverse(node: marko.block.BlockElement):
|
||||||
# Check if the node has a "children" attribute
|
# Check if the node has a "children" attribute
|
||||||
if hasattr(node, "children"):
|
if hasattr(node, "children"):
|
||||||
# If "children" is a list, continue traversal
|
# If "children" is a list, continue traversal
|
||||||
@ -208,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(" - List item")
|
_log.debug(" - List item")
|
||||||
|
|
||||||
snippet_text = str(element.children[0].children[0].children)
|
snippet_text = str(element.children[0].children[0].children) # type: ignore
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
if parent_element.label == GroupLabel.ORDERED_LIST:
|
if (
|
||||||
|
parent_element is not None
|
||||||
|
and isinstance(parent_element, DocItem)
|
||||||
|
and parent_element.label == GroupLabel.ORDERED_LIST
|
||||||
|
):
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
||||||
@ -220,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
doc.add_picture(parent=parent_element, caption=element.title)
|
|
||||||
|
fig_caption: Optional[TextItem] = None
|
||||||
|
if element.title is not None and element.title != "":
|
||||||
|
fig_caption = doc.add_text(
|
||||||
|
label=DocItemLabel.CAPTION, text=element.title
|
||||||
|
)
|
||||||
|
|
||||||
|
doc.add_picture(parent=parent_element, caption=fig_caption)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.Paragraph):
|
elif isinstance(element, marko.block.Paragraph):
|
||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
@ -251,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Code Span: {element.children}")
|
_log.debug(f" - Code Span: {element.children}")
|
||||||
snippet_text = str(element.children).strip()
|
snippet_text = str(element.children).strip()
|
||||||
doc.add_text(
|
doc.add_code(parent=parent_element, text=snippet_text)
|
||||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.block.CodeBlock):
|
elif isinstance(element, marko.block.CodeBlock):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Code Block: {element.children}")
|
_log.debug(f" - Code Block: {element.children}")
|
||||||
snippet_text = str(element.children[0].children).strip()
|
snippet_text = str(element.children[0].children).strip() # type: ignore
|
||||||
doc.add_text(
|
doc.add_code(parent=parent_element, text=snippet_text)
|
||||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.block.FencedCode):
|
elif isinstance(element, marko.block.FencedCode):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
_log.debug(f" - Code Block: {element.children}")
|
_log.debug(f" - Code Block: {element.children}")
|
||||||
snippet_text = str(element.children[0].children).strip()
|
snippet_text = str(element.children[0].children).strip() # type: ignore
|
||||||
doc.add_text(
|
doc.add_code(parent=parent_element, text=snippet_text)
|
||||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.LineBreak):
|
elif isinstance(element, marko.inline.LineBreak):
|
||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
|
Loading…
Reference in New Issue
Block a user