mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
Cleaned code, improved logging for MD
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
ba9beb65e3
commit
dae366440c
@ -4,14 +4,9 @@ from pathlib import Path
|
|||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
|
||||||
CoordOrigin,
|
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
ProvenanceItem,
|
|
||||||
Size,
|
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
@ -27,13 +22,6 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
import marko
|
import marko
|
||||||
from marko import Markdown
|
from marko import Markdown
|
||||||
# from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.)
|
|
||||||
# from marko.ext.gfm.elements import Table
|
|
||||||
# from marko.ext.gfm.elements import TableCell
|
|
||||||
# from marko.ext.gfm.elements import TableRow
|
|
||||||
|
|
||||||
# from marko.block import BlockElement
|
|
||||||
# from marko.inline import InlineElement
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -72,13 +60,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def close_table(self, doc = None):
|
def close_table(self, doc = None):
|
||||||
|
|
||||||
if self.in_table:
|
if self.in_table:
|
||||||
print("")
|
_log.debug("=== TABLE START ===")
|
||||||
print("====================================== TABLE START")
|
|
||||||
for md_table_row in self.md_table_buffer:
|
for md_table_row in self.md_table_buffer:
|
||||||
print(md_table_row)
|
_log.debug(md_table_row)
|
||||||
print("====================================== TABLE END")
|
_log.debug("=== TABLE END ===")
|
||||||
print("")
|
|
||||||
|
|
||||||
tcells = []
|
tcells = []
|
||||||
result_table = []
|
result_table = []
|
||||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||||
@ -94,9 +79,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
data.append(value)
|
data.append(value)
|
||||||
result_table.append(data)
|
result_table.append(data)
|
||||||
|
|
||||||
print(result_table)
|
|
||||||
print()
|
|
||||||
|
|
||||||
for trow_ind, trow in enumerate(result_table):
|
for trow_ind, trow in enumerate(result_table):
|
||||||
for tcol_ind, cellval in enumerate(trow):
|
for tcol_ind, cellval in enumerate(trow):
|
||||||
row_span = 1 # currently supporting just simple tables (without spans)
|
row_span = 1 # currently supporting just simple tables (without spans)
|
||||||
@ -116,10 +98,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
num_rows = len(result_table)
|
num_rows = len(result_table)
|
||||||
num_cols = len(result_table[0])
|
num_cols = len(result_table[0])
|
||||||
|
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.md_table_buffer = [] # clean table markdown buffer
|
self.md_table_buffer = [] # clean table markdown buffer
|
||||||
|
|
||||||
# Initialize Docling TableData
|
# Initialize Docling TableData
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
|
||||||
# Populate
|
# Populate
|
||||||
@ -127,28 +107,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
data.table_cells.append(tcell)
|
data.table_cells.append(tcell)
|
||||||
if len(tcells) > 0:
|
if len(tcells) > 0:
|
||||||
doc.add_table(data=data)
|
doc.add_table(data=data)
|
||||||
|
|
||||||
# return self.in_table, self.md_table_buffer
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# Function to iterate over all elements in the AST
|
|
||||||
def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
|
def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
|
||||||
# Print the element type and optionally its content
|
# Iterates over all elements in the AST
|
||||||
# print(f"{' ' * depth}- {type(element).__name__}", end="")
|
# Check for different element types and process relevant details
|
||||||
# print(f"{' ' * depth}", end="")
|
|
||||||
|
|
||||||
# if isinstance(element, BlockElement):
|
|
||||||
# print(" (Block Element)")
|
|
||||||
# elif isinstance(element, InlineElement):
|
|
||||||
# print(" (Inline Element)")
|
|
||||||
|
|
||||||
# not_a_list_item = True
|
|
||||||
|
|
||||||
|
|
||||||
# Check for different element types and print relevant details
|
|
||||||
if isinstance(element, marko.block.Heading):
|
if isinstance(element, marko.block.Heading):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
# print(f" - Heading level {element.level}, content: {element.children[0].children}")
|
_log.debug(f" - Heading level {element.level}, content: {element.children[0].children}")
|
||||||
if element.level == 1:
|
if element.level == 1:
|
||||||
doc_label = DocItemLabel.TITLE
|
doc_label = DocItemLabel.TITLE
|
||||||
else:
|
else:
|
||||||
@ -164,7 +130,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.List):
|
elif isinstance(element, marko.block.List):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
# print(f" - List {'ordered' if element.ordered else 'unordered'}")
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||||
list_label = GroupLabel.LIST
|
list_label = GroupLabel.LIST
|
||||||
if element.ordered:
|
if element.ordered:
|
||||||
list_label = GroupLabel.ORDERED_LIST
|
list_label = GroupLabel.ORDERED_LIST
|
||||||
@ -176,14 +142,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.ListItem):
|
elif isinstance(element, marko.block.ListItem):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
# print(" - List item")
|
_log.debug(" - List item")
|
||||||
# not_a_list_item = False
|
|
||||||
snippet_text = str(element.children[0].children[0].children)
|
snippet_text = str(element.children[0].children[0].children)
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
if parent_element.label == GroupLabel.ORDERED_LIST:
|
if parent_element.label == GroupLabel.ORDERED_LIST:
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
# marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=parent_element,
|
parent=parent_element,
|
||||||
text=snippet_text
|
text=snippet_text
|
||||||
@ -191,7 +155,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.Paragraph):
|
elif isinstance(element, marko.block.Paragraph):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
# print(f" - Paragraph: {element.children[0].children}")
|
_log.debug(f" - Paragraph: {element.children[0].children}")
|
||||||
snippet_text = str(element.children[0].children)
|
snippet_text = str(element.children[0].children)
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -201,23 +165,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
# print(f" - Image with alt: {element.title}, url: {element.dest}")
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=parent_element,
|
parent=parent_element,
|
||||||
caption=element.title
|
caption=element.title
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.RawText):
|
elif isinstance(element, marko.inline.RawText):
|
||||||
# print(f" - Paragraph (raw text): {element.children}")
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||||
# TODO: Detect start of the table here...
|
|
||||||
snippet_text = str(element.children)
|
snippet_text = str(element.children)
|
||||||
# if snippet_text.count("|") > 1:
|
|
||||||
|
# Detect start of the table:
|
||||||
if "|" in snippet_text:
|
if "|" in snippet_text:
|
||||||
# most likely table
|
# most likely part of the markdown table
|
||||||
# if in_table == False:
|
|
||||||
# print("====================================== TABLE START!")
|
|
||||||
self.in_table = True
|
self.in_table = True
|
||||||
# print(f" - TABLE: {element.children}")
|
|
||||||
if len(self.md_table_buffer) > 0:
|
if len(self.md_table_buffer) > 0:
|
||||||
self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text)
|
self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text)
|
||||||
else:
|
else:
|
||||||
@ -234,7 +195,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.inline.CodeSpan):
|
elif isinstance(element, marko.inline.CodeSpan):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
# print(f" - Paragraph (code): {element.children}")
|
_log.debug(f" - Paragraph (code): {element.children}")
|
||||||
snippet_text = str(element.children)
|
snippet_text = str(element.children)
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.CODE,
|
label=DocItemLabel.CODE,
|
||||||
@ -244,28 +205,23 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.inline.LineBreak):
|
elif isinstance(element, marko.inline.LineBreak):
|
||||||
if self.in_table:
|
if self.in_table:
|
||||||
print("Line break in table")
|
_log.debug("Line break in a table")
|
||||||
self.md_table_buffer.append("")
|
self.md_table_buffer.append("")
|
||||||
# print("HTML Block else: {}".format(element))
|
|
||||||
|
|
||||||
elif isinstance(element, marko.block.HTMLBlock):
|
elif isinstance(element, marko.block.HTMLBlock):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
print("HTML Block else: {}".format(element))
|
_log.debug("HTML Block: {}".format(element))
|
||||||
|
snippet_text = str(element.children)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.CODE,
|
||||||
|
parent=parent_element,
|
||||||
|
text=snippet_text
|
||||||
|
)
|
||||||
|
|
||||||
# elif isinstance(element, marko.ext.gfm.elements.Table):
|
|
||||||
# elif isinstance(element, marko.ext.gfm.elements.Table):
|
|
||||||
# print(" - Table")
|
|
||||||
# elif isinstance(element, TableRow):
|
|
||||||
# print(" - TableRow")
|
|
||||||
# elif isinstance(element, TableCell):
|
|
||||||
# print(" - TableCell")
|
|
||||||
else:
|
else:
|
||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
print("Something else: {}".format(element))
|
_log.debug("Some other element: {}".format(element))
|
||||||
|
|
||||||
# elif isinstance(element, marko.block.Table):
|
|
||||||
# print(" - Table")
|
|
||||||
|
|
||||||
# Iterate through the element's children (if any)
|
# Iterate through the element's children (if any)
|
||||||
if hasattr(element, 'children'):
|
if hasattr(element, 'children'):
|
||||||
@ -282,31 +238,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return False # True? if so, how to handle pages...
|
return False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supported_formats(cls) -> Set[InputFormat]:
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
return {InputFormat.MD}
|
return {InputFormat.MD}
|
||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
print("converting Markdown...")
|
_log.debug("converting Markdown...")
|
||||||
doc = DoclingDocument(name="Test")
|
doc = DoclingDocument(name="Test")
|
||||||
# doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
|
|
||||||
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
# Parse the markdown into an abstract syntax tree (AST)
|
# Parse the markdown into an abstract syntax tree (AST)
|
||||||
# parser = marko.Markdown(extensions=['gfm'])
|
marko_parser = Markdown()
|
||||||
|
parsed_ast = marko_parser.parse(self.markdown)
|
||||||
# gfm_parser = Markdown(extensions=['gfm'])
|
|
||||||
gfm_parser = Markdown()
|
|
||||||
# gfm_parser.use('gfm')
|
|
||||||
|
|
||||||
parsed_ast = gfm_parser.parse(self.markdown)
|
|
||||||
|
|
||||||
# parsed_ast = gfm(self.markdown)
|
|
||||||
# Start iterating from the root of the AST
|
# Start iterating from the root of the AST
|
||||||
self.iterate_elements(parsed_ast, 0 , doc, None)
|
self.iterate_elements(parsed_ast, 0 , doc, None)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
||||||
|
@ -4,15 +4,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import (
|
|
||||||
DocumentConverter,
|
|
||||||
PdfFormatOption,
|
|
||||||
WordFormatOption,
|
|
||||||
)
|
|
||||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|
||||||
import os
|
import os
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
Loading…
Reference in New Issue
Block a user