From dae366440cb6c183417e5df1b724b7d6b005a676 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 21 Oct 2024 14:50:08 +0200 Subject: [PATCH] Cleaned code, improved logging for MD Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 109 +++++++++------------------------- docs/examples/run_md.py | 8 --- 2 files changed, 28 insertions(+), 89 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 7e9d269e..373364dc 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -4,14 +4,9 @@ from pathlib import Path from typing import Set, Union from docling_core.types.doc import ( - BoundingBox, - CoordOrigin, DocItemLabel, DoclingDocument, - DocumentOrigin, GroupLabel, - ProvenanceItem, - Size, TableCell, TableData, ) @@ -27,13 +22,6 @@ from docling.datamodel.document import InputDocument import marko from marko import Markdown -# from marko.ext.gfm import gfm # GitHub Flavored Markdown plugin (tables, task lists, etc.) -# from marko.ext.gfm.elements import Table -# from marko.ext.gfm.elements import TableCell -# from marko.ext.gfm.elements import TableRow - -# from marko.block import BlockElement -# from marko.inline import InlineElement _log = logging.getLogger(__name__) @@ -72,13 +60,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): def close_table(self, doc = None): if self.in_table: - print("") - print("====================================== TABLE START") + _log.debug("=== TABLE START ===") for md_table_row in self.md_table_buffer: - print(md_table_row) - print("====================================== TABLE END") - print("") - + _log.debug(md_table_row) + _log.debug("=== TABLE END ===") tcells = [] result_table = [] for n, md_table_row in enumerate(self.md_table_buffer): @@ -94,9 +79,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): data.append(value) result_table.append(data) - print(result_table) - print() - for trow_ind, trow in enumerate(result_table): for tcol_ind, cellval in enumerate(trow): row_span = 1 # currently supporting just simple tables (without spans) @@ -116,10 +98,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): num_rows = len(result_table) num_cols = len(result_table[0]) - self.in_table = False self.md_table_buffer = [] # clean table markdown buffer - # Initialize Docling TableData data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells) # Populate @@ -127,28 +107,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): data.table_cells.append(tcell) if len(tcells) > 0: doc.add_table(data=data) - - # return self.in_table, self.md_table_buffer return - # Function to iterate over all elements in the AST def iterate_elements(self, element, depth=0, doc=None, parent_element = None): - # Print the element type and optionally its content - # print(f"{' ' * depth}- {type(element).__name__}", end="") - # print(f"{' ' * depth}", end="") - - # if isinstance(element, BlockElement): - # print(" (Block Element)") - # elif isinstance(element, InlineElement): - # print(" (Inline Element)") - - # not_a_list_item = True - - - # Check for different element types and print relevant details + # Iterates over all elements in the AST + # Check for different element types and process relevant details if isinstance(element, marko.block.Heading): self.close_table(doc) - # print(f" - Heading level {element.level}, content: {element.children[0].children}") + _log.debug(f" - Heading level {element.level}, content: {element.children[0].children}") if element.level == 1: doc_label = DocItemLabel.TITLE else: @@ -164,7 +130,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.block.List): self.close_table(doc) - # print(f" - List {'ordered' if element.ordered else 'unordered'}") + _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") list_label = GroupLabel.LIST if element.ordered: list_label = GroupLabel.ORDERED_LIST @@ -176,14 +142,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.block.ListItem): self.close_table(doc) - # print(" - List item") - # not_a_list_item = False + _log.debug(" - List item") snippet_text = str(element.children[0].children[0].children) is_numbered = False if parent_element.label == GroupLabel.ORDERED_LIST: is_numbered = True doc.add_list_item( - # marker=enum_marker, enumerated=is_numbered, parent=parent_element, text=snippet_text @@ -191,7 +155,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.block.Paragraph): self.close_table(doc) - # print(f" - Paragraph: {element.children[0].children}") + _log.debug(f" - Paragraph: {element.children[0].children}") snippet_text = str(element.children[0].children) doc.add_text( label=DocItemLabel.PARAGRAPH, @@ -201,23 +165,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.inline.Image): self.close_table(doc) - # print(f" - Image with alt: {element.title}, url: {element.dest}") + _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") doc.add_picture( parent=parent_element, caption=element.title ) elif isinstance(element, marko.inline.RawText): - # print(f" - Paragraph (raw text): {element.children}") - # TODO: Detect start of the table here... + _log.debug(f" - Paragraph (raw text): {element.children}") snippet_text = str(element.children) - # if snippet_text.count("|") > 1: + + # Detect start of the table: if "|" in snippet_text: - # most likely table - # if in_table == False: - # print("====================================== TABLE START!") + # most likely part of the markdown table self.in_table = True - # print(f" - TABLE: {element.children}") if len(self.md_table_buffer) > 0: self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text) else: @@ -234,7 +195,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.inline.CodeSpan): self.close_table(doc) - # print(f" - Paragraph (code): {element.children}") + _log.debug(f" - Paragraph (code): {element.children}") snippet_text = str(element.children) doc.add_text( label=DocItemLabel.CODE, @@ -244,28 +205,23 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.inline.LineBreak): if self.in_table: - print("Line break in table") + _log.debug("Line break in a table") self.md_table_buffer.append("") - # print("HTML Block else: {}".format(element)) elif isinstance(element, marko.block.HTMLBlock): self.close_table(doc) - print("HTML Block else: {}".format(element)) + _log.debug("HTML Block: {}".format(element)) + snippet_text = str(element.children) + doc.add_text( + label=DocItemLabel.CODE, + parent=parent_element, + text=snippet_text + ) - # elif isinstance(element, marko.ext.gfm.elements.Table): - # elif isinstance(element, marko.ext.gfm.elements.Table): - # print(" - Table") - # elif isinstance(element, TableRow): - # print(" - TableRow") - # elif isinstance(element, TableCell): - # print(" - TableCell") else: if not isinstance(element, str): self.close_table(doc) - print("Something else: {}".format(element)) - - # elif isinstance(element, marko.block.Table): - # print(" - Table") + _log.debug("Some other element: {}".format(element)) # Iterate through the element's children (if any) if hasattr(element, 'children'): @@ -282,31 +238,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): @classmethod def supports_pagination(cls) -> bool: - return False # True? if so, how to handle pages... + return False @classmethod def supported_formats(cls) -> Set[InputFormat]: return {InputFormat.MD} def convert(self) -> DoclingDocument: - print("converting Markdown...") + _log.debug("converting Markdown...") doc = DoclingDocument(name="Test") - # doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion") if self.is_valid(): # Parse the markdown into an abstract syntax tree (AST) - # parser = marko.Markdown(extensions=['gfm']) - - # gfm_parser = Markdown(extensions=['gfm']) - gfm_parser = Markdown() - # gfm_parser.use('gfm') - - parsed_ast = gfm_parser.parse(self.markdown) - - # parsed_ast = gfm(self.markdown) + marko_parser = Markdown() + parsed_ast = marko_parser.parse(self.markdown) # Start iterating from the root of the AST self.iterate_elements(parsed_ast, 0 , doc, None) - else: raise RuntimeError( f"Cannot convert md with {self.document_hash} because the backend failed to init." diff --git a/docs/examples/run_md.py b/docs/examples/run_md.py index 126f4ec4..5195a0f1 100644 --- a/docs/examples/run_md.py +++ b/docs/examples/run_md.py @@ -4,15 +4,7 @@ from pathlib import Path import yaml -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.document_converter import ( - DocumentConverter, - PdfFormatOption, - WordFormatOption, -) -from docling.pipeline.simple_pipeline import SimplePipeline -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline import os from docling.backend.md_backend import MarkdownDocumentBackend from docling.datamodel.base_models import InputFormat