Cleaned code, improved logging for MD

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-08-02 15:32:30 +00:00 · 2024-10-21 14:50:08 +02:00 · 2024-10-21 14:50:08 +02:00 · dae366440c
commit dae366440c
parent ba9beb65e3
2 changed files with 28 additions and 89 deletions
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -4,14 +4,9 @@ from pathlib import Path
 from typing import Set, Union
 from docling_core.types.doc import (
    BoundingBox,
    CoordOrigin,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
    ProvenanceItem,
    Size,
    TableCell,
    TableData,
 )
@ -27,13 +22,6 @@ from docling.datamodel.document import InputDocument
 import marko
 from marko import Markdown
 # from marko.ext.gfm import gfm  # GitHub Flavored Markdown plugin (tables, task lists, etc.)
 # from marko.ext.gfm.elements import Table
 # from marko.ext.gfm.elements import TableCell
 # from marko.ext.gfm.elements import TableRow
 # from marko.block import BlockElement
 # from marko.inline import InlineElement
 _log = logging.getLogger(__name__)
@ -72,13 +60,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
    def close_table(self, doc = None):
        if self.in_table:
-            print("")
+            _log.debug("=== TABLE START ===")
            print("====================================== TABLE START")
            for md_table_row in self.md_table_buffer:
-                print(md_table_row)
+                _log.debug(md_table_row)
-            print("====================================== TABLE END")
+            _log.debug("=== TABLE END ===")
            print("")
            tcells = []
            result_table = []
            for n, md_table_row in enumerate(self.md_table_buffer):
@ -94,9 +79,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                        data.append(value)
                    result_table.append(data)
            print(result_table)
            print()
            for trow_ind, trow in enumerate(result_table):
                for tcol_ind, cellval in enumerate(trow):
                    row_span = 1  # currently supporting just simple tables (without spans)
@ -116,10 +98,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            num_rows = len(result_table)
            num_cols = len(result_table[0])
            self.in_table = False
            self.md_table_buffer = []  # clean table markdown buffer
            # Initialize Docling TableData
            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
            # Populate
@ -127,28 +107,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                data.table_cells.append(tcell)
            if len(tcells) > 0:
                doc.add_table(data=data)
        # return self.in_table, self.md_table_buffer
        return
    # Function to iterate over all elements in the AST
    def iterate_elements(self, element, depth=0, doc=None, parent_element = None):
-        # Print the element type and optionally its content
+        # Iterates over all elements in the AST
-        # print(f"{'  ' * depth}- {type(element).__name__}", end="")
+        # Check for different element types and process relevant details
        # print(f"{'  ' * depth}", end="")
        # if isinstance(element, BlockElement):
        #     print(" (Block Element)")
        # elif isinstance(element, InlineElement):
        #     print(" (Inline Element)")
        # not_a_list_item = True
        # Check for different element types and print relevant details
        if isinstance(element, marko.block.Heading):
            self.close_table(doc)
-            # print(f" - Heading level {element.level}, content: {element.children[0].children}")
+            _log.debug(f" - Heading level {element.level}, content: {element.children[0].children}")
            if element.level == 1:
                doc_label = DocItemLabel.TITLE
            else:
@ -164,7 +130,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element, marko.block.List):
            self.close_table(doc)
-            # print(f" - List {'ordered' if element.ordered else 'unordered'}")
+            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
            list_label = GroupLabel.LIST
            if element.ordered:
                list_label = GroupLabel.ORDERED_LIST
@ -176,14 +142,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element, marko.block.ListItem):
            self.close_table(doc)
-            # print(" - List item")
+            _log.debug(" - List item")
            # not_a_list_item = False
            snippet_text = str(element.children[0].children[0].children)
            is_numbered = False
            if parent_element.label == GroupLabel.ORDERED_LIST:
                is_numbered = True
            doc.add_list_item(
                # marker=enum_marker,
                enumerated=is_numbered,
                parent=parent_element,
                text=snippet_text
@ -191,7 +155,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element, marko.block.Paragraph):
            self.close_table(doc)
-            # print(f" - Paragraph: {element.children[0].children}")
+            _log.debug(f" - Paragraph: {element.children[0].children}")
            snippet_text = str(element.children[0].children)
            doc.add_text(
                label=DocItemLabel.PARAGRAPH,
@ -201,23 +165,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element, marko.inline.Image):
            self.close_table(doc)
-            # print(f" - Image with alt: {element.title}, url: {element.dest}")
+            _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
            doc.add_picture(
                parent=parent_element,
                caption=element.title
            )
        elif isinstance(element, marko.inline.RawText):
-            # print(f" - Paragraph (raw text): {element.children}")
+            _log.debug(f" - Paragraph (raw text): {element.children}")
            # TODO: Detect start of the table here...
            snippet_text = str(element.children)
-            # if  snippet_text.count("|") > 1:
+
            # Detect start of the table:
            if  "|" in snippet_text:
-                # most likely table
+                # most likely part of the markdown table
                # if in_table == False:
                #     print("====================================== TABLE START!")
                self.in_table = True
                # print(f" - TABLE: {element.children}")
                if len(self.md_table_buffer) > 0:
                    self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text)
                else:
@ -234,7 +195,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element, marko.inline.CodeSpan):
            self.close_table(doc)
-            # print(f" - Paragraph (code): {element.children}")
+            _log.debug(f" - Paragraph (code): {element.children}")
            snippet_text = str(element.children)
            doc.add_text(
                label=DocItemLabel.CODE,
@ -244,28 +205,23 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element, marko.inline.LineBreak):
            if self.in_table:
-                print("Line break in table")
+                _log.debug("Line break in a table")
                self.md_table_buffer.append("")
                # print("HTML Block else: {}".format(element))
        elif isinstance(element, marko.block.HTMLBlock):
            self.close_table(doc)
-            print("HTML Block else: {}".format(element))
+            _log.debug("HTML Block: {}".format(element))
            snippet_text = str(element.children)
            doc.add_text(
                label=DocItemLabel.CODE,
                parent=parent_element,
                text=snippet_text
            )
            # elif isinstance(element, marko.ext.gfm.elements.Table):
        # elif isinstance(element, marko.ext.gfm.elements.Table):
        #     print(" - Table")
        # elif isinstance(element, TableRow):
        #     print(" - TableRow")
        # elif isinstance(element, TableCell):
        #     print(" - TableCell")
        else:
            if not isinstance(element, str):
                self.close_table(doc)
-                print("Something else: {}".format(element))
+                _log.debug("Some other element: {}".format(element))
        # elif isinstance(element, marko.block.Table):
        #     print(" - Table")
        # Iterate through the element's children (if any)
        if hasattr(element, 'children'):
@ -282,31 +238,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
    @classmethod
    def supports_pagination(cls) -> bool:
-        return False  # True? if so, how to handle pages...
+        return False
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.MD}
    def convert(self) -> DoclingDocument:
-        print("converting Markdown...")
+        _log.debug("converting Markdown...")
        doc = DoclingDocument(name="Test")
        # doc.add_text(label=DocItemLabel.PARAGRAPH, text="Markdown conversion")
        if self.is_valid():
            # Parse the markdown into an abstract syntax tree (AST)
-            # parser = marko.Markdown(extensions=['gfm'])
+            marko_parser = Markdown()            
-
+            parsed_ast = marko_parser.parse(self.markdown)
            # gfm_parser = Markdown(extensions=['gfm'])
            gfm_parser = Markdown()
            # gfm_parser.use('gfm')
            parsed_ast = gfm_parser.parse(self.markdown)
            # parsed_ast = gfm(self.markdown)
            # Start iterating from the root of the AST
            self.iterate_elements(parsed_ast, 0 , doc, None)
        else:
            raise RuntimeError(
                f"Cannot convert md with {self.document_hash} because the backend failed to init."
--- a/docs/examples/run_md.py
+++ b/docs/examples/run_md.py
@ -4,15 +4,7 @@ from pathlib import Path
 import yaml
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
 )
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 import os
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import InputFormat