updated the asciidoc backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-18 08:28:02 +02:00 · 2024-10-18 08:28:02 +02:00 · c1d9241b39
commit c1d9241b39
parent 12033537e3
1 changed files with 102 additions and 0 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -1,3 +1,5 @@
 import re
 import logging
 from io import BytesIO
 from pathlib import Path
@ -68,5 +70,105 @@ class ASCIIDocDocumentBackend(DeclarativeDocumentBackend):
        return doc
    def parse(self, doc: DoclingDocument):
        """
        Main function that orchestrates the parsing by yielding components:
        title, section headers, text, lists, and tables.
        """
        content=""
        with open(self.path_or_stream, "r") as fr:
            self.lines = fr.read_lines()
        #self.lines = file_content.splitlines()
        in_list = False
        in_table = False
        table_data = []
        for line in self.lines:
            line = line.strip()
            # Title
            if self.is_title(line):
                item = self.parse_title(line)
                doc.set_title(text=item["text"])
            # Section headers
            elif self.is_section_header(line):
                heading = self.parse_section_header(line)
                doc.add_heading(text=heading["text"], level=heading["level"])
            # Lists
            elif self.is_list_item(line):
                if not in_list:
                    in_list = True
                item = self.parse_list_item(line)
                doc.add_listitem(item["text"])
            elif in_list and not self.is_list_item(line):
                in_list = False
            # Tables
            elif self.is_table_line(line):
                in_table = True                
                table_data.append(self.parse_table_line(line))
                continue
            elif in_table and not self.is_table_line(line):
                grid = self.populate_table_as_grid(table_data)
                doc.add_table(data=grid)
                in_table = False
                table_data = []
            # Plain text
            elif line:
                item = self.parse_text(line)
                doc.add_text(text=item["text"])
        return doc
    # Title
    def is_title(self, line):
        return re.match(r"^= ", line)
    def parse_title(self, line):
        return {"type": "title", "text": line[2:].strip()}
    # Section headers
    def is_section_header(self, line):
        return re.match(r"^==+", line)
    def parse_section_header(self, line):
        header_level = line.count('=')  # number of '=' represents level
        return {"type": "header", "level": header_level, "text": line[header_level:].strip()}
    # Lists
    def is_list_item(self, line):
        return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
    def parse_list_item(self, line):
        return {"type": "list_item", "text": line}
    # Tables
    def is_table_line(self, line):
        return re.match(r"^\|.*\|", line)
    def parse_table_line(self, line):
        # Split table cells and trim extra spaces
        return [cell.strip() for cell in line.split('|') if cell.strip()]
    def populate_table_as_grid(self, table_data):
        # Adjust the table data into a grid format
        max_cols = max(len(row) for row in table_data)
        grid = []
        for row in table_data:
            # Pad rows with empty strings to match column count
            grid.append(row + [''] * (max_cols - len(row)))
        return grid
    # Plain text
    def parse_text(self, line):
        return {"type": "text", "text": line}