From c1d9241b39012463dc836b99996efc02a6ffd693 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Fri, 18 Oct 2024 08:28:02 +0200
Subject: [PATCH] updated the asciidoc backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/backend/asciidoc_backend.py | 102 ++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py
index 3aad25a7..f2aa4e74 100644
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@@ -1,3 +1,5 @@
+import re
+
 import logging
 from io import BytesIO
 from pathlib import Path
@@ -68,5 +70,105 @@ class ASCIIDocDocumentBackend(DeclarativeDocumentBackend):
         return doc
 
     def parse(self, doc: DoclingDocument):
+        """
+        Main function that orchestrates the parsing by yielding components:
+        title, section headers, text, lists, and tables.
+        """
 
+        content=""
+        with open(self.path_or_stream, "r") as fr:
+            self.lines = fr.read_lines()
+        
+        #self.lines = file_content.splitlines()
+        
+        in_list = False
+        in_table = False
+        table_data = []
+        
+        for line in self.lines:
+            line = line.strip()
+            
+            # Title
+            if self.is_title(line):
+                item = self.parse_title(line)
+                doc.set_title(text=item["text"])
+
+            # Section headers
+            elif self.is_section_header(line):
+                heading = self.parse_section_header(line)
+                doc.add_heading(text=heading["text"], level=heading["level"])
+
+            # Lists
+            elif self.is_list_item(line):
+                if not in_list:
+                    in_list = True
+
+                item = self.parse_list_item(line)
+                doc.add_listitem(item["text"])
+            
+            elif in_list and not self.is_list_item(line):
+                in_list = False
+            
+            # Tables
+            elif self.is_table_line(line):
+                in_table = True                
+                table_data.append(self.parse_table_line(line))
+                continue
+            
+            elif in_table and not self.is_table_line(line):
+                
+                grid = self.populate_table_as_grid(table_data)
+                doc.add_table(data=grid)
+                
+                in_table = False
+                table_data = []
+            
+            # Plain text
+            elif line:
+                item = self.parse_text(line)
+                doc.add_text(text=item["text"])
+        
         return doc
+
+    # Title
+    def is_title(self, line):
+        return re.match(r"^= ", line)
+    
+    def parse_title(self, line):
+        return {"type": "title", "text": line[2:].strip()}
+
+    # Section headers
+    def is_section_header(self, line):
+        return re.match(r"^==+", line)
+
+    def parse_section_header(self, line):
+        header_level = line.count('=')  # number of '=' represents level
+        return {"type": "header", "level": header_level, "text": line[header_level:].strip()}
+    
+    # Lists
+    def is_list_item(self, line):
+        return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
+
+    def parse_list_item(self, line):
+        return {"type": "list_item", "text": line}
+
+    # Tables
+    def is_table_line(self, line):
+        return re.match(r"^\|.*\|", line)
+
+    def parse_table_line(self, line):
+        # Split table cells and trim extra spaces
+        return [cell.strip() for cell in line.split('|') if cell.strip()]
+
+    def populate_table_as_grid(self, table_data):
+        # Adjust the table data into a grid format
+        max_cols = max(len(row) for row in table_data)
+        grid = []
+        for row in table_data:
+            # Pad rows with empty strings to match column count
+            grid.append(row + [''] * (max_cols - len(row)))
+        return grid
+    
+    # Plain text
+    def parse_text(self, line):
+        return {"type": "text", "text": line}