first working asciidoc parser

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-18 16:51:29 +02:00 · 2024-10-18 16:51:29 +02:00 · 5016daeae3
commit 5016daeae3
parent 1138cae7f1
4 changed files with 139 additions and 14 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -65,7 +65,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        
        doc = DoclingDocument(name=docname, origin=origin)

-        doc = self.parse_stream(doc)
+        doc = self.parse(doc)
        
        return doc

@ -77,7 +77,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):

        content=""
        with open(self.path_or_stream, "r") as fr:
-            self.lines = fr.read_lines()
+            self.lines = fr.readlines()
        
        #self.lines = file_content.splitlines()
        
@ -91,7 +91,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
            # Title
            if self.is_title(line):
                item = self.parse_title(line)
-                doc.set_title(text=item["text"])
+                doc.add_text(text=item["text"], label="title")

            # Section headers
            elif self.is_section_header(line):
@ -104,7 +104,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
                    in_list = True

                item = self.parse_list_item(line)
-                doc.add_listitem(item["text"])
+                doc.add_list_item(item["text"])
            
            elif in_list and not self.is_list_item(line):
                in_list = False
@ -113,12 +113,11 @@ class AsciidocBackend(DeclarativeDocumentBackend):
            elif self.is_table_line(line):
                in_table = True                
                table_data.append(self.parse_table_line(line))
-                continue
            
            elif in_table and not self.is_table_line(line):
                
-                grid = self.populate_table_as_grid(table_data)
-                doc.add_table(data=grid)
+                data = self.populate_table_as_grid(table_data)
+                doc.add_table(data=data)
                
                in_table = False
                table_data = []
@ -126,8 +125,15 @@ class AsciidocBackend(DeclarativeDocumentBackend):
            # Plain text
            elif line:
                item = self.parse_text(line)
-                doc.add_text(text=item["text"])
-        
+                doc.add_text(text=item["text"], label="text")
+
+        if in_table and len(table_data)>0:
+            data = self.populate_table_as_grid(table_data)
+            doc.add_table(data=data)
+            
+            in_table = False
+            table_data = []
+                
        return doc

    # Title
@ -161,13 +167,34 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        return [cell.strip() for cell in line.split('|') if cell.strip()]

    def populate_table_as_grid(self, table_data):
+
+        num_rows = len(table_data)
+        
        # Adjust the table data into a grid format
-        max_cols = max(len(row) for row in table_data)
-        grid = []
-        for row in table_data:
+        num_cols = max(len(row) for row in table_data)
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])        
+        for row_idx,row in enumerate(table_data):
            # Pad rows with empty strings to match column count
-            grid.append(row + [''] * (max_cols - len(row)))
-        return grid
+            #grid.append(row + [''] * (max_cols - len(row)))
+
+            for col_idx,text in enumerate(row):
+                row_span = 1
+                col_span = 1
+                
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=False,
+                    row_header=False)
+                data.table_cells.append(cell)                
+            
+        return data
    
    # Plain text
    def parse_text(self, line):
--- a/tests/data/groundtruth/docling_v2/test_01.asciidoc.md
+++ b/tests/data/groundtruth/docling_v2/test_01.asciidoc.md
@ -0,0 +1,24 @@
+# Sample Document Title
+
+## Section 1
+
+This is some introductory text in section 1.
+
+## Subsection 1.1
+
+- * First list item
+
+- * Second list item
+
+This is some introductory text in section 1.1.
+
+- - A dash list item
+
+## Section 2
+
+This is some text in section 2.
+
+| Header 1   | Header 2   |
+|------------|------------|
+| Value 1    | Value 2    |
+| Value 3    | Value 4    |
--- a/tests/data/test_01.asciidoc
+++ b/tests/data/test_01.asciidoc
@ -0,0 +1,20 @@
+= Sample Document Title
+
+== Section 1
+
+This is some introductory text in section 1.
+    
+=== Subsection 1.1
+* First list item
+* Second list item
+
+This is some introductory text in section 1.1.
+
+- A dash list item
+    
+== Section 2
+This is some text in section 2.
+    
+|Header 1|Header 2|
+|Value 1|Value 2|
+|Value 3|Value 4|
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -0,0 +1,54 @@
+import glob
+import os
+
+from pathlib import Path
+
+import pytest
+from docling_core.types.doc import BoundingBox
+
+from docling.backend.asciidoc_backend import (
+    AsciidocBackend,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+
+def _get_backend(fname):
+    in_doc = InputDocument(
+        path_or_stream=fname,
+        format=InputFormat.ASCIIDOC,
+        backend=AsciidocBackend,
+    )
+
+    doc_backend = in_doc._backend
+    return doc_backend
+
+
+def test_asciidocs_examples():
+    
+    fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
+    
+    for fname in fnames:
+        print(f"reading {fname}")
+
+        bname = os.path.basename(fname)
+        gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md")
+        
+        doc_backend = _get_backend(Path(fname))
+        doc = doc_backend.convert()
+
+        pred_mddoc = doc.export_to_markdown()
+
+        if os.path.exists(gname):
+            with open(gname, "r") as fr:
+                true_mddoc = fr.read()
+
+            assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
+        else:            
+            with open(gname, "w") as fw:
+                fw.write(pred_mddoc)
+
+            print("\n\n", doc.export_to_markdown())
+
+            
+