adding test_02.asciidoc

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-21 06:01:56 +02:00 · 2024-10-21 06:01:56 +02:00 · c23d049270
commit c23d049270
parent e60c52586b
6 changed files with 782 additions and 499 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -76,7 +76,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):

        content = ""
        if isinstance(self.path_or_stream, Path):
-            with open(self.path_or_stream.name, "r") as fr:
+            with open(self.path_or_stream, "r") as fr:
                self.lines = fr.readlines()

        # self.lines = file_content.splitlines()
@ -85,30 +85,50 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        in_table = False
        table_data = []

+        parents = {}
+        for i in range(0, 10):
+            parents[i] = None
+        
        for line in self.lines:
            line = line.strip()

            # Title
            if self.is_title(line):
                item = self.parse_title(line)
-                doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
-
+                level = item["level"]
+                
+                parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
+                
            # Section headers
            elif self.is_section_header(line):
-                heading = self.parse_section_header(line)
-                doc.add_heading(text=heading["text"], level=heading["level"])
-
+                item = self.parse_section_header(line)
+                level = item["level"]
+                
+                parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
+                for k,v in parents.items():
+                    if k>level:
+                        parents[k] = None
+                
            # Lists
            elif self.is_list_item(line):
                if not in_list:
                    in_list = True

+                    level = self.get_current_level(parents)
+                    
+                    parents[level+1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    
                item = self.parse_list_item(line)
-                doc.add_list_item(item["text"])
+                doc.add_list_item(item["text"], parent=self.get_current_parent(parents))

            elif in_list and not self.is_list_item(line):
                in_list = False

+                level = self.get_current_level(parents)
+                parents[level]=None
+                
            # Tables
            elif self.is_table_line(line):
                in_table = True
@ -117,42 +137,61 @@ class AsciidocBackend(DeclarativeDocumentBackend):
            elif in_table and not self.is_table_line(line):

                data = self.populate_table_as_grid(table_data)
-                doc.add_table(data=data)
+                doc.add_table(data=data, parent=self.get_current_parent(parents))

                in_table = False
                table_data = []

            # Plain text
-            elif line:
+            elif len(line)>0:
                item = self.parse_text(line)
-                doc.add_text(text=item["text"], label=DocItemLabel.TEXT)
+                doc.add_text(text=item["text"], label=DocItemLabel.PARAGRAPH, parent=self.get_current_parent(parents))

        if in_table and len(table_data) > 0:
            data = self.populate_table_as_grid(table_data)
-            doc.add_table(data=data)
+            doc.add_table(data=data, parent=self.get_current_parent(parents))

            in_table = False
            table_data = []

        return doc

+    def get_current_level(self, parents):
+        for k,v in parents.items():
+            if v==None and k>0:
+                return k-1
+
+        return 0
+    
+    def get_current_parent(self, parents):
+        for k,v in parents.items():
+            if v==None and k>0:
+                return parents[k-1]
+
+        return None
+            
    # Title
    def is_title(self, line):
        return re.match(r"^= ", line)

    def parse_title(self, line):
-        return {"type": "title", "text": line[2:].strip()}
+        return {"type": "title", "text": line[2:].strip(), "level":0}

    # Section headers
    def is_section_header(self, line):
        return re.match(r"^==+", line)

    def parse_section_header(self, line):
-        header_level = line.count("=")  # number of '=' represents level
+        match = re.match(r"^(=+)\s+(.*)", line)
+        
+        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
+        text = match.group(2)    # The actual text of the list item
+        
+        header_level = marker.count("=")  # number of '=' represents level
        return {
            "type": "header",
-            "level": header_level,
-            "text": line[header_level:].strip(),
+            "level": header_level-1,
+            "text": text.strip(),
        }

    # Lists
@ -160,8 +199,20 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)

    def parse_list_item(self, line):
-        return {"type": "list_item", "text": line}
+        """Extract the item marker (number or bullet symbol) and the text of the item."""

+        match = re.match(r"^(\*|-|\d+\.)\s+(.*)", line)
+        if match:
+            item_marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
+            item_text = match.group(2)    # The actual text of the list item
+            if item_marker=="*" or item_marker=="-":
+                return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": False}
+            else:
+                return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": True}
+        else:
+            # Fallback if no match
+            return {"type": "list_item", "marker": item_marker, "text": line, "numbered": False}
+    
    # Tables
    def is_table_line(self, line):
        return re.match(r"^\|.*\|", line)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -37,7 +37,8 @@ torchvision = [
 ######################
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^2.0.1"
+#docling-core = "^2.0.1"
+docling-core = { git = "https://github.com/DS4SD/docling-core.git", rev = "334a9871fc6c666431ed6c59e5d8d69972b41f19" }
 docling-ibm-models = "^2.0.1"
 deepsearch-glm = "^0.25.0"
 filetype = "^1.2.0"
--- a/tests/data/test_01.asciidoc
+++ b/tests/data/test_01.asciidoc
@ -1,8 +1,13 @@
-= Sample Document Title
+= 1st Sample Document Title
+
+This is an abstract.

 == Section 1

 This is some introductory text in section 1.
+
+This spans multiple lines but should be treated
+as a single paragraph.
    
 === Subsection 1.1
 * First list item
--- a/tests/data/test_02.asciidoc
+++ b/tests/data/test_02.asciidoc
@ -0,0 +1,68 @@
+= 2nd Sample Document Title
+
+This is an abstract.
+
+== Section 1: Testing nestedlists
+
+* First item
+  * Nested item 1
+  * Nested item 2
+* Second item
+  1. Nested ordered item 1
+  2. Nested ordered item 2
+    * Deeper nested unordered item
+* Third item
+  1. Nested ordered item 1
+  2. Nested ordered item 2
+    * Deeper nested unordered item
+  3. Nested ordered item 2
+
+== Section 2
+
+bla bla
+
+==== SubSubSection 2.1.1
+
+bla bla bla
+
+== Section 3: test image
+
+image::images/example1.png[Example Image, width=200, height=150, align=center]
+
+.An example caption for the image
+image::images/example2.png[Example Image, width=200, height=150, align=center]
+
+== Section 4: test tables
+
+|Header 1|Header 2|
+|Value 1|Value 2|
+|Value 3|Value 4|
+
+.Caption for the table 1
+|===
+|Header 1 |Header 2
+|Value 1  |Value 2
+|Value 3  |Value 4
+|===
+
+.Caption for the table 2
+|=== 
+|Column 1 Heading |Column 2 Heading |Column 3 Heading
+|Cell 1 |Cell 2 |Cell 3
+|Cell 4 |Cell 5 colspan=2|Cell spans two columns
+|===
+
+.Caption for the table 3
+|===
+|Column 1 Heading |Column 2 Heading |Column 3 Heading
+|Rowspan=2 |Cell 2 |Cell 3
+| |Cell 5 |Cell 6
+|===
+
+.Caption for the table 4
+|===
+|Col 1 |Col 2 |Col 3 |Col 4
+|Rowspan=2.Colspan=2|Cell spanning 2 rows and 2 columns |Col 3 |Col 4
+|   |   |Col 3 |Col 4
+|Col 1 |Col 2 |Col 3 |Col 4
+|===
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -27,15 +27,22 @@ def test_asciidocs_examples():

    for fname in fnames:
        print(f"reading {fname}")
-
+        
        bname = os.path.basename(fname)
        gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
-
+        
        doc_backend = _get_backend(Path(fname))
        doc = doc_backend.convert()

+        pred_itdoc = doc.export_to_indented_text(max_text_len=16)
+        print("\n\n", pred_itdoc)
+        
        pred_mddoc = doc.export_to_markdown()
+        print("\n\n", pred_mddoc)

+        pred_mddocv2 = doc.export_to_markdown(version="v2")
+        print("\n\n", pred_mddocv2)        
+        
        if os.path.exists(gname):
            with open(gname, "r") as fr:
                true_mddoc = fr.read()