fixed the mypy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-08-01 15:02:21 +00:00 · 2024-10-22 09:44:27 +02:00 · 2024-10-22 09:44:27 +02:00 · bb3db07836
commit bb3db07836
parent b04f14ec24
2 changed files with 150 additions and 105 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -4,20 +4,20 @@ from io import BytesIO
 from pathlib import Path
 from typing import Set, Union

-from pydantic import (
-    AnyUrl,
-)
-
 from docling_core.types.doc import (
-    Size,    
+    DocItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
+    GroupItem,
    GroupLabel,
+    ImageRef,
+    NodeItem,
+    Size,
    TableCell,
    TableData,
-    ImageRef,
 )
+from pydantic import AnyUrl

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -90,108 +90,122 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        in_list = False
        in_table = False

-        text_data = []
-        table_data = []
-        caption_data = []
+        text_data: list[str] = []
+        table_data: list[str] = []
+        caption_data: list[str] = []
+
+        # parents: dict[int, Union[DocItem, GroupItem, None]] = {}
+        parents: dict[int, Union[GroupItem, None]] = {}
+        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
+        indents: dict[int, Union[GroupItem, None]] = {}

-        parents = {}
-        indents = {}
-        
        for i in range(0, 10):
            parents[i] = None
            indents[i] = None
-            
+
        for line in self.lines:
-            #line = line.strip()
+            # line = line.strip()

            # Title
            if self.is_title(line):
                item = self.parse_title(line)
                level = item["level"]
-                
-                parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
-                
+
+                parents[level] = doc.add_text(
+                    text=item["text"], label=DocItemLabel.TITLE
+                )
+
            # Section headers
            elif self.is_section_header(line):
                item = self.parse_section_header(line)
                level = item["level"]
-                
-                parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
-                for k,v in parents.items():
-                    if k>level:
+
+                parents[level] = doc.add_heading(
+                    text=item["text"], level=item["level"], parent=parents[level - 1]
+                )
+                for k, v in parents.items():
+                    if k > level:
                        parents[k] = None
-                
+
            # Lists
            elif self.is_list_item(line):

                print("line: ", line)
                item = self.parse_list_item(line)
                print("parsed list-item: ", item)
-                
+
                level = self.get_current_level(parents)
-                
+
                if not in_list:
                    in_list = True
-                    
-                    parents[level+1] = doc.add_group(
-                        parent=parents[level], name="list", label=GroupLabel.LIST
-                    )
-                    indents[level+1] = item["indent"]
-                    
-                elif in_list and item["indent"]>indents[level]:
-                    parents[level+1] = doc.add_group(
-                        parent=parents[level], name="list", label=GroupLabel.LIST
-                    )
-                    indents[level+1] = item["indent"]

-                elif in_list and item["indent"]<indents[level]:                    
+                    parents[level + 1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    indents[level + 1] = item["indent"]
+
+                elif in_list and item["indent"] > indents[level]:
+                    parents[level + 1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    indents[level + 1] = item["indent"]
+
+                elif in_list and item["indent"] < indents[level]:

                    print(item["indent"], " => ", indents[level])
-                    while item["indent"]<indents[level]:
+                    while item["indent"] < indents[level]:
                        print(item["indent"], " => ", indents[level])
                        parents[level] = None
                        indents[level] = None
                        level -= 1
-                        
+
                doc.add_list_item(item["text"], parent=self.get_current_parent(parents))

            elif in_list and not self.is_list_item(line):
                in_list = False

                level = self.get_current_level(parents)
-                parents[level]=None
+                parents[level] = None

            # Tables
-            elif line.strip()=="|===" and not in_table: # start of table
+            elif line.strip() == "|===" and not in_table:  # start of table
                in_table = True
-                
-            elif self.is_table_line(line): # within a table
+
+            elif self.is_table_line(line):  # within a table
                in_table = True
                table_data.append(self.parse_table_line(line))

-            elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table
+            elif in_table and (
+                (not self.is_table_line(line)) or line.strip() == "|==="
+            ):  # end of table

                caption = None
-                if len(caption_data)>0:
-                    caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
+                if len(caption_data) > 0:
+                    caption = doc.add_text(
+                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
+                    )
+
+                caption_data = []

-                caption_data = []                
-                
                data = self.populate_table_as_grid(table_data)
-                doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)
+                doc.add_table(
+                    data=data, parent=self.get_current_parent(parents), caption=caption
+                )

                in_table = False
                table_data = []
-                
+
            # Picture
            elif self.is_picture(line):

                caption = None
-                if len(caption_data)>0:
-                    caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
+                if len(caption_data) > 0:
+                    caption = doc.add_text(
+                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
+                    )
+
+                caption_data = []

-                caption_data = []                
-                
                item = self.parse_picture(line)
                print(item)

@ -200,41 +214,57 @@ class AsciidocBackend(DeclarativeDocumentBackend):
                    size = Size(width=int(item["width"]), height=int(item["height"]))

                uri = None
-                if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"):
-                    uri = "file:"+item["uri"]
-                elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"):
-                    uri = "file:/"+item["uri"]
+                if (
+                    "uri" in item
+                    and not item["uri"].startswith("http")
+                    and item["uri"].startswith("//")
+                ):
+                    uri = "file:" + item["uri"]
+                elif (
+                    "uri" in item
+                    and not item["uri"].startswith("http")
+                    and item["uri"].startswith("/")
+                ):
+                    uri = "file:/" + item["uri"]
                elif "uri" in item and not item["uri"].startswith("http"):
-                    uri = "file://"+item["uri"]
+                    uri = "file://" + item["uri"]

                image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
                doc.add_picture(image=image, caption=caption)
-                
+
            # Caption
-            elif self.is_caption(line) and len(caption_data)==0:
+            elif self.is_caption(line) and len(caption_data) == 0:
                item = self.parse_caption(line)
                caption_data.append(item["text"])

-            elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
+            elif (
+                len(line.strip()) > 0 and len(caption_data) > 0
+            ):  # allow multiline captions
                item = self.parse_text(line)
                caption_data.append(item["text"])
-                
+
            # Plain text
-            elif len(line.strip())==0 and len(text_data)>0:
-                doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
-                             parent=self.get_current_parent(parents))
+            elif len(line.strip()) == 0 and len(text_data) > 0:
+                doc.add_text(
+                    text=" ".join(text_data),
+                    label=DocItemLabel.PARAGRAPH,
+                    parent=self.get_current_parent(parents),
+                )
                text_data = []
-                
-            elif len(line.strip())>0: # allow multiline texts
-                
+
+            elif len(line.strip()) > 0:  # allow multiline texts
+
                item = self.parse_text(line)
                text_data.append(item["text"])

        if len(text_data) > 0:
-            doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
-                         parent=self.get_current_parent(parents))
+            doc.add_text(
+                text=" ".join(text_data),
+                label=DocItemLabel.PARAGRAPH,
+                parent=self.get_current_parent(parents),
+            )
            text_data = []
-                
+
        if in_table and len(table_data) > 0:
            data = self.populate_table_as_grid(table_data)
            doc.add_table(data=data, parent=self.get_current_parent(parents))
@ -245,25 +275,25 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        return doc

    def get_current_level(self, parents):
-        for k,v in parents.items():
-            if v==None and k>0:
-                return k-1
+        for k, v in parents.items():
+            if v == None and k > 0:
+                return k - 1

        return 0
-    
+
    def get_current_parent(self, parents):
-        for k,v in parents.items():
-            if v==None and k>0:
-                return parents[k-1]
+        for k, v in parents.items():
+            if v == None and k > 0:
+                return parents[k - 1]

        return None
-            
+
    #   =========   Title
    def is_title(self, line):
        return re.match(r"^= ", line)

    def parse_title(self, line):
-        return {"type": "title", "text": line[2:].strip(), "level":0}
+        return {"type": "title", "text": line[2:].strip(), "level": 0}

    #   =========   Section headers
    def is_section_header(self, line):
@ -271,14 +301,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):

    def parse_section_header(self, line):
        match = re.match(r"^(=+)\s+(.*)", line)
-        
+
        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
-        text = match.group(2)    # The actual text of the list item
-        
+        text = match.group(2)  # The actual text of the list item
+
        header_level = marker.count("=")  # number of '=' represents level
        return {
            "type": "header",
-            "level": header_level-1,
+            "level": header_level - 1,
            "text": text.strip(),
        }

@ -293,19 +323,34 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        if match:
            indent = match.group(1)
            marker = match.group(2)  # The list marker (e.g., "*", "-", "1.")
-            text = match.group(3)   # The actual text of the list item
-            
-            if marker=="*" or marker=="-":
-                return {"type": "list_item", "marker": marker, "text": text.strip(),
-                        "numbered": False, "indent": 0 if indent==None else len(indent)}
+            text = match.group(3)  # The actual text of the list item
+
+            if marker == "*" or marker == "-":
+                return {
+                    "type": "list_item",
+                    "marker": marker,
+                    "text": text.strip(),
+                    "numbered": False,
+                    "indent": 0 if indent == None else len(indent),
+                }
            else:
-                return {"type": "list_item", "marker": marker, "text": text.strip(),
-                        "numbered": True, "indent": 0 if indent==None else len(indent)}
+                return {
+                    "type": "list_item",
+                    "marker": marker,
+                    "text": text.strip(),
+                    "numbered": True,
+                    "indent": 0 if indent == None else len(indent),
+                }
        else:
            # Fallback if no match
-            return {"type": "list_item", "marker": item_marker, "text": line,
-                    "numbered": False, "indent": 0}
-    
+            return {
+                "type": "list_item",
+                "marker": item_marker,
+                "text": line,
+                "numbered": False,
+                "indent": 0,
+            }
+
    #   =========   Tables
    def is_table_line(self, line):
        return re.match(r"^\|.*\|", line)
@ -357,18 +402,18 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
        if mtch:
            picture_path = mtch.group(1).strip()
-            attributes = mtch.group(2).split(',')
+            attributes = mtch.group(2).split(",")
            picture_info = {"type": "picture", "uri": picture_path}

            # Extract optional attributes (alt text, width, height, alignment)
            if attributes:
                picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
                for attr in attributes[1:]:
-                    key, value = attr.split('=')
+                    key, value = attr.split("=")
                    picture_info[key.strip()] = value.strip()
-                    
+
            return picture_info
-        
+
        return {"type": "picture", "uri": line}

    #   =========   Captions
@ -382,7 +427,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
            return {"type": "caption", "text": text}

        return {"type": "caption", "text": ""}
-        
+
    #   =========   Plain text
    def parse_text(self, line):
        return {"type": "text", "text": line.strip()}
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -27,29 +27,29 @@ def test_asciidocs_examples():

    for fname in fnames:
        print(f"reading {fname}")
-        
+
        bname = os.path.basename(fname)
        gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
-        
+
        doc_backend = _get_backend(Path(fname))
        doc = doc_backend.convert()

        pred_itdoc = doc._export_to_indented_text(max_text_len=16)
        print("\n\n", pred_itdoc)
-        
+
        pred_mddoc = doc.export_to_markdown()
        print("\n\n", pred_mddoc)
-        
+
        if os.path.exists(gname):
            with open(gname, "r") as fr:
                true_mddoc = fr.read()

-            #assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
+            # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
        else:
            with open(gname, "w") as fw:
                fw.write(pred_mddoc)

-            #print("\n\n", doc.export_to_markdown())
+            # print("\n\n", doc.export_to_markdown())

        input("continue")