reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-18 16:57:26 +02:00 · 2024-10-18 16:57:26 +02:00 · 70b2ae3fab
commit 70b2ae3fab
parent 5016daeae3
3 changed files with 51 additions and 52 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -1,6 +1,5 @@
-import re
-    
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
@ -16,7 +15,8 @@ from docling_core.types.doc import (

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
-#from docling.datamodel.document import InputDocument
+
+# from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

@ -29,7 +29,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        self.path_or_stream = path_or_stream

        self.valid = True
-        
+
    def is_valid(self) -> bool:
        return self.valid

@ -61,12 +61,12 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        if len(fname) > 0:
            docname = Path(fname).stem
        else:
-            docname = "stream"            
-        
+            docname = "stream"
+
        doc = DoclingDocument(name=docname, origin=origin)

        doc = self.parse(doc)
-        
+
        return doc

    def parse(self, doc: DoclingDocument):
@ -75,19 +75,19 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        title, section headers, text, lists, and tables.
        """

-        content=""
+        content = ""
        with open(self.path_or_stream, "r") as fr:
            self.lines = fr.readlines()
-        
-        #self.lines = file_content.splitlines()
-        
+
+        # self.lines = file_content.splitlines()
+
        in_list = False
        in_table = False
        table_data = []
-        
+
        for line in self.lines:
            line = line.strip()
-            
+
            # Title
            if self.is_title(line):
                item = self.parse_title(line)
@ -105,41 +105,41 @@ class AsciidocBackend(DeclarativeDocumentBackend):

                item = self.parse_list_item(line)
                doc.add_list_item(item["text"])
-            
+
            elif in_list and not self.is_list_item(line):
                in_list = False
-            
+
            # Tables
            elif self.is_table_line(line):
-                in_table = True                
+                in_table = True
                table_data.append(self.parse_table_line(line))
-            
+
            elif in_table and not self.is_table_line(line):
-                
+
                data = self.populate_table_as_grid(table_data)
                doc.add_table(data=data)
-                
+
                in_table = False
                table_data = []
-            
+
            # Plain text
            elif line:
                item = self.parse_text(line)
                doc.add_text(text=item["text"], label="text")

-        if in_table and len(table_data)>0:
+        if in_table and len(table_data) > 0:
            data = self.populate_table_as_grid(table_data)
            doc.add_table(data=data)
-            
+
            in_table = False
            table_data = []
-                
+
        return doc

    # Title
    def is_title(self, line):
        return re.match(r"^= ", line)
-    
+
    def parse_title(self, line):
        return {"type": "title", "text": line[2:].strip()}

@ -148,9 +148,13 @@ class AsciidocBackend(DeclarativeDocumentBackend):
        return re.match(r"^==+", line)

    def parse_section_header(self, line):
-        header_level = line.count('=')  # number of '=' represents level
-        return {"type": "header", "level": header_level, "text": line[header_level:].strip()}
-    
+        header_level = line.count("=")  # number of '=' represents level
+        return {
+            "type": "header",
+            "level": header_level,
+            "text": line[header_level:].strip(),
+        }
+
    # Lists
    def is_list_item(self, line):
        return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
@ -164,24 +168,24 @@ class AsciidocBackend(DeclarativeDocumentBackend):

    def parse_table_line(self, line):
        # Split table cells and trim extra spaces
-        return [cell.strip() for cell in line.split('|') if cell.strip()]
+        return [cell.strip() for cell in line.split("|") if cell.strip()]

    def populate_table_as_grid(self, table_data):

        num_rows = len(table_data)
-        
+
        # Adjust the table data into a grid format
        num_cols = max(len(row) for row in table_data)

-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])        
-        for row_idx,row in enumerate(table_data):
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+        for row_idx, row in enumerate(table_data):
            # Pad rows with empty strings to match column count
-            #grid.append(row + [''] * (max_cols - len(row)))
+            # grid.append(row + [''] * (max_cols - len(row)))

-            for col_idx,text in enumerate(row):
+            for col_idx, text in enumerate(row):
                row_span = 1
                col_span = 1
-                
+
                cell = TableCell(
                    text=text,
                    row_span=row_span,
@ -191,11 +195,12 @@ class AsciidocBackend(DeclarativeDocumentBackend):
                    start_col_offset_idx=col_idx,
                    end_col_offset_idx=col_idx + col_span,
                    col_header=False,
-                    row_header=False)
-                data.table_cells.append(cell)                
-            
+                    row_header=False,
+                )
+                data.table_cells.append(cell)
+
        return data
-    
+
    # Plain text
    def parse_text(self, line):
        return {"type": "text", "text": line}
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -46,7 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["pdf"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
-    InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],    
+    InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
 }

 FormatToMimeType: Dict[InputFormat, Set[str]] = {
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -1,14 +1,11 @@
 import glob
 import os
-
 from pathlib import Path

 import pytest
 from docling_core.types.doc import BoundingBox

-from docling.backend.asciidoc_backend import (
-    AsciidocBackend,
-)
+from docling.backend.asciidoc_backend import AsciidocBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

@ -25,15 +22,15 @@ def _get_backend(fname):


 def test_asciidocs_examples():
-    
+
    fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
-    
+
    for fname in fnames:
        print(f"reading {fname}")

        bname = os.path.basename(fname)
-        gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md")
-        
+        gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
+
        doc_backend = _get_backend(Path(fname))
        doc = doc_backend.convert()

@ -43,12 +40,9 @@ def test_asciidocs_examples():
            with open(gname, "r") as fr:
                true_mddoc = fr.read()

-            assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
-        else:            
+            assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
+        else:
            with open(gname, "w") as fw:
                fw.write(pred_mddoc)

            print("\n\n", doc.export_to_markdown())
-
-            
-