From b7debe72508563c7e4e6e9e084ffdde3a1c5f1ef Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Mon, 26 Aug 2024 17:50:39 +0200
Subject: [PATCH] need to start running all tests successfully

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 tests/test_backend_docling_parse.py | 20 +++++++++----
 tests/test_backend_pdfium.py        | 16 ++++++++--
 tests/test_toplevel_functions.py    | 46 ++++++++++++++++-------------
 3 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py
index 8f42e1ba..dd98157d 100644
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@@ -1,37 +1,45 @@
 import glob
-
 from pathlib import Path
 
 import pytest
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
+from docling.backend.docling_parse_backend import (
+    DoclingParseDocumentBackend,
+    DoclingParsePageBackend,
+)
 from docling.datamodel.base_models import BoundingBox
-
 from docling.document_converter import DocumentConverter
 
+
 @pytest.fixture
 def test_doc_path():
     return Path("./data/2206.01062.pdf")
 
+
 def test_get_text_from_rect(test_doc_path):
     doc_backend = DoclingParseDocumentBackend(test_doc_path)
     page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
 
     # Get the title text of the DocLayNet paper
-    textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
+    textpiece = page_backend.get_text_in_rect(
+        bbox=BoundingBox(l=102, t=77, r=511, b=124)
+    )
     ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
 
     assert textpiece.strip() == ref
 
+
 def test_crop_page_image(test_doc_path):
     doc_backend = DoclingParseDocumentBackend(test_doc_path)
     page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
 
     # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
+    im = page_backend.get_page_image(
+        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
+    )
     # im.show()
 
+
 def test_num_pages(test_doc_path):
     doc_backend = DoclingParseDocumentBackend(test_doc_path)
     doc_backend.page_count() == 9
-
diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py
index 2ba239c1..4db7a376 100644
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@@ -2,7 +2,10 @@ from pathlib import Path
 
 import pytest
 
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
+from docling.backend.pypdfium2_backend import (
+    PyPdfiumDocumentBackend,
+    PyPdfiumPageBackend,
+)
 from docling.datamodel.base_models import BoundingBox
 
 
@@ -10,24 +13,31 @@ from docling.datamodel.base_models import BoundingBox
 def test_doc_path():
     return Path("./data/2206.01062.pdf")
 
+
 def test_get_text_from_rect(test_doc_path):
     doc_backend = PyPdfiumDocumentBackend(test_doc_path)
     page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
 
     # Get the title text of the DocLayNet paper
-    textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
+    textpiece = page_backend.get_text_in_rect(
+        bbox=BoundingBox(l=102, t=77, r=511, b=124)
+    )
     ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
 
     assert textpiece.strip() == ref
 
+
 def test_crop_page_image(test_doc_path):
     doc_backend = PyPdfiumDocumentBackend(test_doc_path)
     page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
 
     # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
+    im = page_backend.get_page_image(
+        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
+    )
     # im.show()
 
+
 def test_num_pages(test_doc_path):
     doc_backend = PyPdfiumDocumentBackend(test_doc_path)
     doc_backend.page_count() == 9
diff --git a/tests/test_toplevel_functions.py b/tests/test_toplevel_functions.py
index 5d1059da..5638d884 100644
--- a/tests/test_toplevel_functions.py
+++ b/tests/test_toplevel_functions.py
@@ -1,33 +1,36 @@
 import glob
-
 from pathlib import Path
 
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.document_converter import DocumentConverter
 
-GENERATE=True
+GENERATE = True
+
 
 def get_pdf_paths():
 
     # Define the directory you want to search
-    directory = Path('./tests/data')
+    directory = Path("./tests/data")
 
     # List all PDF files in the directory and its subdirectories
-    pdf_files = sorted(directory.rglob('*.pdf'))
+    pdf_files = sorted(directory.rglob("*.pdf"))
     return pdf_files
 
+
 def verify_json(doc_pred_json, doc_true_json):
     return True
 
+
 def verify_md(doc_pred_md, doc_true_md):
-    return (doc_pred_md==doc_true_md)
-    
+    return doc_pred_md == doc_true_md
+
+
 def test_conversions():
-    
+
     pdf_paths = get_pdf_paths()
     print(f"#-documents: {pdf_paths}")
-    
+
     pipeline_options = PipelineOptions()
     pipeline_options.do_ocr = False
     pipeline_options.do_table_structure = True
@@ -37,39 +40,42 @@ def test_conversions():
         pipeline_options=pipeline_options,
         pdf_backend=DoclingParseDocumentBackend,
     )
-    
+
     for path in pdf_paths:
 
-        doc_pred_json=None
-        
+        doc_pred_json = None
+
         try:
             print(f"converting {path}")
-            doc_pred_json = converter.convert_single(path)        
+            doc_pred_json = converter.convert_single(path)
         except:
             continue
-        
+
         doc_pred_md = doc_pred_json.render_as_markdown()
 
         json_path = path.with_suffix(".json")
         md_path = path.with_suffix(".md")
-            
+
         if GENERATE:
-            
+
             with open(json_path, "w") as fw:
                 fw.write(json.dumps(doc_pred_json, indent=2))
 
             with open(md_path, "w") as fw:
                 fw.write(doc_pred_md)
-                
+
         else:
 
             with open(path, "r") as fr:
                 doc_true_json = json.load(fr)
 
             with open(path, "r") as fr:
-                doc_true_md = json.load(fr)        
+                doc_true_md = json.load(fr)
 
-            assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}"
+            assert verify_json(
+                doc_pred_json, doc_true_json
+            ), f"failed json prediction for {path}"
 
-            assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}"
-        
+            assert verify_md(
+                doc_pred_md, doc_true_md
+            ), f"failed md prediction for {path}"