ran pre-commit

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-08-26 20:22:31 +02:00
parent c64489a82c
commit 24c0b9d4c9
2 changed files with 39 additions and 25 deletions
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@@ -16,9 +16,11 @@ from docling.datamodel.document import ConvertedDocument
 class GlmModel:
    def __init__(self, config):
        self.config = config
-        self.model_names = self.config.get("model_names", "") #"language;term;reference"
+        self.model_names = self.config.get(
+            "model_names", ""
+        )  # "language;term;reference"
        load_pretrained_nlp_models()
-        #model = init_nlp_model(model_names="language;term;reference")
+        # model = init_nlp_model(model_names="language;term;reference")
        model = init_nlp_model(model_names=self.model_names)
        self.model = model

--- a/tests/test_toplevel_functions.py
+++ b/tests/test_toplevel_functions.py
@@ -1,5 +1,5 @@
-import json
 import glob
+import json
 from pathlib import Path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -8,6 +8,7 @@ from docling.document_converter import DocumentConverter

 GENERATE = False

+
 def get_pdf_paths():

    # Define the directory you want to search
@@ -17,40 +18,51 @@ def get_pdf_paths():
    pdf_files = sorted(directory.rglob("*.pdf"))
    return pdf_files

+
 def verify_json(doc_pred_json, doc_true_json):

-    if doc_pred_json.keys()!=doc_true_json.keys():
+    if doc_pred_json.keys() != doc_true_json.keys():
        return False

-    if doc_pred_json["output"].keys()!=doc_true_json["output"].keys():
+    if doc_pred_json["output"].keys() != doc_true_json["output"].keys():
        return False

-    for l,true_item in enumerate(doc_true_json["output"]["main_text"]):
+    for l, true_item in enumerate(doc_true_json["output"]["main_text"]):
        if "text" in true_item:

            pred_item = doc_pred_json["output"]["main_text"][l]
-            
+
            assert "text" in pred_item, f"`text` is in {pred_item}"
-            assert true_item["text"]==pred_item["text"]
+            assert true_item["text"] == pred_item["text"]

-    for l,true_item in enumerate(doc_true_json["output"]["tables"]):
+    for l, true_item in enumerate(doc_true_json["output"]["tables"]):
        if "data" in true_item:
-            
-            pred_item = doc_pred_json["output"]["tables"][l]
-            
-            assert "data" in pred_item, f"`data` is in {pred_item}"
-            assert len(true_item["data"])==len(pred_item["data"]), "table does not have the same #-rows"
-            assert len(true_item["data"][0])==len(pred_item["data"][0]), "table does not have the same #-cols"

-            for i,row in enumerate(true_item["data"]):
-                for j,col in enumerate(true_item["data"][i]):
+            pred_item = doc_pred_json["output"]["tables"][l]
+
+            assert "data" in pred_item, f"`data` is in {pred_item}"
+            assert len(true_item["data"]) == len(
+                pred_item["data"]
+            ), "table does not have the same #-rows"
+            assert len(true_item["data"][0]) == len(
+                pred_item["data"][0]
+            ), "table does not have the same #-cols"
+
+            for i, row in enumerate(true_item["data"]):
+                for j, col in enumerate(true_item["data"][i]):

                    if "text" in true_item["data"][i][j]:
-                        assert "text" in pred_item["data"][i][j], "table-cell does not contain text"
-                        assert true_item["data"][i][j]["text"]==pred_item["data"][i][j]["text"], "table-cell does not have the same text"
-                    
+                        assert (
+                            "text" in pred_item["data"][i][j]
+                        ), "table-cell does not contain text"
+                        assert (
+                            true_item["data"][i][j]["text"]
+                            == pred_item["data"][i][j]["text"]
+                        ), "table-cell does not have the same text"
+
    return True

+
 def verify_md(doc_pred_md, doc_true_md):
    return doc_pred_md == doc_true_md

@@ -58,7 +70,7 @@ def verify_md(doc_pred_md, doc_true_md):
 def test_conversions():

    pdf_paths = get_pdf_paths()
-    #print(f"#-documents: {pdf_paths}")
+    # print(f"#-documents: {pdf_paths}")

    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
@@ -76,7 +88,7 @@ def test_conversions():
        doc_true_json = None

        try:
-            #print(f"converting {path}")
+            # print(f"converting {path}")
            doc_pred_json = converter.convert_single(path)
        except:
            continue
@@ -98,13 +110,13 @@ def test_conversions():

            with open(json_path, "r") as fr:
                doc_true_json = json.load(fr)
-                
+
            with open(md_path, "r") as fr:
                doc_true_md = "".join(fr.readlines())

            doc_ = json.loads(doc_pred_json.json())
-            #print(json.dumps(doc_, indent=2))
-                
+            # print(json.dumps(doc_, indent=2))
+
            assert verify_json(
                doc_, doc_true_json
            ), f"failed json prediction for {path}"