diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 2b8b6bb6..73a544b2 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -16,9 +16,11 @@ from docling.datamodel.document import ConvertedDocument class GlmModel: def __init__(self, config): self.config = config - self.model_names = self.config.get("model_names", "") #"language;term;reference" + self.model_names = self.config.get( + "model_names", "" + ) # "language;term;reference" load_pretrained_nlp_models() - #model = init_nlp_model(model_names="language;term;reference") + # model = init_nlp_model(model_names="language;term;reference") model = init_nlp_model(model_names=self.model_names) self.model = model diff --git a/tests/test_toplevel_functions.py b/tests/test_toplevel_functions.py index 27676bdb..b708b0b1 100644 --- a/tests/test_toplevel_functions.py +++ b/tests/test_toplevel_functions.py @@ -1,5 +1,5 @@ -import json import glob +import json from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend @@ -8,6 +8,7 @@ from docling.document_converter import DocumentConverter GENERATE = False + def get_pdf_paths(): # Define the directory you want to search @@ -17,40 +18,51 @@ def get_pdf_paths(): pdf_files = sorted(directory.rglob("*.pdf")) return pdf_files + def verify_json(doc_pred_json, doc_true_json): - if doc_pred_json.keys()!=doc_true_json.keys(): + if doc_pred_json.keys() != doc_true_json.keys(): return False - if doc_pred_json["output"].keys()!=doc_true_json["output"].keys(): + if doc_pred_json["output"].keys() != doc_true_json["output"].keys(): return False - for l,true_item in enumerate(doc_true_json["output"]["main_text"]): + for l, true_item in enumerate(doc_true_json["output"]["main_text"]): if "text" in true_item: pred_item = doc_pred_json["output"]["main_text"][l] - + assert "text" in pred_item, f"`text` is in {pred_item}" - assert true_item["text"]==pred_item["text"] + assert true_item["text"] == pred_item["text"] - for l,true_item in enumerate(doc_true_json["output"]["tables"]): + for l, true_item in enumerate(doc_true_json["output"]["tables"]): if "data" in true_item: - - pred_item = doc_pred_json["output"]["tables"][l] - - assert "data" in pred_item, f"`data` is in {pred_item}" - assert len(true_item["data"])==len(pred_item["data"]), "table does not have the same #-rows" - assert len(true_item["data"][0])==len(pred_item["data"][0]), "table does not have the same #-cols" - for i,row in enumerate(true_item["data"]): - for j,col in enumerate(true_item["data"][i]): + pred_item = doc_pred_json["output"]["tables"][l] + + assert "data" in pred_item, f"`data` is in {pred_item}" + assert len(true_item["data"]) == len( + pred_item["data"] + ), "table does not have the same #-rows" + assert len(true_item["data"][0]) == len( + pred_item["data"][0] + ), "table does not have the same #-cols" + + for i, row in enumerate(true_item["data"]): + for j, col in enumerate(true_item["data"][i]): if "text" in true_item["data"][i][j]: - assert "text" in pred_item["data"][i][j], "table-cell does not contain text" - assert true_item["data"][i][j]["text"]==pred_item["data"][i][j]["text"], "table-cell does not have the same text" - + assert ( + "text" in pred_item["data"][i][j] + ), "table-cell does not contain text" + assert ( + true_item["data"][i][j]["text"] + == pred_item["data"][i][j]["text"] + ), "table-cell does not have the same text" + return True + def verify_md(doc_pred_md, doc_true_md): return doc_pred_md == doc_true_md @@ -58,7 +70,7 @@ def verify_md(doc_pred_md, doc_true_md): def test_conversions(): pdf_paths = get_pdf_paths() - #print(f"#-documents: {pdf_paths}") + # print(f"#-documents: {pdf_paths}") pipeline_options = PipelineOptions() pipeline_options.do_ocr = False @@ -76,7 +88,7 @@ def test_conversions(): doc_true_json = None try: - #print(f"converting {path}") + # print(f"converting {path}") doc_pred_json = converter.convert_single(path) except: continue @@ -98,13 +110,13 @@ def test_conversions(): with open(json_path, "r") as fr: doc_true_json = json.load(fr) - + with open(md_path, "r") as fr: doc_true_md = "".join(fr.readlines()) doc_ = json.loads(doc_pred_json.json()) - #print(json.dumps(doc_, indent=2)) - + # print(json.dumps(doc_, indent=2)) + assert verify_json( doc_, doc_true_json ), f"failed json prediction for {path}"