ran pre-commit

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-08-26 20:22:31 +02:00
parent c64489a82c
commit 24c0b9d4c9
2 changed files with 39 additions and 25 deletions

View File

@ -16,9 +16,11 @@ from docling.datamodel.document import ConvertedDocument
class GlmModel: class GlmModel:
def __init__(self, config): def __init__(self, config):
self.config = config self.config = config
self.model_names = self.config.get("model_names", "") #"language;term;reference" self.model_names = self.config.get(
"model_names", ""
) # "language;term;reference"
load_pretrained_nlp_models() load_pretrained_nlp_models()
#model = init_nlp_model(model_names="language;term;reference") # model = init_nlp_model(model_names="language;term;reference")
model = init_nlp_model(model_names=self.model_names) model = init_nlp_model(model_names=self.model_names)
self.model = model self.model = model

View File

@ -1,5 +1,5 @@
import json
import glob import glob
import json
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@ -8,6 +8,7 @@ from docling.document_converter import DocumentConverter
GENERATE = False GENERATE = False
def get_pdf_paths(): def get_pdf_paths():
# Define the directory you want to search # Define the directory you want to search
@ -17,40 +18,51 @@ def get_pdf_paths():
pdf_files = sorted(directory.rglob("*.pdf")) pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files return pdf_files
def verify_json(doc_pred_json, doc_true_json): def verify_json(doc_pred_json, doc_true_json):
if doc_pred_json.keys()!=doc_true_json.keys(): if doc_pred_json.keys() != doc_true_json.keys():
return False return False
if doc_pred_json["output"].keys()!=doc_true_json["output"].keys(): if doc_pred_json["output"].keys() != doc_true_json["output"].keys():
return False return False
for l,true_item in enumerate(doc_true_json["output"]["main_text"]): for l, true_item in enumerate(doc_true_json["output"]["main_text"]):
if "text" in true_item: if "text" in true_item:
pred_item = doc_pred_json["output"]["main_text"][l] pred_item = doc_pred_json["output"]["main_text"][l]
assert "text" in pred_item, f"`text` is in {pred_item}" assert "text" in pred_item, f"`text` is in {pred_item}"
assert true_item["text"]==pred_item["text"] assert true_item["text"] == pred_item["text"]
for l,true_item in enumerate(doc_true_json["output"]["tables"]): for l, true_item in enumerate(doc_true_json["output"]["tables"]):
if "data" in true_item: if "data" in true_item:
pred_item = doc_pred_json["output"]["tables"][l] pred_item = doc_pred_json["output"]["tables"][l]
assert "data" in pred_item, f"`data` is in {pred_item}" assert "data" in pred_item, f"`data` is in {pred_item}"
assert len(true_item["data"])==len(pred_item["data"]), "table does not have the same #-rows" assert len(true_item["data"]) == len(
assert len(true_item["data"][0])==len(pred_item["data"][0]), "table does not have the same #-cols" pred_item["data"]
), "table does not have the same #-rows"
assert len(true_item["data"][0]) == len(
pred_item["data"][0]
), "table does not have the same #-cols"
for i,row in enumerate(true_item["data"]): for i, row in enumerate(true_item["data"]):
for j,col in enumerate(true_item["data"][i]): for j, col in enumerate(true_item["data"][i]):
if "text" in true_item["data"][i][j]: if "text" in true_item["data"][i][j]:
assert "text" in pred_item["data"][i][j], "table-cell does not contain text" assert (
assert true_item["data"][i][j]["text"]==pred_item["data"][i][j]["text"], "table-cell does not have the same text" "text" in pred_item["data"][i][j]
), "table-cell does not contain text"
assert (
true_item["data"][i][j]["text"]
== pred_item["data"][i][j]["text"]
), "table-cell does not have the same text"
return True return True
def verify_md(doc_pred_md, doc_true_md): def verify_md(doc_pred_md, doc_true_md):
return doc_pred_md == doc_true_md return doc_pred_md == doc_true_md
@ -58,7 +70,7 @@ def verify_md(doc_pred_md, doc_true_md):
def test_conversions(): def test_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
#print(f"#-documents: {pdf_paths}") # print(f"#-documents: {pdf_paths}")
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
@ -76,7 +88,7 @@ def test_conversions():
doc_true_json = None doc_true_json = None
try: try:
#print(f"converting {path}") # print(f"converting {path}")
doc_pred_json = converter.convert_single(path) doc_pred_json = converter.convert_single(path)
except: except:
continue continue
@ -103,7 +115,7 @@ def test_conversions():
doc_true_md = "".join(fr.readlines()) doc_true_md = "".join(fr.readlines())
doc_ = json.loads(doc_pred_json.json()) doc_ = json.loads(doc_pred_json.json())
#print(json.dumps(doc_, indent=2)) # print(json.dumps(doc_, indent=2))
assert verify_json( assert verify_json(
doc_, doc_true_json doc_, doc_true_json