mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
updated the toplevel function test
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
12eea8495f
commit
2c66075390
@ -11,7 +11,7 @@ GENERATE=True
|
|||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path('./data')
|
directory = Path('./tests/data')
|
||||||
|
|
||||||
# List all PDF files in the directory and its subdirectories
|
# List all PDF files in the directory and its subdirectories
|
||||||
pdf_files = sorted(directory.rglob('*.pdf'))
|
pdf_files = sorted(directory.rglob('*.pdf'))
|
||||||
@ -26,22 +26,29 @@ def verify_md(doc_pred_md, doc_true_md):
|
|||||||
def test_conversions():
|
def test_conversions():
|
||||||
|
|
||||||
pdf_paths = get_pdf_paths()
|
pdf_paths = get_pdf_paths()
|
||||||
|
print(f"#-documents: {pdf_paths}")
|
||||||
|
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
)
|
)
|
||||||
|
|
||||||
for path in pdf_paths:
|
for path in pdf_paths:
|
||||||
|
|
||||||
doc_pred_json = converter.convert_single(path)
|
doc_pred_json=None
|
||||||
|
|
||||||
doc_pred_md = doc.render_as_markdown()
|
try:
|
||||||
|
print(f"converting {path}")
|
||||||
|
doc_pred_json = converter.convert_single(path)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
doc_pred_md = doc_pred_json.render_as_markdown()
|
||||||
|
|
||||||
json_path = path.with_suffix(".json")
|
json_path = path.with_suffix(".json")
|
||||||
md_path = path.with_suffix(".md")
|
md_path = path.with_suffix(".md")
|
||||||
|
Loading…
Reference in New Issue
Block a user