feat: add options for choosing OCR engines (#118)

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
Co-authored-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2024-10-08 19:07:08 +02:00
committed by GitHub
parent d412c363d7
commit f96ea86a00
20 changed files with 699 additions and 32 deletions

View File

@@ -0,0 +1,3 @@
<document>
<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
</document>

View File

@@ -0,0 +1 @@
{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []}

View File

@@ -0,0 +1 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package

View File

@@ -0,0 +1 @@
[{"page_no": 0, "page_hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]

Binary file not shown.

View File

@@ -0,0 +1,98 @@
from pathlib import Path
from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result
GENERATE = False
# Debug
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
r""" """
import json
import os
parent = pdf_path.parent
eng = "" if engine is None else f".{engine}"
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
with open(dict_fn, "w") as fd:
json.dump(doc_result.render_as_dict(), fd)
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
pages = [p.model_dump() for p in doc_result.pages]
with open(pages_fn, "w") as fd:
json.dump(pages, fd)
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd:
fd.write(doc_result.render_as_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd:
fd.write(doc_result.render_as_markdown())
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data_scanned")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files
def get_converter(ocr_options: OcrOptions):
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = ocr_options
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def test_e2e_conversions():
pdf_paths = get_pdf_paths()
engines: List[OcrOptions] = [
EasyOcrOptions(),
TesseractOcrOptions(),
TesseractCliOcrOptions(),
]
for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}")
converter = get_converter(ocr_options=ocr_options)
for pdf_path in pdf_paths:
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
# Save conversions
# save_output(pdf_path, doc_result, None)
# Debug
verify_conversion_result(
input_path=pdf_path,
doc_result=doc_result,
generate=GENERATE,
skip_cells=True,
)

View File

@@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
def verify_conversion_result(
input_path: Path, doc_result: ConversionResult, generate=False
input_path: Path,
doc_result: ConversionResult,
generate: bool = False,
ocr_engine: str = None,
skip_cells: bool = False,
):
PageList = TypeAdapter(List[Page])
@@ -143,10 +147,11 @@ def verify_conversion_result(
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
pages_path = input_path.with_suffix(".pages.json")
json_path = input_path.with_suffix(".json")
md_path = input_path.with_suffix(".md")
dt_path = input_path.with_suffix(".doctags.txt")
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
json_path = input_path.with_suffix(f"{engine_suffix}.json")
md_path = input_path.with_suffix(f"{engine_suffix}.md")
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth
with open(pages_path, "w") as fw:
@@ -173,9 +178,10 @@ def verify_conversion_result(
with open(dt_path, "r") as fr:
doc_true_dt = fr.read()
assert verify_cells(
doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {input_path}"
if not skip_cells:
assert verify_cells(
doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {input_path}"
# assert verify_output(
# doc_pred, doc_true