feat: Implement the TesserOcrModel. Introduce the test_e2e_ocr_conversion.py unit test.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-10-02 17:47:01 +02:00
parent a0e72655f7
commit c28846a866
6 changed files with 108 additions and 84 deletions

View File

@ -32,8 +32,10 @@ class TesseractOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract" kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fr", "de", "es", "en"] lang: List[str] = ["fr", "de", "es", "en"]
class TesserOcrOptions(OcrOptions): class TesserOcrOptions(OcrOptions):
kind: Literal["tesserocr"] = "tesserocr" kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
class PipelineOptions(BaseModel): class PipelineOptions(BaseModel):

View File

@ -1,7 +1,6 @@
import logging
import io import io
import logging
import os import os
from subprocess import PIPE, Popen from subprocess import PIPE, Popen
from typing import Iterable, Tuple from typing import Iterable, Tuple
@ -13,6 +12,7 @@ from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class TesseractOcrModel(BaseOcrModel): class TesseractOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractOcrOptions): def __init__(self, enabled: bool, options: TesseractOcrOptions):
@ -79,7 +79,7 @@ class TesseractOcrModel(BaseOcrModel):
# _log.info(output) # _log.info(output)
# Decode the byte string to a regular string # Decode the byte string to a regular string
decoded_data = output.decode('utf-8') decoded_data = output.decode("utf-8")
# _log.info(decoded_data) # _log.info(decoded_data)
# Read the TSV file generated by Tesseract # Read the TSV file generated by Tesseract
@ -138,7 +138,7 @@ class TesseractOcrModel(BaseOcrModel):
cell = OcrCell( cell = OcrCell(
id=ix, id=ix,
text=text, text=text,
confidence=conf/100., confidence=conf / 100.0,
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
coord=( coord=(
(l / self.scale) + ocr_rect.l, (l / self.scale) + ocr_rect.l,

View File

@ -2,6 +2,8 @@ import logging
from typing import Iterable from typing import Iterable
import numpy import numpy
import tesserocr
from tesserocr import OEM, PSM, RIL, PyTessBaseAPI
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.pipeline_options import TesseractOcrOptions
@ -16,11 +18,21 @@ class TesserOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
if self.enabled: if self.enabled:
import tesserocr # Initialize the tesseractAPI
lang = "+".join(self.options.lang)
_log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version())
self.reader = PyTessBaseAPI(
lang=lang, psm=PSM.AUTO, init=True, oem=OEM.DEFAULT
)
self.reader = easyocr.Reader(lang_list=self.options.lang) def __del__(self):
if self.reader is not None:
# Finalize the tesseractAPI
_log.debug("Finalize TesserOCR")
self.reader.End()
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
@ -36,29 +48,38 @@ class TesserOcrModel(BaseOcrModel):
high_res_image = page._backend.get_page_image( high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect scale=self.scale, cropbox=ocr_rect
) )
im = numpy.array(high_res_image)
result = self.reader.readtext(im)
del high_res_image # Retrieve text snippets with their bounding boxes
del im self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(RIL.TEXTLINE, True)
cells = [ cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
# Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
cells.append(
OcrCell( OcrCell(
id=ix, id=ix,
text=line[1], text=text,
confidence=line[2], confidence=confidence,
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
coord=( # l, b, r, t = coord[0], coord[1], coord[2], coord[3]
(line[0][0][0] / self.scale) + ocr_rect.l, coord=(left, bottom, right, top),
(line[0][0][1] / self.scale) + ocr_rect.t, origin=CoordOrigin.BOTTOMLEFT,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
), ),
) )
for ix, line in enumerate(result) )
]
# del high_res_image
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells. ## Remove OCR cells which overlap with programmatic cells.

View File

@ -11,6 +11,7 @@ from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_model import TesseractOcrModel from docling.models.tesseract_model import TesseractOcrModel
from docling.models.tesserocr_model import TesserOcrModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline from docling.pipeline.base_model_pipeline import BaseModelPipeline
@ -33,12 +34,10 @@ class StandardModelPipeline(BaseModelPipeline):
options=pipeline_options.ocr_options, options=pipeline_options.ocr_options,
) )
elif isinstance(pipeline_options.ocr_options, TesserOcrOptions): elif isinstance(pipeline_options.ocr_options, TesserOcrOptions):
raise NotImplemented() ocr_model = TesserOcrModel(
# TODO enabled=pipeline_options.do_ocr,
# ocr_model = TesseractOcrModel( options=pipeline_options.ocr_options,
# enabled=pipeline_options.do_ocr, )
# options=pipeline_options.ocr_options,
# )
else: else:
raise RuntimeError( raise RuntimeError(
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."

View File

@ -1,65 +1,62 @@
from pathlib import Path from pathlib import Path
from pydantic import Field
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions, TesseractOcrOptions
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result from .verify_utils import verify_conversion_result
# from tests.verify_utils import verify_conversion_result
GENERATE = False GENERATE = False
# Debug # Debug
def save_output(pdf_path: Path, doc_result: ConversionResult): def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
r""" """ r""" """
import json import json
import os import os
parent = pdf_path.parent parent = pdf_path.parent
dict_fn = os.path.join(parent, f"{pdf_path.stem}.json") dict_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.json")
with open(dict_fn, "w") as fd: with open(dict_fn, "w") as fd:
json.dump(doc_result.render_as_dict(), fd) json.dump(doc_result.render_as_dict(), fd)
pages_fn = os.path.join(parent, f"{pdf_path.stem}.pages.json") pages_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.pages.json")
pages = [p.model_dump() for p in doc_result.pages] pages = [p.model_dump() for p in doc_result.pages]
with open(pages_fn, "w") as fd: with open(pages_fn, "w") as fd:
json.dump(pages, fd) json.dump(pages, fd)
doctags_fn = os.path.join(parent, f"{pdf_path.stem}.doctags.txt") doctags_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.doctags.txt")
with open(doctags_fn, "w") as fd: with open(doctags_fn, "w") as fd:
fd.write(doc_result.render_as_doctags()) fd.write(doc_result.render_as_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}.md") md_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.md")
with open(md_fn, "w") as fd: with open(md_fn, "w") as fd:
fd.write(doc_result.render_as_markdown()) fd.write(doc_result.render_as_markdown())
def get_pdf_paths(): def get_pdf_paths():
# TODO: Debug
# Define the directory you want to search # Define the directory you want to search
# directory = Path("./tests/data") directory = Path("./tests/data_scanned")
directory = Path("./tests/data/scanned")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf")) pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files return pdf_files
def get_easyocr_converter(): def get_converter(engine: str):
ocr_options = EasyOcrOptions(
)
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
# Debug
pipeline_options.do_ocr = True pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
if engine == "tesserocr":
pipeline_options.ocr_options = TesseractOcrOptions()
converter = DocumentConverter( converter = DocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
@ -68,34 +65,30 @@ def get_easyocr_converter():
return converter return converter
def get_tesseract_converter():
pipeline_options = PipelineOptions()
# Debug
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def test_e2e_conversions(): def test_e2e_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
converter = get_converter()
for engine in ["easyocr", "tesserocr"]:
print(f"Converting with ocr_engine: {engine}")
converter = get_converter(engine)
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path) doc_result: ConversionResult = converter.convert_single(pdf_path)
# # Save conversions
# save_output(pdf_path, doc_result, engine)
# Debug # Debug
verify_conversion_result( verify_conversion_result(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE input_path=pdf_path,
doc_result=doc_result,
generate=GENERATE,
ocr_engine=engine,
) )
# if __name__ == "__main__":
# test_e2e_conversions()

View File

@ -127,7 +127,10 @@ def verify_dt(doc_pred_dt, doc_true_dt):
def verify_conversion_result( def verify_conversion_result(
input_path: Path, doc_result: ConversionResult, generate=False input_path: Path,
doc_result: ConversionResult,
generate=False,
ocr_engine=None,
): ):
PageList = TypeAdapter(List[Page]) PageList = TypeAdapter(List[Page])
@ -140,10 +143,16 @@ def verify_conversion_result(
doc_pred_md = doc_result.render_as_markdown() doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags() doc_pred_dt = doc_result.render_as_doctags()
pages_path = input_path.with_suffix(".pages.json") # pages_path = input_path.with_suffix(".pages.json")
json_path = input_path.with_suffix(".json") # json_path = input_path.with_suffix(".json")
md_path = input_path.with_suffix(".md") # md_path = input_path.with_suffix(".md")
dt_path = input_path.with_suffix(".doctags.txt") # dt_path = input_path.with_suffix(".doctags.txt")
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
json_path = input_path.with_suffix(f"{engine_suffix}.json")
md_path = input_path.with_suffix(f"{engine_suffix}.md")
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth if generate: # only used when re-generating truth
with open(pages_path, "w") as fw: with open(pages_path, "w") as fw: