chore(OCR): Rename class names to use Tesseract for the tesserocr and TesseractCLI for the tesseract process

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-10-08 14:44:23 +02:00
parent 074acd703c
commit 70a8a2cc82
8 changed files with 261 additions and 261 deletions

View File

@ -17,8 +17,8 @@ from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
PipelineOptions, PipelineOptions,
TesseractOcrOptions, TesseractCLIOptions,
TesserOcrOptions, TesseractOptions,
) )
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -61,8 +61,8 @@ class Backend(str, Enum):
# Define an enum for the ocr engines # Define an enum for the ocr engines
class OcrEngine(str, Enum): class OcrEngine(str, Enum):
EASYOCR = "easyocr" EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract" TESSERACT = "tesseract"
TESSEROCR = "tesserocr"
def export_documents( def export_documents(
@ -209,10 +209,10 @@ def convert(
match ocr_engine: match ocr_engine:
case OcrEngine.EASYOCR: case OcrEngine.EASYOCR:
ocr_options = EasyOcrOptions() ocr_options = EasyOcrOptions()
case OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCLIOptions()
case OcrEngine.TESSERACT: case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions() ocr_options = TesseractOptions()
case OcrEngine.TESSEROCR:
ocr_options = TesserOcrOptions()
case _: case _:
raise RuntimeError(f"Unexpected backend type {backend}") raise RuntimeError(f"Unexpected backend type {backend}")

View File

@ -36,7 +36,7 @@ class EasyOcrOptions(OcrOptions):
) )
class TesseractOcrOptions(OcrOptions): class TesseractCLIOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract" kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"] lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract" tesseract_cmd: str = "tesseract"
@ -47,7 +47,7 @@ class TesseractOcrOptions(OcrOptions):
) )
class TesserOcrOptions(OcrOptions): class TesseractOptions(OcrOptions):
kind: Literal["tesserocr"] = "tesserocr" kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"] lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None path: Optional[str] = None
@ -62,6 +62,6 @@ class PipelineOptions(BaseModel):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field( ocr_options: Union[EasyOcrOptions, TesseractCLIOptions, TesseractOptions] = Field(
EasyOcrOptions(), discriminator="kind" EasyOcrOptions(), discriminator="kind"
) )

View File

@ -0,0 +1,167 @@
import io
import logging
import tempfile
from subprocess import PIPE, Popen
from typing import Iterable, Tuple
import pandas as pd
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.pipeline_options import TesseractCLIOptions
from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__)
class TesseractCLIModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractCLIOptions):
super().__init__(enabled=enabled, options=options)
self.options: TesseractCLIOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self._name = None
self._version = None
if self.enabled:
try:
self._get_name_and_version()
except Exception as exc:
raise RuntimeError(
f"Tesseract is not available, aborting: {exc} "
"Install tesseract on your system and the tesseract binary is discoverable. "
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
def _get_name_and_version(self) -> Tuple[str, str]:
if self._name != None and self._version != None:
return self._name, self._version
cmd = [self.options.tesseract_cmd, "--version"]
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
proc.wait()
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
# to stderr, so check both.
version_line = (
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
.split("\n")[0]
.strip()
)
# If everything else fails...
if not version_line:
version_line = "tesseract XXX"
name, version = version_line.split(" ")
self._name = name
self._version = version
return name, version
def _run_tesseract(self, ifilename: str):
cmd = [self.options.tesseract_cmd]
if self.options.lang is not None and len(self.options.lang) > 0:
cmd.append("-l")
cmd.append("+".join(self.options.lang))
if self.options.path is not None:
cmd.append("--tessdata-dir")
cmd.append(self.options.path)
cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE)
output, _ = proc.communicate()
# _log.info(output)
# Decode the byte string to a regular string
decoded_data = output.decode("utf-8")
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
# Display the dataframe (optional)
# _log.info("df: ", df.head())
# Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
return df_filtered
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
fname = image_file.name
high_res_image.save(fname)
df = self._run_tesseract(fname)
# _log.info(df)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
yield page

View File

@ -1,105 +1,65 @@
import io
import logging import logging
import tempfile from typing import Iterable
from subprocess import PIPE, Popen
from typing import Iterable, Tuple
import pandas as pd import numpy
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.pipeline_options import TesseractCLIOptions
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class TesseractOcrModel(BaseOcrModel): class TesseractModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractCLIOptions):
def __init__(self, enabled: bool, options: TesseractOcrOptions):
super().__init__(enabled=enabled, options=options) super().__init__(enabled=enabled, options=options)
self.options: TesseractOcrOptions self.options: TesseractCLIOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self._name = None
self._version = None
if self.enabled: if self.enabled:
setup_errmsg = (
"tesserocr is not correctly installed. "
"Please install it via `pip install tesserocr` to use this OCR engine. "
"Note that tesserocr might have to be manually compiled for working with"
"your Tesseract installation. The Docling documentation provides examples for it. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
try: try:
self._get_name_and_version() import tesserocr
except ImportError:
raise ImportError(setup_errmsg)
except Exception as exc: try:
raise RuntimeError( tesseract_version = tesserocr.tesseract_version()
f"Tesseract is not available, aborting: {exc} " _log.debug("Initializing TesserOCR: %s", tesseract_version)
"Install tesseract on your system and the tesseract binary is discoverable. " except:
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. " raise ImportError(setup_errmsg)
"Alternatively, Docling has support for other OCR engines. See the documentation."
# Initialize the tesseractAPI
lang = "+".join(self.options.lang)
if self.options.path is not None:
self.reader = tesserocr.PyTessBaseAPI(
path=self.options.path,
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
) )
else:
self.reader = tesserocr.PyTessBaseAPI(
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)
self.reader_RIL = tesserocr.RIL
def _get_name_and_version(self) -> Tuple[str, str]: def __del__(self):
if self.reader is not None:
if self._name != None and self._version != None: # Finalize the tesseractAPI
return self._name, self._version self.reader.End()
cmd = [self.options.tesseract_cmd, "--version"]
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
proc.wait()
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
# to stderr, so check both.
version_line = (
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
.split("\n")[0]
.strip()
)
# If everything else fails...
if not version_line:
version_line = "tesseract XXX"
name, version = version_line.split(" ")
self._name = name
self._version = version
return name, version
def _run_tesseract(self, ifilename: str):
cmd = [self.options.tesseract_cmd]
if self.options.lang is not None and len(self.options.lang) > 0:
cmd.append("-l")
cmd.append("+".join(self.options.lang))
if self.options.path is not None:
cmd.append("--tessdata-dir")
cmd.append(self.options.path)
cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE)
output, _ = proc.communicate()
# _log.info(output)
# Decode the byte string to a regular string
decoded_data = output.decode("utf-8")
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
# Display the dataframe (optional)
# _log.info("df: ", df.head())
# Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
return df_filtered
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
@ -119,42 +79,37 @@ class TesseractOcrModel(BaseOcrModel):
scale=self.scale, cropbox=ocr_rect scale=self.scale, cropbox=ocr_rect
) )
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file: # Retrieve text snippets with their bounding boxes
fname = image_file.name self.reader.SetImage(high_res_image)
high_res_image.save(fname) boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
df = self._run_tesseract(fname) cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
# _log.info(df) # Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
# Print relevant columns (bounding box and text) cells.append(
for ix, row in df.iterrows(): OcrCell(
text = row["text"] id=ix,
conf = row["conf"] text=text,
confidence=confidence,
l = float(row["left"]) bbox=BoundingBox.from_tuple(
b = float(row["top"]) coord=(left, top, right, bottom),
w = float(row["width"]) origin=CoordOrigin.TOPLEFT,
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
), ),
origin=CoordOrigin.TOPLEFT, )
),
) )
all_ocr_cells.append(cell)
# del high_res_image
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells. ## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)

View File

@ -1,122 +0,0 @@
import logging
from typing import Iterable
import numpy
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__)
class TesserOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractOcrOptions):
super().__init__(enabled=enabled, options=options)
self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
if self.enabled:
setup_errmsg = (
"tesserocr is not correctly installed. "
"Please install it via `pip install tesserocr` to use this OCR engine. "
"Note that tesserocr might have to be manually compiled for working with"
"your Tesseract installation. The Docling documentation provides examples for it. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
try:
import tesserocr
except ImportError:
raise ImportError(setup_errmsg)
try:
tesseract_version = tesserocr.tesseract_version()
_log.debug("Initializing TesserOCR: %s", tesseract_version)
except:
raise ImportError(setup_errmsg)
# Initialize the tesseractAPI
lang = "+".join(self.options.lang)
if self.options.path is not None:
self.reader = tesserocr.PyTessBaseAPI(
path=self.options.path,
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)
else:
self.reader = tesserocr.PyTessBaseAPI(
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)
self.reader_RIL = tesserocr.RIL
def __del__(self):
if self.reader is not None:
# Finalize the tesseractAPI
self.reader.End()
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
# Retrieve text snippets with their bounding boxes
self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
# Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
cells.append(
OcrCell(
id=ix,
text=text,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
)
)
# del high_res_image
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
yield page

View File

@ -3,15 +3,15 @@ from pathlib import Path
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
PipelineOptions, PipelineOptions,
TesseractOcrOptions, TesseractCLIOptions,
TesserOcrOptions, TesseractOptions,
) )
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_model import TesseractOcrModel from docling.models.tesseract_cli_model import TesseractCLIModel
from docling.models.tesserocr_model import TesserOcrModel from docling.models.tesseract_model import TesseractModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline from docling.pipeline.base_model_pipeline import BaseModelPipeline
@ -28,13 +28,13 @@ class StandardModelPipeline(BaseModelPipeline):
enabled=pipeline_options.do_ocr, enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options, options=pipeline_options.ocr_options,
) )
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): elif isinstance(pipeline_options.ocr_options, TesseractCLIOptions):
ocr_model = TesseractOcrModel( ocr_model = TesseractCLIModel(
enabled=pipeline_options.do_ocr, enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options, options=pipeline_options.ocr_options,
) )
elif isinstance(pipeline_options.ocr_options, TesserOcrOptions): elif isinstance(pipeline_options.ocr_options, TesseractOptions):
ocr_model = TesserOcrModel( ocr_model = TesseractModel(
enabled=pipeline_options.do_ocr, enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options, options=pipeline_options.ocr_options,
) )

View File

@ -8,7 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions from docling.datamodel.pipeline_options import TesseractCLIOptions, TesseractOptions
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -126,7 +126,7 @@ def main():
pipeline_options.do_ocr = True pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesserOcrOptions() pipeline_options.ocr_options = TesseractOptions()
# Docling Parse with Tesseract CLI # Docling Parse with Tesseract CLI
# ---------------------- # ----------------------
@ -134,7 +134,7 @@ def main():
pipeline_options.do_ocr = True pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesseractOcrOptions() pipeline_options.ocr_options = TesseractCLIOptions()
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,

View File

@ -7,8 +7,8 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrOptions, OcrOptions,
PipelineOptions, PipelineOptions,
TesseractOcrOptions, TesseractCLIOptions,
TesserOcrOptions, TesseractOptions,
) )
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -74,8 +74,8 @@ def test_e2e_conversions():
engines: List[OcrOptions] = [ engines: List[OcrOptions] = [
EasyOcrOptions(), EasyOcrOptions(),
TesserOcrOptions(), TesseractOptions(),
TesseractOcrOptions(), TesseractCLIOptions(),
] ]
for ocr_options in engines: for ocr_options in engines: