tesseract is working

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-02 17:23:50 +02:00
parent a3e2cf5473
commit fed3323e25
3 changed files with 26 additions and 14 deletions

View File

@ -33,7 +33,7 @@ class TesseractOcrOptions(OcrOptions):
lang: List[str] = ["fr", "de", "es", "en"] lang: List[str] = ["fr", "de", "es", "en"]
class TesserOcrOptions(OcrOptions): class TesserOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesserocr" kind: Literal["tesserocr"] = "tesserocr"
class PipelineOptions(BaseModel): class PipelineOptions(BaseModel):

View File

@ -1,4 +1,7 @@
import logging import logging
import io
import os
from subprocess import PIPE, Popen from subprocess import PIPE, Popen
from typing import Iterable, Tuple from typing import Iterable, Tuple
@ -18,12 +21,15 @@ class TesseractOcrModel(BaseOcrModel):
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self._name = None
self._version = None
if self.enabled: if self.enabled:
try: try:
self._get_name_and_version() self._get_name_and_version()
except Exception as exc: except Exception as exc:
_log.error(f"Tesseract is not available, aborting ...") _log.error(f"Tesseract is not available, aborting: ", exc.what())
self.enabled = False self.enabled = False
def _get_name_and_version(self) -> Tuple[str, str]: def _get_name_and_version(self) -> Tuple[str, str]:
@ -70,11 +76,17 @@ class TesseractOcrModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE) proc = Popen(cmd, stdout=PIPE)
output, _ = proc.communicate() output, _ = proc.communicate()
#_log.info(output)
# Decode the byte string to a regular string
decoded_data = output.decode('utf-8')
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract # Read the TSV file generated by Tesseract
df = pd.read_csv("output_file_name.tsv", sep="\t") df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
# Display the dataframe (optional) # Display the dataframe (optional)
print(df.head()) # _log.info("df: ", df.head())
# Filter rows that contain actual text (ignore header or empty rows) # Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
@ -95,37 +107,38 @@ class TesseractOcrModel(BaseOcrModel):
high_res_image = page._backend.get_page_image( high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect scale=self.scale, cropbox=ocr_rect
) )
print(high_res_image)
# FIXME: do we really need to save the image to a file # FIXME: do we really need to save the image to a file
fname = "temporary-file.png" fname = "temporary-file.png"
high_res_image.save(fname) high_res_image.save(fname)
df=None
if os.path.exists(fname): if os.path.exists(fname):
df = self._run_tesseract(fname) df = self._run_tesseract(fname)
os.remove(fname) os.remove(fname)
else: else:
_log.error(f"no image file: {fname}") _log.error(f"no image file: {fname}")
continue
# _log.info(df)
# Print relevant columns (bounding box and text) # Print relevant columns (bounding box and text)
for index, row in df_filtered.iterrows(): for ix, row in df.iterrows():
print(row)
text = row["text"] text = row["text"]
conf = row["confidence"] conf = row["conf"]
l = float(row["left"]) l = float(row["left"])
t = float(row["top"]) b = float(row["top"])
w = float(row["width"]) w = float(row["width"])
h = float(row["height"]) h = float(row["height"])
b = t - h t = b + h
r = l + w r = l + w
cell = OcrCell( cell = OcrCell(
id=ix, id=ix,
text=text, text=text,
confidence=line[2], confidence=conf/100.,
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
coord=( coord=(
(l / self.scale) + ocr_rect.l, (l / self.scale) + ocr_rect.l,
@ -144,6 +157,6 @@ class TesseractOcrModel(BaseOcrModel):
page.cells.extend(filtered_ocr_cells) page.cells.extend(filtered_ocr_cells)
# DEBUG code: # DEBUG code:
self.draw_ocr_rects_and_cells(page, ocr_rects) # self.draw_ocr_rects_and_cells(page, ocr_rects)
yield page yield page

View File

@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
) )
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.tesseract_model import TesseractOCRModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_model import TesseractOcrModel from docling.models.tesseract_model import TesseractOcrModel