From fed3323e25ec84d66cc7ed15016dad9d3cfd0816 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 2 Oct 2024 17:23:50 +0200
Subject: [PATCH] tesseract is working

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py       |  2 +-
 docling/models/tesseract_model.py           | 37 ++++++++++++++-------
 docling/pipeline/standard_model_pipeline.py |  1 -
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 6e3867f4..cc48a461 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -33,7 +33,7 @@ class TesseractOcrOptions(OcrOptions):
     lang: List[str] = ["fr", "de", "es", "en"]
 
 class TesserOcrOptions(OcrOptions):
-    kind: Literal["tesseract"] = "tesserocr"
+    kind: Literal["tesserocr"] = "tesserocr"
 
 
 class PipelineOptions(BaseModel):
diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py
index 687e3b4f..e4e8a73a 100644
--- a/docling/models/tesseract_model.py
+++ b/docling/models/tesseract_model.py
@@ -1,4 +1,7 @@
 import logging
+import io
+import os
+
 from subprocess import PIPE, Popen
 from typing import Iterable, Tuple
 
@@ -18,12 +21,15 @@ class TesseractOcrModel(BaseOcrModel):
 
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
 
+        self._name = None
+        self._version = None
+        
         if self.enabled:
             try:
                 self._get_name_and_version()
 
             except Exception as exc:
-                _log.error(f"Tesseract is not available, aborting ...")
+                _log.error(f"Tesseract is not available, aborting: ", exc.what())
                 self.enabled = False
 
     def _get_name_and_version(self) -> Tuple[str, str]:
@@ -70,11 +76,17 @@ class TesseractOcrModel(BaseOcrModel):
         proc = Popen(cmd, stdout=PIPE)
         output, _ = proc.communicate()
 
+        #_log.info(output)
+        
+        # Decode the byte string to a regular string
+        decoded_data = output.decode('utf-8')
+        # _log.info(decoded_data)
+        
         # Read the TSV file generated by Tesseract
-        df = pd.read_csv("output_file_name.tsv", sep="\t")
+        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
 
         # Display the dataframe (optional)
-        print(df.head())
+        # _log.info("df: ", df.head())
 
         # Filter rows that contain actual text (ignore header or empty rows)
         df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
@@ -95,37 +107,38 @@ class TesseractOcrModel(BaseOcrModel):
                 high_res_image = page._backend.get_page_image(
                     scale=self.scale, cropbox=ocr_rect
                 )
-                print(high_res_image)
 
                 # FIXME: do we really need to save the image to a file
                 fname = "temporary-file.png"
                 high_res_image.save(fname)
 
+                df=None
                 if os.path.exists(fname):
                     df = self._run_tesseract(fname)
                     os.remove(fname)
                 else:
                     _log.error(f"no image file: {fname}")
+                    continue
 
+                # _log.info(df)
+                
                 # Print relevant columns (bounding box and text)
-                for index, row in df_filtered.iterrows():
-                    print(row)
-
+                for ix, row in df.iterrows():
                     text = row["text"]
-                    conf = row["confidence"]
+                    conf = row["conf"]
 
                     l = float(row["left"])
-                    t = float(row["top"])
+                    b = float(row["top"])
                     w = float(row["width"])
                     h = float(row["height"])
 
-                    b = t - h
+                    t = b + h
                     r = l + w
 
                     cell = OcrCell(
                         id=ix,
                         text=text,
-                        confidence=line[2],
+                        confidence=conf/100.,
                         bbox=BoundingBox.from_tuple(
                             coord=(
                                 (l / self.scale) + ocr_rect.l,
@@ -144,6 +157,6 @@ class TesseractOcrModel(BaseOcrModel):
             page.cells.extend(filtered_ocr_cells)
 
             # DEBUG code:
-            self.draw_ocr_rects_and_cells(page, ocr_rects)
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
 
             yield page
diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py
index c39b83c3..df64fb8a 100644
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.easyocr_model import EasyOcrModel
-from docling.models.tesseract_model import TesseractOCRModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_model import TesseractOcrModel