tesseract is working

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2024-10-02 17:23:50 +02:00 · 2024-10-02 17:23:50 +02:00 · fed3323e25
commit fed3323e25
parent a3e2cf5473
3 changed files with 26 additions and 14 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -33,7 +33,7 @@ class TesseractOcrOptions(OcrOptions):
    lang: List[str] = ["fr", "de", "es", "en"]

 class TesserOcrOptions(OcrOptions):
-    kind: Literal["tesseract"] = "tesserocr"
+    kind: Literal["tesserocr"] = "tesserocr"


 class PipelineOptions(BaseModel):
--- a/docling/models/tesseract_model.py
+++ b/docling/models/tesseract_model.py
@ -1,4 +1,7 @@
 import logging
+import io
+import os
+
 from subprocess import PIPE, Popen
 from typing import Iterable, Tuple

@ -18,12 +21,15 @@ class TesseractOcrModel(BaseOcrModel):

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.

+        self._name = None
+        self._version = None
+        
        if self.enabled:
            try:
                self._get_name_and_version()

            except Exception as exc:
-                _log.error(f"Tesseract is not available, aborting ...")
+                _log.error(f"Tesseract is not available, aborting: ", exc.what())
                self.enabled = False

    def _get_name_and_version(self) -> Tuple[str, str]:
@ -70,11 +76,17 @@ class TesseractOcrModel(BaseOcrModel):
        proc = Popen(cmd, stdout=PIPE)
        output, _ = proc.communicate()

+        #_log.info(output)
+        
+        # Decode the byte string to a regular string
+        decoded_data = output.decode('utf-8')
+        # _log.info(decoded_data)
+        
        # Read the TSV file generated by Tesseract
-        df = pd.read_csv("output_file_name.tsv", sep="\t")
+        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")

        # Display the dataframe (optional)
-        print(df.head())
+        # _log.info("df: ", df.head())

        # Filter rows that contain actual text (ignore header or empty rows)
        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
@ -95,37 +107,38 @@ class TesseractOcrModel(BaseOcrModel):
                high_res_image = page._backend.get_page_image(
                    scale=self.scale, cropbox=ocr_rect
                )
-                print(high_res_image)

                # FIXME: do we really need to save the image to a file
                fname = "temporary-file.png"
                high_res_image.save(fname)

+                df=None
                if os.path.exists(fname):
                    df = self._run_tesseract(fname)
                    os.remove(fname)
                else:
                    _log.error(f"no image file: {fname}")
+                    continue

+                # _log.info(df)
+                
                # Print relevant columns (bounding box and text)
-                for index, row in df_filtered.iterrows():
-                    print(row)
-
+                for ix, row in df.iterrows():
                    text = row["text"]
-                    conf = row["confidence"]
+                    conf = row["conf"]

                    l = float(row["left"])
-                    t = float(row["top"])
+                    b = float(row["top"])
                    w = float(row["width"])
                    h = float(row["height"])

-                    b = t - h
+                    t = b + h
                    r = l + w

                    cell = OcrCell(
                        id=ix,
                        text=text,
-                        confidence=line[2],
+                        confidence=conf/100.,
                        bbox=BoundingBox.from_tuple(
                            coord=(
                                (l / self.scale) + ocr_rect.l,
@ -144,6 +157,6 @@ class TesseractOcrModel(BaseOcrModel):
            page.cells.extend(filtered_ocr_cells)

            # DEBUG code:
-            self.draw_ocr_rects_and_cells(page, ocr_rects)
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)

            yield page
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.easyocr_model import EasyOcrModel
-from docling.models.tesseract_model import TesseractOCRModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_model import TesseractOcrModel