From fed3323e25ec84d66cc7ed15016dad9d3cfd0816 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 2 Oct 2024 17:23:50 +0200 Subject: [PATCH] tesseract is working Signed-off-by: Peter Staar --- docling/datamodel/pipeline_options.py | 2 +- docling/models/tesseract_model.py | 37 ++++++++++++++------- docling/pipeline/standard_model_pipeline.py | 1 - 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 6e3867f4..cc48a461 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -33,7 +33,7 @@ class TesseractOcrOptions(OcrOptions): lang: List[str] = ["fr", "de", "es", "en"] class TesserOcrOptions(OcrOptions): - kind: Literal["tesseract"] = "tesserocr" + kind: Literal["tesserocr"] = "tesserocr" class PipelineOptions(BaseModel): diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index 687e3b4f..e4e8a73a 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -1,4 +1,7 @@ import logging +import io +import os + from subprocess import PIPE, Popen from typing import Iterable, Tuple @@ -18,12 +21,15 @@ class TesseractOcrModel(BaseOcrModel): self.scale = 3 # multiplier for 72 dpi == 216 dpi. + self._name = None + self._version = None + if self.enabled: try: self._get_name_and_version() except Exception as exc: - _log.error(f"Tesseract is not available, aborting ...") + _log.error(f"Tesseract is not available, aborting: ", exc.what()) self.enabled = False def _get_name_and_version(self) -> Tuple[str, str]: @@ -70,11 +76,17 @@ class TesseractOcrModel(BaseOcrModel): proc = Popen(cmd, stdout=PIPE) output, _ = proc.communicate() + #_log.info(output) + + # Decode the byte string to a regular string + decoded_data = output.decode('utf-8') + # _log.info(decoded_data) + # Read the TSV file generated by Tesseract - df = pd.read_csv("output_file_name.tsv", sep="\t") + df = pd.read_csv(io.StringIO(decoded_data), sep="\t") # Display the dataframe (optional) - print(df.head()) + # _log.info("df: ", df.head()) # Filter rows that contain actual text (ignore header or empty rows) df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] @@ -95,37 +107,38 @@ class TesseractOcrModel(BaseOcrModel): high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) - print(high_res_image) # FIXME: do we really need to save the image to a file fname = "temporary-file.png" high_res_image.save(fname) + df=None if os.path.exists(fname): df = self._run_tesseract(fname) os.remove(fname) else: _log.error(f"no image file: {fname}") + continue + # _log.info(df) + # Print relevant columns (bounding box and text) - for index, row in df_filtered.iterrows(): - print(row) - + for ix, row in df.iterrows(): text = row["text"] - conf = row["confidence"] + conf = row["conf"] l = float(row["left"]) - t = float(row["top"]) + b = float(row["top"]) w = float(row["width"]) h = float(row["height"]) - b = t - h + t = b + h r = l + w cell = OcrCell( id=ix, text=text, - confidence=line[2], + confidence=conf/100., bbox=BoundingBox.from_tuple( coord=( (l / self.scale) + ocr_rect.l, @@ -144,6 +157,6 @@ class TesseractOcrModel(BaseOcrModel): page.cells.extend(filtered_ocr_cells) # DEBUG code: - self.draw_ocr_rects_and_cells(page, ocr_rects) + # self.draw_ocr_rects_and_cells(page, ocr_rects) yield page diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index c39b83c3..df64fb8a 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import ( ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.easyocr_model import EasyOcrModel -from docling.models.tesseract_model import TesseractOCRModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_model import TesseractOcrModel