mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
tesseract is working
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
a3e2cf5473
commit
fed3323e25
@ -33,7 +33,7 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
lang: List[str] = ["fr", "de", "es", "en"]
|
lang: List[str] = ["fr", "de", "es", "en"]
|
||||||
|
|
||||||
class TesserOcrOptions(OcrOptions):
|
class TesserOcrOptions(OcrOptions):
|
||||||
kind: Literal["tesseract"] = "tesserocr"
|
kind: Literal["tesserocr"] = "tesserocr"
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel):
|
class PipelineOptions(BaseModel):
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
from subprocess import PIPE, Popen
|
from subprocess import PIPE, Popen
|
||||||
from typing import Iterable, Tuple
|
from typing import Iterable, Tuple
|
||||||
|
|
||||||
@ -18,12 +21,15 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
|
|
||||||
|
self._name = None
|
||||||
|
self._version = None
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
try:
|
try:
|
||||||
self._get_name_and_version()
|
self._get_name_and_version()
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
_log.error(f"Tesseract is not available, aborting ...")
|
_log.error(f"Tesseract is not available, aborting: ", exc.what())
|
||||||
self.enabled = False
|
self.enabled = False
|
||||||
|
|
||||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||||
@ -70,11 +76,17 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
proc = Popen(cmd, stdout=PIPE)
|
proc = Popen(cmd, stdout=PIPE)
|
||||||
output, _ = proc.communicate()
|
output, _ = proc.communicate()
|
||||||
|
|
||||||
|
#_log.info(output)
|
||||||
|
|
||||||
|
# Decode the byte string to a regular string
|
||||||
|
decoded_data = output.decode('utf-8')
|
||||||
|
# _log.info(decoded_data)
|
||||||
|
|
||||||
# Read the TSV file generated by Tesseract
|
# Read the TSV file generated by Tesseract
|
||||||
df = pd.read_csv("output_file_name.tsv", sep="\t")
|
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
||||||
|
|
||||||
# Display the dataframe (optional)
|
# Display the dataframe (optional)
|
||||||
print(df.head())
|
# _log.info("df: ", df.head())
|
||||||
|
|
||||||
# Filter rows that contain actual text (ignore header or empty rows)
|
# Filter rows that contain actual text (ignore header or empty rows)
|
||||||
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
|
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
|
||||||
@ -95,37 +107,38 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
print(high_res_image)
|
|
||||||
|
|
||||||
# FIXME: do we really need to save the image to a file
|
# FIXME: do we really need to save the image to a file
|
||||||
fname = "temporary-file.png"
|
fname = "temporary-file.png"
|
||||||
high_res_image.save(fname)
|
high_res_image.save(fname)
|
||||||
|
|
||||||
|
df=None
|
||||||
if os.path.exists(fname):
|
if os.path.exists(fname):
|
||||||
df = self._run_tesseract(fname)
|
df = self._run_tesseract(fname)
|
||||||
os.remove(fname)
|
os.remove(fname)
|
||||||
else:
|
else:
|
||||||
_log.error(f"no image file: {fname}")
|
_log.error(f"no image file: {fname}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# _log.info(df)
|
||||||
|
|
||||||
# Print relevant columns (bounding box and text)
|
# Print relevant columns (bounding box and text)
|
||||||
for index, row in df_filtered.iterrows():
|
for ix, row in df.iterrows():
|
||||||
print(row)
|
|
||||||
|
|
||||||
text = row["text"]
|
text = row["text"]
|
||||||
conf = row["confidence"]
|
conf = row["conf"]
|
||||||
|
|
||||||
l = float(row["left"])
|
l = float(row["left"])
|
||||||
t = float(row["top"])
|
b = float(row["top"])
|
||||||
w = float(row["width"])
|
w = float(row["width"])
|
||||||
h = float(row["height"])
|
h = float(row["height"])
|
||||||
|
|
||||||
b = t - h
|
t = b + h
|
||||||
r = l + w
|
r = l + w
|
||||||
|
|
||||||
cell = OcrCell(
|
cell = OcrCell(
|
||||||
id=ix,
|
id=ix,
|
||||||
text=text,
|
text=text,
|
||||||
confidence=line[2],
|
confidence=conf/100.,
|
||||||
bbox=BoundingBox.from_tuple(
|
bbox=BoundingBox.from_tuple(
|
||||||
coord=(
|
coord=(
|
||||||
(l / self.scale) + ocr_rect.l,
|
(l / self.scale) + ocr_rect.l,
|
||||||
@ -144,6 +157,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
page.cells.extend(filtered_ocr_cells)
|
page.cells.extend(filtered_ocr_cells)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
self.draw_ocr_rects_and_cells(page, ocr_rects)
|
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.tesseract_model import TesseractOCRModel
|
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
from docling.models.tesseract_model import TesseractOcrModel
|
from docling.models.tesseract_model import TesseractOcrModel
|
||||||
|
Loading…
Reference in New Issue
Block a user