fix: tesseract_ocr_cli csv parsing fails when text contains single quotes

This commit is contained in:
Gaspard Petit 2024-12-02 01:59:17 -05:00
parent 8ccb3c6db6
commit 2f6f6c1b41

View File

@ -1,3 +1,4 @@
import csv
import io
import logging
import tempfile
@ -95,7 +96,7 @@ class TesseractOcrCliModel(BaseOcrModel):
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
df = pd.read_csv(io.StringIO(decoded_data), sep="\t", quoting=csv.QUOTE_NONE)
# Display the dataframe (optional)
# _log.info("df: ", df.head())