From a52305990b0c763f7d490361c535b5e6001fbf53 Mon Sep 17 00:00:00 2001 From: guglie Date: Fri, 29 Nov 2024 17:25:06 +0100 Subject: [PATCH] fix: ParserError EOF inside string (#470) Signed-off-by: guglie --- docling/models/tesseract_ocr_cli_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 9a50eee0..a6b2f7fb 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -1,3 +1,4 @@ +import csv import io import logging import tempfile @@ -95,7 +96,7 @@ class TesseractOcrCliModel(BaseOcrModel): # _log.info(decoded_data) # Read the TSV file generated by Tesseract - df = pd.read_csv(io.StringIO(decoded_data), sep="\t") + df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t") # Display the dataframe (optional) # _log.info("df: ", df.head())