mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
fix: tesseract_ocr_cli csv parsing fails when text contains single quotes
This commit is contained in:
parent
8ccb3c6db6
commit
2f6f6c1b41
@ -1,3 +1,4 @@
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import tempfile
|
||||
@ -95,7 +96,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
||||
df = pd.read_csv(io.StringIO(decoded_data), sep="\t", quoting=csv.QUOTE_NONE)
|
||||
|
||||
# Display the dataframe (optional)
|
||||
# _log.info("df: ", df.head())
|
||||
|
Loading…
Reference in New Issue
Block a user