mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
fix: tesseract_ocr_cli csv parsing fails when text contains single quotes
This commit is contained in:
parent
8ccb3c6db6
commit
2f6f6c1b41
@ -1,3 +1,4 @@
|
|||||||
|
import csv
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
@ -95,7 +96,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
# _log.info(decoded_data)
|
# _log.info(decoded_data)
|
||||||
|
|
||||||
# Read the TSV file generated by Tesseract
|
# Read the TSV file generated by Tesseract
|
||||||
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
df = pd.read_csv(io.StringIO(decoded_data), sep="\t", quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
# Display the dataframe (optional)
|
# Display the dataframe (optional)
|
||||||
# _log.info("df: ", df.head())
|
# _log.info("df: ", df.head())
|
||||||
|
Loading…
Reference in New Issue
Block a user