fix(tesseract): initialize df_osd to avoid uninitialized variable error (#1718)

* fix: initialize df_osd to avoid uninitialized variable error Signed-off-by: IoannisMaras <maras2002@gmail.com> * Fix formatting Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Satisfy mypy, regenerate OCR tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: IoannisMaras <maras2002@gmail.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-10 13:48:13 +00:00 · 2025-06-10 11:57:45 +03:00
parent f7f31137f1
commit e979750ce9
20 changed files with 752 additions and 751 deletions
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -99,12 +99,12 @@ class TesseractOcrCliModel(BaseOcrModel):

        return name, version

-    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
+    def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]):
        r"""
        Run tesseract CLI
        """
        cmd = [self.options.tesseract_cmd]
-        if self._is_auto:
+        if self._is_auto and osd is not None:
            lang = self._parse_language(osd)
            if lang is not None:
                cmd.append("-l")
@@ -231,6 +231,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                                fname = image_file.name
                                high_res_image.save(image_file)
                            doc_orientation = 0
+                            df_osd: Optional[pd.DataFrame] = None
                            try:
                                df_osd = self._perform_osd(fname)
                                doc_orientation = _parse_orientation(df_osd)