fix: python3.9 support (#396)

* fixes for python3.9 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * pin docling-parse with python3.9 wheels Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update deps Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2024-11-20 15:21:40 +01:00
parent 6efa96c983
commit 7b013abcf3
6 changed files with 238 additions and 247 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -254,17 +254,16 @@ def convert(
    export_txt = OutputFormat.TEXT in to_formats
    export_doctags = OutputFormat.DOCTAGS in to_formats

-    match ocr_engine:
-        case OcrEngine.EASYOCR:
-            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-        case OcrEngine.TESSERACT_CLI:
-            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-        case OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-        case OcrEngine.OCRMAC:
-            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
-        case _:
-            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
+    if ocr_engine == OcrEngine.EASYOCR:
+        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
+    elif ocr_engine == OcrEngine.TESSERACT_CLI:
+        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
+    elif ocr_engine == OcrEngine.TESSERACT:
+        ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
+    elif ocr_engine == OcrEngine.OCRMAC:
+        ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
+    else:
+        raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

    ocr_lang_list = _split_list(ocr_lang)
    if ocr_lang_list is not None:
@@ -281,15 +280,14 @@ def convert(
    if artifacts_path is not None:
        pipeline_options.artifacts_path = artifacts_path

-    match pdf_backend:
-        case PdfBackend.DLPARSE_V1:
-            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-        case PdfBackend.DLPARSE_V2:
-            backend = DoclingParseV2DocumentBackend
-        case PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend
-        case _:
-            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+    if pdf_backend == PdfBackend.DLPARSE_V1:
+        backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+    elif pdf_backend == PdfBackend.DLPARSE_V2:
+        backend = DoclingParseV2DocumentBackend
+    elif pdf_backend == PdfBackend.PYPDFIUM2:
+        backend = PyPdfiumDocumentBackend
+    else:
+        raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

    format_options: Dict[InputFormat, FormatOption] = {
        InputFormat.PDF: PdfFormatOption(
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -3,7 +3,7 @@ import sys
 import time
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Type
+from typing import Dict, Iterable, Iterator, List, Optional, Type, Union

 from pydantic import BaseModel, ConfigDict, model_validator, validate_call

@@ -155,7 +155,7 @@ class DocumentConverter:
    @validate_call(config=ConfigDict(strict=True))
    def convert(
        self,
-        source: Path | str | DocumentStream,  # TODO review naming
+        source: Union[Path, str, DocumentStream],  # TODO review naming
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
@@ -172,7 +172,7 @@ class DocumentConverter:
    @validate_call(config=ConfigDict(strict=True))
    def convert_all(
        self,
-        source: Iterable[Path | str | DocumentStream],  # TODO review naming
+        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,