feat: add options for choosing OCR engines (#118)

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
Co-authored-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2024-10-08 19:07:08 +02:00
committed by GitHub
parent d412c363d7
commit f96ea86a00
20 changed files with 699 additions and 32 deletions

View File

@@ -110,7 +110,10 @@ class BoundingBox(BaseModel):
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
def area(self) -> float:
return (self.r - self.l) * (self.b - self.t)
area = (self.r - self.l) * (self.b - self.t)
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
area = -area
return area
def intersection_area_with(self, other: "BoundingBox") -> float:
# Calculate intersection coordinates

View File

@@ -1,6 +1,7 @@
from enum import Enum, auto
from typing import List, Literal, Optional, Union
from pydantic import BaseModel
from pydantic import BaseModel, ConfigDict, Field
class TableFormerMode(str, Enum):
@@ -18,8 +19,49 @@ class TableStructureOptions(BaseModel):
mode: TableFormerMode = TableFormerMode.FAST
class OcrOptions(BaseModel):
kind: str
class EasyOcrOptions(OcrOptions):
kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader
model_storage_directory: Optional[str] = None
download_enabled: bool = True # same default as easyocr.Reader
model_config = ConfigDict(
extra="forbid",
protected_namespaces=(),
)
class TesseractCliOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
path: Optional[str] = None
model_config = ConfigDict(
extra="forbid",
)
class TesseractOcrOptions(OcrOptions):
kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
model_config = ConfigDict(
extra="forbid",
)
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
Field(EasyOcrOptions(), discriminator="kind")
)