feat(ocr): added support for RapidOCR engine (#415)

* adding rapidocr engine for ocr in docling

Signed-off-by: swayam-singhal <swayam.singhal@inito.com>

* fixing styling format

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* updating pyproject.toml and poetry.lock to fix ci bugs

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* help poetry pinning for python3.9

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* simplifying rapidocr options so that device can be changed using a single option for all models

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* fix styling issues and small bug in rapidOcrOptions

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* use default device until we enable global management

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: swayam-singhal <swayam.singhal@inito.com>
Signed-off-by: Swaymaw <swaymaw@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: swayam-singhal <swayam.singhal@inito.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Swaymaw
2024-11-27 18:27:41 +05:30
committed by GitHub
parent 767563bf8b
commit 85b29990be
9 changed files with 405 additions and 13 deletions

View File

@@ -50,6 +50,13 @@ marko = "^2.1.2"
openpyxl = "^3.1.5"
lxml = ">=4.0.0,<6.0.0"
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
onnxruntime = [
# 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
]
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -104,6 +111,7 @@ torchvision = [
[tool.poetry.extras]
tesserocr = ["tesserocr"]
ocrmac = ["ocrmac"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts]
docling = "docling.cli.main:app"