mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
init
Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
parent
b3d111a3cd
commit
35f185f545
@ -151,6 +151,32 @@ class RapidOcrOptions(OcrOptions):
|
||||
)
|
||||
|
||||
|
||||
class OnnxtrOcrOptions(OcrOptions):
|
||||
"""Options for the Onnxtr engine."""
|
||||
|
||||
kind: ClassVar[Literal["onnxtr"]] = "onnxtr"
|
||||
|
||||
lang: List[str] = ["en", "fr"]
|
||||
confidence_score: float = 0.5
|
||||
|
||||
det_arch: str = "fast_base"
|
||||
reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model
|
||||
det_bs: int = 1 # NOTE: Should be 1 because docling seems not to support batch processing yet
|
||||
reco_bs: int = 512
|
||||
auto_correct_orientation: bool = False
|
||||
preserve_aspect_ratio: bool = True
|
||||
symmetric_pad: bool = True
|
||||
paragraph_break: float = 0.035
|
||||
load_in_8_bit: bool = False
|
||||
det_engine_cfg: Dict[str, Any] = {}
|
||||
reco_engine_cfg: Dict[str, Any] = {}
|
||||
clf_engine_cfg: Dict[str, Any] = {}
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
|
||||
class EasyOcrOptions(OcrOptions):
|
||||
"""Options for the EasyOCR engine."""
|
||||
|
||||
|
174
docling/models/onnxtr_model.py
Normal file
174
docling/models/onnxtr_model.py
Normal file
@ -0,0 +1,174 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
OcrOptions,
|
||||
OnnxtrOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OnnxtrOcrModel(BaseOcrModel):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Optional[Path],
|
||||
options: OnnxtrOcrOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
super().__init__(
|
||||
enabled=enabled,
|
||||
artifacts_path=artifacts_path,
|
||||
options=options,
|
||||
accelerator_options=accelerator_options,
|
||||
)
|
||||
self.options: OnnxtrOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
if self.enabled:
|
||||
try:
|
||||
from onnxtr.models import ocr_predictor, EngineConfig, from_hub # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. "
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
|
||||
if options.auto_correct_orientation:
|
||||
config = {
|
||||
"assume_straight_pages": False,
|
||||
"straighten_pages": True,
|
||||
"export_as_straight_boxes": True,
|
||||
# Disable crop orientation because we straighten the pages already
|
||||
"disable_crop_orientation": True,
|
||||
"disable_page_orientation": False,
|
||||
}
|
||||
else:
|
||||
config = {
|
||||
"assume_straight_pages": True,
|
||||
"straighten_pages": False,
|
||||
"export_as_straight_boxes": True,
|
||||
"disable_crop_orientation": False,
|
||||
"disable_page_orientation": False,
|
||||
}
|
||||
|
||||
self.reader = ocr_predictor(
|
||||
det_arch=from_hub(self.options.det_arch) if self.options.det_arch.count("/") == 1 else self.options.det_arch,
|
||||
reco_arch=from_hub(self.options.reco_arch) if self.options.reco_arch.count("/") == 1 else self.options.reco_arch,
|
||||
preserve_aspect_ratio=self.options.preserve_aspect_ratio,
|
||||
symmetric_pad=self.options.symmetric_pad,
|
||||
paragraph_break=self.options.paragraph_break,
|
||||
load_in_8_bit=self.options.load_in_8_bit,
|
||||
**config,
|
||||
)
|
||||
|
||||
def _to_absolute_and_docling_format(self, geom: list[list[float]], img_shape: tuple[int, int]) -> tuple[int, int, int, int]:
|
||||
"""
|
||||
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
|
||||
|
||||
Args:
|
||||
geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
|
||||
img_shape (tuple[int, int]): (height, width) of the image
|
||||
|
||||
Returns:
|
||||
tuple: (x1, y1, x2, y2)
|
||||
"""
|
||||
h, w = img_shape
|
||||
scale_inv = 1 / self.scale # Precompute inverse for efficiency
|
||||
|
||||
def scale_point(x: float, y: float) -> tuple[int, int]:
|
||||
"""Scale and round a point to absolute coordinates."""
|
||||
return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
|
||||
|
||||
if len(geom) == 2:
|
||||
(xmin, ymin), (xmax, ymax) = geom
|
||||
x1, y1 = scale_point(xmin, ymin)
|
||||
x2, y2 = scale_point(xmax, ymax)
|
||||
elif len(geom) == 4:
|
||||
abs_points = [scale_point(*point) for point in geom]
|
||||
x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points)
|
||||
x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points)
|
||||
else:
|
||||
raise ValueError(f"Invalid geometry format: {geom}. Expected either 2 or 4 points.")
|
||||
|
||||
return x1, y1, x2, y2
|
||||
|
||||
|
||||
def __call__(self, conv_res: ConversionResult, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
continue
|
||||
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
all_ocr_cells = []
|
||||
|
||||
for ocr_rect in ocr_rects:
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
|
||||
with page._backend.get_page_image(scale=self.scale, cropbox=ocr_rect) as high_res_image:
|
||||
im_width, im_height = high_res_image.size
|
||||
result = self.reader([numpy.array(high_res_image)])
|
||||
|
||||
if result is not None:
|
||||
for p in result.pages:
|
||||
for ix, word in enumerate(
|
||||
word
|
||||
for block in p.blocks
|
||||
for line in block.lines
|
||||
for word in line.words
|
||||
):
|
||||
all_ocr_cells.append(
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=word.value,
|
||||
orig=word.value,
|
||||
from_ocr=True,
|
||||
confidence=word.confidence,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
self._to_absolute_and_docling_format(
|
||||
word.geometry, img_shape=(im_height, im_width)
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||
|
||||
yield page
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_options_type(cls) -> Type[OcrOptions]:
|
||||
return OnnxtrOcrOptions
|
@ -3,6 +3,7 @@ from docling.models.ocr_mac_model import OcrMacModel
|
||||
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docling.models.onnxtr_model import OnnxtrOcrModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
|
||||
@ -13,6 +14,7 @@ def ocr_engines():
|
||||
EasyOcrModel,
|
||||
OcrMacModel,
|
||||
RapidOcrModel,
|
||||
OnnxtrOcrModel,
|
||||
TesseractOcrModel,
|
||||
TesseractOcrCliModel,
|
||||
]
|
||||
|
@ -72,11 +72,12 @@ openpyxl = "^3.1.5"
|
||||
lxml = ">=4.0.0,<6.0.0"
|
||||
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.3", optional = true, markers = "python_version < '3.13'" }
|
||||
onnxruntime = [
|
||||
# 1.19.2 is the last version with python3.9 support,
|
||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
|
||||
{ version = "^1.7.0", optional = true, markers = "python_version < '3.10'" },
|
||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version >= '3.10'" },
|
||||
]
|
||||
|
||||
transformers = [
|
||||
|
@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
OnnxtrOcrOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
@ -62,6 +63,7 @@ def test_e2e_conversions():
|
||||
TesseractOcrOptions(),
|
||||
TesseractCliOcrOptions(),
|
||||
EasyOcrOptions(force_full_page_ocr=True),
|
||||
OnnxtrOcrOptions(force_full_page_ocr=True),
|
||||
TesseractOcrOptions(force_full_page_ocr=True),
|
||||
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
||||
TesseractCliOcrOptions(force_full_page_ocr=True),
|
||||
|
Loading…
Reference in New Issue
Block a user