style & quality applied

Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
felix 2025-03-21 21:49:50 +01:00
parent 7c87467ea5
commit a19cf81f98
7 changed files with 1509 additions and 47 deletions

View File

@ -161,18 +161,12 @@ class OnnxtrOcrOptions(OcrOptions):
det_arch: str = "fast_base" det_arch: str = "fast_base"
reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model
det_bs: int = (
1 # NOTE: Should be 1 because docling seems not to support batch processing yet
)
reco_bs: int = 512 reco_bs: int = 512
auto_correct_orientation: bool = False auto_correct_orientation: bool = False
preserve_aspect_ratio: bool = True preserve_aspect_ratio: bool = True
symmetric_pad: bool = True symmetric_pad: bool = True
paragraph_break: float = 0.035 paragraph_break: float = 0.035
load_in_8_bit: bool = False load_in_8_bit: bool = False
det_engine_cfg: Dict[str, Any] = {}
reco_engine_cfg: Dict[str, Any] = {}
clf_engine_cfg: Dict[str, Any] = {}
model_config = ConfigDict( model_config = ConfigDict(
extra="forbid", extra="forbid",

View File

@ -1,8 +1,10 @@
import logging import logging
import os
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type from typing import Iterable, Optional, Type
import numpy import numpy
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -42,7 +44,15 @@ class OnnxtrOcrModel(BaseOcrModel):
if self.enabled: if self.enabled:
try: try:
from onnxtr.models import ocr_predictor, EngineConfig, from_hub # type: ignore from onnxtr.models import ( # type: ignore
EngineConfig,
from_hub,
ocr_predictor,
)
# We diable multiprocessing for OnnxTR,
# because the speed up is minimal and it can raise memory leaks on windows
os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE"
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. " "OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. "
@ -62,6 +72,7 @@ class OnnxtrOcrModel(BaseOcrModel):
config = { config = {
"assume_straight_pages": True, "assume_straight_pages": True,
"straighten_pages": False, "straighten_pages": False,
# This should be disabled when docling supports polygons
"export_as_straight_boxes": True, "export_as_straight_boxes": True,
"disable_crop_orientation": False, "disable_crop_orientation": False,
"disable_page_orientation": False, "disable_page_orientation": False,
@ -78,15 +89,22 @@ class OnnxtrOcrModel(BaseOcrModel):
if self.options.reco_arch.count("/") == 1 if self.options.reco_arch.count("/") == 1
else self.options.reco_arch else self.options.reco_arch
), ),
det_bs=1, # NOTE: Should be always 1, because docling handles batching
preserve_aspect_ratio=self.options.preserve_aspect_ratio, preserve_aspect_ratio=self.options.preserve_aspect_ratio,
symmetric_pad=self.options.symmetric_pad, symmetric_pad=self.options.symmetric_pad,
paragraph_break=self.options.paragraph_break, paragraph_break=self.options.paragraph_break,
load_in_8_bit=self.options.load_in_8_bit, load_in_8_bit=self.options.load_in_8_bit,
**config, **config,
# TODO: Allow specification of the engine configs in the options
det_engine_cfg=None,
reco_engine_cfg=None,
clf_engine_cfg=None,
) )
def _to_absolute_and_docling_format( def _to_absolute_and_docling_format(
self, geom: list[list[float]], img_shape: tuple[int, int] self,
geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray,
img_shape: tuple[int, int],
) -> tuple[int, int, int, int]: ) -> tuple[int, int, int, int]:
""" """
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format. Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
@ -109,14 +127,11 @@ class OnnxtrOcrModel(BaseOcrModel):
(xmin, ymin), (xmax, ymax) = geom (xmin, ymin), (xmax, ymax) = geom
x1, y1 = scale_point(xmin, ymin) x1, y1 = scale_point(xmin, ymin)
x2, y2 = scale_point(xmax, ymax) x2, y2 = scale_point(xmax, ymax)
elif len(geom) == 4: # 4-Point polygon
else:
abs_points = [scale_point(*point) for point in geom] abs_points = [scale_point(*point) for point in geom]
x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points) x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points)
x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points) x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points)
else:
raise ValueError(
f"Invalid geometry format: {geom}. Expected either 2 or 4 points."
)
return x1, y1, x2, y2 return x1, y1, x2, y2

View File

@ -1,9 +1,9 @@
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.ocr_mac_model import OcrMacModel from docling.models.ocr_mac_model import OcrMacModel
from docling.models.onnxtr_model import OnnxtrOcrModel
from docling.models.picture_description_api_model import PictureDescriptionApiModel from docling.models.picture_description_api_model import PictureDescriptionApiModel
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
from docling.models.rapid_ocr_model import RapidOcrModel from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.onnxtr_model import OnnxtrOcrModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.models.tesseract_ocr_model import TesseractOcrModel

View File

@ -0,0 +1,40 @@
from docling.datamodel.pipeline_options import OnnxtrOcrOptions, PdfPipelineOptions
from docling.document_converter import (
ConversionResult,
DocumentConverter,
InputFormat,
PdfFormatOption,
)
def main():
# Source document to convert
source = "https://arxiv.org/pdf/2408.09869v4"
ocr_options = OnnxtrOcrOptions(
det_arch="db_mobilenet_v3_large",
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub
auto_correct_orientation=True, # This can be used to correct the orientation of the pages
)
pipeline_options = PdfPipelineOptions(
ocr_options=ocr_options,
)
# Convert the document
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
),
},
)
conversion_result: ConversionResult = converter.convert(source=source)
doc = conversion_result.document
md = doc.export_to_markdown()
print(md)
if __name__ == "__main__":
main()

1473
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -44,7 +44,7 @@ packages = [{ include = "docling" }]
###################### ######################
# actual dependencies: # actual dependencies:
###################### ######################
python = "^3.9" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = {extras = ["chunking"], version = "^2.24.1"} docling-core = {extras = ["chunking"], version = "^2.24.1"}
docling-ibm-models = "^3.4.0" docling-ibm-models = "^3.4.0"
@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
lxml = ">=4.0.0,<6.0.0" lxml = ">=4.0.0,<6.0.0"
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true } ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" } rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.3", optional = true, markers = "python_version < '3.13'" } onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
onnxruntime = [ onnxruntime = [
# 1.19.2 is the last version with python3.9 support, # 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0 # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0

View File

@ -11,9 +11,9 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions, OcrMacOptions,
OcrOptions, OcrOptions,
OnnxtrOcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
RapidOcrOptions, RapidOcrOptions,
OnnxtrOcrOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )