mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
style & quality applied
Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
parent
7c87467ea5
commit
a19cf81f98
@ -161,18 +161,12 @@ class OnnxtrOcrOptions(OcrOptions):
|
|||||||
|
|
||||||
det_arch: str = "fast_base"
|
det_arch: str = "fast_base"
|
||||||
reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model
|
reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model
|
||||||
det_bs: int = (
|
|
||||||
1 # NOTE: Should be 1 because docling seems not to support batch processing yet
|
|
||||||
)
|
|
||||||
reco_bs: int = 512
|
reco_bs: int = 512
|
||||||
auto_correct_orientation: bool = False
|
auto_correct_orientation: bool = False
|
||||||
preserve_aspect_ratio: bool = True
|
preserve_aspect_ratio: bool = True
|
||||||
symmetric_pad: bool = True
|
symmetric_pad: bool = True
|
||||||
paragraph_break: float = 0.035
|
paragraph_break: float = 0.035
|
||||||
load_in_8_bit: bool = False
|
load_in_8_bit: bool = False
|
||||||
det_engine_cfg: Dict[str, Any] = {}
|
|
||||||
reco_engine_cfg: Dict[str, Any] = {}
|
|
||||||
clf_engine_cfg: Dict[str, Any] = {}
|
|
||||||
|
|
||||||
model_config = ConfigDict(
|
model_config = ConfigDict(
|
||||||
extra="forbid",
|
extra="forbid",
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type
|
from typing import Iterable, Optional, Type
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
@ -42,7 +44,15 @@ class OnnxtrOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
try:
|
try:
|
||||||
from onnxtr.models import ocr_predictor, EngineConfig, from_hub # type: ignore
|
from onnxtr.models import ( # type: ignore
|
||||||
|
EngineConfig,
|
||||||
|
from_hub,
|
||||||
|
ocr_predictor,
|
||||||
|
)
|
||||||
|
|
||||||
|
# We diable multiprocessing for OnnxTR,
|
||||||
|
# because the speed up is minimal and it can raise memory leaks on windows
|
||||||
|
os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE"
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. "
|
"OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. "
|
||||||
@ -62,6 +72,7 @@ class OnnxtrOcrModel(BaseOcrModel):
|
|||||||
config = {
|
config = {
|
||||||
"assume_straight_pages": True,
|
"assume_straight_pages": True,
|
||||||
"straighten_pages": False,
|
"straighten_pages": False,
|
||||||
|
# This should be disabled when docling supports polygons
|
||||||
"export_as_straight_boxes": True,
|
"export_as_straight_boxes": True,
|
||||||
"disable_crop_orientation": False,
|
"disable_crop_orientation": False,
|
||||||
"disable_page_orientation": False,
|
"disable_page_orientation": False,
|
||||||
@ -78,15 +89,22 @@ class OnnxtrOcrModel(BaseOcrModel):
|
|||||||
if self.options.reco_arch.count("/") == 1
|
if self.options.reco_arch.count("/") == 1
|
||||||
else self.options.reco_arch
|
else self.options.reco_arch
|
||||||
),
|
),
|
||||||
|
det_bs=1, # NOTE: Should be always 1, because docling handles batching
|
||||||
preserve_aspect_ratio=self.options.preserve_aspect_ratio,
|
preserve_aspect_ratio=self.options.preserve_aspect_ratio,
|
||||||
symmetric_pad=self.options.symmetric_pad,
|
symmetric_pad=self.options.symmetric_pad,
|
||||||
paragraph_break=self.options.paragraph_break,
|
paragraph_break=self.options.paragraph_break,
|
||||||
load_in_8_bit=self.options.load_in_8_bit,
|
load_in_8_bit=self.options.load_in_8_bit,
|
||||||
**config,
|
**config,
|
||||||
|
# TODO: Allow specification of the engine configs in the options
|
||||||
|
det_engine_cfg=None,
|
||||||
|
reco_engine_cfg=None,
|
||||||
|
clf_engine_cfg=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _to_absolute_and_docling_format(
|
def _to_absolute_and_docling_format(
|
||||||
self, geom: list[list[float]], img_shape: tuple[int, int]
|
self,
|
||||||
|
geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray,
|
||||||
|
img_shape: tuple[int, int],
|
||||||
) -> tuple[int, int, int, int]:
|
) -> tuple[int, int, int, int]:
|
||||||
"""
|
"""
|
||||||
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
|
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
|
||||||
@ -109,14 +127,11 @@ class OnnxtrOcrModel(BaseOcrModel):
|
|||||||
(xmin, ymin), (xmax, ymax) = geom
|
(xmin, ymin), (xmax, ymax) = geom
|
||||||
x1, y1 = scale_point(xmin, ymin)
|
x1, y1 = scale_point(xmin, ymin)
|
||||||
x2, y2 = scale_point(xmax, ymax)
|
x2, y2 = scale_point(xmax, ymax)
|
||||||
elif len(geom) == 4:
|
# 4-Point polygon
|
||||||
|
else:
|
||||||
abs_points = [scale_point(*point) for point in geom]
|
abs_points = [scale_point(*point) for point in geom]
|
||||||
x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points)
|
x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points)
|
||||||
x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points)
|
x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points)
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid geometry format: {geom}. Expected either 2 or 4 points."
|
|
||||||
)
|
|
||||||
|
|
||||||
return x1, y1, x2, y2
|
return x1, y1, x2, y2
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.ocr_mac_model import OcrMacModel
|
from docling.models.ocr_mac_model import OcrMacModel
|
||||||
|
from docling.models.onnxtr_model import OnnxtrOcrModel
|
||||||
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
||||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||||
from docling.models.onnxtr_model import OnnxtrOcrModel
|
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||||
|
|
||||||
|
40
docs/examples/onnxtr_with_custom_models.py
Normal file
40
docs/examples/onnxtr_with_custom_models.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from docling.datamodel.pipeline_options import OnnxtrOcrOptions, PdfPipelineOptions
|
||||||
|
from docling.document_converter import (
|
||||||
|
ConversionResult,
|
||||||
|
DocumentConverter,
|
||||||
|
InputFormat,
|
||||||
|
PdfFormatOption,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Source document to convert
|
||||||
|
source = "https://arxiv.org/pdf/2408.09869v4"
|
||||||
|
|
||||||
|
ocr_options = OnnxtrOcrOptions(
|
||||||
|
det_arch="db_mobilenet_v3_large",
|
||||||
|
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub
|
||||||
|
auto_correct_orientation=True, # This can be used to correct the orientation of the pages
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions(
|
||||||
|
ocr_options=ocr_options,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert the document
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
conversion_result: ConversionResult = converter.convert(source=source)
|
||||||
|
doc = conversion_result.document
|
||||||
|
md = doc.export_to_markdown()
|
||||||
|
print(md)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
1473
poetry.lock
generated
1473
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -44,7 +44,7 @@ packages = [{ include = "docling" }]
|
|||||||
######################
|
######################
|
||||||
# actual dependencies:
|
# actual dependencies:
|
||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.10"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = {extras = ["chunking"], version = "^2.24.1"}
|
docling-core = {extras = ["chunking"], version = "^2.24.1"}
|
||||||
docling-ibm-models = "^3.4.0"
|
docling-ibm-models = "^3.4.0"
|
||||||
@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
|
|||||||
lxml = ">=4.0.0,<6.0.0"
|
lxml = ">=4.0.0,<6.0.0"
|
||||||
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
||||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||||
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.3", optional = true, markers = "python_version < '3.13'" }
|
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
|
||||||
onnxruntime = [
|
onnxruntime = [
|
||||||
# 1.19.2 is the last version with python3.9 support,
|
# 1.19.2 is the last version with python3.9 support,
|
||||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
||||||
|
@ -11,9 +11,9 @@ from docling.datamodel.pipeline_options import (
|
|||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
|
OnnxtrOcrOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
RapidOcrOptions,
|
RapidOcrOptions,
|
||||||
OnnxtrOcrOptions,
|
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user