mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
feat: add options for choosing OCR engine
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
cde671cf34
commit
bbfc0617f2
@ -1,6 +1,7 @@
|
|||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
|
from typing import List, Literal, Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
class TableFormerMode(str, Enum):
|
class TableFormerMode(str, Enum):
|
||||||
@ -18,8 +19,24 @@ class TableStructureOptions(BaseModel):
|
|||||||
mode: TableFormerMode = TableFormerMode.FAST
|
mode: TableFormerMode = TableFormerMode.FAST
|
||||||
|
|
||||||
|
|
||||||
|
class OcrOptions(BaseModel):
|
||||||
|
kind: str
|
||||||
|
|
||||||
|
|
||||||
|
class EasyOcrOptions(OcrOptions):
|
||||||
|
kind: Literal["easyocr"] = "easyocr"
|
||||||
|
lang: List[str] = ["fr", "de", "es", "en"]
|
||||||
|
|
||||||
|
|
||||||
|
class TesseractOcrOptions(OcrOptions):
|
||||||
|
kind: Literal["tesseract"] = "tesseract"
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel):
|
class PipelineOptions(BaseModel):
|
||||||
do_table_structure: bool = True # True: perform table structure extraction
|
do_table_structure: bool = True # True: perform table structure extraction
|
||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
|
ocr_options: Union[EasyOcrOptions, TesseractOcrOptions] = Field(
|
||||||
|
EasyOcrOptions(), discriminator="kind"
|
||||||
|
)
|
||||||
|
@ -3,21 +3,21 @@ import logging
|
|||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Iterable, List, Tuple
|
from typing import Iterable, List, Tuple
|
||||||
|
|
||||||
import numpy
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import find_objects, label
|
from scipy.ndimage import find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||||
|
from docling.datamodel.pipeline_options import OcrOptions
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseOcrModel:
|
class BaseOcrModel:
|
||||||
def __init__(self, config):
|
def __init__(self, enabled: bool, options: OcrOptions):
|
||||||
self.config = config
|
self.enabled = enabled
|
||||||
self.enabled = config["enabled"]
|
self.options = options
|
||||||
|
|
||||||
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
||||||
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
||||||
|
@ -4,21 +4,23 @@ from typing import Iterable
|
|||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||||
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EasyOcrModel(BaseOcrModel):
|
class EasyOcrModel(BaseOcrModel):
|
||||||
def __init__(self, config):
|
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
||||||
super().__init__(config)
|
super().__init__(enabled=enabled, options=options)
|
||||||
|
self.options: EasyOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
import easyocr
|
import easyocr
|
||||||
|
|
||||||
self.reader = easyocr.Reader(config["lang"])
|
self.reader = easyocr.Reader(lang_list=self.options.lang)
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
|
||||||
|
@ -1,6 +1,11 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
PipelineOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
@ -14,19 +19,35 @@ class StandardModelPipeline(BaseModelPipeline):
|
|||||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||||
super().__init__(artifacts_path, pipeline_options)
|
super().__init__(artifacts_path, pipeline_options)
|
||||||
|
|
||||||
|
ocr_model: BaseOcrModel
|
||||||
|
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
||||||
|
ocr_model = EasyOcrModel(
|
||||||
|
enabled=pipeline_options.do_ocr,
|
||||||
|
options=pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
||||||
|
raise NotImplemented()
|
||||||
|
# TODO
|
||||||
|
# ocr_model = TesseractOcrModel(
|
||||||
|
# enabled=pipeline_options.do_ocr,
|
||||||
|
# options=pipeline_options.ocr_options,
|
||||||
|
# )
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||||
|
)
|
||||||
|
|
||||||
self.model_pipe = [
|
self.model_pipe = [
|
||||||
EasyOcrModel(
|
# OCR
|
||||||
config={
|
ocr_model,
|
||||||
"lang": ["fr", "de", "es", "en"],
|
# Layout
|
||||||
"enabled": pipeline_options.do_ocr,
|
|
||||||
}
|
|
||||||
),
|
|
||||||
LayoutModel(
|
LayoutModel(
|
||||||
config={
|
config={
|
||||||
"artifacts_path": artifacts_path
|
"artifacts_path": artifacts_path
|
||||||
/ StandardModelPipeline._layout_model_path
|
/ StandardModelPipeline._layout_model_path
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
|
# Table structure
|
||||||
TableStructureModel(
|
TableStructureModel(
|
||||||
config={
|
config={
|
||||||
"artifacts_path": artifacts_path
|
"artifacts_path": artifacts_path
|
||||||
|
Loading…
Reference in New Issue
Block a user