mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
feat: add support for ocrmac
OCR engine on macOS
- Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <nuridol@gmail.com>
This commit is contained in:
parent
6c22cba0a7
commit
4aaf128384
@ -28,6 +28,7 @@ from docling.datamodel.pipeline_options import (
|
||||
TableFormerMode,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
OcrMacOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
|
||||
@ -73,6 +74,7 @@ class OcrEngine(str, Enum):
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT_CLI = "tesseract_cli"
|
||||
TESSERACT = "tesseract"
|
||||
OCRMAC = "ocrmac"
|
||||
|
||||
|
||||
def export_documents(
|
||||
@ -224,6 +226,8 @@ def convert(
|
||||
ocr_options = TesseractCliOcrOptions()
|
||||
case OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions()
|
||||
case OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions()
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
|
@ -60,6 +60,16 @@ class TesseractOcrOptions(OcrOptions):
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
class OcrMacOptions(OcrOptions):
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
recognition: str = "accurate"
|
||||
framework: str = "vision"
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
create_legacy_output: bool = (
|
||||
@ -73,7 +83,7 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
||||
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = (
|
||||
Field(EasyOcrOptions(), discriminator="kind")
|
||||
)
|
||||
|
||||
|
123
docling/models/ocr_mac_model.py
Normal file
123
docling/models/ocr_mac_model.py
Normal file
@ -0,0 +1,123 @@
|
||||
import logging
|
||||
import tempfile
|
||||
|
||||
from typing import Iterable, Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OcrMacModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: OcrMacOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: OcrMacOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
if self.enabled:
|
||||
setup_errmsg = (
|
||||
"ocrmac is not correctly installed. "
|
||||
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
try:
|
||||
from ocrmac import ocrmac
|
||||
except ImportError:
|
||||
raise ImportError(setup_errmsg)
|
||||
|
||||
self.reader_RIL = ocrmac.OCR
|
||||
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page_idx, page in enumerate(page_batch):
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w"
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
|
||||
boxes = self.reader_RIL(fname,
|
||||
recognition_level=self.options.recognition,
|
||||
framework=self.options.framework,
|
||||
language_preference=self.options.lang,
|
||||
).recognize()
|
||||
|
||||
im_width, im_height = high_res_image.size
|
||||
cells = []
|
||||
for ix, (text, confidence, box) in enumerate(boxes):
|
||||
x = float(box[0])
|
||||
y = float(box[1])
|
||||
w = float(box[2])
|
||||
h = float(box[3])
|
||||
|
||||
x1 = x * im_width
|
||||
y2 = (1 - y) * im_height
|
||||
|
||||
x2 = x1 + w * im_width
|
||||
y1 = y2 - h * im_height
|
||||
|
||||
left = x1 / self.scale
|
||||
top = y1 / self.scale
|
||||
right = x2 / self.scale
|
||||
bottom = y2 / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||
|
||||
yield page
|
@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
OcrMacOptions,
|
||||
)
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
@ -26,6 +27,7 @@ from docling.models.page_preprocessing_model import (
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.models.ocr_mac_model import OcrMacModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
@ -118,6 +120,11 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
||||
return OcrMacModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
return None
|
||||
|
||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
|
@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
||||
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
||||
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
||||
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
||||
| OcrMac | System dependency. See description below. | `OcrMacOptions` |
|
||||
|
||||
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||
|
||||
|
@ -89,6 +89,7 @@ torchvision = [
|
||||
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
|
||||
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
|
||||
]
|
||||
ocrmac = {markers = "sys_platform == 'darwin'", version = "^1.0.0"}
|
||||
|
||||
[tool.poetry.extras]
|
||||
tesserocr = ["tesserocr"]
|
||||
|
Loading…
Reference in New Issue
Block a user