mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
add examples for swtching OCR engine and CLI support
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
8d1c1d6dd5
commit
0b76211eed
@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
PipelineOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
TesserOcrOptions,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
@ -53,6 +58,13 @@ class Backend(str, Enum):
|
|||||||
DOCLING = "docling"
|
DOCLING = "docling"
|
||||||
|
|
||||||
|
|
||||||
|
# Define an enum for the ocr engines
|
||||||
|
class OcrEngine(str, Enum):
|
||||||
|
EASYOCR = "easyocr"
|
||||||
|
TESSERACT = "tesseract"
|
||||||
|
TESSEROCR = "tesserocr"
|
||||||
|
|
||||||
|
|
||||||
def export_documents(
|
def export_documents(
|
||||||
conv_results: Iterable[ConversionResult],
|
conv_results: Iterable[ConversionResult],
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
@ -152,6 +164,9 @@ def convert(
|
|||||||
backend: Annotated[
|
backend: Annotated[
|
||||||
Backend, typer.Option(..., help="The PDF backend to use.")
|
Backend, typer.Option(..., help="The PDF backend to use.")
|
||||||
] = Backend.DOCLING,
|
] = Backend.DOCLING,
|
||||||
|
ocr_engine: Annotated[
|
||||||
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
|
] = OcrEngine.EASYOCR,
|
||||||
output: Annotated[
|
output: Annotated[
|
||||||
Path, typer.Option(..., help="Output directory where results are saved.")
|
Path, typer.Option(..., help="Output directory where results are saved.")
|
||||||
] = Path("."),
|
] = Path("."),
|
||||||
@ -191,8 +206,19 @@ def convert(
|
|||||||
case _:
|
case _:
|
||||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||||
|
|
||||||
|
match ocr_engine:
|
||||||
|
case OcrEngine.EASYOCR:
|
||||||
|
ocr_options = EasyOcrOptions()
|
||||||
|
case OcrEngine.TESSERACT:
|
||||||
|
ocr_options = TesseractOcrOptions()
|
||||||
|
case OcrEngine.TESSEROCR:
|
||||||
|
ocr_options = TesserOcrOptions()
|
||||||
|
case _:
|
||||||
|
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||||
|
|
||||||
pipeline_options = PipelineOptions(
|
pipeline_options = PipelineOptions(
|
||||||
do_ocr=ocr,
|
do_ocr=ocr,
|
||||||
|
ocr_options=ocr_options,
|
||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||||
|
@ -32,11 +32,15 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
kind: Literal["tesseract"] = "tesseract"
|
kind: Literal["tesseract"] = "tesseract"
|
||||||
|
|
||||||
|
|
||||||
|
class TesserOcrOptions(OcrOptions):
|
||||||
|
kind: Literal["tesseract"] = "tesserocr"
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel):
|
class PipelineOptions(BaseModel):
|
||||||
do_table_structure: bool = True # True: perform table structure extraction
|
do_table_structure: bool = True # True: perform table structure extraction
|
||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
ocr_options: Union[EasyOcrOptions, TesseractOcrOptions] = Field(
|
ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field(
|
||||||
EasyOcrOptions(), discriminator="kind"
|
EasyOcrOptions(), discriminator="kind"
|
||||||
)
|
)
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Iterable
|
from subprocess import PIPE, Popen
|
||||||
|
from typing import Iterable, Tuple
|
||||||
|
|
||||||
import numpy
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||||
@ -10,7 +10,8 @@ from docling.models.base_ocr_model import BaseOcrModel
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
class TesseractModel(BaseOcrModel):
|
|
||||||
|
class TesseractOcrModel(BaseOcrModel):
|
||||||
|
|
||||||
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
||||||
super().__init__(enabled=enabled, options=options)
|
super().__init__(enabled=enabled, options=options)
|
||||||
@ -23,15 +24,15 @@ class TesseractModel(BaseOcrModel):
|
|||||||
self._get_name_and_version()
|
self._get_name_and_version()
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
_log.error(f"Tesseract is not supported, aborting ...")
|
_log.error(f"Tesseract is not available, aborting ...")
|
||||||
self.enabled = False
|
self.enabled = False
|
||||||
|
|
||||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||||
|
|
||||||
if self._name!=None and self._version!=None:
|
if self._name != None and self._version != None:
|
||||||
return self._name, self._version
|
return self._name, self._version
|
||||||
|
|
||||||
cmd = ['tesseract', '--version']
|
cmd = ["tesseract", "--version"]
|
||||||
|
|
||||||
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
||||||
stdout, stderr = proc.communicate()
|
stdout, stderr = proc.communicate()
|
||||||
@ -40,13 +41,17 @@ class TesseractModel(BaseOcrModel):
|
|||||||
|
|
||||||
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
|
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
|
||||||
# to stderr, so check both.
|
# to stderr, so check both.
|
||||||
version_line = (stdout.decode('utf8').strip() or stderr.decode('utf8').strip()).split('\n')[0].strip()
|
version_line = (
|
||||||
|
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
|
||||||
|
.split("\n")[0]
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
# If everything else fails...
|
# If everything else fails...
|
||||||
if not version_line:
|
if not version_line:
|
||||||
version_line = 'tesseract XXX'
|
version_line = "tesseract XXX"
|
||||||
|
|
||||||
name, version = version_line.split(' ')
|
name, version = version_line.split(" ")
|
||||||
|
|
||||||
self._name = name
|
self._name = name
|
||||||
self._version = version
|
self._version = version
|
||||||
@ -58,26 +63,25 @@ class TesseractModel(BaseOcrModel):
|
|||||||
cmd = ["tesseract"]
|
cmd = ["tesseract"]
|
||||||
|
|
||||||
if languages:
|
if languages:
|
||||||
cmd += ['-l', '+'.join(languages)]
|
cmd += ["-l", "+".join(languages)]
|
||||||
|
|
||||||
cmd += [ifilename, 'stdout', "tsv"]
|
cmd += [ifilename, "stdout", "tsv"]
|
||||||
logger.info("command: {}".format(" ".join(cmd)))
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
|
|
||||||
proc = Popen(cmd, stdout=PIPE)
|
proc = Popen(cmd, stdout=PIPE)
|
||||||
output, _ = proc.communicate()
|
output, _ = proc.communicate()
|
||||||
|
|
||||||
# Read the TSV file generated by Tesseract
|
# Read the TSV file generated by Tesseract
|
||||||
df = pd.read_csv('output_file_name.tsv', sep='\t')
|
df = pd.read_csv("output_file_name.tsv", sep="\t")
|
||||||
|
|
||||||
# Display the dataframe (optional)
|
# Display the dataframe (optional)
|
||||||
print(df.head())
|
print(df.head())
|
||||||
|
|
||||||
# Filter rows that contain actual text (ignore header or empty rows)
|
# Filter rows that contain actual text (ignore header or empty rows)
|
||||||
df_filtered = df[df['text'].notnull() & (df['text'].str.strip() != '')]
|
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
|
||||||
|
|
||||||
return df_filtered
|
return df_filtered
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
@ -111,13 +115,13 @@ class TesseractModel(BaseOcrModel):
|
|||||||
text = row["text"]
|
text = row["text"]
|
||||||
conf = row["confidence"]
|
conf = row["confidence"]
|
||||||
|
|
||||||
l = float(row['left'])
|
l = float(row["left"])
|
||||||
t = float(row['top'])
|
t = float(row["top"])
|
||||||
w = float(row['width'])
|
w = float(row["width"])
|
||||||
h = float(row['height'])
|
h = float(row["height"])
|
||||||
|
|
||||||
b = t-h
|
b = t - h
|
||||||
r = l+w
|
r = l + w
|
||||||
|
|
||||||
cell = OcrCell(
|
cell = OcrCell(
|
||||||
id=ix,
|
id=ix,
|
||||||
|
@ -4,11 +4,13 @@ from docling.datamodel.pipeline_options import (
|
|||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
PipelineOptions,
|
PipelineOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
|
TesserOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
from docling.models.tesseract_model import TesseractOcrModel
|
||||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||||
|
|
||||||
|
|
||||||
@ -26,6 +28,11 @@ class StandardModelPipeline(BaseModelPipeline):
|
|||||||
options=pipeline_options.ocr_options,
|
options=pipeline_options.ocr_options,
|
||||||
)
|
)
|
||||||
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
||||||
|
ocr_model = TesseractOcrModel(
|
||||||
|
enabled=pipeline_options.do_ocr,
|
||||||
|
options=pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
elif isinstance(pipeline_options.ocr_options, TesserOcrOptions):
|
||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
# TODO
|
# TODO
|
||||||
# ocr_model = TesseractOcrModel(
|
# ocr_model = TesseractOcrModel(
|
||||||
|
@ -8,6 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
|
from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -115,6 +116,27 @@ def main():
|
|||||||
# pdf_backend=DoclingParseDocumentBackend,
|
# pdf_backend=DoclingParseDocumentBackend,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
|
# Docling Parse with Tesseract OCR
|
||||||
|
# ----------------------
|
||||||
|
pipeline_options = PipelineOptions()
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
pipeline_options.ocr_options = TesseractOcrOptions()
|
||||||
|
|
||||||
|
# Docling Parse with TesserOCR
|
||||||
|
# ----------------------
|
||||||
|
# pipeline_options = PipelineOptions()
|
||||||
|
# pipeline_options.do_ocr=True
|
||||||
|
# pipeline_options.do_table_structure=True
|
||||||
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
# pipeline_options.ocr_options = TesserOcrOptions()
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
|
|
||||||
# Define input files
|
# Define input files
|
||||||
|
@ -12,8 +12,7 @@ GENERATE = False
|
|||||||
|
|
||||||
# Debug
|
# Debug
|
||||||
def save_output(pdf_path: Path, doc_result: ConversionResult):
|
def save_output(pdf_path: Path, doc_result: ConversionResult):
|
||||||
r"""
|
r""" """
|
||||||
"""
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user