fix: main: Introduce format options for Image with the same pdf pipeline_options.

Add RapidOcrOptions to the Union of ocr_options for PdfPipelineOptions

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-12-08 18:32:08 +01:00
parent c830b92b2e
commit e125b9b24d
2 changed files with 5 additions and 3 deletions

View File

@ -342,11 +342,13 @@ def convert(
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,

View File

@ -143,7 +143,7 @@ class PdfPipelineOptions(PipelineOptions):
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions, RapidOcrOptions
] = Field(EasyOcrOptions(), discriminator="kind")
images_scale: float = 1.0