fix: main: Introduce format options for Image with the same pdf pipeline_options.

Add RapidOcrOptions to the Union of ocr_options for PdfPipelineOptions

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-12-08 18:32:08 +01:00
parent c830b92b2e
commit e125b9b24d
2 changed files with 5 additions and 3 deletions

View File

@ -342,11 +342,13 @@ def convert(
else: else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
format_options: Dict[InputFormat, FormatOption] = { pdf_format_option = PdfFormatOption(
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
backend=backend, # pdf_backend backend=backend, # pdf_backend
) )
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
} }
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
allowed_formats=from_formats, allowed_formats=from_formats,

View File

@ -143,7 +143,7 @@ class PdfPipelineOptions(PipelineOptions):
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[ ocr_options: Union[
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions, RapidOcrOptions
] = Field(EasyOcrOptions(), discriminator="kind") ] = Field(EasyOcrOptions(), discriminator="kind")
images_scale: float = 1.0 images_scale: float = 1.0