fixed conflicts

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-02 17:01:34 +02:00
parent 0b76211eed
commit a3e2cf5473
4 changed files with 25 additions and 3 deletions

View File

@ -30,7 +30,7 @@ class EasyOcrOptions(OcrOptions):
class TesseractOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fr", "de", "es", "en"]
class TesserOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesserocr"

View File

@ -10,7 +10,6 @@ from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__)
class TesseractOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractOcrOptions):

View File

@ -8,6 +8,7 @@ from docling.datamodel.pipeline_options import (
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.easyocr_model import EasyOcrModel
from docling.models.tesseract_model import TesseractOCRModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_model import TesseractOcrModel

View File

@ -47,7 +47,28 @@ def get_pdf_paths():
return pdf_files
def get_converter():
def get_easyocr_converter():
ocr_options = EasyOcrOptions(
)
pipeline_options = PipelineOptions()
# Debug
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def get_tesseract_converter():
pipeline_options = PipelineOptions()
# Debug
@ -63,6 +84,7 @@ def get_converter():
return converter
def test_e2e_conversions():
pdf_paths = get_pdf_paths()