mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
fixed conflicts
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
0b76211eed
commit
a3e2cf5473
@ -30,7 +30,7 @@ class EasyOcrOptions(OcrOptions):
|
|||||||
|
|
||||||
class TesseractOcrOptions(OcrOptions):
|
class TesseractOcrOptions(OcrOptions):
|
||||||
kind: Literal["tesseract"] = "tesseract"
|
kind: Literal["tesseract"] = "tesseract"
|
||||||
|
lang: List[str] = ["fr", "de", "es", "en"]
|
||||||
|
|
||||||
class TesserOcrOptions(OcrOptions):
|
class TesserOcrOptions(OcrOptions):
|
||||||
kind: Literal["tesseract"] = "tesserocr"
|
kind: Literal["tesseract"] = "tesserocr"
|
||||||
|
@ -10,7 +10,6 @@ from docling.models.base_ocr_model import BaseOcrModel
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TesseractOcrModel(BaseOcrModel):
|
class TesseractOcrModel(BaseOcrModel):
|
||||||
|
|
||||||
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
||||||
|
@ -8,6 +8,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
|
from docling.models.tesseract_model import TesseractOCRModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
from docling.models.tesseract_model import TesseractOcrModel
|
from docling.models.tesseract_model import TesseractOcrModel
|
||||||
|
@ -47,7 +47,28 @@ def get_pdf_paths():
|
|||||||
return pdf_files
|
return pdf_files
|
||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_easyocr_converter():
|
||||||
|
|
||||||
|
ocr_options = EasyOcrOptions(
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_options = PipelineOptions()
|
||||||
|
# Debug
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
|
return converter
|
||||||
|
|
||||||
|
def get_tesseract_converter():
|
||||||
|
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PipelineOptions()
|
||||||
# Debug
|
# Debug
|
||||||
@ -63,6 +84,7 @@ def get_converter():
|
|||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_e2e_conversions():
|
def test_e2e_conversions():
|
||||||
|
|
||||||
pdf_paths = get_pdf_paths()
|
pdf_paths = get_pdf_paths()
|
||||||
|
Loading…
Reference in New Issue
Block a user