diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 8e92e76e..8c88acc5 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -9,6 +9,11 @@ jobs:
python-version: ['3.10', '3.11', '3.12']
steps:
- uses: actions/checkout@v3
+ - name: Install tesseract
+ run: sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config
+ - name: Set TESSDATA_PREFIX
+ run: |
+ echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
- uses: ./.github/actions/setup-poetry
with:
python-version: ${{ matrix.python-version }}
@@ -32,4 +37,4 @@ jobs:
poetry run python "$file" || exit 1
done
- name: Build with poetry
- run: poetry build
\ No newline at end of file
+ run: poetry build
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94e773c2..1a8bc4fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## [v1.19.0](https://github.com/DS4SD/docling/releases/tag/v1.19.0) - 2024-10-08
+
+### Feature
+
+* Add options for choosing OCR engines ([#118](https://github.com/DS4SD/docling/issues/118)) ([`f96ea86`](https://github.com/DS4SD/docling/commit/f96ea86a00fd1aafaa57025e46b5288b43958725))
+
## [v1.18.0](https://github.com/DS4SD/docling/releases/tag/v1.18.0) - 2024-10-03
### Feature
diff --git a/README.md b/README.md
index 53990a5a..1d72b44a 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
```
+
+ Alternative OCR engines
+
+ Docling supports multiple OCR engines for processing scanned documents. The current version provides
+ the following engines.
+
+ | Engine | Installation | Usage |
+ | ------ | ------------ | ----- |
+ | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
+ | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
+ | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
+
+ The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
+
+ ```python
+ from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+ from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
+ from docling.document_converter import DocumentConverter
+
+ pipeline_options = PipelineOptions()
+ pipeline_options.do_ocr = True
+ pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
+
+ doc_converter = DocumentConverter(
+ pipeline_options=pipeline_options,
+ )
+ ```
+
+ #### Tesseract installation
+
+ [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
+ on most operating systems. For using this engine with Docling, Tesseract must be installed on your
+ system, using the packaging tool of your choice. Below we provide example commands.
+ After installing Tesseract you are expected to provide the path to its language files using the
+ `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
+
+ For macOS, we reccomend using [Homebrew](https://brew.sh/).
+
+ ```console
+ brew install tesseract leptonica pkg-config
+ TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+ ```
+
+ For Debian-based systems.
+
+ ```console
+ apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
+ TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+ ```
+
+ For RHEL systems.
+
+ ```console
+ dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
+ TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+ ```
+
+ #### Linking to Tesseract
+ The most efficient usage of the Tesseract library is via linking. Docling is using
+ the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
+
+ If you get into installation issues of Tesserocr, we suggest using the following
+ installation options:
+
+ ```console
+ pip uninstall tesserocr
+ pip install --no-binary :all: tesserocr
+ ```
+
+
Docling development setup
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 99452076..2a391d5c 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import (
+ EasyOcrOptions,
+ PdfPipelineOptions,
+ TesseractCliOcrOptions,
+ TesseractOcrOptions,
+)
from docling.document_converter import DocumentConverter, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -53,6 +58,13 @@ class Backend(str, Enum):
DOCLING = "docling"
+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+ EASYOCR = "easyocr"
+ TESSERACT_CLI = "tesseract_cli"
+ TESSERACT = "tesseract"
+
+
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
@@ -152,6 +164,9 @@ def convert(
backend: Annotated[
Backend, typer.Option(..., help="The PDF backend to use.")
] = Backend.DOCLING,
+ ocr_engine: Annotated[
+ OcrEngine, typer.Option(..., help="The OCR engine to use.")
+ ] = OcrEngine.EASYOCR,
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
@@ -191,8 +206,19 @@ def convert(
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
+ match ocr_engine:
+ case OcrEngine.EASYOCR:
+ ocr_options = EasyOcrOptions()
+ case OcrEngine.TESSERACT_CLI:
+ ocr_options = TesseractCliOcrOptions()
+ case OcrEngine.TESSERACT:
+ ocr_options = TesseractOcrOptions()
+ case _:
+ raise RuntimeError(f"Unexpected backend type {backend}")
+
pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
+ ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 0098288e..4be6fcec 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,9 +1,9 @@
import warnings
from enum import Enum, auto
from pathlib import Path
-from typing import Annotated, Optional, Union
+from typing import Annotated, List, Literal, Optional, Union
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, ConfigDict, Field, model_validator
class TableFormerMode(str, Enum):
@@ -21,6 +21,44 @@ class TableStructureOptions(BaseModel):
mode: TableFormerMode = TableFormerMode.FAST
+class OcrOptions(BaseModel):
+ kind: str
+
+
+class EasyOcrOptions(OcrOptions):
+ kind: Literal["easyocr"] = "easyocr"
+ lang: List[str] = ["fr", "de", "es", "en"]
+ use_gpu: bool = True # same default as easyocr.Reader
+ model_storage_directory: Optional[str] = None
+ download_enabled: bool = True # same default as easyocr.Reader
+
+ model_config = ConfigDict(
+ extra="forbid",
+ protected_namespaces=(),
+ )
+
+
+class TesseractCliOcrOptions(OcrOptions):
+ kind: Literal["tesseract"] = "tesseract"
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
+ tesseract_cmd: str = "tesseract"
+ path: Optional[str] = None
+
+ model_config = ConfigDict(
+ extra="forbid",
+ )
+
+
+class TesseractOcrOptions(OcrOptions):
+ kind: Literal["tesserocr"] = "tesserocr"
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
+ path: Optional[str] = None
+
+ model_config = ConfigDict(
+ extra="forbid",
+ )
+
+
class PipelineOptions(BaseModel): ...
@@ -30,6 +68,9 @@ class PdfPipelineOptions(PipelineOptions):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
+ ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+ Field(EasyOcrOptions(), discriminator="kind")
+ )
keep_page_images: Annotated[
bool,
diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
index ea0feb82..aea7755b 100644
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -10,15 +10,15 @@ from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import OcrCell, Page
-from docling.models.abstract_model import AbstractPageModel
+from docling.datamodel.pipeline_options import OcrOptions
_log = logging.getLogger(__name__)
-class BaseOcrModel(AbstractPageModel):
- def __init__(self, config):
- self.config = config
- self.enabled = config["enabled"]
+class BaseOcrModel:
+ def __init__(self, enabled: bool, options: OcrOptions):
+ self.enabled = enabled
+ self.options = options
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
index 3bc1f89d..9408076b 100644
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -5,21 +5,33 @@ import numpy
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__)
class EasyOcrModel(BaseOcrModel):
- def __init__(self, config):
- super().__init__(config)
+ def __init__(self, enabled: bool, options: EasyOcrOptions):
+ super().__init__(enabled=enabled, options=options)
+ self.options: EasyOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
if self.enabled:
- import easyocr
+ try:
+ import easyocr
+ except ImportError:
+ raise ImportError(
+ "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
+ )
- self.reader = easyocr.Reader(config["lang"])
+ self.reader = easyocr.Reader(
+ lang_list=self.options.lang,
+ model_storage_directory=self.options.model_storage_directory,
+ download_enabled=self.options.download_enabled,
+ )
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
@@ -32,6 +44,9 @@ class EasyOcrModel(BaseOcrModel):
all_ocr_cells = []
for ocr_rect in ocr_rects:
+ # Skip zero area boxes
+ if ocr_rect.area() == 0:
+ continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py
new file mode 100644
index 00000000..d0240f86
--- /dev/null
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -0,0 +1,168 @@
+import io
+import logging
+import tempfile
+from subprocess import PIPE, Popen
+from typing import Iterable, Tuple
+
+import pandas as pd
+from docling_core.types.experimental import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesseractOcrCliModel(BaseOcrModel):
+
+ def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+ super().__init__(enabled=enabled, options=options)
+ self.options: TesseractCliOcrOptions
+
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
+
+ self._name = None
+ self._version = None
+
+ if self.enabled:
+ try:
+ self._get_name_and_version()
+
+ except Exception as exc:
+ raise RuntimeError(
+ f"Tesseract is not available, aborting: {exc} "
+ "Install tesseract on your system and the tesseract binary is discoverable. "
+ "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
+ )
+
+ def _get_name_and_version(self) -> Tuple[str, str]:
+
+ if self._name != None and self._version != None:
+ return self._name, self._version
+
+ cmd = [self.options.tesseract_cmd, "--version"]
+
+ proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
+ stdout, stderr = proc.communicate()
+
+ proc.wait()
+
+ # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
+ # to stderr, so check both.
+ version_line = (
+ (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
+ .split("\n")[0]
+ .strip()
+ )
+
+ # If everything else fails...
+ if not version_line:
+ version_line = "tesseract XXX"
+
+ name, version = version_line.split(" ")
+
+ self._name = name
+ self._version = version
+
+ return name, version
+
+ def _run_tesseract(self, ifilename: str):
+
+ cmd = [self.options.tesseract_cmd]
+
+ if self.options.lang is not None and len(self.options.lang) > 0:
+ cmd.append("-l")
+ cmd.append("+".join(self.options.lang))
+ if self.options.path is not None:
+ cmd.append("--tessdata-dir")
+ cmd.append(self.options.path)
+
+ cmd += [ifilename, "stdout", "tsv"]
+ _log.info("command: {}".format(" ".join(cmd)))
+
+ proc = Popen(cmd, stdout=PIPE)
+ output, _ = proc.communicate()
+
+ # _log.info(output)
+
+ # Decode the byte string to a regular string
+ decoded_data = output.decode("utf-8")
+ # _log.info(decoded_data)
+
+ # Read the TSV file generated by Tesseract
+ df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
+
+ # Display the dataframe (optional)
+ # _log.info("df: ", df.head())
+
+ # Filter rows that contain actual text (ignore header or empty rows)
+ df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
+
+ return df_filtered
+
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+ if not self.enabled:
+ yield from page_batch
+ return
+
+ for page in page_batch:
+ ocr_rects = self.get_ocr_rects(page)
+
+ all_ocr_cells = []
+ for ocr_rect in ocr_rects:
+ # Skip zero area boxes
+ if ocr_rect.area() == 0:
+ continue
+ high_res_image = page._backend.get_page_image(
+ scale=self.scale, cropbox=ocr_rect
+ )
+
+ with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
+ fname = image_file.name
+ high_res_image.save(fname)
+
+ df = self._run_tesseract(fname)
+
+ # _log.info(df)
+
+ # Print relevant columns (bounding box and text)
+ for ix, row in df.iterrows():
+ text = row["text"]
+ conf = row["conf"]
+
+ l = float(row["left"])
+ b = float(row["top"])
+ w = float(row["width"])
+ h = float(row["height"])
+
+ t = b + h
+ r = l + w
+
+ cell = OcrCell(
+ id=ix,
+ text=text,
+ confidence=conf / 100.0,
+ bbox=BoundingBox.from_tuple(
+ coord=(
+ (l / self.scale) + ocr_rect.l,
+ (b / self.scale) + ocr_rect.t,
+ (r / self.scale) + ocr_rect.l,
+ (t / self.scale) + ocr_rect.t,
+ ),
+ origin=CoordOrigin.TOPLEFT,
+ ),
+ )
+ all_ocr_cells.append(cell)
+
+ ## Remove OCR cells which overlap with programmatic cells.
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+ page.cells.extend(filtered_ocr_cells)
+
+ # DEBUG code:
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+ yield page
diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py
new file mode 100644
index 00000000..5173c1bf
--- /dev/null
+++ b/docling/models/tesseract_ocr_model.py
@@ -0,0 +1,123 @@
+import logging
+from typing import Iterable
+
+import numpy
+from docling_core.types.experimental import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesseractOcrModel(BaseOcrModel):
+ def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+ super().__init__(enabled=enabled, options=options)
+ self.options: TesseractCliOcrOptions
+
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
+ self.reader = None
+
+ if self.enabled:
+ setup_errmsg = (
+ "tesserocr is not correctly installed. "
+ "Please install it via `pip install tesserocr` to use this OCR engine. "
+ "Note that tesserocr might have to be manually compiled for working with"
+ "your Tesseract installation. The Docling documentation provides examples for it. "
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
+ )
+ try:
+ import tesserocr
+ except ImportError:
+ raise ImportError(setup_errmsg)
+
+ try:
+ tesseract_version = tesserocr.tesseract_version()
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
+ except:
+ raise ImportError(setup_errmsg)
+
+ # Initialize the tesseractAPI
+ lang = "+".join(self.options.lang)
+ if self.options.path is not None:
+ self.reader = tesserocr.PyTessBaseAPI(
+ path=self.options.path,
+ lang=lang,
+ psm=tesserocr.PSM.AUTO,
+ init=True,
+ oem=tesserocr.OEM.DEFAULT,
+ )
+ else:
+ self.reader = tesserocr.PyTessBaseAPI(
+ lang=lang,
+ psm=tesserocr.PSM.AUTO,
+ init=True,
+ oem=tesserocr.OEM.DEFAULT,
+ )
+ self.reader_RIL = tesserocr.RIL
+
+ def __del__(self):
+ if self.reader is not None:
+ # Finalize the tesseractAPI
+ self.reader.End()
+
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+ if not self.enabled:
+ yield from page_batch
+ return
+
+ for page in page_batch:
+ ocr_rects = self.get_ocr_rects(page)
+
+ all_ocr_cells = []
+ for ocr_rect in ocr_rects:
+ # Skip zero area boxes
+ if ocr_rect.area() == 0:
+ continue
+ high_res_image = page._backend.get_page_image(
+ scale=self.scale, cropbox=ocr_rect
+ )
+
+ # Retrieve text snippets with their bounding boxes
+ self.reader.SetImage(high_res_image)
+ boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
+
+ cells = []
+ for ix, (im, box, _, _) in enumerate(boxes):
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
+ self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
+
+ # Extract text within the bounding box
+ text = self.reader.GetUTF8Text().strip()
+ confidence = self.reader.MeanTextConf()
+ left = box["x"] / self.scale
+ bottom = box["y"] / self.scale
+ right = (box["x"] + box["w"]) / self.scale
+ top = (box["y"] + box["h"]) / self.scale
+
+ cells.append(
+ OcrCell(
+ id=ix,
+ text=text,
+ confidence=confidence,
+ bbox=BoundingBox.from_tuple(
+ coord=(left, top, right, bottom),
+ origin=CoordOrigin.TOPLEFT,
+ ),
+ )
+ )
+
+ # del high_res_image
+ all_ocr_cells.extend(cells)
+
+ ## Remove OCR cells which overlap with programmatic cells.
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+ page.cells.extend(filtered_ocr_cells)
+
+ # DEBUG code:
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+ yield page
diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py
index d0ab94bc..659c3c93 100644
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@@ -6,13 +6,21 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import (
+ EasyOcrOptions,
+ PdfPipelineOptions,
+ TesseractCliOcrOptions,
+ TesseractOcrOptions,
+)
+from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.models.page_preprocessing_model import PagePreprocessingModel
from docling.models.table_structure_model import TableStructureModel
+from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
+from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
_log = logging.getLogger(__name__)
@@ -31,16 +39,32 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel(config={})
+ ocr_model: BaseOcrModel
+ if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
+ ocr_model = EasyOcrModel(
+ enabled=pipeline_options.do_ocr,
+ options=pipeline_options.ocr_options,
+ )
+ elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
+ ocr_model = TesseractOcrCliModel(
+ enabled=pipeline_options.do_ocr,
+ options=pipeline_options.ocr_options,
+ )
+ elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
+ ocr_model = TesseractOcrModel(
+ enabled=pipeline_options.do_ocr,
+ options=pipeline_options.ocr_options,
+ )
+ else:
+ raise RuntimeError(
+ f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
+ )
+
self.model_pipe = [
PagePreprocessingModel(
config={"images_scale": pipeline_options.images_scale}
),
- EasyOcrModel(
- config={
- "lang": ["fr", "de", "es", "en"],
- "enabled": pipeline_options.do_ocr,
- }
- ),
+ ocr_model,
LayoutModel(
config={
"artifacts_path": artifacts_path
diff --git a/examples/custom_convert.py b/examples/custom_convert.py
index 090e49aa..68b52797 100644
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -6,7 +6,11 @@ from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import (
+ PdfPipelineOptions,
+ TesseractCliOcrOptions,
+ TesseractOcrOptions,
+)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
@@ -71,7 +75,7 @@ def main():
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
- # PyPdfium without OCR
+ # PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
@@ -83,7 +87,7 @@ def main():
# pdf_backend=PyPdfiumDocumentBackend,
# )
- # PyPdfium with OCR
+ # PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
@@ -95,7 +99,7 @@ def main():
# pdf_backend=PyPdfiumDocumentBackend,
# )
- # Docling Parse without OCR
+ # Docling Parse without EasyOCR
# -------------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
@@ -108,7 +112,7 @@ def main():
}
)
- # Docling Parse with OCR
+ # Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
@@ -120,6 +124,32 @@ def main():
# pdf_backend=DoclingParseDocumentBackend,
# )
+ # Docling Parse with Tesseract
+ # ----------------------
+ # pipeline_options = PipelineOptions()
+ # pipeline_options.do_ocr = True
+ # pipeline_options.do_table_structure = True
+ # pipeline_options.table_structure_options.do_cell_matching = True
+ # pipeline_options.ocr_options = TesseractOcrOptions()
+
+ # doc_converter = DocumentConverter(
+ # pipeline_options=pipeline_options,
+ # pdf_backend=DoclingParseDocumentBackend,
+ # )
+
+ # Docling Parse with Tesseract CLI
+ # ----------------------
+ # pipeline_options = PipelineOptions()
+ # pipeline_options.do_ocr = True
+ # pipeline_options.do_table_structure = True
+ # pipeline_options.table_structure_options.do_cell_matching = True
+ # pipeline_options.ocr_options = TesseractCliOcrOptions()
+
+ # doc_converter = DocumentConverter(
+ # pipeline_options=pipeline_options,
+ # pdf_backend=DoclingParseDocumentBackend,
+ # )
+
###########################################################################
# Define input files
diff --git a/poetry.lock b/poetry.lock
index 45684d25..c4a4e559 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -450,101 +450,116 @@ files = [
[[package]]
name = "charset-normalizer"
-version = "3.3.2"
+version = "3.4.0"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
optional = false
python-versions = ">=3.7.0"
files = [
- {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
- {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
+ {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
+ {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
+ {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
+ {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
+ {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
+ {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
+ {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
+ {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
+ {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
]
[[package]]
@@ -2493,13 +2508,13 @@ files = [
[[package]]
name = "llama-index-core"
-version = "0.11.16"
+version = "0.11.17"
description = "Interface between LLMs and your data"
optional = false
python-versions = "<4.0,>=3.8.1"
files = [
- {file = "llama_index_core-0.11.16-py3-none-any.whl", hash = "sha256:099ba785e357506fd5a24c1a6b8fa5286366d6c71637649fab0f9126dcea842c"},
- {file = "llama_index_core-0.11.16.tar.gz", hash = "sha256:232a5cebcc73b951d9c663bd30ed59de5356dbd8f9ab88024d19c88bdd1b3254"},
+ {file = "llama_index_core-0.11.17-py3-none-any.whl", hash = "sha256:d65565b54ea55b2db12f9a1cd5c250b770d7e43d3363137cff431a6116ef069c"},
+ {file = "llama_index_core-0.11.17.tar.gz", hash = "sha256:1143baf8d819e27555bdb142abdf2833d3d37731f270f46fa1e07fc4b97116ae"},
]
[package.dependencies]
@@ -6125,6 +6140,41 @@ files = [
doc = ["reno", "sphinx"]
test = ["pytest", "tornado (>=4.5)", "typeguard"]
+[[package]]
+name = "tesserocr"
+version = "2.7.1"
+description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
+optional = true
+python-versions = "*"
+files = [
+ {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
+ {file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"},
+ {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"},
+ {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"},
+ {file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"},
+ {file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"},
+ {file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"},
+ {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"},
+ {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"},
+ {file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"},
+ {file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"},
+ {file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"},
+ {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"},
+ {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"},
+ {file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"},
+ {file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"},
+ {file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"},
+ {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"},
+ {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"},
+ {file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"},
+ {file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"},
+ {file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"},
+ {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"},
+ {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"},
+ {file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"},
+ {file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"},
+]
+
[[package]]
name = "threadpoolctl"
version = "3.5.0"
@@ -7330,7 +7380,10 @@ enabler = ["pytest-enabler (>=2.2)"]
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
type = ["pytest-mypy"]
+[extras]
+tesserocr = ["tesserocr"]
+
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
-content-hash = "5ef87a880333213955e3ded6bcf0748f6728e4501a98bd5bf9421057de745772"
+content-hash = "71eec93c5fc347a7c0ae0d846d4c2c41ff96255aab218d7d2ba747d1ffed942e"
diff --git a/pyproject.toml b/pyproject.toml
index 41100a47..411550ae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = "^1.7"
+tesserocr = { version = "^2.7.1", optional = true }
docling-parse = "^1.4.1"
certifi = ">=2024.7.4"
rtree = "^1.3.0"
@@ -56,6 +57,7 @@ typer = "^0.12.5"
python-docx = "^1.1.2"
python-pptx = "^1.0.2"
beautifulsoup4 = "^4.12.3"
+pandas = "^2.1.4"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -70,7 +72,7 @@ pytest-xdist = "^3.3.1"
types-requests = "^2.31.0.2"
flake8-pyproject = "^1.2.3"
pylint = "^2.17.5"
-pandas-stubs = "^2.2.2.240909"
+pandas-stubs = "^2.1.4.231227"
ipykernel = "^6.29.5"
ipywidgets = "^8.1.5"
nbqa = "^1.9.0"
@@ -85,6 +87,9 @@ langchain-huggingface = "^0.0.3"
langchain-milvus = "^0.1.4"
langchain-text-splitters = "^0.2.4"
+[tool.poetry.extras]
+tesserocr = ["tesserocr"]
+
[tool.poetry.scripts]
docling = "docling.cli.main:app"
diff --git a/tests/data_scanned/ocr_test.doctags.txt b/tests/data_scanned/ocr_test.doctags.txt
new file mode 100644
index 00000000..7cd53510
--- /dev/null
+++ b/tests/data_scanned/ocr_test.doctags.txt
@@ -0,0 +1,3 @@
+
+Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
+
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.json b/tests/data_scanned/ocr_test.json
new file mode 100644
index 00000000..775ab8e7
--- /dev/null
+++ b/tests/data_scanned/ocr_test.json
@@ -0,0 +1 @@
+{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.41791534423828, 690.8074951171875, 509.4447021484375, 767.422119140625], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.md b/tests/data_scanned/ocr_test.md
new file mode 100644
index 00000000..42896546
--- /dev/null
+++ b/tests/data_scanned/ocr_test.md
@@ -0,0 +1 @@
+Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.pages.json b/tests/data_scanned/ocr_test.pages.json
new file mode 100644
index 00000000..8d0a6a3b
--- /dev/null
+++ b/tests/data_scanned/ocr_test.pages.json
@@ -0,0 +1 @@
+[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 71.41791534423828, "t": 74.49958801269531, "r": 509.4447021484375, "b": 151.11419677734375, "coord_origin": "TOPLEFT"}, "confidence": 0.9238373041152954, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 71.41791534423828, "t": 74.49958801269531, "r": 509.4447021484375, "b": 151.11419677734375, "coord_origin": "TOPLEFT"}, "confidence": 0.9238373041152954, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 71.41791534423828, "t": 74.49958801269531, "r": 509.4447021484375, "b": 151.11419677734375, "coord_origin": "TOPLEFT"}, "confidence": 0.9238373041152954, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.pdf b/tests/data_scanned/ocr_test.pdf
new file mode 100644
index 00000000..b79f3c28
Binary files /dev/null and b/tests/data_scanned/ocr_test.pdf differ
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
new file mode 100644
index 00000000..804ecd8d
--- /dev/null
+++ b/tests/test_e2e_ocr_conversion.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+from typing import List
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+ EasyOcrOptions,
+ OcrOptions,
+ PdfPipelineOptions,
+ PipelineOptions,
+ TesseractCliOcrOptions,
+ TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+from .verify_utils import verify_conversion_result
+
+GENERATE = True
+
+
+# Debug
+def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
+ r""" """
+ import json
+ import os
+
+ parent = pdf_path.parent
+ eng = "" if engine is None else f".{engine}"
+
+ dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
+ with open(dict_fn, "w") as fd:
+ json.dump(doc_result.render_as_dict(), fd)
+
+ pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
+ pages = [p.model_dump() for p in doc_result.pages]
+ with open(pages_fn, "w") as fd:
+ json.dump(pages, fd)
+
+ doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
+ with open(doctags_fn, "w") as fd:
+ fd.write(doc_result.render_as_doctags())
+
+ md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
+ with open(md_fn, "w") as fd:
+ fd.write(doc_result.render_as_markdown())
+
+
+def get_pdf_paths():
+ # Define the directory you want to search
+ directory = Path("./tests/data_scanned")
+
+ # List all PDF files in the directory and its subdirectories
+ pdf_files = sorted(directory.rglob("*.pdf"))
+ return pdf_files
+
+
+def get_converter(ocr_options: OcrOptions):
+ pipeline_options = PdfPipelineOptions()
+ pipeline_options.do_ocr = True
+ pipeline_options.do_table_structure = True
+ pipeline_options.table_structure_options.do_cell_matching = True
+ pipeline_options.ocr_options = ocr_options
+
+ converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=pipeline_options,
+ backend=DoclingParseDocumentBackend,
+ )
+ }
+ )
+
+ return converter
+
+
+def test_e2e_conversions():
+
+ pdf_paths = get_pdf_paths()
+
+ engines: List[OcrOptions] = [
+ EasyOcrOptions(),
+ TesseractOcrOptions(),
+ TesseractCliOcrOptions(),
+ ]
+
+ for ocr_options in engines:
+ print(f"Converting with ocr_engine: {ocr_options.kind}")
+ converter = get_converter(ocr_options=ocr_options)
+ for pdf_path in pdf_paths:
+ print(f"converting {pdf_path}")
+
+ doc_result: ConversionResult = converter.convert_single(pdf_path)
+
+ # Save conversions
+ # save_output(pdf_path, doc_result, None)
+
+ # Debug
+ verify_conversion_result(
+ input_path=pdf_path,
+ doc_result=doc_result,
+ generate=GENERATE,
+ skip_cells=True,
+ )
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index a0b0f0e6..082b7c78 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
def verify_conversion_result(
- input_path: Path, doc_result: ConversionResult, generate=False
+ input_path: Path,
+ doc_result: ConversionResult,
+ generate: bool = False,
+ ocr_engine: str = None,
+ skip_cells: bool = False,
):
PageList = TypeAdapter(List[Page])
@@ -143,10 +147,11 @@ def verify_conversion_result(
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
- pages_path = input_path.with_suffix(".pages.json")
- json_path = input_path.with_suffix(".json")
- md_path = input_path.with_suffix(".md")
- dt_path = input_path.with_suffix(".doctags.txt")
+ engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
+ pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
+ json_path = input_path.with_suffix(f"{engine_suffix}.json")
+ md_path = input_path.with_suffix(f"{engine_suffix}.md")
+ dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth
with open(pages_path, "w") as fw:
@@ -173,9 +178,10 @@ def verify_conversion_result(
with open(dt_path, "r") as fr:
doc_true_dt = fr.read()
- assert verify_cells(
- doc_pred_pages, doc_true_pages
- ), f"Mismatch in PDF cell prediction for {input_path}"
+ if not skip_cells:
+ assert verify_cells(
+ doc_pred_pages, doc_true_pages
+ ), f"Mismatch in PDF cell prediction for {input_path}"
# assert verify_output(
# doc_pred, doc_true