mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge from main, update OCR model and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
b5a27386c1
7
.github/workflows/checks.yml
vendored
7
.github/workflows/checks.yml
vendored
@ -9,6 +9,11 @@ jobs:
|
|||||||
python-version: ['3.10', '3.11', '3.12']
|
python-version: ['3.10', '3.11', '3.12']
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
- name: Install tesseract
|
||||||
|
run: sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config
|
||||||
|
- name: Set TESSDATA_PREFIX
|
||||||
|
run: |
|
||||||
|
echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||||
- uses: ./.github/actions/setup-poetry
|
- uses: ./.github/actions/setup-poetry
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
@ -32,4 +37,4 @@ jobs:
|
|||||||
poetry run python "$file" || exit 1
|
poetry run python "$file" || exit 1
|
||||||
done
|
done
|
||||||
- name: Build with poetry
|
- name: Build with poetry
|
||||||
run: poetry build
|
run: poetry build
|
||||||
|
@ -1,3 +1,9 @@
|
|||||||
|
## [v1.19.0](https://github.com/DS4SD/docling/releases/tag/v1.19.0) - 2024-10-08
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Add options for choosing OCR engines ([#118](https://github.com/DS4SD/docling/issues/118)) ([`f96ea86`](https://github.com/DS4SD/docling/commit/f96ea86a00fd1aafaa57025e46b5288b43958725))
|
||||||
|
|
||||||
## [v1.18.0](https://github.com/DS4SD/docling/releases/tag/v1.18.0) - 2024-10-03
|
## [v1.18.0](https://github.com/DS4SD/docling/releases/tag/v1.18.0) - 2024-10-03
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
73
README.md
73
README.md
@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
|
|||||||
```
|
```
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>Alternative OCR engines</b></summary>
|
||||||
|
|
||||||
|
Docling supports multiple OCR engines for processing scanned documents. The current version provides
|
||||||
|
the following engines.
|
||||||
|
|
||||||
|
| Engine | Installation | Usage |
|
||||||
|
| ------ | ------------ | ----- |
|
||||||
|
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
||||||
|
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
||||||
|
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
||||||
|
|
||||||
|
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||||
|
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
pipeline_options = PipelineOptions()
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Tesseract installation
|
||||||
|
|
||||||
|
[Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
|
||||||
|
on most operating systems. For using this engine with Docling, Tesseract must be installed on your
|
||||||
|
system, using the packaging tool of your choice. Below we provide example commands.
|
||||||
|
After installing Tesseract you are expected to provide the path to its language files using the
|
||||||
|
`TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
|
||||||
|
|
||||||
|
For macOS, we reccomend using [Homebrew](https://brew.sh/).
|
||||||
|
|
||||||
|
```console
|
||||||
|
brew install tesseract leptonica pkg-config
|
||||||
|
TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
|
||||||
|
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||||
|
```
|
||||||
|
|
||||||
|
For Debian-based systems.
|
||||||
|
|
||||||
|
```console
|
||||||
|
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
|
||||||
|
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
||||||
|
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||||
|
```
|
||||||
|
|
||||||
|
For RHEL systems.
|
||||||
|
|
||||||
|
```console
|
||||||
|
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
||||||
|
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||||
|
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Linking to Tesseract
|
||||||
|
The most efficient usage of the Tesseract library is via linking. Docling is using
|
||||||
|
the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
|
||||||
|
|
||||||
|
If you get into installation issues of Tesserocr, we suggest using the following
|
||||||
|
installation options:
|
||||||
|
|
||||||
|
```console
|
||||||
|
pip uninstall tesserocr
|
||||||
|
pip install --no-binary :all: tesserocr
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>Docling development setup</b></summary>
|
<summary><b>Docling development setup</b></summary>
|
||||||
|
|
||||||
|
@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
@ -53,6 +58,13 @@ class Backend(str, Enum):
|
|||||||
DOCLING = "docling"
|
DOCLING = "docling"
|
||||||
|
|
||||||
|
|
||||||
|
# Define an enum for the ocr engines
|
||||||
|
class OcrEngine(str, Enum):
|
||||||
|
EASYOCR = "easyocr"
|
||||||
|
TESSERACT_CLI = "tesseract_cli"
|
||||||
|
TESSERACT = "tesseract"
|
||||||
|
|
||||||
|
|
||||||
def export_documents(
|
def export_documents(
|
||||||
conv_results: Iterable[ConversionResult],
|
conv_results: Iterable[ConversionResult],
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
@ -152,6 +164,9 @@ def convert(
|
|||||||
backend: Annotated[
|
backend: Annotated[
|
||||||
Backend, typer.Option(..., help="The PDF backend to use.")
|
Backend, typer.Option(..., help="The PDF backend to use.")
|
||||||
] = Backend.DOCLING,
|
] = Backend.DOCLING,
|
||||||
|
ocr_engine: Annotated[
|
||||||
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
|
] = OcrEngine.EASYOCR,
|
||||||
output: Annotated[
|
output: Annotated[
|
||||||
Path, typer.Option(..., help="Output directory where results are saved.")
|
Path, typer.Option(..., help="Output directory where results are saved.")
|
||||||
] = Path("."),
|
] = Path("."),
|
||||||
@ -191,8 +206,19 @@ def convert(
|
|||||||
case _:
|
case _:
|
||||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||||
|
|
||||||
|
match ocr_engine:
|
||||||
|
case OcrEngine.EASYOCR:
|
||||||
|
ocr_options = EasyOcrOptions()
|
||||||
|
case OcrEngine.TESSERACT_CLI:
|
||||||
|
ocr_options = TesseractCliOcrOptions()
|
||||||
|
case OcrEngine.TESSERACT:
|
||||||
|
ocr_options = TesseractOcrOptions()
|
||||||
|
case _:
|
||||||
|
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions(
|
pipeline_options = PdfPipelineOptions(
|
||||||
do_ocr=ocr,
|
do_ocr=ocr,
|
||||||
|
ocr_options=ocr_options,
|
||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import warnings
|
import warnings
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Optional, Union
|
from typing import Annotated, List, Literal, Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, Field, model_validator
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||||
|
|
||||||
|
|
||||||
class TableFormerMode(str, Enum):
|
class TableFormerMode(str, Enum):
|
||||||
@ -21,6 +21,44 @@ class TableStructureOptions(BaseModel):
|
|||||||
mode: TableFormerMode = TableFormerMode.FAST
|
mode: TableFormerMode = TableFormerMode.FAST
|
||||||
|
|
||||||
|
|
||||||
|
class OcrOptions(BaseModel):
|
||||||
|
kind: str
|
||||||
|
|
||||||
|
|
||||||
|
class EasyOcrOptions(OcrOptions):
|
||||||
|
kind: Literal["easyocr"] = "easyocr"
|
||||||
|
lang: List[str] = ["fr", "de", "es", "en"]
|
||||||
|
use_gpu: bool = True # same default as easyocr.Reader
|
||||||
|
model_storage_directory: Optional[str] = None
|
||||||
|
download_enabled: bool = True # same default as easyocr.Reader
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
extra="forbid",
|
||||||
|
protected_namespaces=(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TesseractCliOcrOptions(OcrOptions):
|
||||||
|
kind: Literal["tesseract"] = "tesseract"
|
||||||
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
|
tesseract_cmd: str = "tesseract"
|
||||||
|
path: Optional[str] = None
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
extra="forbid",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TesseractOcrOptions(OcrOptions):
|
||||||
|
kind: Literal["tesserocr"] = "tesserocr"
|
||||||
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
|
path: Optional[str] = None
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
extra="forbid",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel): ...
|
class PipelineOptions(BaseModel): ...
|
||||||
|
|
||||||
|
|
||||||
@ -30,6 +68,9 @@ class PdfPipelineOptions(PipelineOptions):
|
|||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
|
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
||||||
|
Field(EasyOcrOptions(), discriminator="kind")
|
||||||
|
)
|
||||||
|
|
||||||
keep_page_images: Annotated[
|
keep_page_images: Annotated[
|
||||||
bool,
|
bool,
|
||||||
|
@ -10,15 +10,15 @@ from rtree import index
|
|||||||
from scipy.ndimage import find_objects, label
|
from scipy.ndimage import find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
from docling.models.abstract_model import AbstractPageModel
|
from docling.datamodel.pipeline_options import OcrOptions
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseOcrModel(AbstractPageModel):
|
class BaseOcrModel:
|
||||||
def __init__(self, config):
|
def __init__(self, enabled: bool, options: OcrOptions):
|
||||||
self.config = config
|
self.enabled = enabled
|
||||||
self.enabled = config["enabled"]
|
self.options = options
|
||||||
|
|
||||||
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
||||||
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
||||||
|
@ -5,21 +5,33 @@ import numpy
|
|||||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EasyOcrModel(BaseOcrModel):
|
class EasyOcrModel(BaseOcrModel):
|
||||||
def __init__(self, config):
|
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
||||||
super().__init__(config)
|
super().__init__(enabled=enabled, options=options)
|
||||||
|
self.options: EasyOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
import easyocr
|
try:
|
||||||
|
import easyocr
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
||||||
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||||
|
)
|
||||||
|
|
||||||
self.reader = easyocr.Reader(config["lang"])
|
self.reader = easyocr.Reader(
|
||||||
|
lang_list=self.options.lang,
|
||||||
|
model_storage_directory=self.options.model_storage_directory,
|
||||||
|
download_enabled=self.options.download_enabled,
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
|
||||||
@ -32,6 +44,9 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
for ocr_rect in ocr_rects:
|
for ocr_rect in ocr_rects:
|
||||||
|
# Skip zero area boxes
|
||||||
|
if ocr_rect.area() == 0:
|
||||||
|
continue
|
||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
|
168
docling/models/tesseract_ocr_cli_model.py
Normal file
168
docling/models/tesseract_ocr_cli_model.py
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import tempfile
|
||||||
|
from subprocess import PIPE, Popen
|
||||||
|
from typing import Iterable, Tuple
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||||
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TesseractOcrCliModel(BaseOcrModel):
|
||||||
|
|
||||||
|
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
||||||
|
super().__init__(enabled=enabled, options=options)
|
||||||
|
self.options: TesseractCliOcrOptions
|
||||||
|
|
||||||
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
|
|
||||||
|
self._name = None
|
||||||
|
self._version = None
|
||||||
|
|
||||||
|
if self.enabled:
|
||||||
|
try:
|
||||||
|
self._get_name_and_version()
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Tesseract is not available, aborting: {exc} "
|
||||||
|
"Install tesseract on your system and the tesseract binary is discoverable. "
|
||||||
|
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
|
||||||
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||||
|
|
||||||
|
if self._name != None and self._version != None:
|
||||||
|
return self._name, self._version
|
||||||
|
|
||||||
|
cmd = [self.options.tesseract_cmd, "--version"]
|
||||||
|
|
||||||
|
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
||||||
|
stdout, stderr = proc.communicate()
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
|
||||||
|
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
|
||||||
|
# to stderr, so check both.
|
||||||
|
version_line = (
|
||||||
|
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
|
||||||
|
.split("\n")[0]
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
# If everything else fails...
|
||||||
|
if not version_line:
|
||||||
|
version_line = "tesseract XXX"
|
||||||
|
|
||||||
|
name, version = version_line.split(" ")
|
||||||
|
|
||||||
|
self._name = name
|
||||||
|
self._version = version
|
||||||
|
|
||||||
|
return name, version
|
||||||
|
|
||||||
|
def _run_tesseract(self, ifilename: str):
|
||||||
|
|
||||||
|
cmd = [self.options.tesseract_cmd]
|
||||||
|
|
||||||
|
if self.options.lang is not None and len(self.options.lang) > 0:
|
||||||
|
cmd.append("-l")
|
||||||
|
cmd.append("+".join(self.options.lang))
|
||||||
|
if self.options.path is not None:
|
||||||
|
cmd.append("--tessdata-dir")
|
||||||
|
cmd.append(self.options.path)
|
||||||
|
|
||||||
|
cmd += [ifilename, "stdout", "tsv"]
|
||||||
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
|
|
||||||
|
proc = Popen(cmd, stdout=PIPE)
|
||||||
|
output, _ = proc.communicate()
|
||||||
|
|
||||||
|
# _log.info(output)
|
||||||
|
|
||||||
|
# Decode the byte string to a regular string
|
||||||
|
decoded_data = output.decode("utf-8")
|
||||||
|
# _log.info(decoded_data)
|
||||||
|
|
||||||
|
# Read the TSV file generated by Tesseract
|
||||||
|
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
||||||
|
|
||||||
|
# Display the dataframe (optional)
|
||||||
|
# _log.info("df: ", df.head())
|
||||||
|
|
||||||
|
# Filter rows that contain actual text (ignore header or empty rows)
|
||||||
|
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
|
||||||
|
|
||||||
|
return df_filtered
|
||||||
|
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
|
||||||
|
if not self.enabled:
|
||||||
|
yield from page_batch
|
||||||
|
return
|
||||||
|
|
||||||
|
for page in page_batch:
|
||||||
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
|
|
||||||
|
all_ocr_cells = []
|
||||||
|
for ocr_rect in ocr_rects:
|
||||||
|
# Skip zero area boxes
|
||||||
|
if ocr_rect.area() == 0:
|
||||||
|
continue
|
||||||
|
high_res_image = page._backend.get_page_image(
|
||||||
|
scale=self.scale, cropbox=ocr_rect
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
|
||||||
|
fname = image_file.name
|
||||||
|
high_res_image.save(fname)
|
||||||
|
|
||||||
|
df = self._run_tesseract(fname)
|
||||||
|
|
||||||
|
# _log.info(df)
|
||||||
|
|
||||||
|
# Print relevant columns (bounding box and text)
|
||||||
|
for ix, row in df.iterrows():
|
||||||
|
text = row["text"]
|
||||||
|
conf = row["conf"]
|
||||||
|
|
||||||
|
l = float(row["left"])
|
||||||
|
b = float(row["top"])
|
||||||
|
w = float(row["width"])
|
||||||
|
h = float(row["height"])
|
||||||
|
|
||||||
|
t = b + h
|
||||||
|
r = l + w
|
||||||
|
|
||||||
|
cell = OcrCell(
|
||||||
|
id=ix,
|
||||||
|
text=text,
|
||||||
|
confidence=conf / 100.0,
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=(
|
||||||
|
(l / self.scale) + ocr_rect.l,
|
||||||
|
(b / self.scale) + ocr_rect.t,
|
||||||
|
(r / self.scale) + ocr_rect.l,
|
||||||
|
(t / self.scale) + ocr_rect.t,
|
||||||
|
),
|
||||||
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
all_ocr_cells.append(cell)
|
||||||
|
|
||||||
|
## Remove OCR cells which overlap with programmatic cells.
|
||||||
|
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||||
|
|
||||||
|
page.cells.extend(filtered_ocr_cells)
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||||
|
|
||||||
|
yield page
|
123
docling/models/tesseract_ocr_model.py
Normal file
123
docling/models/tesseract_ocr_model.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import logging
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||||
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TesseractOcrModel(BaseOcrModel):
|
||||||
|
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
||||||
|
super().__init__(enabled=enabled, options=options)
|
||||||
|
self.options: TesseractCliOcrOptions
|
||||||
|
|
||||||
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
|
self.reader = None
|
||||||
|
|
||||||
|
if self.enabled:
|
||||||
|
setup_errmsg = (
|
||||||
|
"tesserocr is not correctly installed. "
|
||||||
|
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
||||||
|
"Note that tesserocr might have to be manually compiled for working with"
|
||||||
|
"your Tesseract installation. The Docling documentation provides examples for it. "
|
||||||
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
import tesserocr
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(setup_errmsg)
|
||||||
|
|
||||||
|
try:
|
||||||
|
tesseract_version = tesserocr.tesseract_version()
|
||||||
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||||
|
except:
|
||||||
|
raise ImportError(setup_errmsg)
|
||||||
|
|
||||||
|
# Initialize the tesseractAPI
|
||||||
|
lang = "+".join(self.options.lang)
|
||||||
|
if self.options.path is not None:
|
||||||
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
|
path=self.options.path,
|
||||||
|
lang=lang,
|
||||||
|
psm=tesserocr.PSM.AUTO,
|
||||||
|
init=True,
|
||||||
|
oem=tesserocr.OEM.DEFAULT,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
|
lang=lang,
|
||||||
|
psm=tesserocr.PSM.AUTO,
|
||||||
|
init=True,
|
||||||
|
oem=tesserocr.OEM.DEFAULT,
|
||||||
|
)
|
||||||
|
self.reader_RIL = tesserocr.RIL
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
if self.reader is not None:
|
||||||
|
# Finalize the tesseractAPI
|
||||||
|
self.reader.End()
|
||||||
|
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
|
||||||
|
if not self.enabled:
|
||||||
|
yield from page_batch
|
||||||
|
return
|
||||||
|
|
||||||
|
for page in page_batch:
|
||||||
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
|
|
||||||
|
all_ocr_cells = []
|
||||||
|
for ocr_rect in ocr_rects:
|
||||||
|
# Skip zero area boxes
|
||||||
|
if ocr_rect.area() == 0:
|
||||||
|
continue
|
||||||
|
high_res_image = page._backend.get_page_image(
|
||||||
|
scale=self.scale, cropbox=ocr_rect
|
||||||
|
)
|
||||||
|
|
||||||
|
# Retrieve text snippets with their bounding boxes
|
||||||
|
self.reader.SetImage(high_res_image)
|
||||||
|
boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
|
||||||
|
|
||||||
|
cells = []
|
||||||
|
for ix, (im, box, _, _) in enumerate(boxes):
|
||||||
|
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
||||||
|
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
||||||
|
|
||||||
|
# Extract text within the bounding box
|
||||||
|
text = self.reader.GetUTF8Text().strip()
|
||||||
|
confidence = self.reader.MeanTextConf()
|
||||||
|
left = box["x"] / self.scale
|
||||||
|
bottom = box["y"] / self.scale
|
||||||
|
right = (box["x"] + box["w"]) / self.scale
|
||||||
|
top = (box["y"] + box["h"]) / self.scale
|
||||||
|
|
||||||
|
cells.append(
|
||||||
|
OcrCell(
|
||||||
|
id=ix,
|
||||||
|
text=text,
|
||||||
|
confidence=confidence,
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=(left, top, right, bottom),
|
||||||
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# del high_res_image
|
||||||
|
all_ocr_cells.extend(cells)
|
||||||
|
|
||||||
|
## Remove OCR cells which overlap with programmatic cells.
|
||||||
|
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||||
|
|
||||||
|
page.cells.extend(filtered_ocr_cells)
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||||
|
|
||||||
|
yield page
|
@ -6,13 +6,21 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
|
|||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.ds_glm_model import GlmModel
|
from docling.models.ds_glm_model import GlmModel
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel
|
from docling.models.page_assemble_model import PageAssembleModel
|
||||||
from docling.models.page_preprocessing_model import PagePreprocessingModel
|
from docling.models.page_preprocessing_model import PagePreprocessingModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||||
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
|
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -31,16 +39,32 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
self.artifacts_path = Path(artifacts_path)
|
self.artifacts_path = Path(artifacts_path)
|
||||||
self.glm_model = GlmModel(config={})
|
self.glm_model = GlmModel(config={})
|
||||||
|
|
||||||
|
ocr_model: BaseOcrModel
|
||||||
|
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
||||||
|
ocr_model = EasyOcrModel(
|
||||||
|
enabled=pipeline_options.do_ocr,
|
||||||
|
options=pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
|
||||||
|
ocr_model = TesseractOcrCliModel(
|
||||||
|
enabled=pipeline_options.do_ocr,
|
||||||
|
options=pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
||||||
|
ocr_model = TesseractOcrModel(
|
||||||
|
enabled=pipeline_options.do_ocr,
|
||||||
|
options=pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||||
|
)
|
||||||
|
|
||||||
self.model_pipe = [
|
self.model_pipe = [
|
||||||
PagePreprocessingModel(
|
PagePreprocessingModel(
|
||||||
config={"images_scale": pipeline_options.images_scale}
|
config={"images_scale": pipeline_options.images_scale}
|
||||||
),
|
),
|
||||||
EasyOcrModel(
|
ocr_model,
|
||||||
config={
|
|
||||||
"lang": ["fr", "de", "es", "en"],
|
|
||||||
"enabled": pipeline_options.do_ocr,
|
|
||||||
}
|
|
||||||
),
|
|
||||||
LayoutModel(
|
LayoutModel(
|
||||||
config={
|
config={
|
||||||
"artifacts_path": artifacts_path
|
"artifacts_path": artifacts_path
|
||||||
|
@ -6,7 +6,11 @@ from typing import Iterable
|
|||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
PdfPipelineOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
|
||||||
@ -71,7 +75,7 @@ def main():
|
|||||||
# and PDF Backends for various configurations.
|
# and PDF Backends for various configurations.
|
||||||
# Uncomment one section at the time to see the differences in the output.
|
# Uncomment one section at the time to see the differences in the output.
|
||||||
|
|
||||||
# PyPdfium without OCR
|
# PyPdfium without EasyOCR
|
||||||
# --------------------
|
# --------------------
|
||||||
# pipeline_options = PipelineOptions()
|
# pipeline_options = PipelineOptions()
|
||||||
# pipeline_options.do_ocr=False
|
# pipeline_options.do_ocr=False
|
||||||
@ -83,7 +87,7 @@ def main():
|
|||||||
# pdf_backend=PyPdfiumDocumentBackend,
|
# pdf_backend=PyPdfiumDocumentBackend,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
# PyPdfium with OCR
|
# PyPdfium with EasyOCR
|
||||||
# -----------------
|
# -----------------
|
||||||
# pipeline_options = PipelineOptions()
|
# pipeline_options = PipelineOptions()
|
||||||
# pipeline_options.do_ocr=True
|
# pipeline_options.do_ocr=True
|
||||||
@ -95,7 +99,7 @@ def main():
|
|||||||
# pdf_backend=PyPdfiumDocumentBackend,
|
# pdf_backend=PyPdfiumDocumentBackend,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
# Docling Parse without OCR
|
# Docling Parse without EasyOCR
|
||||||
# -------------------------
|
# -------------------------
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
@ -108,7 +112,7 @@ def main():
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Docling Parse with OCR
|
# Docling Parse with EasyOCR
|
||||||
# ----------------------
|
# ----------------------
|
||||||
# pipeline_options = PipelineOptions()
|
# pipeline_options = PipelineOptions()
|
||||||
# pipeline_options.do_ocr=True
|
# pipeline_options.do_ocr=True
|
||||||
@ -120,6 +124,32 @@ def main():
|
|||||||
# pdf_backend=DoclingParseDocumentBackend,
|
# pdf_backend=DoclingParseDocumentBackend,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
|
# Docling Parse with Tesseract
|
||||||
|
# ----------------------
|
||||||
|
# pipeline_options = PipelineOptions()
|
||||||
|
# pipeline_options.do_ocr = True
|
||||||
|
# pipeline_options.do_table_structure = True
|
||||||
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
# pipeline_options.ocr_options = TesseractOcrOptions()
|
||||||
|
|
||||||
|
# doc_converter = DocumentConverter(
|
||||||
|
# pipeline_options=pipeline_options,
|
||||||
|
# pdf_backend=DoclingParseDocumentBackend,
|
||||||
|
# )
|
||||||
|
|
||||||
|
# Docling Parse with Tesseract CLI
|
||||||
|
# ----------------------
|
||||||
|
# pipeline_options = PipelineOptions()
|
||||||
|
# pipeline_options.do_ocr = True
|
||||||
|
# pipeline_options.do_table_structure = True
|
||||||
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
||||||
|
|
||||||
|
# doc_converter = DocumentConverter(
|
||||||
|
# pipeline_options=pipeline_options,
|
||||||
|
# pdf_backend=DoclingParseDocumentBackend,
|
||||||
|
# )
|
||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
|
|
||||||
# Define input files
|
# Define input files
|
||||||
|
243
poetry.lock
generated
243
poetry.lock
generated
@ -450,101 +450,116 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.3.2"
|
version = "3.4.0"
|
||||||
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.0"
|
python-versions = ">=3.7.0"
|
||||||
files = [
|
files = [
|
||||||
{file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
|
{file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
|
||||||
{file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
|
{file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
|
||||||
{file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
|
{file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
|
||||||
{file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
|
||||||
{file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
|
{file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
|
{file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
|
||||||
{file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
|
{file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
|
||||||
{file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
|
{file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
|
||||||
{file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
|
{file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
|
||||||
|
{file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
|
||||||
|
{file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
|
||||||
|
{file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2493,13 +2508,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "llama-index-core"
|
name = "llama-index-core"
|
||||||
version = "0.11.16"
|
version = "0.11.17"
|
||||||
description = "Interface between LLMs and your data"
|
description = "Interface between LLMs and your data"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.8.1"
|
python-versions = "<4.0,>=3.8.1"
|
||||||
files = [
|
files = [
|
||||||
{file = "llama_index_core-0.11.16-py3-none-any.whl", hash = "sha256:099ba785e357506fd5a24c1a6b8fa5286366d6c71637649fab0f9126dcea842c"},
|
{file = "llama_index_core-0.11.17-py3-none-any.whl", hash = "sha256:d65565b54ea55b2db12f9a1cd5c250b770d7e43d3363137cff431a6116ef069c"},
|
||||||
{file = "llama_index_core-0.11.16.tar.gz", hash = "sha256:232a5cebcc73b951d9c663bd30ed59de5356dbd8f9ab88024d19c88bdd1b3254"},
|
{file = "llama_index_core-0.11.17.tar.gz", hash = "sha256:1143baf8d819e27555bdb142abdf2833d3d37731f270f46fa1e07fc4b97116ae"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -6125,6 +6140,41 @@ files = [
|
|||||||
doc = ["reno", "sphinx"]
|
doc = ["reno", "sphinx"]
|
||||||
test = ["pytest", "tornado (>=4.5)", "typeguard"]
|
test = ["pytest", "tornado (>=4.5)", "typeguard"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tesserocr"
|
||||||
|
version = "2.7.1"
|
||||||
|
description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
|
||||||
|
optional = true
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
|
||||||
|
{file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"},
|
||||||
|
{file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"},
|
||||||
|
{file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"},
|
||||||
|
{file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"},
|
||||||
|
{file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"},
|
||||||
|
{file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"},
|
||||||
|
{file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"},
|
||||||
|
{file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"},
|
||||||
|
{file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"},
|
||||||
|
{file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"},
|
||||||
|
{file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"},
|
||||||
|
{file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"},
|
||||||
|
{file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"},
|
||||||
|
{file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"},
|
||||||
|
{file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"},
|
||||||
|
{file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"},
|
||||||
|
{file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"},
|
||||||
|
{file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"},
|
||||||
|
{file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"},
|
||||||
|
{file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"},
|
||||||
|
{file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"},
|
||||||
|
{file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"},
|
||||||
|
{file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"},
|
||||||
|
{file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"},
|
||||||
|
{file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "threadpoolctl"
|
name = "threadpoolctl"
|
||||||
version = "3.5.0"
|
version = "3.5.0"
|
||||||
@ -7330,7 +7380,10 @@ enabler = ["pytest-enabler (>=2.2)"]
|
|||||||
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
|
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
|
||||||
type = ["pytest-mypy"]
|
type = ["pytest-mypy"]
|
||||||
|
|
||||||
|
[extras]
|
||||||
|
tesserocr = ["tesserocr"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "5ef87a880333213955e3ded6bcf0748f6728e4501a98bd5bf9421057de745772"
|
content-hash = "71eec93c5fc347a7c0ae0d846d4c2c41ff96255aab218d7d2ba747d1ffed942e"
|
||||||
|
@ -47,6 +47,7 @@ pydantic-settings = "^2.3.0"
|
|||||||
huggingface_hub = ">=0.23,<1"
|
huggingface_hub = ">=0.23,<1"
|
||||||
requests = "^2.32.3"
|
requests = "^2.32.3"
|
||||||
easyocr = "^1.7"
|
easyocr = "^1.7"
|
||||||
|
tesserocr = { version = "^2.7.1", optional = true }
|
||||||
docling-parse = "^1.4.1"
|
docling-parse = "^1.4.1"
|
||||||
certifi = ">=2024.7.4"
|
certifi = ">=2024.7.4"
|
||||||
rtree = "^1.3.0"
|
rtree = "^1.3.0"
|
||||||
@ -56,6 +57,7 @@ typer = "^0.12.5"
|
|||||||
python-docx = "^1.1.2"
|
python-docx = "^1.1.2"
|
||||||
python-pptx = "^1.0.2"
|
python-pptx = "^1.0.2"
|
||||||
beautifulsoup4 = "^4.12.3"
|
beautifulsoup4 = "^4.12.3"
|
||||||
|
pandas = "^2.1.4"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
@ -70,7 +72,7 @@ pytest-xdist = "^3.3.1"
|
|||||||
types-requests = "^2.31.0.2"
|
types-requests = "^2.31.0.2"
|
||||||
flake8-pyproject = "^1.2.3"
|
flake8-pyproject = "^1.2.3"
|
||||||
pylint = "^2.17.5"
|
pylint = "^2.17.5"
|
||||||
pandas-stubs = "^2.2.2.240909"
|
pandas-stubs = "^2.1.4.231227"
|
||||||
ipykernel = "^6.29.5"
|
ipykernel = "^6.29.5"
|
||||||
ipywidgets = "^8.1.5"
|
ipywidgets = "^8.1.5"
|
||||||
nbqa = "^1.9.0"
|
nbqa = "^1.9.0"
|
||||||
@ -85,6 +87,9 @@ langchain-huggingface = "^0.0.3"
|
|||||||
langchain-milvus = "^0.1.4"
|
langchain-milvus = "^0.1.4"
|
||||||
langchain-text-splitters = "^0.2.4"
|
langchain-text-splitters = "^0.2.4"
|
||||||
|
|
||||||
|
[tool.poetry.extras]
|
||||||
|
tesserocr = ["tesserocr"]
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
docling = "docling.cli.main:app"
|
docling = "docling.cli.main:app"
|
||||||
|
|
||||||
|
3
tests/data_scanned/ocr_test.doctags.txt
Normal file
3
tests/data_scanned/ocr_test.doctags.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
<document>
|
||||||
|
<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
|
||||||
|
</document>
|
1
tests/data_scanned/ocr_test.json
Normal file
1
tests/data_scanned/ocr_test.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.41791534423828, 690.8074951171875, 509.4447021484375, 767.422119140625], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
1
tests/data_scanned/ocr_test.md
Normal file
1
tests/data_scanned/ocr_test.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
1
tests/data_scanned/ocr_test.pages.json
Normal file
1
tests/data_scanned/ocr_test.pages.json
Normal file
File diff suppressed because one or more lines are too long
BIN
tests/data_scanned/ocr_test.pdf
Normal file
BIN
tests/data_scanned/ocr_test.pdf
Normal file
Binary file not shown.
104
tests/test_e2e_ocr_conversion.py
Normal file
104
tests/test_e2e_ocr_conversion.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
OcrOptions,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
PipelineOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
from .verify_utils import verify_conversion_result
|
||||||
|
|
||||||
|
GENERATE = True
|
||||||
|
|
||||||
|
|
||||||
|
# Debug
|
||||||
|
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
||||||
|
r""" """
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
parent = pdf_path.parent
|
||||||
|
eng = "" if engine is None else f".{engine}"
|
||||||
|
|
||||||
|
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
|
||||||
|
with open(dict_fn, "w") as fd:
|
||||||
|
json.dump(doc_result.render_as_dict(), fd)
|
||||||
|
|
||||||
|
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
|
||||||
|
pages = [p.model_dump() for p in doc_result.pages]
|
||||||
|
with open(pages_fn, "w") as fd:
|
||||||
|
json.dump(pages, fd)
|
||||||
|
|
||||||
|
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
||||||
|
with open(doctags_fn, "w") as fd:
|
||||||
|
fd.write(doc_result.render_as_doctags())
|
||||||
|
|
||||||
|
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
||||||
|
with open(md_fn, "w") as fd:
|
||||||
|
fd.write(doc_result.render_as_markdown())
|
||||||
|
|
||||||
|
|
||||||
|
def get_pdf_paths():
|
||||||
|
# Define the directory you want to search
|
||||||
|
directory = Path("./tests/data_scanned")
|
||||||
|
|
||||||
|
# List all PDF files in the directory and its subdirectories
|
||||||
|
pdf_files = sorted(directory.rglob("*.pdf"))
|
||||||
|
return pdf_files
|
||||||
|
|
||||||
|
|
||||||
|
def get_converter(ocr_options: OcrOptions):
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
pipeline_options.ocr_options = ocr_options
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return converter
|
||||||
|
|
||||||
|
|
||||||
|
def test_e2e_conversions():
|
||||||
|
|
||||||
|
pdf_paths = get_pdf_paths()
|
||||||
|
|
||||||
|
engines: List[OcrOptions] = [
|
||||||
|
EasyOcrOptions(),
|
||||||
|
TesseractOcrOptions(),
|
||||||
|
TesseractCliOcrOptions(),
|
||||||
|
]
|
||||||
|
|
||||||
|
for ocr_options in engines:
|
||||||
|
print(f"Converting with ocr_engine: {ocr_options.kind}")
|
||||||
|
converter = get_converter(ocr_options=ocr_options)
|
||||||
|
for pdf_path in pdf_paths:
|
||||||
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
|
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||||
|
|
||||||
|
# Save conversions
|
||||||
|
# save_output(pdf_path, doc_result, None)
|
||||||
|
|
||||||
|
# Debug
|
||||||
|
verify_conversion_result(
|
||||||
|
input_path=pdf_path,
|
||||||
|
doc_result=doc_result,
|
||||||
|
generate=GENERATE,
|
||||||
|
skip_cells=True,
|
||||||
|
)
|
@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
|
|||||||
|
|
||||||
|
|
||||||
def verify_conversion_result(
|
def verify_conversion_result(
|
||||||
input_path: Path, doc_result: ConversionResult, generate=False
|
input_path: Path,
|
||||||
|
doc_result: ConversionResult,
|
||||||
|
generate: bool = False,
|
||||||
|
ocr_engine: str = None,
|
||||||
|
skip_cells: bool = False,
|
||||||
):
|
):
|
||||||
PageList = TypeAdapter(List[Page])
|
PageList = TypeAdapter(List[Page])
|
||||||
|
|
||||||
@ -143,10 +147,11 @@ def verify_conversion_result(
|
|||||||
doc_pred_md = doc_result.render_as_markdown()
|
doc_pred_md = doc_result.render_as_markdown()
|
||||||
doc_pred_dt = doc_result.render_as_doctags()
|
doc_pred_dt = doc_result.render_as_doctags()
|
||||||
|
|
||||||
pages_path = input_path.with_suffix(".pages.json")
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
json_path = input_path.with_suffix(".json")
|
pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
|
||||||
md_path = input_path.with_suffix(".md")
|
json_path = input_path.with_suffix(f"{engine_suffix}.json")
|
||||||
dt_path = input_path.with_suffix(".doctags.txt")
|
md_path = input_path.with_suffix(f"{engine_suffix}.md")
|
||||||
|
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, "w") as fw:
|
||||||
@ -173,9 +178,10 @@ def verify_conversion_result(
|
|||||||
with open(dt_path, "r") as fr:
|
with open(dt_path, "r") as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
assert verify_cells(
|
if not skip_cells:
|
||||||
doc_pred_pages, doc_true_pages
|
assert verify_cells(
|
||||||
), f"Mismatch in PDF cell prediction for {input_path}"
|
doc_pred_pages, doc_true_pages
|
||||||
|
), f"Mismatch in PDF cell prediction for {input_path}"
|
||||||
|
|
||||||
# assert verify_output(
|
# assert verify_output(
|
||||||
# doc_pred, doc_true
|
# doc_pred, doc_true
|
||||||
|
Loading…
Reference in New Issue
Block a user