diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 8e92e76e..8c88acc5 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -9,6 +9,11 @@ jobs: python-version: ['3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v3 + - name: Install tesseract + run: sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config + - name: Set TESSDATA_PREFIX + run: | + echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" - uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} @@ -32,4 +37,4 @@ jobs: poetry run python "$file" || exit 1 done - name: Build with poetry - run: poetry build \ No newline at end of file + run: poetry build diff --git a/CHANGELOG.md b/CHANGELOG.md index 94e773c2..1a8bc4fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## [v1.19.0](https://github.com/DS4SD/docling/releases/tag/v1.19.0) - 2024-10-08 + +### Feature + +* Add options for choosing OCR engines ([#118](https://github.com/DS4SD/docling/issues/118)) ([`f96ea86`](https://github.com/DS4SD/docling/commit/f96ea86a00fd1aafaa57025e46b5288b43958725)) + ## [v1.18.0](https://github.com/DS4SD/docling/releases/tag/v1.18.0) - 2024-10-03 ### Feature diff --git a/README.md b/README.md index 53990a5a..1d72b44a 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu ``` +
+ Alternative OCR engines + + Docling supports multiple OCR engines for processing scanned documents. The current version provides + the following engines. + + | Engine | Installation | Usage | + | ------ | ------------ | ----- | + | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` | + | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` | + | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` | + + The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example + + ```python + from docling.datamodel.base_models import ConversionStatus, PipelineOptions + from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions + from docling.document_converter import DocumentConverter + + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract + + doc_converter = DocumentConverter( + pipeline_options=pipeline_options, + ) + ``` + + #### Tesseract installation + + [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available + on most operating systems. For using this engine with Docling, Tesseract must be installed on your + system, using the packaging tool of your choice. Below we provide example commands. + After installing Tesseract you are expected to provide the path to its language files using the + `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`). + + For macOS, we reccomend using [Homebrew](https://brew.sh/). + + ```console + brew install tesseract leptonica pkg-config + TESSDATA_PREFIX=/opt/homebrew/share/tessdata/ + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + For Debian-based systems. + + ```console + apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config + TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$) + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + For RHEL systems. + + ```console + dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel + TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + #### Linking to Tesseract + The most efficient usage of the Tesseract library is via linking. Docling is using + the [Tesserocr](https://github.com/sirfz/tesserocr) package for this. + + If you get into installation issues of Tesserocr, we suggest using the following + installation options: + + ```console + pip uninstall tesserocr + pip install --no-binary :all: tesserocr + ``` +
+
Docling development setup diff --git a/docling/cli/main.py b/docling/cli/main.py index 99452076..2a391d5c 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) from docling.document_converter import DocumentConverter, PdfFormatOption warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") @@ -53,6 +58,13 @@ class Backend(str, Enum): DOCLING = "docling" +# Define an enum for the ocr engines +class OcrEngine(str, Enum): + EASYOCR = "easyocr" + TESSERACT_CLI = "tesseract_cli" + TESSERACT = "tesseract" + + def export_documents( conv_results: Iterable[ConversionResult], output_dir: Path, @@ -152,6 +164,9 @@ def convert( backend: Annotated[ Backend, typer.Option(..., help="The PDF backend to use.") ] = Backend.DOCLING, + ocr_engine: Annotated[ + OcrEngine, typer.Option(..., help="The OCR engine to use.") + ] = OcrEngine.EASYOCR, output: Annotated[ Path, typer.Option(..., help="Output directory where results are saved.") ] = Path("."), @@ -191,8 +206,19 @@ def convert( case _: raise RuntimeError(f"Unexpected backend type {backend}") + match ocr_engine: + case OcrEngine.EASYOCR: + ocr_options = EasyOcrOptions() + case OcrEngine.TESSERACT_CLI: + ocr_options = TesseractCliOcrOptions() + case OcrEngine.TESSERACT: + ocr_options = TesseractOcrOptions() + case _: + raise RuntimeError(f"Unexpected backend type {backend}") + pipeline_options = PdfPipelineOptions( do_ocr=ocr, + ocr_options=ocr_options, do_table_structure=True, ) pipeline_options.table_structure_options.do_cell_matching = do_cell_matching diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 0098288e..4be6fcec 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,9 +1,9 @@ import warnings from enum import Enum, auto from pathlib import Path -from typing import Annotated, Optional, Union +from typing import Annotated, List, Literal, Optional, Union -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator class TableFormerMode(str, Enum): @@ -21,6 +21,44 @@ class TableStructureOptions(BaseModel): mode: TableFormerMode = TableFormerMode.FAST +class OcrOptions(BaseModel): + kind: str + + +class EasyOcrOptions(OcrOptions): + kind: Literal["easyocr"] = "easyocr" + lang: List[str] = ["fr", "de", "es", "en"] + use_gpu: bool = True # same default as easyocr.Reader + model_storage_directory: Optional[str] = None + download_enabled: bool = True # same default as easyocr.Reader + + model_config = ConfigDict( + extra="forbid", + protected_namespaces=(), + ) + + +class TesseractCliOcrOptions(OcrOptions): + kind: Literal["tesseract"] = "tesseract" + lang: List[str] = ["fra", "deu", "spa", "eng"] + tesseract_cmd: str = "tesseract" + path: Optional[str] = None + + model_config = ConfigDict( + extra="forbid", + ) + + +class TesseractOcrOptions(OcrOptions): + kind: Literal["tesserocr"] = "tesserocr" + lang: List[str] = ["fra", "deu", "spa", "eng"] + path: Optional[str] = None + + model_config = ConfigDict( + extra="forbid", + ) + + class PipelineOptions(BaseModel): ... @@ -30,6 +68,9 @@ class PdfPipelineOptions(PipelineOptions): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() + ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = ( + Field(EasyOcrOptions(), discriminator="kind") + ) keep_page_images: Annotated[ bool, diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index ea0feb82..aea7755b 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -10,15 +10,15 @@ from rtree import index from scipy.ndimage import find_objects, label from docling.datamodel.base_models import OcrCell, Page -from docling.models.abstract_model import AbstractPageModel +from docling.datamodel.pipeline_options import OcrOptions _log = logging.getLogger(__name__) -class BaseOcrModel(AbstractPageModel): - def __init__(self, config): - self.config = config - self.enabled = config["enabled"] +class BaseOcrModel: + def __init__(self, enabled: bool, options: OcrOptions): + self.enabled = enabled + self.options = options # Computes the optimum amount and coordinates of rectangles to OCR on a given page def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]: diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 3bc1f89d..9408076b 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -5,21 +5,33 @@ import numpy from docling_core.types.experimental import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.pipeline_options import EasyOcrOptions from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) class EasyOcrModel(BaseOcrModel): - def __init__(self, config): - super().__init__(config) + def __init__(self, enabled: bool, options: EasyOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: EasyOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. if self.enabled: - import easyocr + try: + import easyocr + except ImportError: + raise ImportError( + "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) - self.reader = easyocr.Reader(config["lang"]) + self.reader = easyocr.Reader( + lang_list=self.options.lang, + model_storage_directory=self.options.model_storage_directory, + download_enabled=self.options.download_enabled, + ) def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: @@ -32,6 +44,9 @@ class EasyOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py new file mode 100644 index 00000000..d0240f86 --- /dev/null +++ b/docling/models/tesseract_ocr_cli_model.py @@ -0,0 +1,168 @@ +import io +import logging +import tempfile +from subprocess import PIPE, Popen +from typing import Iterable, Tuple + +import pandas as pd +from docling_core.types.experimental import BoundingBox, CoordOrigin + +from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.pipeline_options import TesseractCliOcrOptions +from docling.models.base_ocr_model import BaseOcrModel + +_log = logging.getLogger(__name__) + + +class TesseractOcrCliModel(BaseOcrModel): + + def __init__(self, enabled: bool, options: TesseractCliOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: TesseractCliOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + self._name = None + self._version = None + + if self.enabled: + try: + self._get_name_and_version() + + except Exception as exc: + raise RuntimeError( + f"Tesseract is not available, aborting: {exc} " + "Install tesseract on your system and the tesseract binary is discoverable. " + "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + + def _get_name_and_version(self) -> Tuple[str, str]: + + if self._name != None and self._version != None: + return self._name, self._version + + cmd = [self.options.tesseract_cmd, "--version"] + + proc = Popen(cmd, stdout=PIPE, stderr=PIPE) + stdout, stderr = proc.communicate() + + proc.wait() + + # HACK: Windows versions of Tesseract output the version to stdout, Linux versions + # to stderr, so check both. + version_line = ( + (stdout.decode("utf8").strip() or stderr.decode("utf8").strip()) + .split("\n")[0] + .strip() + ) + + # If everything else fails... + if not version_line: + version_line = "tesseract XXX" + + name, version = version_line.split(" ") + + self._name = name + self._version = version + + return name, version + + def _run_tesseract(self, ifilename: str): + + cmd = [self.options.tesseract_cmd] + + if self.options.lang is not None and len(self.options.lang) > 0: + cmd.append("-l") + cmd.append("+".join(self.options.lang)) + if self.options.path is not None: + cmd.append("--tessdata-dir") + cmd.append(self.options.path) + + cmd += [ifilename, "stdout", "tsv"] + _log.info("command: {}".format(" ".join(cmd))) + + proc = Popen(cmd, stdout=PIPE) + output, _ = proc.communicate() + + # _log.info(output) + + # Decode the byte string to a regular string + decoded_data = output.decode("utf-8") + # _log.info(decoded_data) + + # Read the TSV file generated by Tesseract + df = pd.read_csv(io.StringIO(decoded_data), sep="\t") + + # Display the dataframe (optional) + # _log.info("df: ", df.head()) + + # Filter rows that contain actual text (ignore header or empty rows) + df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] + + return df_filtered + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file: + fname = image_file.name + high_res_image.save(fname) + + df = self._run_tesseract(fname) + + # _log.info(df) + + # Print relevant columns (bounding box and text) + for ix, row in df.iterrows(): + text = row["text"] + conf = row["conf"] + + l = float(row["left"]) + b = float(row["top"]) + w = float(row["width"]) + h = float(row["height"]) + + t = b + h + r = l + w + + cell = OcrCell( + id=ix, + text=text, + confidence=conf / 100.0, + bbox=BoundingBox.from_tuple( + coord=( + (l / self.scale) + ocr_rect.l, + (b / self.scale) + ocr_rect.t, + (r / self.scale) + ocr_rect.l, + (t / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + all_ocr_cells.append(cell) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) + + yield page diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py new file mode 100644 index 00000000..5173c1bf --- /dev/null +++ b/docling/models/tesseract_ocr_model.py @@ -0,0 +1,123 @@ +import logging +from typing import Iterable + +import numpy +from docling_core.types.experimental import BoundingBox, CoordOrigin + +from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.pipeline_options import TesseractCliOcrOptions +from docling.models.base_ocr_model import BaseOcrModel + +_log = logging.getLogger(__name__) + + +class TesseractOcrModel(BaseOcrModel): + def __init__(self, enabled: bool, options: TesseractCliOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: TesseractCliOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + self.reader = None + + if self.enabled: + setup_errmsg = ( + "tesserocr is not correctly installed. " + "Please install it via `pip install tesserocr` to use this OCR engine. " + "Note that tesserocr might have to be manually compiled for working with" + "your Tesseract installation. The Docling documentation provides examples for it. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + try: + import tesserocr + except ImportError: + raise ImportError(setup_errmsg) + + try: + tesseract_version = tesserocr.tesseract_version() + _log.debug("Initializing TesserOCR: %s", tesseract_version) + except: + raise ImportError(setup_errmsg) + + # Initialize the tesseractAPI + lang = "+".join(self.options.lang) + if self.options.path is not None: + self.reader = tesserocr.PyTessBaseAPI( + path=self.options.path, + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + else: + self.reader = tesserocr.PyTessBaseAPI( + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + self.reader_RIL = tesserocr.RIL + + def __del__(self): + if self.reader is not None: + # Finalize the tesseractAPI + self.reader.End() + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + # Retrieve text snippets with their bounding boxes + self.reader.SetImage(high_res_image) + boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True) + + cells = [] + for ix, (im, box, _, _) in enumerate(boxes): + # Set the area of interest. Tesseract uses Bottom-Left for the origin + self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) + + # Extract text within the bounding box + text = self.reader.GetUTF8Text().strip() + confidence = self.reader.MeanTextConf() + left = box["x"] / self.scale + bottom = box["y"] / self.scale + right = (box["x"] + box["w"]) / self.scale + top = (box["y"] + box["h"]) / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), + ) + ) + + # del high_res_image + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) + + yield page diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py index d0ab94bc..659c3c93 100644 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -6,13 +6,21 @@ from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.page_assemble_model import PageAssembleModel from docling.models.page_preprocessing_model import PagePreprocessingModel from docling.models.table_structure_model import TableStructureModel +from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel +from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.base_model_pipeline import PaginatedModelPipeline _log = logging.getLogger(__name__) @@ -31,16 +39,32 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): self.artifacts_path = Path(artifacts_path) self.glm_model = GlmModel(config={}) + ocr_model: BaseOcrModel + if isinstance(pipeline_options.ocr_options, EasyOcrOptions): + ocr_model = EasyOcrModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) + elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions): + ocr_model = TesseractOcrCliModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) + elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): + ocr_model = TesseractOcrModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) + else: + raise RuntimeError( + f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." + ) + self.model_pipe = [ PagePreprocessingModel( config={"images_scale": pipeline_options.images_scale} ), - EasyOcrModel( - config={ - "lang": ["fr", "de", "es", "en"], - "enabled": pipeline_options.do_ocr, - } - ), + ocr_model, LayoutModel( config={ "artifacts_path": artifacts_path diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 090e49aa..68b52797 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -6,7 +6,11 @@ from typing import Iterable from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline @@ -71,7 +75,7 @@ def main(): # and PDF Backends for various configurations. # Uncomment one section at the time to see the differences in the output. - # PyPdfium without OCR + # PyPdfium without EasyOCR # -------------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=False @@ -83,7 +87,7 @@ def main(): # pdf_backend=PyPdfiumDocumentBackend, # ) - # PyPdfium with OCR + # PyPdfium with EasyOCR # ----------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=True @@ -95,7 +99,7 @@ def main(): # pdf_backend=PyPdfiumDocumentBackend, # ) - # Docling Parse without OCR + # Docling Parse without EasyOCR # ------------------------- pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False @@ -108,7 +112,7 @@ def main(): } ) - # Docling Parse with OCR + # Docling Parse with EasyOCR # ---------------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=True @@ -120,6 +124,32 @@ def main(): # pdf_backend=DoclingParseDocumentBackend, # ) + # Docling Parse with Tesseract + # ---------------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr = True + # pipeline_options.do_table_structure = True + # pipeline_options.table_structure_options.do_cell_matching = True + # pipeline_options.ocr_options = TesseractOcrOptions() + + # doc_converter = DocumentConverter( + # pipeline_options=pipeline_options, + # pdf_backend=DoclingParseDocumentBackend, + # ) + + # Docling Parse with Tesseract CLI + # ---------------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr = True + # pipeline_options.do_table_structure = True + # pipeline_options.table_structure_options.do_cell_matching = True + # pipeline_options.ocr_options = TesseractCliOcrOptions() + + # doc_converter = DocumentConverter( + # pipeline_options=pipeline_options, + # pdf_backend=DoclingParseDocumentBackend, + # ) + ########################################################################### # Define input files diff --git a/poetry.lock b/poetry.lock index 45684d25..c4a4e559 100644 --- a/poetry.lock +++ b/poetry.lock @@ -450,101 +450,116 @@ files = [ [[package]] name = "charset-normalizer" -version = "3.3.2" +version = "3.4.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7.0" files = [ - {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, - {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"}, + {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"}, + {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, ] [[package]] @@ -2493,13 +2508,13 @@ files = [ [[package]] name = "llama-index-core" -version = "0.11.16" +version = "0.11.17" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_core-0.11.16-py3-none-any.whl", hash = "sha256:099ba785e357506fd5a24c1a6b8fa5286366d6c71637649fab0f9126dcea842c"}, - {file = "llama_index_core-0.11.16.tar.gz", hash = "sha256:232a5cebcc73b951d9c663bd30ed59de5356dbd8f9ab88024d19c88bdd1b3254"}, + {file = "llama_index_core-0.11.17-py3-none-any.whl", hash = "sha256:d65565b54ea55b2db12f9a1cd5c250b770d7e43d3363137cff431a6116ef069c"}, + {file = "llama_index_core-0.11.17.tar.gz", hash = "sha256:1143baf8d819e27555bdb142abdf2833d3d37731f270f46fa1e07fc4b97116ae"}, ] [package.dependencies] @@ -6125,6 +6140,41 @@ files = [ doc = ["reno", "sphinx"] test = ["pytest", "tornado (>=4.5)", "typeguard"] +[[package]] +name = "tesserocr" +version = "2.7.1" +description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython" +optional = true +python-versions = "*" +files = [ + {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"}, + {file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"}, + {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"}, + {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"}, + {file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"}, + {file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"}, + {file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"}, + {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"}, + {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"}, + {file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"}, + {file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"}, + {file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"}, + {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"}, + {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"}, + {file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"}, + {file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"}, + {file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"}, + {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"}, + {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"}, + {file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"}, + {file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"}, + {file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"}, + {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"}, + {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"}, + {file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"}, + {file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"}, +] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -7330,7 +7380,10 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] +[extras] +tesserocr = ["tesserocr"] + [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "5ef87a880333213955e3ded6bcf0748f6728e4501a98bd5bf9421057de745772" +content-hash = "71eec93c5fc347a7c0ae0d846d4c2c41ff96255aab218d7d2ba747d1ffed942e" diff --git a/pyproject.toml b/pyproject.toml index 41100a47..411550ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = "^1.7" +tesserocr = { version = "^2.7.1", optional = true } docling-parse = "^1.4.1" certifi = ">=2024.7.4" rtree = "^1.3.0" @@ -56,6 +57,7 @@ typer = "^0.12.5" python-docx = "^1.1.2" python-pptx = "^1.0.2" beautifulsoup4 = "^4.12.3" +pandas = "^2.1.4" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} @@ -70,7 +72,7 @@ pytest-xdist = "^3.3.1" types-requests = "^2.31.0.2" flake8-pyproject = "^1.2.3" pylint = "^2.17.5" -pandas-stubs = "^2.2.2.240909" +pandas-stubs = "^2.1.4.231227" ipykernel = "^6.29.5" ipywidgets = "^8.1.5" nbqa = "^1.9.0" @@ -85,6 +87,9 @@ langchain-huggingface = "^0.0.3" langchain-milvus = "^0.1.4" langchain-text-splitters = "^0.2.4" +[tool.poetry.extras] +tesserocr = ["tesserocr"] + [tool.poetry.scripts] docling = "docling.cli.main:app" diff --git a/tests/data_scanned/ocr_test.doctags.txt b/tests/data_scanned/ocr_test.doctags.txt new file mode 100644 index 00000000..7cd53510 --- /dev/null +++ b/tests/data_scanned/ocr_test.doctags.txt @@ -0,0 +1,3 @@ + +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package + \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.json b/tests/data_scanned/ocr_test.json new file mode 100644 index 00000000..775ab8e7 --- /dev/null +++ b/tests/data_scanned/ocr_test.json @@ -0,0 +1 @@ +{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.41791534423828, 690.8074951171875, 509.4447021484375, 767.422119140625], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.md b/tests/data_scanned/ocr_test.md new file mode 100644 index 00000000..42896546 --- /dev/null +++ b/tests/data_scanned/ocr_test.md @@ -0,0 +1 @@ +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.pages.json b/tests/data_scanned/ocr_test.pages.json new file mode 100644 index 00000000..8d0a6a3b --- /dev/null +++ b/tests/data_scanned/ocr_test.pages.json @@ -0,0 +1 @@ +[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 71.41791534423828, "t": 74.49958801269531, "r": 509.4447021484375, "b": 151.11419677734375, "coord_origin": "TOPLEFT"}, "confidence": 0.9238373041152954, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 71.41791534423828, "t": 74.49958801269531, "r": 509.4447021484375, "b": 151.11419677734375, "coord_origin": "TOPLEFT"}, "confidence": 0.9238373041152954, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 71.41791534423828, "t": 74.49958801269531, "r": 509.4447021484375, "b": 151.11419677734375, "coord_origin": "TOPLEFT"}, "confidence": 0.9238373041152954, "cells": [{"id": 0, "text": "Docling", "bbox": {"l": 74.0, "t": 78.0, "r": 144.0, "b": 96.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "bundles", "bbox": {"l": 150.66666666666666, "t": 78.0, "r": 224.66666666666666, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "PDF", "bbox": {"l": 232.0, "t": 78.0, "r": 269.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 3, "text": "document", "bbox": {"l": 275.0, "t": 78.0, "r": 371.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 4, "text": "conversion", "bbox": {"l": 377.3333333333333, "t": 78.0, "r": 479.0, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 5, "text": "to", "bbox": {"l": 485.3333333333333, "t": 79.66666666666663, "r": 503.3333333333333, "b": 92.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 6, "text": "JSON", "bbox": {"l": 72.33333333333333, "t": 104.66666666666663, "r": 121.33333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 7, "text": "and", "bbox": {"l": 129.0, "t": 105.0, "r": 162.33333333333334, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 8, "text": "Markdown", "bbox": {"l": 170.33333333333334, "t": 105.0, "r": 265.6666666666667, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 9, "text": "in", "bbox": {"l": 273.3333333333333, "t": 105.0, "r": 287.6666666666667, "b": 119.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 10, "text": "an", "bbox": {"l": 294.6666666666667, "t": 108.66666666666663, "r": 316.0, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 11, "text": "easy", "bbox": {"l": 323.0, "t": 108.66666666666663, "r": 365.6666666666667, "b": 123.33333333333337, "coord_origin": "TOPLEFT"}}, {"id": 12, "text": "self", "bbox": {"l": 371.0, "t": 105.0, "r": 404.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 13, "text": "contained", "bbox": {"l": 410.3333333333333, "t": 105.0, "r": 503.3333333333333, "b": 119.66666666666663, "coord_origin": "TOPLEFT"}}, {"id": 14, "text": "package", "bbox": {"l": 73.66666666666667, "t": 131.66666666666663, "r": 150.66666666666666, "b": 150.33333333333337, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}] \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.pdf b/tests/data_scanned/ocr_test.pdf new file mode 100644 index 00000000..b79f3c28 Binary files /dev/null and b/tests/data_scanned/ocr_test.pdf differ diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py new file mode 100644 index 00000000..804ecd8d --- /dev/null +++ b/tests/test_e2e_ocr_conversion.py @@ -0,0 +1,104 @@ +from pathlib import Path +from typing import List + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + OcrOptions, + PdfPipelineOptions, + PipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + +from .verify_utils import verify_conversion_result + +GENERATE = True + + +# Debug +def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str): + r""" """ + import json + import os + + parent = pdf_path.parent + eng = "" if engine is None else f".{engine}" + + dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json") + with open(dict_fn, "w") as fd: + json.dump(doc_result.render_as_dict(), fd) + + pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json") + pages = [p.model_dump() for p in doc_result.pages] + with open(pages_fn, "w") as fd: + json.dump(pages, fd) + + doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt") + with open(doctags_fn, "w") as fd: + fd.write(doc_result.render_as_doctags()) + + md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md") + with open(md_fn, "w") as fd: + fd.write(doc_result.render_as_markdown()) + + +def get_pdf_paths(): + # Define the directory you want to search + directory = Path("./tests/data_scanned") + + # List all PDF files in the directory and its subdirectories + pdf_files = sorted(directory.rglob("*.pdf")) + return pdf_files + + +def get_converter(ocr_options: OcrOptions): + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + pipeline_options.ocr_options = ocr_options + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + backend=DoclingParseDocumentBackend, + ) + } + ) + + return converter + + +def test_e2e_conversions(): + + pdf_paths = get_pdf_paths() + + engines: List[OcrOptions] = [ + EasyOcrOptions(), + TesseractOcrOptions(), + TesseractCliOcrOptions(), + ] + + for ocr_options in engines: + print(f"Converting with ocr_engine: {ocr_options.kind}") + converter = get_converter(ocr_options=ocr_options) + for pdf_path in pdf_paths: + print(f"converting {pdf_path}") + + doc_result: ConversionResult = converter.convert_single(pdf_path) + + # Save conversions + # save_output(pdf_path, doc_result, None) + + # Debug + verify_conversion_result( + input_path=pdf_path, + doc_result=doc_result, + generate=GENERATE, + skip_cells=True, + ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index a0b0f0e6..082b7c78 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt): def verify_conversion_result( - input_path: Path, doc_result: ConversionResult, generate=False + input_path: Path, + doc_result: ConversionResult, + generate: bool = False, + ocr_engine: str = None, + skip_cells: bool = False, ): PageList = TypeAdapter(List[Page]) @@ -143,10 +147,11 @@ def verify_conversion_result( doc_pred_md = doc_result.render_as_markdown() doc_pred_dt = doc_result.render_as_doctags() - pages_path = input_path.with_suffix(".pages.json") - json_path = input_path.with_suffix(".json") - md_path = input_path.with_suffix(".md") - dt_path = input_path.with_suffix(".doctags.txt") + engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json") + json_path = input_path.with_suffix(f"{engine_suffix}.json") + md_path = input_path.with_suffix(f"{engine_suffix}.md") + dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth with open(pages_path, "w") as fw: @@ -173,9 +178,10 @@ def verify_conversion_result( with open(dt_path, "r") as fr: doc_true_dt = fr.read() - assert verify_cells( - doc_pred_pages, doc_true_pages - ), f"Mismatch in PDF cell prediction for {input_path}" + if not skip_cells: + assert verify_cells( + doc_pred_pages, doc_true_pages + ), f"Mismatch in PDF cell prediction for {input_path}" # assert verify_output( # doc_pred, doc_true