From e5fd579861383d99d578b4e24399fdfb2a4ba62c Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 17 Jun 2025 16:52:34 +0200
Subject: [PATCH] added openai-whisper as a first transcription model

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/backend/audio_backend.py |   2 +-
 docling/cli/main.py              |   2 +-
 docling/datamodel/base_models.py |   6 +-
 docling/datamodel/document.py    |   5 +-
 docling/document_converter.py    |   4 +-
 docling/pipeline/asr_pipeline.py | 353 ++++++-------------------------
 pyproject.toml                   |   4 +-
 uv.lock                          |  67 +++++-
 8 files changed, 132 insertions(+), 311 deletions(-)

diff --git a/docling/backend/audio_backend.py b/docling/backend/audio_backend.py
index 289356ab..d0958276 100644
--- a/docling/backend/audio_backend.py
+++ b/docling/backend/audio_backend.py
@@ -53,7 +53,7 @@ class AudioBackend(DeclarativeDocumentBackend):
 
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.AUDIO_WAV}
+        return {InputFormat.AUDIO}
 
     def convert(self) -> DoclingDocument:
         """
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 7ad810f1..54d5fc67 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -652,7 +652,7 @@ def convert(  # noqa: C901
             )
 
             format_options = {
-                InputFormat.AUDIO_WAV: audio_format_option,
+                InputFormat.AUDIO: audio_format_option,
             }
 
         if artifacts_path is not None:
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index dd6f7406..5426fb4d 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -49,7 +49,7 @@ class InputFormat(str, Enum):
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
     JSON_DOCLING = "json_docling"
-    AUDIO_WAV = "wav"
+    AUDIO = "wav"
 
 
 class OutputFormat(str, Enum):
@@ -74,7 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.JSON_DOCLING: ["json"],
-    InputFormat.AUDIO_WAV: ["wav"],
+    InputFormat.AUDIO: ["wav"],
 }
 
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -106,7 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
     InputFormat.JSON_DOCLING: ["application/json"],
-    InputFormat.AUDIO_WAV: ["audio/wav", "audio/x-wav"],
+    InputFormat.AUDIO: ["audio/wav", "audio/x-wav"],
 }
 
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 73b2bfbe..9f5cf82c 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -280,12 +280,9 @@ class _DocumentConversionInput(BaseModel):
 
         if isinstance(obj, Path):
             mime = filetype.guess_mime(str(obj))
-            print(f"mime: {mime}")
             if mime is None:
                 ext = obj.suffix[1:]
-                print(f"ext: {ext}")
                 mime = _DocumentConversionInput._mime_from_extension(ext)
-                print(f"mime: {mime}")
             if mime is None:  # must guess from
                 with obj.open("rb") as f:
                     content = f.read(1024)  # Read first 1KB
@@ -321,7 +318,7 @@ class _DocumentConversionInput(BaseModel):
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
-        print(formats)
+        _log.info(f"detected formats: {formats}")
 
         if formats:
             if len(formats) == 1 and mime not in ("text/plain"):
diff --git a/docling/document_converter.py b/docling/document_converter.py
index 258e92d0..4336c9bf 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -163,9 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.JSON_DOCLING: FormatOption(
             pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
         ),
-        InputFormat.AUDIO_WAV: FormatOption(
-            pipeline_cls=AsrPipeline, backend=AudioBackend
-        ),
+        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend),
     }
     if (options := format_to_default_options.get(format)) is not None:
         return options
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
index 173beffc..c8c39b16 100644
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@@ -5,14 +5,16 @@ from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
 
-import librosa  # type: ignore
-import numpy as np
-import soundfile as sf  # type: ignore
+import whisper  # type: ignore
+
+# import librosa
+# import numpy as np
+# import soundfile as sf  # type: ignore
 from docling_core.types.doc.labels import DocItemLabel
 from pydantic import BaseModel, Field, validator
-from pydub import AudioSegment  # type: ignore
-from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
 
+# from pydub import AudioSegment  # type: ignore
+# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.audio_backend import AudioBackend
 from docling.datamodel.base_models import (
@@ -36,6 +38,16 @@ from docling.utils.profiling import ProfilingScope, TimeRecorder
 _log = logging.getLogger(__name__)
 
 
+class _ConversationWord(BaseModel):
+    text: str
+    start_time: Optional[float] = Field(
+        None, description="Start time in seconds from video start"
+    )
+    end_time: Optional[float] = Field(
+        None, ge=0, description="End time in seconds from video start"
+    )
+
+
 class _ConversationItem(BaseModel):
     text: str
     start_time: Optional[float] = Field(
@@ -48,6 +60,9 @@ class _ConversationItem(BaseModel):
     speaker: Optional[str] = Field(
         None, description="Speaker name, defaults to speaker-{speaker_id}"
     )
+    words: Optional[list[_ConversationWord]] = Field(
+        None, description="Individual words with time-stamps"
+    )
 
     def __lt__(self, other):
         if not isinstance(other, _ConversationItem):
@@ -72,313 +87,65 @@ class _ConversationItem(BaseModel):
         return result
 
 
-class _WhisperASR:
-    def __init__(self, model_name: str = "openai/whisper-small"):
+class _NativeWhisperModel:
+    def __init__(self, model_name: str = "medium"):
         """
-        Transcriber using Hugging Face Transformers Whisper + energy-based VAD.
+        Transcriber using native Whisper.
         """
-        print(f"Loading Whisper model: {model_name}")
 
-        self.device = "cpu"
+        self.model = whisper.load_model(model_name)
 
-        self.transcriber = pipeline(
-            "automatic-speech-recognition",
-            model=model_name,
-            return_timestamps=True,
-            device=self.device,
-        )
-
-    def _energy_vad(
-        self,
-        y: np.ndarray,
-        sr: int,
-        frame_length=2048,
-        hop_length=512,
-        threshold_percentile=85,
-    ):
-        """
-        Simple energy-based VAD.
-        Returns list of (start_time, end_time) tuples for speech segments.
-        """
-        _log.debug(f"_energy_vad {sr}: ", y.shape)
-        energy = np.array(
-            [
-                np.sum(np.abs(y[i : i + frame_length] ** 2))
-                for i in range(0, len(y), hop_length)
-            ]
-        )
-        _log.debug(f"energy: {energy}")
-
-        threshold = np.percentile(energy, threshold_percentile) * 0.3
-        _log.debug(f"threshold: {threshold}")
-
-        speech_frames = energy > threshold
-        _log.debug(f"speech_frames: {speech_frames}")
-
-        frame_times = librosa.frames_to_time(
-            np.arange(len(energy)), sr=sr, hop_length=hop_length
-        )
-
-        segments = []
-        start_time = None
-
-        for i, is_speech in enumerate(speech_frames):
-            t = frame_times[i]
-            if is_speech and start_time is None:
-                start_time = t
-            elif not is_speech and start_time is not None:
-                segments.append((start_time, t))
-                start_time = None
-
-        if start_time is not None:
-            segments.append((start_time, frame_times[-1]))
-
-        return segments
-
-    def _merge_vad_segments(self, segments, min_duration=5.0, max_gap=0.5):
-        """
-        Merge short/adjacent speech segments to improve transcription quality.
-        """
-        if not segments:
-            return []
-
-        merged = []
-        current_start, current_end = segments[0]
-
-        for start, end in segments[1:]:
-            gap = start - current_end
-            if gap <= max_gap or (current_end - current_start) < min_duration:
-                current_end = end  # merge
-            else:
-                if current_end - current_start >= 1.0:  # skip ultra-short
-                    merged.append((current_start, current_end))
-                current_start, current_end = start, end
-
-        if current_end - current_start >= 1.0:
-            merged.append((current_start, current_end))
-
-        return merged
+        self.verbose = True
+        self.word_timestamps = True
 
     def run(self, conv_res: ConversionResult) -> ConversionResult:
-        """
-        Transcribe audio using custom VAD and Whisper, returning timestamped segments.
-        Returns list of {"start", "end", "text"} dictionaries.
-        """
-        audio_path = conv_res.input.file
+        audio_path: Path = Path(conv_res.input.file).resolve()
 
-        _log.info(f"Loading audio and resampling: {audio_path}")
-        y, sr = librosa.load(audio_path, sr=16000)
-
-        speech_segments = self._energy_vad(y=y, sr=int(sr))
-        speech_segments = self._merge_vad_segments(speech_segments)
-        _log.info("#-speech: ", len(speech_segments))
-
-        _log.info("Preparing AudioSegment for chunk slicing...")
-        pcm = (y * 32767).astype(np.int16).tobytes()
-        audio_seg = AudioSegment(data=pcm, sample_width=2, frame_rate=16000, channels=1)
-
-        result = self._create_conversation_entries_v2(speech_segments, audio_seg)
-        result.sort()
-
-        for _ in result:
-            conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
-
-        conv_res.status = ConversionStatus.SUCCESS
-        return conv_res
-
-    def _create_conversation_entries_v1(
-        self, speech_segments, audio_seg
-    ) -> list[_ConversationItem]:
-        """
-        Chunk audio based on speech_segments, transcribe with Whisper,
-        and return structured _ConversationItem items.
-        """
-        results = []
-        chunk_id = 0
-
-        for start, end in speech_segments:
-            duration = end - start
-            while duration > 0:
-                sub_end = min(start + 30.0, end)
-                chunk = audio_seg[start * 1000 : sub_end * 1000]
-                samples = (
-                    np.array(chunk.get_array_of_samples()).astype(np.float32) / 32768.0
-                )
-
-                try:
-                    _log.debug(
-                        f"Transcribing chunk {chunk_id}: {start:.2f}s - {sub_end:.2f}s [{sub_end - start:.2f}]"
-                    )
-                    result = self.transcriber(samples, return_timestamps=True)
-
-                    # Adjust timestamps globally
-                    for seg in result["chunks"]:
-                        t0, t1 = seg["timestamp"]
-                        if t0 is None or t1 is None or t1 <= t0:
-                            _log.warning(f"skipping bad segment: {seg}")
-                            continue
-
-                        item = _ConversationItem(
-                            text=seg["text"].strip(),
-                            start_time=start + t0,
-                            end_time=start + t1,
-                        )
-                        results.append(item)
-
-                    start = sub_end
-                    duration = end - start
-                    chunk_id += 1
-                except Exception as exc:
-                    _log.error(f"Exception: {exc}")
-
-        return results
-
-    def _create_conversation_entries_v2(
-        self, speech_segments, audio_seg
-    ) -> list[_ConversationItem]:
-        """
-        Chunk audio based on speech_segments, transcribe with Whisper,
-        and return structured _ConversationItem items.
-        """
-        results = []
-        chunk_id = 0
-
-        if len(speech_segments) == 0:
-            return []
-
-        any_valid = False
-        last_valid_offset: float = speech_segments[0][0]
-
-        for start, end in speech_segments:
-            if any_valid:
-                last_valid_offset = min(start, last_valid_offset)
-            else:
-                last_valid_offset = start
-
-            duration = end - last_valid_offset
-
-            if duration > 0.2:
-                sub_end = min(last_valid_offset + 30.0, end)
-
-                chunk_i0 = int(last_valid_offset * 1000)
-                chunk_i1 = int(sub_end * 1000)
-
-                chunk = audio_seg[chunk_i0:chunk_i1]
-                samples = (
-                    np.array(chunk.get_array_of_samples()).astype(np.float32) / 32768.0
-                )
-                chunk_id += 1
-
-                try:
-                    result = self.transcriber(samples, return_timestamps=True)
-
-                    any_valid = False
-
-                    last_valid_offset_ = last_valid_offset
-
-                    for seg in result["chunks"]:
-                        t0, t1 = seg["timestamp"]
-                        if t0 is None or t1 is None or t1 <= t0:
-                            _log.warning(f" => skipping bad segment: {seg}")
-                            continue
-
-                        global_start = round(last_valid_offset_ + t0, 2)
-                        global_end = round(last_valid_offset_ + t1, 2)
-                        text = seg["text"].strip()
-
-                        results.append(
-                            _ConversationItem(
-                                start_time=global_start, end_time=global_end, text=text
-                            )
-                        )
-                        last_valid_offset = max(global_end, last_valid_offset)
-                        any_valid = True
-
-                    if not any_valid:
-                        _log.warning(
-                            "No valid transcription in chunk, nudging forward 1s."
-                        )
-                        last_valid_offset += 1.0
-
-                except Exception as e:
-                    _log.error(f"Whisper failed: {e}")
-                    last_valid_offset += 1.0
-
-                duration = end - last_valid_offset
-            else:
-                any_valid = False
-
-        return results
-
-
-class _WhisperModel:
-    def __init__(self):
-        _log.info("initialisation `_WhisperModel`")
-
-        self.device = "cpu"
-        self.chunk_length = 30
-
-        self.batch_size = 8
-
-        # self.model_repo = "openai/whisper-tiny"
-        # self.model_repo = "openai/whisper-small"
-        self.model_repo = "openai/whisper-medium"
-        # self.model_repo = "openai/whisper-large"
-
-        self.processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        self.model = WhisperForConditionalGeneration.from_pretrained(
-            "openai/whisper-tiny"
-        )
-
-        # FIXME
-        self.max_new_tokens = 256
-
-        _log.info(f"model is loaded: {self.model_repo}")
-
-        self.pipe = pipeline(
-            "automatic-speech-recognition",
-            model=self.model_repo,
-            chunk_length_s=self.chunk_length,
-            device=self.device,
-        )
-
-    def run(self, conv_res: ConversionResult) -> ConversionResult:
-        return self._run_pipeline(conv_res=conv_res)
-
-    def _run_pipeline(self, conv_res: ConversionResult) -> ConversionResult:
         try:
-            fpath = conv_res.input.file
+            conversation = self.transcribe(audio_path)
 
-            array, sampling_rate = librosa.load(fpath, sr=16000)
-
-            prediction = self.pipe(
-                inputs=array, batch_size=self.batch_size, return_timestamps=True
-            )  # ["chunks"]
-
-            for _ in prediction["chunks"]:
-                item = _ConversationItem(
-                    text=_["text"],
-                    start_time=_["timestamp"][0],
-                    end_time=_["timestamp"][1],
-                )
-                conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=item.to_string()
-                )
+            for _ in conversation:
+                conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
 
             conv_res.status = ConversionStatus.SUCCESS
-        except Exception as exc:
-            conv_res.status = ConversionStatus.FAILURE
-            _log.error(f"Failed to convert with {self.model_repo}: {exc}")
+            return conv_res
 
+        except Exception as exc:
+            _log.error(f"Audio tranciption has an error: {exc}")
+
+        conv_res.status = ConversionStatus.FAILURE
         return conv_res
 
+    def transcribe(self, fpath: Path) -> list[_ConversationItem]:
+        result = self.model.transcribe(
+            str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
+        )
+
+        convo: list[_ConversationItem] = []
+        for _ in result["segments"]:
+            item = _ConversationItem(
+                start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
+            )
+            item.words = []
+            for __ in _["words"]:
+                item.words.append(
+                    _ConversationWord(
+                        start_time=__["start"],
+                        end_time=__["end"],
+                        text=__["word"],
+                    )
+                )
+            convo.append(item)
+
+        return convo
+
 
 class AsrPipeline(BasePipeline):
     def __init__(self, pipeline_options: AsrPipelineOptions):
         super().__init__(pipeline_options)
         self.keep_backend = True
 
-        self.pipeline_options: AsrPipelineOptions
+        self.pipeline_options: AsrPipelineOptions = pipeline_options
 
         artifacts_path: Optional[Path] = None
         if pipeline_options.artifacts_path is not None:
@@ -393,7 +160,7 @@ class AsrPipeline(BasePipeline):
             )
 
         # self._model = _WhisperModel()
-        self._model = _WhisperASR()
+        self._model = _NativeWhisperModel()
 
     def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
         status = ConversionStatus.SUCCESS
diff --git a/pyproject.toml b/pyproject.toml
index c4c4b2b2..cf5a6aed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,7 @@ dependencies = [
   # 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
   "pydub[asr]>=0.25.1",
   "pyannote-audio[asr]>=1.1.2",
+  "openai-whisper[asr]>=20240930",
 ]
 
 [project.urls]
@@ -102,8 +103,7 @@ rapidocr = [
   # 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
 ]
 asr = [
-    "librosa>=0.11.0",
-    "soundfile>=0.13.1",
+    "openai-whisper>=20240930",
 ]
 
 [dependency-groups]
diff --git a/uv.lock b/uv.lock
index 58e736c6..768de133 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1091,6 +1091,7 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "lxml" },
     { name = "marko" },
+    { name = "openai-whisper" },
     { name = "openpyxl" },
     { name = "pandas" },
     { name = "pillow" },
@@ -1113,8 +1114,7 @@ dependencies = [
 
 [package.optional-dependencies]
 asr = [
-    { name = "librosa" },
-    { name = "soundfile" },
+    { name = "openai-whisper" },
 ]
 ocrmac = [
     { name = "ocrmac", marker = "sys_platform == 'darwin'" },
@@ -1185,12 +1185,13 @@ requires-dist = [
     { name = "easyocr", specifier = ">=1.7,<2.0" },
     { name = "filetype", specifier = ">=1.2.0,<2.0.0" },
     { name = "huggingface-hub", specifier = ">=0.23,<1" },
-    { name = "librosa", marker = "extra == 'asr'", specifier = ">=0.11.0" },
     { name = "lxml", specifier = ">=4.0.0,<6.0.0" },
     { name = "marko", specifier = ">=2.1.2,<3.0.0" },
     { name = "mlx-vlm", marker = "python_full_version >= '3.10' and platform_machine == 'arm64' and sys_platform == 'darwin' and extra == 'vlm'", specifier = ">=0.1.22" },
     { name = "ocrmac", marker = "sys_platform == 'darwin' and extra == 'ocrmac'", specifier = ">=1.0.0,<2.0.0" },
     { name = "onnxruntime", marker = "extra == 'rapidocr'", specifier = ">=1.7.0,<2.0.0" },
+    { name = "openai-whisper", marker = "extra == 'asr'", specifier = ">=20240930" },
+    { name = "openai-whisper", extras = ["asr"], specifier = ">=20240930" },
     { name = "openpyxl", specifier = ">=3.1.5,<4.0.0" },
     { name = "pandas", specifier = ">=2.1.4,<3.0.0" },
     { name = "pillow", specifier = ">=10.0.0,<12.0.0" },
@@ -1207,7 +1208,6 @@ requires-dist = [
     { name = "requests", specifier = ">=2.32.2,<3.0.0" },
     { name = "rtree", specifier = ">=1.3.0,<2.0.0" },
     { name = "scipy", specifier = ">=1.6.0,<2.0.0" },
-    { name = "soundfile", marker = "extra == 'asr'", specifier = ">=0.13.1" },
     { name = "tesserocr", marker = "extra == 'tesserocr'", specifier = ">=2.7.1,<3.0.0" },
     { name = "tqdm", specifier = ">=4.65.0,<5.0.0" },
     { name = "transformers", marker = "extra == 'vlm'", specifier = ">=4.46.0,<5.0.0" },
@@ -4467,6 +4467,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c3/16/873b955beda7bada5b0d798d3a601b2ff210e44ad5169f6d405b93892103/onnxruntime-1.22.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64845709f9e8a2809e8e009bc4c8f73b788cee9c6619b7d9930344eae4c9cd36", size = 16427482, upload-time = "2025-05-09T20:26:20.376Z" },
 ]
 
+[[package]]
+name = "openai-whisper"
+version = "20240930"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "more-itertools" },
+    { name = "numba", version = "0.60.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numba", version = "0.61.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "tiktoken" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "triton", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'linux2'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f5/77/952ca71515f81919bd8a6a4a3f89a27b09e73880cebf90957eda8f2f8545/openai-whisper-20240930.tar.gz", hash = "sha256:b7178e9c1615576807a300024f4daa6353f7e1a815dac5e38c33f1ef055dd2d2", size = 800544, upload-time = "2024-09-30T18:21:22.596Z" }
+
 [[package]]
 name = "opencv-python"
 version = "4.10.0.84"
@@ -7341,6 +7358,48 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4d/77/7f7dfcf2d847c1c1c63a2d4157c480eb4c74e4aa56e844008795ff01f86d/tifffile-2025.6.1-py3-none-any.whl", hash = "sha256:ff7163f1aaea519b769a2ac77c43be69e7d83e5b5d5d6a676497399de50535e5", size = 230624, upload-time = "2025-06-02T01:41:42.179Z" },
 ]
 
+[[package]]
+name = "tiktoken"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "regex" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ea/cf/756fedf6981e82897f2d570dd25fa597eb3f4459068ae0572d7e888cfd6f/tiktoken-0.9.0.tar.gz", hash = "sha256:d02a5ca6a938e0490e1ff957bc48c8b078c88cb83977be1625b1fd8aac792c5d", size = 35991, upload-time = "2025-02-14T06:03:01.003Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/f3/50ec5709fad61641e4411eb1b9ac55b99801d71f1993c29853f256c726c9/tiktoken-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:586c16358138b96ea804c034b8acf3f5d3f0258bd2bc3b0227af4af5d622e382", size = 1065770, upload-time = "2025-02-14T06:02:01.251Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/f8/5a9560a422cf1755b6e0a9a436e14090eeb878d8ec0f80e0cd3d45b78bf4/tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d9c59ccc528c6c5dd51820b3474402f69d9a9e1d656226848ad68a8d5b2e5108", size = 1009314, upload-time = "2025-02-14T06:02:02.869Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/20/3ed4cfff8f809cb902900ae686069e029db74567ee10d017cb254df1d598/tiktoken-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0968d5beeafbca2a72c595e8385a1a1f8af58feaebb02b227229b69ca5357fd", size = 1143140, upload-time = "2025-02-14T06:02:04.165Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/95/cc2c6d79df8f113bdc6c99cdec985a878768120d87d839a34da4bd3ff90a/tiktoken-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92a5fb085a6a3b7350b8fc838baf493317ca0e17bd95e8642f95fc69ecfed1de", size = 1197860, upload-time = "2025-02-14T06:02:06.268Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/6c/9c1a4cc51573e8867c9381db1814223c09ebb4716779c7f845d48688b9c8/tiktoken-0.9.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:15a2752dea63d93b0332fb0ddb05dd909371ededa145fe6a3242f46724fa7990", size = 1259661, upload-time = "2025-02-14T06:02:08.889Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/4c/22eb8e9856a2b1808d0a002d171e534eac03f96dbe1161978d7389a59498/tiktoken-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:26113fec3bd7a352e4b33dbaf1bd8948de2507e30bd95a44e2b1156647bc01b4", size = 894026, upload-time = "2025-02-14T06:02:12.841Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/ae/4613a59a2a48e761c5161237fc850eb470b4bb93696db89da51b79a871f1/tiktoken-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f32cc56168eac4851109e9b5d327637f15fd662aa30dd79f964b7c39fbadd26e", size = 1065987, upload-time = "2025-02-14T06:02:14.174Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/86/55d9d1f5b5a7e1164d0f1538a85529b5fcba2b105f92db3622e5d7de6522/tiktoken-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:45556bc41241e5294063508caf901bf92ba52d8ef9222023f83d2483a3055348", size = 1009155, upload-time = "2025-02-14T06:02:15.384Z" },
+    { url = "https://files.pythonhosted.org/packages/03/58/01fb6240df083b7c1916d1dcb024e2b761213c95d576e9f780dfb5625a76/tiktoken-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33", size = 1142898, upload-time = "2025-02-14T06:02:16.666Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/73/41591c525680cd460a6becf56c9b17468d3711b1df242c53d2c7b2183d16/tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b3d80aad8d2c6b9238fc1a5524542087c52b860b10cbf952429ffb714bc1136", size = 1197535, upload-time = "2025-02-14T06:02:18.595Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/7c/1069f25521c8f01a1a182f362e5c8e0337907fae91b368b7da9c3e39b810/tiktoken-0.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b2a21133be05dc116b1d0372af051cd2c6aa1d2188250c9b553f9fa49301b336", size = 1259548, upload-time = "2025-02-14T06:02:20.729Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/07/c67ad1724b8e14e2b4c8cca04b15da158733ac60136879131db05dda7c30/tiktoken-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:11a20e67fdf58b0e2dea7b8654a288e481bb4fc0289d3ad21291f8d0849915fb", size = 893895, upload-time = "2025-02-14T06:02:22.67Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/e5/21ff33ecfa2101c1bb0f9b6df750553bd873b7fb532ce2cb276ff40b197f/tiktoken-0.9.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e88f121c1c22b726649ce67c089b90ddda8b9662545a8aeb03cfef15967ddd03", size = 1065073, upload-time = "2025-02-14T06:02:24.768Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/03/a95e7b4863ee9ceec1c55983e4cc9558bcfd8f4f80e19c4f8a99642f697d/tiktoken-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a6600660f2f72369acb13a57fb3e212434ed38b045fd8cc6cdd74947b4b5d210", size = 1008075, upload-time = "2025-02-14T06:02:26.92Z" },
+    { url = "https://files.pythonhosted.org/packages/40/10/1305bb02a561595088235a513ec73e50b32e74364fef4de519da69bc8010/tiktoken-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95e811743b5dfa74f4b227927ed86cbc57cad4df859cb3b643be797914e41794", size = 1140754, upload-time = "2025-02-14T06:02:28.124Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/40/da42522018ca496432ffd02793c3a72a739ac04c3794a4914570c9bb2925/tiktoken-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99376e1370d59bcf6935c933cb9ba64adc29033b7e73f5f7569f3aad86552b22", size = 1196678, upload-time = "2025-02-14T06:02:29.845Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/41/1e59dddaae270ba20187ceb8aa52c75b24ffc09f547233991d5fd822838b/tiktoken-0.9.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:badb947c32739fb6ddde173e14885fb3de4d32ab9d8c591cbd013c22b4c31dd2", size = 1259283, upload-time = "2025-02-14T06:02:33.838Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/64/b16003419a1d7728d0d8c0d56a4c24325e7b10a21a9dd1fc0f7115c02f0a/tiktoken-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:5a62d7a25225bafed786a524c1b9f0910a1128f4232615bf3f8257a73aaa3b16", size = 894897, upload-time = "2025-02-14T06:02:36.265Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/11/09d936d37f49f4f494ffe660af44acd2d99eb2429d60a57c71318af214e0/tiktoken-0.9.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2b0e8e05a26eda1249e824156d537015480af7ae222ccb798e5234ae0285dbdb", size = 1064919, upload-time = "2025-02-14T06:02:37.494Z" },
+    { url = "https://files.pythonhosted.org/packages/80/0e/f38ba35713edb8d4197ae602e80837d574244ced7fb1b6070b31c29816e0/tiktoken-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:27d457f096f87685195eea0165a1807fae87b97b2161fe8c9b1df5bd74ca6f63", size = 1007877, upload-time = "2025-02-14T06:02:39.516Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/82/9197f77421e2a01373e27a79dd36efdd99e6b4115746ecc553318ecafbf0/tiktoken-0.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cf8ded49cddf825390e36dd1ad35cd49589e8161fdcb52aa25f0583e90a3e01", size = 1140095, upload-time = "2025-02-14T06:02:41.791Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/bb/4513da71cac187383541facd0291c4572b03ec23c561de5811781bbd988f/tiktoken-0.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc156cb314119a8bb9748257a2eaebd5cc0753b6cb491d26694ed42fc7cb3139", size = 1195649, upload-time = "2025-02-14T06:02:43Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/5c/74e4c137530dd8504e97e3a41729b1103a4ac29036cbfd3250b11fd29451/tiktoken-0.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cd69372e8c9dd761f0ab873112aba55a0e3e506332dd9f7522ca466e817b1b7a", size = 1258465, upload-time = "2025-02-14T06:02:45.046Z" },
+    { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669, upload-time = "2025-02-14T06:02:47.341Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/92/4d681b5c066d417b98f22a0176358d9e606e183c6b61c337d61fb54accb4/tiktoken-0.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c6386ca815e7d96ef5b4ac61e0048cd32ca5a92d5781255e13b31381d28667dc", size = 1066217, upload-time = "2025-02-14T06:02:49.259Z" },
+    { url = "https://files.pythonhosted.org/packages/12/dd/af27bbe186df481666de48cf0f2f4e0643ba9c78b472e7bf70144c663b22/tiktoken-0.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:75f6d5db5bc2c6274b674ceab1615c1778e6416b14705827d19b40e6355f03e0", size = 1009441, upload-time = "2025-02-14T06:02:51.347Z" },
+    { url = "https://files.pythonhosted.org/packages/33/35/2792b7dcb8b150d2767322637513c73a3e80833c19212efea80b31087894/tiktoken-0.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e15b16f61e6f4625a57a36496d28dd182a8a60ec20a534c5343ba3cafa156ac7", size = 1144423, upload-time = "2025-02-14T06:02:52.547Z" },
+    { url = "https://files.pythonhosted.org/packages/65/ae/4d1682510172ce3500bbed3b206ebc4efefe280f0bf1179cfb043f88cc16/tiktoken-0.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebcec91babf21297022882344c3f7d9eed855931466c3311b1ad6b64befb3df", size = 1199002, upload-time = "2025-02-14T06:02:55.72Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/2e/df2dc31dd161190f315829775a9652ea01d60f307af8f98e35bdd14a6a93/tiktoken-0.9.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e5fd49e7799579240f03913447c0cdfa1129625ebd5ac440787afc4345990427", size = 1260610, upload-time = "2025-02-14T06:02:56.924Z" },
+    { url = "https://files.pythonhosted.org/packages/70/22/e8fc1bf9cdecc439b7ddc28a45b976a8c699a38874c070749d855696368a/tiktoken-0.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:26242ca9dc8b58e875ff4ca078b9a94d2f0813e6a535dcd2205df5d49d927cc7", size = 894215, upload-time = "2025-02-14T06:02:59.031Z" },
+]
+
 [[package]]
 name = "tinycss2"
 version = "1.4.0"