remove cgi, download in chunks

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-07-26 16:48:42 +02:00
parent 33d5d7d787
commit 02cf8e576d
3 changed files with 20 additions and 13 deletions

View File

@ -1,4 +1,3 @@
import cgi
import functools import functools
import logging import logging
import tempfile import tempfile
@ -6,8 +5,8 @@ import time
import traceback import traceback
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type, Union from typing import Iterable, Optional, Type, Union
from urllib.request import urlopen
import requests
from docling_core.types import Document from docling_core.types import Document
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
@ -102,17 +101,23 @@ class DocumentConverter:
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
try: try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source) http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
with urlopen(str(source)) as resp: res = requests.get(http_url, stream=True)
cont_disp = resp.info().get("Content-Disposition") res.raise_for_status()
content = resp.read() fname = None
if cont_disp: # try to get filename from response header
_, params = cgi.parse_header(cont_disp) if cont_disp := res.headers.get("Content-Disposition"):
filename = params.get("filename", self._default_download_filename) for par in cont_disp.strip().split(";"):
else: # currently only handling directive "filename" (not "*filename")
filename = http_url.path or self._default_download_filename if (split := par.split("=")) and split[0].strip() == "filename":
local_path = Path(temp_dir) / filename fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path).name or self._default_download_filename
local_path = Path(temp_dir) / fname
with open(local_path, "wb") as f: with open(local_path, "wb") as f:
f.write(content) for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError: except ValidationError:
try: try:
local_path = TypeAdapter(Path).validate_python(source) local_path = TypeAdapter(Path).validate_python(source)

3
poetry.lock generated
View File

@ -2510,6 +2510,7 @@ description = "Nvidia JIT LTO Library"
optional = false optional = false
python-versions = ">=3" python-versions = ">=3"
files = [ files = [
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
] ]
@ -4881,4 +4882,4 @@ ocr = ["easyocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "3ffc5161fd49fe2186ee2afbb3319922964661c769c434fc7386aae40f4aab19" content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00"

View File

@ -30,6 +30,7 @@ filetype = "^1.2.0"
pypdfium2 = "^4.30.0" pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0" pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1" huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = { version = "^1.7", optional = true } easyocr = { version = "^1.7", optional = true }
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]