mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
remove cgi, download in chunks
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
33d5d7d787
commit
02cf8e576d
@ -1,4 +1,3 @@
|
||||
import cgi
|
||||
import functools
|
||||
import logging
|
||||
import tempfile
|
||||
@ -6,8 +5,8 @@ import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
from urllib.request import urlopen
|
||||
|
||||
import requests
|
||||
from docling_core.types import Document
|
||||
from PIL import ImageDraw
|
||||
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
||||
@ -102,17 +101,23 @@ class DocumentConverter:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
||||
with urlopen(str(source)) as resp:
|
||||
cont_disp = resp.info().get("Content-Disposition")
|
||||
content = resp.read()
|
||||
if cont_disp:
|
||||
_, params = cgi.parse_header(cont_disp)
|
||||
filename = params.get("filename", self._default_download_filename)
|
||||
else:
|
||||
filename = http_url.path or self._default_download_filename
|
||||
local_path = Path(temp_dir) / filename
|
||||
res = requests.get(http_url, stream=True)
|
||||
res.raise_for_status()
|
||||
fname = None
|
||||
# try to get filename from response header
|
||||
if cont_disp := res.headers.get("Content-Disposition"):
|
||||
for par in cont_disp.strip().split(";"):
|
||||
# currently only handling directive "filename" (not "*filename")
|
||||
if (split := par.split("=")) and split[0].strip() == "filename":
|
||||
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
||||
break
|
||||
# otherwise, use name from URL:
|
||||
if fname is None:
|
||||
fname = Path(http_url.path).name or self._default_download_filename
|
||||
local_path = Path(temp_dir) / fname
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(content)
|
||||
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
||||
f.write(chunk)
|
||||
except ValidationError:
|
||||
try:
|
||||
local_path = TypeAdapter(Path).validate_python(source)
|
||||
|
3
poetry.lock
generated
3
poetry.lock
generated
@ -2510,6 +2510,7 @@ description = "Nvidia JIT LTO Library"
|
||||
optional = false
|
||||
python-versions = ">=3"
|
||||
files = [
|
||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
|
||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
|
||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
|
||||
]
|
||||
@ -4881,4 +4882,4 @@ ocr = ["easyocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "3ffc5161fd49fe2186ee2afbb3319922964661c769c434fc7386aae40f4aab19"
|
||||
content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00"
|
||||
|
@ -30,6 +30,7 @@ filetype = "^1.2.0"
|
||||
pypdfium2 = "^4.30.0"
|
||||
pydantic-settings = "^2.3.0"
|
||||
huggingface_hub = ">=0.23,<1"
|
||||
requests = "^2.32.3"
|
||||
easyocr = { version = "^1.7", optional = true }
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
Loading…
Reference in New Issue
Block a user