From 02cf8e576db5d05c994a203882dbabdf485cf331 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 26 Jul 2024 16:48:42 +0200 Subject: [PATCH] remove cgi, download in chunks Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/document_converter.py | 29 +++++++++++++++++------------ poetry.lock | 3 ++- pyproject.toml | 1 + 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/docling/document_converter.py b/docling/document_converter.py index ec9a3f29..95b30a06 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,4 +1,3 @@ -import cgi import functools import logging import tempfile @@ -6,8 +5,8 @@ import time import traceback from pathlib import Path from typing import Iterable, Optional, Type, Union -from urllib.request import urlopen +import requests from docling_core.types import Document from PIL import ImageDraw from pydantic import AnyHttpUrl, TypeAdapter, ValidationError @@ -102,17 +101,23 @@ class DocumentConverter: with tempfile.TemporaryDirectory() as temp_dir: try: http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source) - with urlopen(str(source)) as resp: - cont_disp = resp.info().get("Content-Disposition") - content = resp.read() - if cont_disp: - _, params = cgi.parse_header(cont_disp) - filename = params.get("filename", self._default_download_filename) - else: - filename = http_url.path or self._default_download_filename - local_path = Path(temp_dir) / filename + res = requests.get(http_url, stream=True) + res.raise_for_status() + fname = None + # try to get filename from response header + if cont_disp := res.headers.get("Content-Disposition"): + for par in cont_disp.strip().split(";"): + # currently only handling directive "filename" (not "*filename") + if (split := par.split("=")) and split[0].strip() == "filename": + fname = "=".join(split[1:]).strip().strip("'\"") or None + break + # otherwise, use name from URL: + if fname is None: + fname = Path(http_url.path).name or self._default_download_filename + local_path = Path(temp_dir) / fname with open(local_path, "wb") as f: - f.write(content) + for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks + f.write(chunk) except ValidationError: try: local_path = TypeAdapter(Path).validate_python(source) diff --git a/poetry.lock b/poetry.lock index 09f695a0..9715593d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2510,6 +2510,7 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] @@ -4881,4 +4882,4 @@ ocr = ["easyocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "3ffc5161fd49fe2186ee2afbb3319922964661c769c434fc7386aae40f4aab19" +content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00" diff --git a/pyproject.toml b/pyproject.toml index 544af9ae..a142eac2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ filetype = "^1.2.0" pypdfium2 = "^4.30.0" pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" +requests = "^2.32.3" easyocr = { version = "^1.7", optional = true } [tool.poetry.group.dev.dependencies]