2015-04-10 21:45:41 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2020-01-17 23:51:07 +01:00
|
|
|
# Copyright 2014-2020 Mike Fährmann
|
2015-04-10 21:45:41 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2017-06-30 19:38:14 +02:00
|
|
|
"""Downloader module for http:// and https:// URLs"""
|
2015-04-10 21:45:41 +02:00
|
|
|
|
2017-12-02 01:47:26 +01:00
|
|
|
import time
|
2016-09-30 12:32:48 +02:00
|
|
|
import mimetypes
|
2019-06-19 22:19:29 +02:00
|
|
|
from requests.exceptions import RequestException, ConnectionError, Timeout
|
2017-10-24 12:53:03 +02:00
|
|
|
from .common import DownloaderBase
|
2020-01-17 23:51:07 +01:00
|
|
|
from .. import text, util
|
2017-10-24 23:33:44 +02:00
|
|
|
|
2019-12-09 20:14:25 +01:00
|
|
|
from ssl import SSLError
|
2019-07-01 20:10:26 +02:00
|
|
|
try:
|
2019-12-09 20:14:25 +01:00
|
|
|
from OpenSSL.SSL import Error as OpenSSLError
|
2019-07-01 20:10:26 +02:00
|
|
|
except ImportError:
|
2019-12-09 20:14:25 +01:00
|
|
|
OpenSSLError = SSLError
|
2019-07-01 20:10:26 +02:00
|
|
|
|
2017-10-24 23:33:44 +02:00
|
|
|
|
2018-11-16 14:40:05 +01:00
|
|
|
class HttpDownloader(DownloaderBase):
|
2017-10-26 22:11:36 +02:00
|
|
|
scheme = "http"
|
2017-03-26 18:24:46 +02:00
|
|
|
|
2020-05-18 01:35:53 +02:00
|
|
|
def __init__(self, job):
|
|
|
|
DownloaderBase.__init__(self, job)
|
|
|
|
extractor = job.extractor
|
|
|
|
self.chunk_size = 16384
|
|
|
|
self.downloading = False
|
|
|
|
|
2019-08-07 22:52:29 +02:00
|
|
|
self.adjust_extension = self.config("adjust-extensions", True)
|
2020-09-01 22:05:17 +02:00
|
|
|
self.minsize = self.config("filesize-min")
|
|
|
|
self.maxsize = self.config("filesize-max")
|
2018-10-06 19:59:19 +02:00
|
|
|
self.retries = self.config("retries", extractor._retries)
|
|
|
|
self.timeout = self.config("timeout", extractor._timeout)
|
|
|
|
self.verify = self.config("verify", extractor._verify)
|
2019-06-20 17:19:44 +02:00
|
|
|
self.mtime = self.config("mtime", True)
|
2017-12-02 01:47:26 +01:00
|
|
|
self.rate = self.config("rate")
|
|
|
|
|
2019-06-30 22:55:31 +02:00
|
|
|
if self.retries < 0:
|
|
|
|
self.retries = float("inf")
|
2020-09-01 22:05:17 +02:00
|
|
|
if self.minsize:
|
|
|
|
minsize = text.parse_bytes(self.minsize)
|
|
|
|
if not minsize:
|
|
|
|
self.log.warning("Invalid minimum filesize (%r)", self.minsize)
|
|
|
|
self.minsize = minsize
|
|
|
|
if self.maxsize:
|
|
|
|
maxsize = text.parse_bytes(self.maxsize)
|
|
|
|
if not maxsize:
|
|
|
|
self.log.warning("Invalid maximum filesize (%r)", self.maxsize)
|
|
|
|
self.maxsize = maxsize
|
2017-12-02 01:47:26 +01:00
|
|
|
if self.rate:
|
2019-08-29 23:05:47 +02:00
|
|
|
rate = text.parse_bytes(self.rate)
|
2019-12-09 20:21:28 +01:00
|
|
|
if rate:
|
|
|
|
if rate < self.chunk_size:
|
|
|
|
self.chunk_size = rate
|
|
|
|
self.rate = rate
|
|
|
|
self.receive = self._receive_rate
|
|
|
|
else:
|
2019-08-29 23:05:47 +02:00
|
|
|
self.log.warning("Invalid rate limit (%r)", self.rate)
|
2017-10-24 12:53:03 +02:00
|
|
|
|
2019-06-19 22:19:29 +02:00
|
|
|
def download(self, url, pathfmt):
|
2017-12-06 22:35:05 +01:00
|
|
|
try:
|
2019-06-19 22:19:29 +02:00
|
|
|
return self._download_impl(url, pathfmt)
|
|
|
|
except Exception:
|
|
|
|
print()
|
|
|
|
raise
|
|
|
|
finally:
|
|
|
|
# remove file from incomplete downloads
|
|
|
|
if self.downloading and not self.part:
|
2020-01-17 23:51:07 +01:00
|
|
|
util.remove_file(pathfmt.temppath)
|
2019-06-19 22:19:29 +02:00
|
|
|
|
|
|
|
def _download_impl(self, url, pathfmt):
|
|
|
|
response = None
|
|
|
|
tries = 0
|
|
|
|
msg = ""
|
|
|
|
|
|
|
|
if self.part:
|
|
|
|
pathfmt.part_enable(self.partdir)
|
|
|
|
|
|
|
|
while True:
|
|
|
|
if tries:
|
|
|
|
if response:
|
|
|
|
response.close()
|
2019-06-30 21:27:28 +02:00
|
|
|
self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
|
|
|
|
if tries > self.retries:
|
2019-06-19 22:19:29 +02:00
|
|
|
return False
|
2020-09-06 22:38:25 +02:00
|
|
|
time.sleep(tries)
|
2019-06-19 22:19:29 +02:00
|
|
|
tries += 1
|
|
|
|
|
2019-11-19 23:50:54 +01:00
|
|
|
headers = {}
|
2019-06-19 22:19:29 +02:00
|
|
|
# check for .part file
|
|
|
|
filesize = pathfmt.part_size()
|
|
|
|
if filesize:
|
2019-11-19 23:50:54 +01:00
|
|
|
headers["Range"] = "bytes={}-".format(filesize)
|
|
|
|
# file-specific headers
|
|
|
|
extra = pathfmt.kwdict.get("_http_headers")
|
|
|
|
if extra:
|
|
|
|
headers.update(extra)
|
2019-06-19 22:19:29 +02:00
|
|
|
|
|
|
|
# connect to (remote) source
|
|
|
|
try:
|
|
|
|
response = self.session.request(
|
|
|
|
"GET", url, stream=True, headers=headers,
|
|
|
|
timeout=self.timeout, verify=self.verify)
|
|
|
|
except (ConnectionError, Timeout) as exc:
|
|
|
|
msg = str(exc)
|
|
|
|
continue
|
|
|
|
except Exception as exc:
|
2019-12-10 21:30:08 +01:00
|
|
|
self.log.warning(exc)
|
2019-06-19 22:19:29 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
# check response
|
|
|
|
code = response.status_code
|
|
|
|
if code == 200: # OK
|
|
|
|
offset = 0
|
|
|
|
size = response.headers.get("Content-Length")
|
|
|
|
elif code == 206: # Partial Content
|
|
|
|
offset = filesize
|
|
|
|
size = response.headers["Content-Range"].rpartition("/")[2]
|
2019-08-19 23:46:58 +02:00
|
|
|
elif code == 416 and filesize: # Requested Range Not Satisfiable
|
2019-06-19 22:19:29 +02:00
|
|
|
break
|
|
|
|
else:
|
2019-10-27 23:16:25 +01:00
|
|
|
msg = "'{} {}' for '{}'".format(code, response.reason, url)
|
2019-06-19 22:19:29 +02:00
|
|
|
if code == 429 or 500 <= code < 600: # Server Error
|
|
|
|
continue
|
2019-12-10 21:30:08 +01:00
|
|
|
self.log.warning(msg)
|
2019-06-19 22:19:29 +02:00
|
|
|
return False
|
2020-09-01 22:05:17 +02:00
|
|
|
|
|
|
|
# check filesize
|
|
|
|
size = text.parse_int(size, None)
|
|
|
|
if size is not None:
|
|
|
|
if self.minsize and size < self.minsize:
|
|
|
|
self.log.warning(
|
|
|
|
"File size smaller than allowed minimum (%s < %s)",
|
|
|
|
size, self.minsize)
|
|
|
|
return False
|
|
|
|
if self.maxsize and size > self.maxsize:
|
|
|
|
self.log.warning(
|
|
|
|
"File size larger than allowed maximum (%s > %s)",
|
|
|
|
size, self.maxsize)
|
|
|
|
return False
|
2019-06-19 22:19:29 +02:00
|
|
|
|
|
|
|
# set missing filename extension
|
2019-08-12 21:40:37 +02:00
|
|
|
if not pathfmt.extension:
|
2019-06-19 22:19:29 +02:00
|
|
|
pathfmt.set_extension(self.get_extension(response))
|
|
|
|
if pathfmt.exists():
|
|
|
|
pathfmt.temppath = ""
|
|
|
|
return True
|
|
|
|
|
|
|
|
# set open mode
|
|
|
|
if not offset:
|
|
|
|
mode = "w+b"
|
|
|
|
if filesize:
|
2019-08-29 23:05:47 +02:00
|
|
|
self.log.debug("Unable to resume partial download")
|
2019-06-19 22:19:29 +02:00
|
|
|
else:
|
|
|
|
mode = "r+b"
|
2019-08-29 23:05:47 +02:00
|
|
|
self.log.debug("Resuming download at byte %d", offset)
|
2019-06-19 22:19:29 +02:00
|
|
|
|
|
|
|
# start downloading
|
|
|
|
self.out.start(pathfmt.path)
|
|
|
|
self.downloading = True
|
|
|
|
with pathfmt.open(mode) as file:
|
|
|
|
if offset:
|
|
|
|
file.seek(offset)
|
|
|
|
|
|
|
|
# download content
|
|
|
|
try:
|
|
|
|
self.receive(response, file)
|
2019-12-09 20:14:25 +01:00
|
|
|
except (RequestException, SSLError, OpenSSLError) as exc:
|
2019-06-19 22:19:29 +02:00
|
|
|
msg = str(exc)
|
|
|
|
print()
|
|
|
|
continue
|
|
|
|
|
|
|
|
# check filesize
|
|
|
|
if size and file.tell() < size:
|
|
|
|
msg = "filesize mismatch ({} < {})".format(
|
|
|
|
file.tell(), size)
|
2019-06-30 21:27:28 +02:00
|
|
|
print()
|
2019-06-19 22:19:29 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
# check filename extension
|
2019-08-07 22:52:29 +02:00
|
|
|
if self.adjust_extension:
|
2019-08-12 21:40:37 +02:00
|
|
|
adj_ext = self.check_extension(file, pathfmt.extension)
|
2019-08-07 22:52:29 +02:00
|
|
|
if adj_ext:
|
|
|
|
pathfmt.set_extension(adj_ext)
|
2019-06-19 22:19:29 +02:00
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
self.downloading = False
|
2019-06-20 17:19:44 +02:00
|
|
|
if self.mtime:
|
2020-04-10 22:24:32 +02:00
|
|
|
pathfmt.kwdict.setdefault(
|
|
|
|
"_mtime", response.headers.get("Last-Modified"))
|
|
|
|
else:
|
|
|
|
pathfmt.kwdict["_mtime"] = None
|
|
|
|
|
2019-06-19 22:19:29 +02:00
|
|
|
return True
|
|
|
|
|
|
|
|
def receive(self, response, file):
|
2019-12-09 20:21:28 +01:00
|
|
|
for data in response.iter_content(self.chunk_size):
|
|
|
|
file.write(data)
|
|
|
|
|
|
|
|
def _receive_rate(self, response, file):
|
|
|
|
t1 = time.time()
|
|
|
|
rt = self.rate
|
2017-12-02 01:47:26 +01:00
|
|
|
|
2019-06-19 22:19:29 +02:00
|
|
|
for data in response.iter_content(self.chunk_size):
|
2017-10-24 12:53:03 +02:00
|
|
|
file.write(data)
|
|
|
|
|
2019-12-09 20:21:28 +01:00
|
|
|
t2 = time.time() # current time
|
|
|
|
actual = t2 - t1 # actual elapsed time
|
|
|
|
expected = len(data) / rt # expected elapsed time
|
|
|
|
|
|
|
|
if actual < expected:
|
|
|
|
# sleep if less time elapsed than expected
|
|
|
|
time.sleep(expected - actual)
|
|
|
|
t1 = time.time()
|
|
|
|
else:
|
|
|
|
t1 = t2
|
2017-12-02 01:47:26 +01:00
|
|
|
|
2019-06-19 22:19:29 +02:00
|
|
|
def get_extension(self, response):
|
|
|
|
mtype = response.headers.get("Content-Type", "image/jpeg")
|
2017-10-24 12:53:03 +02:00
|
|
|
mtype = mtype.partition(";")[0]
|
2017-11-30 22:30:01 +01:00
|
|
|
|
2020-03-03 21:21:57 +01:00
|
|
|
if "/" not in mtype:
|
|
|
|
mtype = "image/" + mtype
|
|
|
|
|
2017-11-30 22:30:01 +01:00
|
|
|
if mtype in MIMETYPE_MAP:
|
|
|
|
return MIMETYPE_MAP[mtype]
|
|
|
|
|
2017-10-24 12:53:03 +02:00
|
|
|
exts = mimetypes.guess_all_extensions(mtype, strict=False)
|
|
|
|
if exts:
|
|
|
|
exts.sort()
|
|
|
|
return exts[-1][1:]
|
2017-11-30 22:30:01 +01:00
|
|
|
|
2017-10-24 12:53:03 +02:00
|
|
|
self.log.warning(
|
|
|
|
"No filename extension found for MIME type '%s'", mtype)
|
|
|
|
return "txt"
|
2017-11-30 22:30:01 +01:00
|
|
|
|
2019-06-19 22:19:29 +02:00
|
|
|
@staticmethod
|
2019-08-12 21:40:37 +02:00
|
|
|
def check_extension(file, extension):
|
2019-06-19 22:19:29 +02:00
|
|
|
"""Check filename extension against fileheader"""
|
|
|
|
if extension in FILETYPE_CHECK:
|
|
|
|
file.seek(0)
|
|
|
|
header = file.read(8)
|
|
|
|
if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
|
|
|
|
for ext, check in FILETYPE_CHECK.items():
|
|
|
|
if ext != extension and check(header):
|
|
|
|
return ext
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
FILETYPE_CHECK = {
|
|
|
|
"jpg": lambda h: h[0:2] == b"\xff\xd8",
|
|
|
|
"png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
|
|
|
|
"gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
|
|
|
|
}
|
|
|
|
|
2017-11-30 22:30:01 +01:00
|
|
|
|
|
|
|
MIMETYPE_MAP = {
|
|
|
|
"image/jpeg": "jpg",
|
|
|
|
"image/jpg": "jpg",
|
|
|
|
"image/png": "png",
|
|
|
|
"image/gif": "gif",
|
|
|
|
"image/bmp": "bmp",
|
2020-02-23 16:49:57 +01:00
|
|
|
"image/x-bmp": "bmp",
|
|
|
|
"image/x-ms-bmp": "bmp",
|
2017-11-30 22:30:01 +01:00
|
|
|
"image/webp": "webp",
|
|
|
|
"image/svg+xml": "svg",
|
|
|
|
|
2020-04-29 23:01:42 +02:00
|
|
|
"image/vnd.adobe.photoshop": "psd",
|
|
|
|
"image/x-photoshop": "psd",
|
|
|
|
"application/x-photoshop": "psd",
|
|
|
|
|
2017-11-30 22:30:01 +01:00
|
|
|
"video/webm": "webm",
|
|
|
|
"video/ogg": "ogg",
|
|
|
|
"video/mp4": "mp4",
|
|
|
|
|
|
|
|
"audio/wav": "wav",
|
|
|
|
"audio/x-wav": "wav",
|
|
|
|
"audio/webm": "webm",
|
|
|
|
"audio/ogg": "ogg",
|
|
|
|
"audio/mpeg": "mp3",
|
|
|
|
|
2019-10-10 18:30:23 +02:00
|
|
|
"application/zip": "zip",
|
|
|
|
"application/x-zip": "zip",
|
|
|
|
"application/x-zip-compressed": "zip",
|
2020-03-01 20:42:13 +01:00
|
|
|
"application/rar": "rar",
|
2019-10-10 18:30:23 +02:00
|
|
|
"application/x-rar": "rar",
|
|
|
|
"application/x-rar-compressed": "rar",
|
|
|
|
"application/x-7z-compressed": "7z",
|
|
|
|
|
2017-11-30 22:30:01 +01:00
|
|
|
"application/ogg": "ogg",
|
|
|
|
"application/octet-stream": "bin",
|
|
|
|
}
|
2018-11-16 14:40:05 +01:00
|
|
|
|
|
|
|
|
|
|
|
__downloader__ = HttpDownloader
|