129 lines
3.9 KiB
Python
129 lines
3.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2014-2018 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Downloader module for http:// and https:// URLs"""
|
|
|
|
import time
|
|
import mimetypes
|
|
from requests.exceptions import ConnectionError, Timeout
|
|
from .common import DownloaderBase
|
|
from .. import text, exception
|
|
|
|
|
|
class HttpDownloader(DownloaderBase):
|
|
scheme = "http"
|
|
|
|
def __init__(self, extractor, output):
|
|
DownloaderBase.__init__(self, extractor, output)
|
|
self.response = None
|
|
self.retries = self.config("retries", extractor._retries)
|
|
self.timeout = self.config("timeout", extractor._timeout)
|
|
self.verify = self.config("verify", extractor._verify)
|
|
self.rate = self.config("rate")
|
|
self.chunk_size = 16384
|
|
|
|
if self.rate:
|
|
self.rate = text.parse_bytes(self.rate)
|
|
if not self.rate:
|
|
self.log.warning("Invalid rate limit specified")
|
|
elif self.rate < self.chunk_size:
|
|
self.chunk_size = self.rate
|
|
|
|
def connect(self, url, offset):
|
|
headers = {}
|
|
if offset:
|
|
headers["Range"] = "bytes={}-".format(offset)
|
|
|
|
try:
|
|
self.response = self.session.request(
|
|
"GET", url, stream=True, headers=headers, allow_redirects=True,
|
|
timeout=self.timeout, verify=self.verify)
|
|
except (ConnectionError, Timeout) as exc:
|
|
raise exception.DownloadRetry(exc)
|
|
|
|
code = self.response.status_code
|
|
if code == 200: # OK
|
|
offset = 0
|
|
size = self.response.headers.get("Content-Length")
|
|
elif code == 206: # Partial Content
|
|
size = self.response.headers["Content-Range"].rpartition("/")[2]
|
|
elif code == 416: # Requested Range Not Satisfiable
|
|
raise exception.DownloadComplete()
|
|
elif code == 429 or 500 <= code < 600: # Server Error
|
|
raise exception.DownloadRetry(
|
|
"{} Server Error: {} for url: {}".format(
|
|
code, self.response.reason, url))
|
|
else:
|
|
self.response.raise_for_status()
|
|
|
|
return offset, text.parse_int(size)
|
|
|
|
def receive(self, file):
|
|
if self.rate:
|
|
total = 0 # total amount of bytes received
|
|
start = time.time() # start time
|
|
|
|
for data in self.response.iter_content(self.chunk_size):
|
|
file.write(data)
|
|
|
|
if self.rate:
|
|
total += len(data)
|
|
expected = total / self.rate # expected elapsed time
|
|
delta = time.time() - start # actual elapsed time since start
|
|
if delta < expected:
|
|
# sleep if less time passed than expected
|
|
time.sleep(expected - delta)
|
|
|
|
def reset(self):
|
|
if self.response:
|
|
self.response.close()
|
|
self.response = None
|
|
|
|
def get_extension(self):
|
|
mtype = self.response.headers.get("Content-Type", "image/jpeg")
|
|
mtype = mtype.partition(";")[0]
|
|
|
|
if mtype in MIMETYPE_MAP:
|
|
return MIMETYPE_MAP[mtype]
|
|
|
|
exts = mimetypes.guess_all_extensions(mtype, strict=False)
|
|
if exts:
|
|
exts.sort()
|
|
return exts[-1][1:]
|
|
|
|
self.log.warning(
|
|
"No filename extension found for MIME type '%s'", mtype)
|
|
return "txt"
|
|
|
|
|
|
MIMETYPE_MAP = {
|
|
"image/jpeg": "jpg",
|
|
"image/jpg": "jpg",
|
|
"image/png": "png",
|
|
"image/gif": "gif",
|
|
"image/bmp": "bmp",
|
|
"image/webp": "webp",
|
|
"image/svg+xml": "svg",
|
|
|
|
"video/webm": "webm",
|
|
"video/ogg": "ogg",
|
|
"video/mp4": "mp4",
|
|
|
|
"audio/wav": "wav",
|
|
"audio/x-wav": "wav",
|
|
"audio/webm": "webm",
|
|
"audio/ogg": "ogg",
|
|
"audio/mpeg": "mp3",
|
|
|
|
"application/ogg": "ogg",
|
|
"application/octet-stream": "bin",
|
|
}
|
|
|
|
|
|
__downloader__ = HttpDownloader
|