[downloader:http] improve 'adjust-extensions' (#776)
Check file headers against a list of file signatures before downloading the whole file and writing it to disk. The file signature check needs some improvements (*), but it produces usable results for the most part. (*) - 'webp', 'wav', and others start with 'RFFI' - 'svg' uses the same "signature" as all XML documents - 'webm' has the same signature as 'mkv' files - only 'mp3' files in an ID3v2 container get recognized
This commit is contained in:
parent
46323ae6ff
commit
536c088462
@ -44,12 +44,14 @@ class HttpDownloader(DownloaderBase):
|
||||
if self.minsize:
|
||||
minsize = text.parse_bytes(self.minsize)
|
||||
if not minsize:
|
||||
self.log.warning("Invalid minimum filesize (%r)", self.minsize)
|
||||
self.log.warning(
|
||||
"Invalid minimum file size (%r)", self.minsize)
|
||||
self.minsize = minsize
|
||||
if self.maxsize:
|
||||
maxsize = text.parse_bytes(self.maxsize)
|
||||
if not maxsize:
|
||||
self.log.warning("Invalid maximum filesize (%r)", self.maxsize)
|
||||
self.log.warning(
|
||||
"Invalid maximum file size (%r)", self.maxsize)
|
||||
self.maxsize = maxsize
|
||||
if self.rate:
|
||||
rate = text.parse_bytes(self.rate)
|
||||
@ -84,17 +86,20 @@ class HttpDownloader(DownloaderBase):
|
||||
if tries:
|
||||
if response:
|
||||
response.close()
|
||||
response = None
|
||||
self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
|
||||
if tries > self.retries:
|
||||
return False
|
||||
time.sleep(tries)
|
||||
tries += 1
|
||||
|
||||
tries += 1
|
||||
headers = {}
|
||||
file_header = None
|
||||
|
||||
# check for .part file
|
||||
filesize = pathfmt.part_size()
|
||||
if filesize:
|
||||
headers["Range"] = "bytes={}-".format(filesize)
|
||||
file_size = pathfmt.part_size()
|
||||
if file_size:
|
||||
headers["Range"] = "bytes={}-".format(file_size)
|
||||
# file-specific headers
|
||||
extra = pathfmt.kwdict.get("_http_headers")
|
||||
if extra:
|
||||
@ -118,9 +123,9 @@ class HttpDownloader(DownloaderBase):
|
||||
offset = 0
|
||||
size = response.headers.get("Content-Length")
|
||||
elif code == 206: # Partial Content
|
||||
offset = filesize
|
||||
offset = file_size
|
||||
size = response.headers["Content-Range"].rpartition("/")[2]
|
||||
elif code == 416 and filesize: # Requested Range Not Satisfiable
|
||||
elif code == 416 and file_size: # Requested Range Not Satisfiable
|
||||
break
|
||||
else:
|
||||
msg = "'{} {}' for '{}'".format(code, response.reason, url)
|
||||
@ -129,7 +134,14 @@ class HttpDownloader(DownloaderBase):
|
||||
self.log.warning(msg)
|
||||
return False
|
||||
|
||||
# check filesize
|
||||
# set missing filename extension from MIME type
|
||||
if not pathfmt.extension:
|
||||
pathfmt.set_extension(self._find_extension(response))
|
||||
if pathfmt.exists():
|
||||
pathfmt.temppath = ""
|
||||
return True
|
||||
|
||||
# check file size
|
||||
size = text.parse_int(size, None)
|
||||
if size is not None:
|
||||
if self.minsize and size < self.minsize:
|
||||
@ -143,50 +155,55 @@ class HttpDownloader(DownloaderBase):
|
||||
size, self.maxsize)
|
||||
return False
|
||||
|
||||
# set missing filename extension
|
||||
if not pathfmt.extension:
|
||||
pathfmt.set_extension(self.get_extension(response))
|
||||
if pathfmt.exists():
|
||||
# check filename extension against file header
|
||||
if self.adjust_extension and not offset and \
|
||||
pathfmt.extension in FILE_SIGNATURES:
|
||||
try:
|
||||
file_header = next(response.iter_content(16), b"")
|
||||
except (RequestException, SSLError, OpenSSLError) as exc:
|
||||
msg = str(exc)
|
||||
print()
|
||||
continue
|
||||
if self._adjust_extension(pathfmt, file_header) and \
|
||||
pathfmt.exists():
|
||||
pathfmt.temppath = ""
|
||||
return True
|
||||
|
||||
# set open mode
|
||||
if not offset:
|
||||
mode = "w+b"
|
||||
if filesize:
|
||||
if file_size:
|
||||
self.log.debug("Unable to resume partial download")
|
||||
else:
|
||||
mode = "r+b"
|
||||
self.log.debug("Resuming download at byte %d", offset)
|
||||
|
||||
# start downloading
|
||||
self.out.start(pathfmt.path)
|
||||
# download content
|
||||
self.downloading = True
|
||||
with pathfmt.open(mode) as file:
|
||||
if offset:
|
||||
file.seek(offset)
|
||||
with pathfmt.open(mode) as fp:
|
||||
if file_header:
|
||||
fp.write(file_header)
|
||||
elif offset:
|
||||
if self.adjust_extension and \
|
||||
pathfmt.extension in FILE_SIGNATURES:
|
||||
self._adjust_extension(pathfmt, fp.read(16))
|
||||
fp.seek(offset)
|
||||
|
||||
# download content
|
||||
self.out.start(pathfmt.path)
|
||||
try:
|
||||
self.receive(response, file)
|
||||
self.receive(fp, response.iter_content(self.chunk_size))
|
||||
except (RequestException, SSLError, OpenSSLError) as exc:
|
||||
msg = str(exc)
|
||||
print()
|
||||
continue
|
||||
|
||||
# check filesize
|
||||
if size and file.tell() < size:
|
||||
msg = "filesize mismatch ({} < {})".format(
|
||||
file.tell(), size)
|
||||
# check file size
|
||||
if size and fp.tell() < size:
|
||||
msg = "file size mismatch ({} < {})".format(
|
||||
fp.tell(), size)
|
||||
print()
|
||||
continue
|
||||
|
||||
# check filename extension
|
||||
if self.adjust_extension:
|
||||
adj_ext = self.check_extension(file, pathfmt.extension)
|
||||
if adj_ext:
|
||||
pathfmt.set_extension(adj_ext)
|
||||
|
||||
break
|
||||
|
||||
self.downloading = False
|
||||
@ -198,16 +215,18 @@ class HttpDownloader(DownloaderBase):
|
||||
|
||||
return True
|
||||
|
||||
def receive(self, response, file):
|
||||
for data in response.iter_content(self.chunk_size):
|
||||
file.write(data)
|
||||
@staticmethod
|
||||
def receive(fp, content):
|
||||
write = fp.write
|
||||
for data in content:
|
||||
write(data)
|
||||
|
||||
def _receive_rate(self, response, file):
|
||||
def _receive_rate(self, fp, content):
|
||||
t1 = time.time()
|
||||
rt = self.rate
|
||||
|
||||
for data in response.iter_content(self.chunk_size):
|
||||
file.write(data)
|
||||
for data in content:
|
||||
fp.write(data)
|
||||
|
||||
t2 = time.time() # current time
|
||||
actual = t2 - t1 # actual elapsed time
|
||||
@ -220,81 +239,91 @@ class HttpDownloader(DownloaderBase):
|
||||
else:
|
||||
t1 = t2
|
||||
|
||||
def get_extension(self, response):
|
||||
def _find_extension(self, response):
|
||||
"""Get filename extension from MIME type"""
|
||||
mtype = response.headers.get("Content-Type", "image/jpeg")
|
||||
mtype = mtype.partition(";")[0]
|
||||
|
||||
if "/" not in mtype:
|
||||
mtype = "image/" + mtype
|
||||
|
||||
if mtype in MIMETYPE_MAP:
|
||||
return MIMETYPE_MAP[mtype]
|
||||
if mtype in MIME_TYPES:
|
||||
return MIME_TYPES[mtype]
|
||||
|
||||
exts = mimetypes.guess_all_extensions(mtype, strict=False)
|
||||
if exts:
|
||||
exts.sort()
|
||||
return exts[-1][1:]
|
||||
|
||||
self.log.warning(
|
||||
"No filename extension found for MIME type '%s'", mtype)
|
||||
return "txt"
|
||||
ext = mimetypes.guess_extension(mtype, strict=False)
|
||||
if ext:
|
||||
return ext[1:]
|
||||
self.log.warning("Unknown MIME type '%s'", mtype)
|
||||
return "bin"
|
||||
|
||||
@staticmethod
|
||||
def check_extension(file, extension):
|
||||
"""Check filename extension against fileheader"""
|
||||
if extension in FILETYPE_CHECK:
|
||||
file.seek(0)
|
||||
header = file.read(8)
|
||||
if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
|
||||
for ext, check in FILETYPE_CHECK.items():
|
||||
if ext != extension and check(header):
|
||||
return ext
|
||||
return None
|
||||
def _adjust_extension(pathfmt, file_header):
|
||||
"""Check filename extension against file header"""
|
||||
sig = FILE_SIGNATURES[pathfmt.extension]
|
||||
if not file_header.startswith(sig):
|
||||
for ext, sig in FILE_SIGNATURES.items():
|
||||
if file_header.startswith(sig):
|
||||
pathfmt.set_extension(ext)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
FILETYPE_CHECK = {
|
||||
"jpg": lambda h: h[0:2] == b"\xff\xd8",
|
||||
"png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
|
||||
"gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
|
||||
}
|
||||
|
||||
|
||||
MIMETYPE_MAP = {
|
||||
"image/jpeg": "jpg",
|
||||
"image/jpg": "jpg",
|
||||
"image/png": "png",
|
||||
"image/gif": "gif",
|
||||
"image/bmp": "bmp",
|
||||
"image/x-bmp": "bmp",
|
||||
MIME_TYPES = {
|
||||
"image/jpeg" : "jpg",
|
||||
"image/jpg" : "jpg",
|
||||
"image/png" : "png",
|
||||
"image/gif" : "gif",
|
||||
"image/bmp" : "bmp",
|
||||
"image/x-bmp" : "bmp",
|
||||
"image/x-ms-bmp": "bmp",
|
||||
"image/webp": "webp",
|
||||
"image/svg+xml": "svg",
|
||||
"image/webp" : "webp",
|
||||
"image/svg+xml" : "svg",
|
||||
|
||||
"image/x-photoshop" : "psd",
|
||||
"application/x-photoshop" : "psd",
|
||||
"image/vnd.adobe.photoshop": "psd",
|
||||
"image/x-photoshop": "psd",
|
||||
"application/x-photoshop": "psd",
|
||||
|
||||
"video/webm": "webm",
|
||||
"video/ogg": "ogg",
|
||||
"video/mp4": "mp4",
|
||||
"video/ogg" : "ogg",
|
||||
"video/mp4" : "mp4",
|
||||
|
||||
"audio/wav": "wav",
|
||||
"audio/wav" : "wav",
|
||||
"audio/x-wav": "wav",
|
||||
"audio/webm": "webm",
|
||||
"audio/ogg": "ogg",
|
||||
"audio/mpeg": "mp3",
|
||||
"audio/webm" : "webm",
|
||||
"audio/ogg" : "ogg",
|
||||
"audio/mpeg" : "mp3",
|
||||
|
||||
"application/zip": "zip",
|
||||
"application/zip" : "zip",
|
||||
"application/x-zip": "zip",
|
||||
"application/x-zip-compressed": "zip",
|
||||
"application/rar": "rar",
|
||||
"application/rar" : "rar",
|
||||
"application/x-rar": "rar",
|
||||
"application/x-rar-compressed": "rar",
|
||||
"application/x-7z-compressed": "7z",
|
||||
"application/x-7z-compressed" : "7z",
|
||||
|
||||
"application/ogg": "ogg",
|
||||
"application/octet-stream": "bin",
|
||||
}
|
||||
|
||||
# taken from https://en.wikipedia.org/wiki/List_of_file_signatures
|
||||
FILE_SIGNATURES = {
|
||||
"jpg" : b"\xFF\xD8\xFF",
|
||||
"png" : b"\x89PNG\r\n\x1A\n",
|
||||
"gif" : b"GIF8",
|
||||
"bmp" : b"\x42\x4D",
|
||||
"webp": b"RIFF",
|
||||
"svg" : b"<?xml",
|
||||
"psd" : b"8BPS",
|
||||
"webm": b"\x1A\x45\xDF\xA3",
|
||||
"ogg" : b"OggS",
|
||||
"wav" : b"RIFF",
|
||||
"mp3" : b"ID3",
|
||||
"zip" : b"\x50\x4B",
|
||||
"rar" : b"\x52\x61\x72\x21\x1A\x07",
|
||||
"7z" : b"\x37\x7A\xBC\xAF\x27\x1C",
|
||||
# check 'bin' files against all other file signatures
|
||||
"bin" : b"\x00\x00\x00\x00",
|
||||
}
|
||||
|
||||
|
||||
__downloader__ = HttpDownloader
|
||||
|
Loading…
x
Reference in New Issue
Block a user