2015-10-11 16:22:38 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2019-02-08 13:45:40 +01:00
|
|
|
# Copyright 2015-2019 Mike Fährmann
|
2015-10-11 16:22:38 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2017-05-26 22:30:09 +02:00
|
|
|
"""Extract images from https://imgur.com/"""
|
2015-10-11 16:22:38 +02:00
|
|
|
|
|
|
|
from .common import Extractor, Message
|
2016-10-07 00:13:51 +02:00
|
|
|
from .. import text, exception
|
2019-10-15 21:14:14 +02:00
|
|
|
from ..cache import cache
|
2019-09-19 15:54:26 +02:00
|
|
|
import itertools
|
2017-05-26 22:30:09 +02:00
|
|
|
import json
|
2015-10-11 16:22:38 +02:00
|
|
|
|
2017-02-01 00:53:19 +01:00
|
|
|
|
2019-09-19 15:54:26 +02:00
|
|
|
BASE_PATTERN = r"(?:https?://)?(?:www\.|[im]\.)?imgur\.com"
|
|
|
|
|
|
|
|
|
2017-05-26 22:30:09 +02:00
|
|
|
class ImgurExtractor(Extractor):
|
|
|
|
"""Base class for imgur extractors"""
|
|
|
|
category = "imgur"
|
2019-07-04 23:45:26 +02:00
|
|
|
root = "https://imgur.com"
|
2019-09-19 15:54:26 +02:00
|
|
|
api_root = "https://api.imgur.com"
|
2017-05-26 22:30:09 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2019-02-11 13:31:10 +01:00
|
|
|
Extractor.__init__(self, match)
|
2019-08-14 21:20:58 +02:00
|
|
|
self.key = match.group(1)
|
2017-05-29 08:48:07 +02:00
|
|
|
self.mp4 = self.config("mp4", True)
|
2017-05-26 22:30:09 +02:00
|
|
|
|
2019-10-15 21:14:14 +02:00
|
|
|
def login(self):
|
|
|
|
username, password = self._get_auth_info()
|
|
|
|
if username:
|
|
|
|
self._update_cookies(self._login_impl(username, password))
|
|
|
|
|
|
|
|
@cache(maxage=180*24*3600, keyarg=1)
|
|
|
|
def _login_impl(self, username, password):
|
|
|
|
self.log.info("Logging in as %s", username)
|
|
|
|
|
|
|
|
url = "{}/signin?invokedBy=Regular%20Sign%20In".format(self.root)
|
|
|
|
headers = {"Referer": url}
|
|
|
|
data = {
|
|
|
|
"username": username,
|
|
|
|
"password": password,
|
|
|
|
"remember": "remember",
|
|
|
|
"submit" : "",
|
|
|
|
}
|
|
|
|
|
|
|
|
response = self.request(url, method="POST", headers=headers, data=data)
|
|
|
|
if not response.history:
|
|
|
|
error = text.extract(response.text, 'class="error">', '<')[0] or ""
|
|
|
|
raise exception.AuthenticationError(error.strip())
|
|
|
|
return self.session.cookies
|
|
|
|
|
2019-08-14 21:20:58 +02:00
|
|
|
def _extract_data(self, path):
|
2019-10-15 22:17:31 +02:00
|
|
|
page = self.request(self.root + path, notfound=self.subcategory).text
|
|
|
|
data = text.extract(page, "image : ", ",\n")[0]
|
|
|
|
|
|
|
|
if not data:
|
|
|
|
if ">Sign in required<" in page:
|
|
|
|
self.log.error("'Sign in required'")
|
|
|
|
else:
|
|
|
|
self.log.error("Unable to extract JSON data")
|
|
|
|
raise exception.StopExtraction()
|
|
|
|
data = json.loads(data)
|
|
|
|
|
2019-08-14 21:20:58 +02:00
|
|
|
try:
|
|
|
|
del data["adConfig"]
|
|
|
|
del data["isAd"]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
|
|
|
return data
|
2017-05-26 22:30:09 +02:00
|
|
|
|
2017-05-29 08:48:07 +02:00
|
|
|
def _prepare(self, image):
|
2018-02-10 21:29:40 +01:00
|
|
|
image["ext"] = image["ext"].partition("?")[0]
|
2017-05-29 08:48:07 +02:00
|
|
|
if image["ext"] == ".gif" and (
|
2017-07-18 12:42:19 +02:00
|
|
|
(self.mp4 and image["prefer_video"]) or self.mp4 == "always"):
|
2017-05-27 11:49:29 +02:00
|
|
|
image["ext"] = ".mp4"
|
2017-05-26 22:30:09 +02:00
|
|
|
url = "https://i.imgur.com/" + image["hash"] + image["ext"]
|
|
|
|
image["extension"] = image["ext"][1:]
|
|
|
|
return url
|
|
|
|
|
2019-09-19 15:54:26 +02:00
|
|
|
def _items_apiv3(self, urlfmt):
|
2019-10-15 21:14:14 +02:00
|
|
|
self.login()
|
2019-09-19 15:54:26 +02:00
|
|
|
album_ex = ImgurAlbumExtractor
|
|
|
|
image_ex = ImgurImageExtractor
|
|
|
|
|
|
|
|
params = {
|
|
|
|
"IMGURPLATFORM" : "web",
|
|
|
|
"album_previews": "0",
|
|
|
|
"client_id" : "546c25a59c58ad7",
|
|
|
|
}
|
|
|
|
headers = {
|
|
|
|
"Origin" : self.root,
|
|
|
|
"Referer": self.root + "/",
|
|
|
|
}
|
|
|
|
|
|
|
|
yield Message.Version, 1
|
|
|
|
|
|
|
|
for num in itertools.count(0):
|
|
|
|
url = urlfmt.format(num)
|
|
|
|
data = self.request(url, params=params, headers=headers).json()
|
|
|
|
|
|
|
|
for item in data["data"]:
|
|
|
|
item["_extractor"] = album_ex if item["is_album"] else image_ex
|
|
|
|
yield Message.Queue, item["link"], item
|
|
|
|
|
|
|
|
if len(data["data"]) < 60:
|
|
|
|
return
|
|
|
|
|
2017-05-26 22:30:09 +02:00
|
|
|
|
|
|
|
class ImgurImageExtractor(ImgurExtractor):
|
2019-08-14 21:20:58 +02:00
|
|
|
"""Extractor for individual images on imgur.com"""
|
2017-05-26 22:30:09 +02:00
|
|
|
subcategory = "image"
|
2017-09-28 18:00:19 +02:00
|
|
|
filename_fmt = "{category}_{hash}{title:?_//}.{extension}"
|
2018-02-12 23:09:34 +01:00
|
|
|
archive_fmt = "{hash}"
|
2019-09-19 15:54:26 +02:00
|
|
|
pattern = BASE_PATTERN + r"/(?!gallery)(\w{7}|\w{5})[sbtmlh]?\.?"
|
2019-02-08 13:45:40 +01:00
|
|
|
test = (
|
2017-05-26 22:30:09 +02:00
|
|
|
("https://imgur.com/21yMxCS", {
|
|
|
|
"url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2",
|
|
|
|
"content": "0c8768055e4e20e7c7259608b67799171b691140",
|
2018-01-20 18:49:29 +01:00
|
|
|
"keyword": {
|
|
|
|
"animated": False,
|
|
|
|
"datetime": "2016-11-10 14:24:35",
|
|
|
|
"description": str,
|
|
|
|
"ext": ".png",
|
|
|
|
"extension": "png",
|
|
|
|
"hash": "21yMxCS",
|
|
|
|
"height": "32",
|
|
|
|
"is_moderated": False,
|
|
|
|
"is_safe": False,
|
|
|
|
"is_viral": 0,
|
|
|
|
"looping": False,
|
|
|
|
"mimetype": "image/png",
|
|
|
|
"name": None,
|
|
|
|
"prefer_video": False,
|
|
|
|
"size": 182,
|
|
|
|
"source": "",
|
|
|
|
"title": "Test",
|
|
|
|
"video_host": None,
|
|
|
|
"video_source": None,
|
|
|
|
"width": "64",
|
|
|
|
},
|
2017-05-26 22:30:09 +02:00
|
|
|
}),
|
2017-05-27 11:49:29 +02:00
|
|
|
("http://imgur.com/0gybAXR", { # gifv/mp4 video
|
|
|
|
"url": "a2220eb265a55b0c95e0d3d721ec7665460e3fd7",
|
|
|
|
"content": "a3c080e43f58f55243ab830569ba02309d59abfc",
|
2017-05-26 22:30:09 +02:00
|
|
|
}),
|
2018-02-11 16:28:19 +01:00
|
|
|
("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1'
|
|
|
|
"url": "73f361b50753ab25da64160aa50bc5d139480d45",
|
|
|
|
}),
|
2018-04-08 17:50:57 +02:00
|
|
|
("https://imgur.com/zzzzzzz", { # not found
|
2017-05-26 22:30:09 +02:00
|
|
|
"exception": exception.NotFoundError,
|
|
|
|
}),
|
2019-02-08 13:45:40 +01:00
|
|
|
("https://www.imgur.com/21yMxCS"), # www
|
|
|
|
("https://m.imgur.com/21yMxCS"), # mobile
|
|
|
|
("https://imgur.com/zxaY6"), # 5 character key
|
|
|
|
("https://i.imgur.com/21yMxCS.png"), # direct link
|
|
|
|
("https://i.imgur.com/21yMxCSh.png"), # direct link thumbnail
|
|
|
|
("https://i.imgur.com/zxaY6.gif"), # direct link (short)
|
|
|
|
("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb)
|
|
|
|
)
|
2017-05-26 22:30:09 +02:00
|
|
|
|
|
|
|
def items(self):
|
2019-10-15 21:14:14 +02:00
|
|
|
self.login()
|
2019-08-14 21:20:58 +02:00
|
|
|
image = self._extract_data("/" + self.key)
|
2017-05-26 22:30:09 +02:00
|
|
|
url = self._prepare(image)
|
|
|
|
yield Message.Version, 1
|
|
|
|
yield Message.Directory, image
|
|
|
|
yield Message.Url, url, image
|
|
|
|
|
|
|
|
|
|
|
|
class ImgurAlbumExtractor(ImgurExtractor):
|
2019-08-14 21:20:58 +02:00
|
|
|
"""Extractor for imgur albums"""
|
2016-08-01 16:02:25 +02:00
|
|
|
subcategory = "album"
|
2019-02-08 13:45:40 +01:00
|
|
|
directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}")
|
2017-05-26 22:30:09 +02:00
|
|
|
filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}"
|
2018-02-12 23:09:34 +01:00
|
|
|
archive_fmt = "{album[hash]}_{hash}"
|
2019-09-19 15:54:26 +02:00
|
|
|
pattern = BASE_PATTERN + r"/(?:a|t/unmuted)/(\w{7}|\w{5})"
|
2019-02-08 13:45:40 +01:00
|
|
|
test = (
|
2016-12-31 00:51:06 +01:00
|
|
|
("https://imgur.com/a/TcBmP", {
|
|
|
|
"url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563",
|
2018-01-20 18:49:29 +01:00
|
|
|
"keyword": {
|
|
|
|
"album": {
|
|
|
|
"album_cover": "693j2Kr",
|
|
|
|
"album_description": None,
|
|
|
|
"cover": "693j2Kr",
|
|
|
|
"datetime": "2015-10-09 10:37:50",
|
|
|
|
"description": None,
|
|
|
|
"hash": "TcBmP",
|
|
|
|
"id": "TcBmP",
|
|
|
|
"is_album": True,
|
|
|
|
"num_images": "19",
|
|
|
|
"title": "138",
|
|
|
|
"title_clean": "TcBmP",
|
|
|
|
"views": str,
|
|
|
|
},
|
|
|
|
"animated": bool,
|
|
|
|
"datetime": str,
|
|
|
|
"extension": str,
|
|
|
|
"hash": str,
|
|
|
|
"height": int,
|
|
|
|
"num": int,
|
|
|
|
"prefer_video": bool,
|
|
|
|
"size": int,
|
|
|
|
"title": str,
|
|
|
|
"width": int,
|
|
|
|
},
|
2016-12-31 00:51:06 +01:00
|
|
|
}),
|
2019-08-14 21:20:58 +02:00
|
|
|
("https://imgur.com/a/eD9CT", { # large album
|
2017-07-18 12:42:19 +02:00
|
|
|
"url": "4ee94de31ff26be416271bc0b1ea27b9349c9937",
|
|
|
|
}),
|
2018-05-13 11:19:10 +02:00
|
|
|
("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash
|
|
|
|
"url": "695ef0c950023362a0163ee5041796300db76674",
|
|
|
|
}),
|
2018-07-16 18:14:41 +02:00
|
|
|
("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL
|
|
|
|
"url": "86b4747f8147cec7602f0214e267309af73a8655",
|
2018-05-30 16:19:01 +02:00
|
|
|
}),
|
2016-12-31 00:51:06 +01:00
|
|
|
("https://imgur.com/a/TcBmQ", {
|
|
|
|
"exception": exception.NotFoundError,
|
|
|
|
}),
|
2019-02-08 13:45:40 +01:00
|
|
|
("https://www.imgur.com/a/TcBmP"), # www
|
|
|
|
("https://m.imgur.com/a/TcBmP"), # mobile
|
|
|
|
)
|
2015-11-21 04:26:30 +01:00
|
|
|
|
2015-10-11 16:22:38 +02:00
|
|
|
def items(self):
|
2019-10-15 21:14:14 +02:00
|
|
|
self.login()
|
2019-10-15 22:17:31 +02:00
|
|
|
album = self._extract_data("/a/" + self.key)
|
2017-05-26 22:30:09 +02:00
|
|
|
images = album["album_images"]["images"]
|
|
|
|
del album["album_images"]
|
|
|
|
|
2017-07-18 12:42:19 +02:00
|
|
|
if int(album["num_images"]) > len(images):
|
2019-07-04 23:45:26 +02:00
|
|
|
url = "{}/ajaxalbums/getimages/{}/hit.json".format(
|
2019-08-14 21:20:58 +02:00
|
|
|
self.root, self.key)
|
2017-07-18 12:42:19 +02:00
|
|
|
images = self.request(url).json()["data"]["images"]
|
|
|
|
|
2015-10-11 16:22:38 +02:00
|
|
|
yield Message.Version, 1
|
2017-05-26 22:30:09 +02:00
|
|
|
yield Message.Directory, {"album": album, "count": len(images)}
|
|
|
|
for num, image in enumerate(images, 1):
|
|
|
|
url = self._prepare(image)
|
2016-10-07 00:13:51 +02:00
|
|
|
image["num"] = num
|
2017-05-26 22:30:09 +02:00
|
|
|
image["album"] = album
|
2016-10-07 00:13:51 +02:00
|
|
|
yield Message.Url, url, image
|
2019-08-14 21:20:58 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ImgurGalleryExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for imgur galleries"""
|
|
|
|
subcategory = "gallery"
|
2019-09-19 15:54:26 +02:00
|
|
|
pattern = BASE_PATTERN + r"/gallery/(\w{7}|\w{5})"
|
2019-08-14 21:20:58 +02:00
|
|
|
test = (
|
|
|
|
("https://imgur.com/gallery/zf2fIms", { # non-album gallery (#380)
|
|
|
|
"pattern": "https://imgur.com/zf2fIms",
|
|
|
|
}),
|
|
|
|
("https://imgur.com/gallery/eD9CT", {
|
|
|
|
"pattern": "https://imgur.com/a/eD9CT",
|
|
|
|
}),
|
|
|
|
)
|
|
|
|
|
|
|
|
def items(self):
|
2019-10-15 21:14:14 +02:00
|
|
|
self.login()
|
2019-08-14 21:20:58 +02:00
|
|
|
url = self.root + "/a/" + self.key
|
2019-08-20 20:00:43 +02:00
|
|
|
with self.request(url, method="HEAD", fatal=False) as response:
|
|
|
|
code = response.status_code
|
2019-08-14 21:20:58 +02:00
|
|
|
|
2019-08-20 20:00:43 +02:00
|
|
|
if code < 400:
|
2019-08-14 21:20:58 +02:00
|
|
|
extr = ImgurAlbumExtractor
|
|
|
|
else:
|
|
|
|
extr = ImgurImageExtractor
|
2019-08-20 20:00:43 +02:00
|
|
|
url = self.root + "/" + self.key
|
2019-08-14 21:20:58 +02:00
|
|
|
|
|
|
|
yield Message.Version, 1
|
2019-08-20 20:00:43 +02:00
|
|
|
yield Message.Queue, url, {"_extractor": extr}
|
2019-09-17 22:58:18 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ImgurUserExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for all images posted by a user"""
|
|
|
|
subcategory = "user"
|
2019-09-19 15:54:26 +02:00
|
|
|
pattern = BASE_PATTERN + r"/user/([^/?&#]+)(?:/posts|/submitted)?/?$"
|
2019-09-17 22:58:18 +02:00
|
|
|
test = (
|
|
|
|
("https://imgur.com/user/Miguenzo", {
|
2019-09-19 15:54:26 +02:00
|
|
|
"range": "1-100",
|
|
|
|
"count": 100,
|
2019-09-24 23:01:12 +02:00
|
|
|
"pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+",
|
2019-09-17 22:58:18 +02:00
|
|
|
}),
|
|
|
|
("https://imgur.com/user/Miguenzo/posts"),
|
2019-09-19 15:54:26 +02:00
|
|
|
("https://imgur.com/user/Miguenzo/submitted"),
|
2019-09-17 22:58:18 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
def items(self):
|
2019-09-19 15:54:26 +02:00
|
|
|
urlfmt = "{}/3/account/{}/submissions/{{}}/newest".format(
|
|
|
|
self.api_root, self.key)
|
|
|
|
return self._items_apiv3(urlfmt)
|
2019-09-17 22:58:18 +02:00
|
|
|
|
|
|
|
|
2019-09-19 15:54:26 +02:00
|
|
|
class ImgurFavoriteExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for a user's favorites"""
|
|
|
|
subcategory = "favorite"
|
|
|
|
pattern = BASE_PATTERN + r"/user/([^/?&#]+)/favorites"
|
|
|
|
test = ("https://imgur.com/user/Miguenzo/favorites", {
|
|
|
|
"range": "1-100",
|
|
|
|
"count": 100,
|
2019-09-24 23:01:12 +02:00
|
|
|
"pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+",
|
2019-09-19 15:54:26 +02:00
|
|
|
})
|
2019-09-17 22:58:18 +02:00
|
|
|
|
2019-09-19 15:54:26 +02:00
|
|
|
def items(self):
|
|
|
|
urlfmt = "{}/3/account/{}/gallery_favorites/{{}}/newest".format(
|
|
|
|
self.api_root, self.key)
|
|
|
|
return self._items_apiv3(urlfmt)
|