Mike Fährmann d3b04076f7
add .netrc support (#22)
Use the '--netrc' cmdline option or set the 'netrc' config option
to 'true' to enable the use of .netrc authentication data.

The 'machine' names for the .netrc info are the lowercase extractor
names (or categories): batoto, exhentai, nijie, pixiv, seiga.
2017-06-24 12:17:26 +02:00

321 lines
11 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images and ugoira from https://www.pixiv.net/"""
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import re
class PixivExtractor(Extractor):
"""Base class for pixiv extractors"""
category = "pixiv"
directory_fmt = ["{category}", "{user[id]} {user[account]}"]
filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
illust_url = "https://www.pixiv.net/member_illust.php?mode=medium"
def __init__(self):
Extractor.__init__(self)
self.api = PixivAPI(self)
self.user_id = -1
self.load_ugoira = self.config("ugoira", True)
def items(self):
metadata = self.get_metadata()
yield Message.Version, 1
yield Message.Headers, self.session.headers
yield Message.Cookies, self.session.cookies
yield Message.Directory, metadata
for work in self.works():
work = self.prepare_work(work)
pos = work["extension"].rfind("?", -18)
if pos != -1:
timestamp = work["extension"][pos:]
work["extension"] = work["extension"][:pos]
else:
timestamp = ""
if work["type"] == "ugoira":
if not self.load_ugoira:
continue
url, framelist = self.parse_ugoira(work)
work["extension"] = "zip"
yield Message.Url, url, work
work["extension"] = "txt"
yield Message.Url, "text:"+framelist, work
elif work["page_count"] == 1:
yield Message.Url, work["url"], work
else:
url = work["url"]
ext = work["extension"]
off = url.rfind(".")
if url[off-2] == "p":
off -= 3
if work["id"] > 11319935 and "/img-original/" not in url:
big = "_big"
else:
big = ""
for i in range(work["page_count"]):
work["num"] = "_p{:02}".format(i)
url = "{}{}_p{}.{}{}".format(
url[:off], big, i, ext, timestamp
)
yield Message.Url, url, work
def works(self):
"""Return all work-items for a pixiv-member"""
return []
def prepare_work(self, work):
"""Prepare a work-dictionary with additional keywords"""
url = work["image_urls"]["large"]
work["num"] = ""
work["url"] = url
work["extension"] = url[url.rfind(".")+1:]
return work
def parse_ugoira(self, data):
"""Parse ugoira data"""
# get illust page
page = self.request(
self.illust_url, params={"illust_id": data["id"]},
).text
# parse page
frames = text.extract(page, ',"frames":[', ']')[0]
# build url
url = re.sub(
r"/img-original/(.+/\d+)[^/]+",
r"/img-zip-ugoira/\g<1>_ugoira1920x1080.zip",
data["url"]
)
# build framelist
framelist = re.sub(
r'\{"file":"([^"]+)","delay":(\d+)\},?',
r'\1 \2\n', frames
)
return url, framelist
def get_metadata(self, user=None):
"""Collect metadata for extractor-job"""
if not user:
user = self.api.user(self.user_id)[0]
return {"user": user}
class PixivUserExtractor(PixivExtractor):
"""Extractor for works of a pixiv-user"""
subcategory = "user"
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/"
r"member(?:_illust)?\.php\?id=(\d+)(?:.*&tag=([^&#]+))?"]
test = [
("http://www.pixiv.net/member_illust.php?id=173530", {
"url": "852c31ad83b6840bacbce824d85f2a997889efb7",
}),
("https://www.pixiv.net/member_illust.php?id=173530&tag=HITMAN", {
"url": "3ecb4970dd91ce1de0a9449671b42db5e3fe2b08",
}),
("http://www.pixiv.net/member_illust.php?id=173531", {
"exception": exception.NotFoundError,
}),
]
def __init__(self, match):
PixivExtractor.__init__(self)
self.user_id, tag = match.groups()
self.tag = tag.lower() if tag else None
def works(self):
for work in self.api.user_works(self.user_id):
if (not self.tag or
self.tag in [tag.lower() for tag in work["tags"]]):
yield work
class PixivWorkExtractor(PixivExtractor):
"""Extractor for a single pixiv work/illustration"""
subcategory = "work"
pattern = [(r"(?:https?://)?(?:www\.)?pixiv\.net/member(?:_illust)?\.php"
r"\?(?:[^&]+&)*illust_id=(\d+)"),
(r"(?:https?://)?i(?:\d+\.pixiv|\.pximg)\.net(?:/.*)?/img-[^/]+"
r"/img/\d{4}(?:/\d\d){5}/(\d+)"),
(r"(?:https?://)?img\d+\.pixiv\.net/img/[^/]+/(\d+)")]
test = [
(("http://www.pixiv.net/member_illust.php"
"?mode=medium&illust_id=966412"), {
"url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
"content": "69a8edfb717400d1c2e146ab2b30d2c235440c5a",
}),
(("http://www.pixiv.net/member_illust.php"
"?mode=medium&illust_id=966411"), {
"exception": exception.NotFoundError,
}),
(("http://i1.pixiv.net/c/600x600/img-master/"
"img/2008/06/13/00/29/13/966412_p0_master1200.jpg"), {
"url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
}),
(("https://i.pximg.net/img-original/"
"img/2017/04/25/07/33/29/62568267_p0.png"), {
"url": "71b8bbd070d6b03a75ca4afb89f64d1445b2278d",
}),
]
def __init__(self, match):
PixivExtractor.__init__(self)
self.illust_id = match.group(1)
self.load_ugoira = True
self.work = None
def works(self):
return (self.work,)
def get_metadata(self, user=None):
self.work = self.api.work(self.illust_id)[0]
return PixivExtractor.get_metadata(self, self.work["user"])
class PixivFavoriteExtractor(PixivExtractor):
"""Extractor for all favorites/bookmarks of a pixiv-user"""
subcategory = "favorite"
directory_fmt = ["{category}", "bookmarks", "{user[id]} {user[account]}"]
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/bookmark\.php\?id=(\d+)"]
test = [("http://www.pixiv.net/bookmark.php?id=173530", {
"url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
})]
def __init__(self, match):
PixivExtractor.__init__(self)
self.user_id = match.group(1)
def works(self):
return self.api.user_favorite_works(self.user_id)
def prepare_work(self, work):
return PixivExtractor.prepare_work(self, work["work"])
class PixivBookmarkExtractor(PixivFavoriteExtractor):
"""Extractor for all favorites/bookmarks of your own account"""
subcategory = "bookmark"
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/bookmark\.php()$"]
test = []
def get_metadata(self, user=None):
self.api.login()
user = self.api.user_info
self.user_id = user["id"]
return PixivExtractor.get_metadata(self, user)
class PixivAPI():
"""Minimal interface for the Pixiv Public-API for mobile devices
For a better and more complete implementation, see
- https://github.com/upbit/pixivpy
For in-depth information regarding the Pixiv Public-API, see
- http://blog.imaou.com/opensource/2014/10/09/pixiv_api_for_ios_update.html
"""
def __init__(self, extractor):
self.session = extractor.session
self.log = extractor.log
self.username, self.password = extractor.auth_info()
self.user_info = None
self.session.headers.update({
"Referer": "https://www.pixiv.net/",
'App-OS': 'ios',
'App-OS-Version': '10.3.1',
'App-Version': '6.7.1',
'User-Agent': 'PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)',
})
def user(self, user_id):
"""Query information about a pixiv user"""
endpoint = "users/" + user_id
return self._call(endpoint, {})["response"]
def work(self, illust_id):
"""Query information about a single pixiv work/illustration"""
endpoint = "works/" + illust_id
params = {"image_sizes": "large"}
return self._call(endpoint, params)["response"]
def user_works(self, user_id):
"""Query information about the works of a pixiv user"""
endpoint = "users/{user}/works".format(user=user_id)
params = {"image_sizes": "large"}
return self._pagination(endpoint, params)
def user_favorite_works(self, user_id):
"""Query information about the favorite works of a pixiv user"""
endpoint = "users/{user}/favorite_works".format(user=user_id)
params = {"image_sizes": "large", "include_stats": False}
return self._pagination(endpoint, params)
def login(self):
"""Login and gain a Pixiv Public-API access token"""
self.user_info, access_token = self._login_impl(
self.username, self.password)
self.session.headers["Authorization"] = access_token
@cache(maxage=50*60, keyarg=1)
def _login_impl(self, username, password):
"""Actual login implementation"""
self.log.info("Logging in as %s", username)
data = {
"username": username,
"password": password,
"grant_type": "password",
"client_id": "bYGKuGVw91e0NMfPGp44euvGt59s",
"client_secret": "HP3RmkgAmEGro0gn1x9ioawQE8WMfvLXDz3ZqxpK",
"get_secure_url": 1,
}
response = self.session.post(
"https://oauth.secure.pixiv.net/auth/token", data=data
)
if response.status_code != 200:
raise exception.AuthenticationError()
try:
response = response.json()["response"]
token = response["access_token"]
user = response["user"]
except KeyError:
raise Exception("Get token error! Response: %s" % (response))
return user, "Bearer " + token
def _call(self, endpoint, params, _empty=[None]):
url = "https://public-api.secure.pixiv.net/v1/" + endpoint + ".json"
self.login()
data = self.session.get(url, params=params).json()
status = data.get("status")
response = data.get("response", _empty)
if status == "failure" or response == _empty:
raise exception.NotFoundError()
return data
def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
yield from data["response"]
pinfo = data["pagination"]
if pinfo["current"] == pinfo["pages"]:
return
params["page"] = pinfo["next"]