2016-08-09 16:36:30 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2017-03-17 09:42:59 +01:00
|
|
|
# Copyright 2016-2017 Mike Fährmann
|
2016-08-09 16:36:30 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extract images from http://seiga.nicovideo.jp"""
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
2017-04-25 17:12:48 +02:00
|
|
|
from .. import text, exception
|
2016-08-09 16:36:30 +02:00
|
|
|
from ..cache import cache
|
|
|
|
|
2017-02-01 00:53:19 +01:00
|
|
|
|
2017-01-04 14:20:37 +01:00
|
|
|
class SeigaExtractor(Extractor):
|
|
|
|
"""Base class for seiga extractors"""
|
2016-08-09 16:36:30 +02:00
|
|
|
category = "seiga"
|
2017-07-22 15:43:35 +02:00
|
|
|
cookiedomain = ".nicovideo.jp"
|
2016-08-09 16:36:30 +02:00
|
|
|
|
|
|
|
def items(self):
|
2017-01-04 14:20:37 +01:00
|
|
|
self.login()
|
|
|
|
data = self.get_metadata()
|
2016-08-09 16:36:30 +02:00
|
|
|
yield Message.Version, 1
|
|
|
|
yield Message.Directory, data
|
2017-01-04 14:20:37 +01:00
|
|
|
for image in self.get_images():
|
|
|
|
data.update(image)
|
2017-03-14 09:09:04 +01:00
|
|
|
data["extension"] = None
|
2017-09-10 22:20:47 +02:00
|
|
|
url = self.get_image_url(image["image_id"])
|
2017-01-04 14:20:37 +01:00
|
|
|
yield Message.Url, url, data
|
2016-08-09 16:36:30 +02:00
|
|
|
|
2017-01-04 14:20:37 +01:00
|
|
|
def get_metadata(self):
|
2016-08-09 16:36:30 +02:00
|
|
|
"""Collect metadata for extractor-job"""
|
2017-01-04 14:20:37 +01:00
|
|
|
return {}
|
|
|
|
|
|
|
|
def get_images(self):
|
|
|
|
"""Return list of images"""
|
|
|
|
return []
|
2016-08-09 16:36:30 +02:00
|
|
|
|
|
|
|
def get_image_url(self, image_id):
|
|
|
|
"""Get url for an image with id 'image_id'"""
|
|
|
|
url = "http://seiga.nicovideo.jp/image/source/" + image_id
|
|
|
|
response = self.session.head(url)
|
2016-08-29 17:02:53 +02:00
|
|
|
if response.status_code == 404:
|
|
|
|
raise exception.NotFoundError("image")
|
2016-08-09 16:36:30 +02:00
|
|
|
return response.headers["Location"].replace("/o/", "/priv/", 1)
|
|
|
|
|
2017-01-04 14:20:37 +01:00
|
|
|
def login(self):
|
|
|
|
"""Login and set necessary cookies"""
|
2017-07-25 14:59:41 +02:00
|
|
|
if not self._check_cookies(("user_session",)):
|
|
|
|
username, password = self._get_auth_info()
|
|
|
|
self.session.cookies = self._login_impl(username, password)
|
2017-01-04 14:20:37 +01:00
|
|
|
|
2017-04-24 15:25:20 +02:00
|
|
|
@cache(maxage=7*24*60*60, keyarg=1)
|
2017-01-04 14:20:37 +01:00
|
|
|
def _login_impl(self, username, password):
|
|
|
|
"""Actual login implementation"""
|
2017-03-17 09:42:59 +01:00
|
|
|
self.log.info("Logging in as %s", username)
|
2016-08-09 16:36:30 +02:00
|
|
|
url = "https://account.nicovideo.jp/api/v1/login"
|
2017-08-05 16:11:46 +02:00
|
|
|
data = {"mail_tel": username, "password": password}
|
|
|
|
self.request(url, method="POST", data=data)
|
2016-08-09 16:36:30 +02:00
|
|
|
if "user_session" not in self.session.cookies:
|
|
|
|
raise exception.AuthenticationError()
|
|
|
|
del self.session.cookies["nicosid"]
|
|
|
|
return self.session.cookies
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
class SeigaUserExtractor(SeigaExtractor):
|
|
|
|
"""Extractor for images of a user from seiga.nicovideo.jp"""
|
|
|
|
subcategory = "user"
|
2017-09-10 22:20:47 +02:00
|
|
|
directory_fmt = ["{category}", "{user_id}"]
|
|
|
|
filename_fmt = "{category}_{user_id}_{image_id}.{extension}"
|
2017-01-04 14:20:37 +01:00
|
|
|
pattern = [(r"(?:https?://)?(?:www\.|seiga\.)?nicovideo\.jp/"
|
|
|
|
r"user/illust/(\d+)")]
|
|
|
|
test = [
|
|
|
|
("http://seiga.nicovideo.jp/user/illust/39537793", {
|
2017-11-13 20:46:24 +01:00
|
|
|
"pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
|
|
|
|
"count": 2,
|
2017-01-04 14:20:37 +01:00
|
|
|
}),
|
|
|
|
("http://seiga.nicovideo.jp/user/illust/79433", {
|
|
|
|
"url": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
|
2017-11-13 20:46:24 +01:00
|
|
|
"count": 0,
|
2017-01-04 14:20:37 +01:00
|
|
|
}),
|
|
|
|
]
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
SeigaExtractor.__init__(self)
|
|
|
|
self.user_id = match.group(1)
|
|
|
|
|
|
|
|
def get_metadata(self):
|
2017-09-10 22:20:47 +02:00
|
|
|
return {"user_id": self.user_id}
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
def get_images(self):
|
2017-11-13 20:46:24 +01:00
|
|
|
url = "http://seiga.nicovideo.jp/user/illust/" + self.user_id
|
|
|
|
params = {"target": "illust_all", "page": 1}
|
|
|
|
|
|
|
|
while True:
|
|
|
|
cnt = 0
|
|
|
|
page = self.request(url, params=params).text
|
|
|
|
|
|
|
|
for info in text.extract_iter(
|
|
|
|
page, '<li class="list_item', '</a></li> '):
|
|
|
|
yield text.extract_all(info, (
|
|
|
|
("image_id", '/seiga/im', '"'),
|
|
|
|
("title" , '<li class="title">', '</li>'),
|
|
|
|
("views" , '</span>', '</li>'),
|
|
|
|
("comments", '</span>', '</li>'),
|
|
|
|
("clips" , '</span>', '</li>'),
|
|
|
|
))[0]
|
|
|
|
cnt += 1
|
|
|
|
|
|
|
|
if cnt < 40:
|
|
|
|
return
|
|
|
|
params["page"] += 1
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
class SeigaImageExtractor(SeigaExtractor):
|
|
|
|
"""Extractor for single images from seiga.nicovideo.jp"""
|
|
|
|
subcategory = "image"
|
2017-09-10 22:20:47 +02:00
|
|
|
filename_fmt = "{category}_{image_id}.{extension}"
|
2017-01-04 14:20:37 +01:00
|
|
|
pattern = [(r"(?:https?://)?(?:www\.|seiga\.)?nicovideo\.jp/"
|
|
|
|
r"(?:seiga/im|image/source/)(\d+)"),
|
|
|
|
(r"(?:https?://)?lohas\.nicoseiga\.jp/"
|
|
|
|
r"(?:priv|o)/[^/]+/\d+/(\d+)")]
|
|
|
|
test = [
|
|
|
|
("http://seiga.nicovideo.jp/seiga/im5977527", {
|
2017-09-10 22:20:47 +02:00
|
|
|
"keyword": "6ff7564b35890e333ff7413cb633ddb58339912f",
|
2017-01-04 14:20:37 +01:00
|
|
|
"content": "d9202292012178374d57fb0126f6124387265297",
|
|
|
|
}),
|
|
|
|
("http://seiga.nicovideo.jp/seiga/im123", {
|
|
|
|
"exception": exception.NotFoundError,
|
|
|
|
}),
|
|
|
|
]
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
SeigaExtractor.__init__(self)
|
|
|
|
self.image_id = match.group(1)
|
|
|
|
|
|
|
|
def get_images(self):
|
2017-09-10 22:20:47 +02:00
|
|
|
return ({"image_id": self.image_id},)
|