2016-08-09 16:36:30 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2019-01-30 16:18:22 +01:00
|
|
|
# Copyright 2016-2019 Mike Fährmann
|
2016-08-09 16:36:30 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2019-02-21 22:51:11 +01:00
|
|
|
"""Extract images from https://seiga.nicovideo.jp/"""
|
2016-08-09 16:36:30 +02:00
|
|
|
|
|
|
|
from .common import Extractor, Message
|
2017-11-14 21:33:17 +01:00
|
|
|
from .. import text, util, exception
|
2016-08-09 16:36:30 +02:00
|
|
|
from ..cache import cache
|
|
|
|
|
2017-02-01 00:53:19 +01:00
|
|
|
|
2017-01-04 14:20:37 +01:00
|
|
|
class SeigaExtractor(Extractor):
|
|
|
|
"""Base class for seiga extractors"""
|
2016-08-09 16:36:30 +02:00
|
|
|
category = "seiga"
|
2018-01-30 22:49:16 +01:00
|
|
|
archive_fmt = "{image_id}"
|
2017-07-22 15:43:35 +02:00
|
|
|
cookiedomain = ".nicovideo.jp"
|
2019-02-21 22:51:11 +01:00
|
|
|
root = "https://seiga.nicovideo.jp"
|
2016-08-09 16:36:30 +02:00
|
|
|
|
2019-02-11 13:31:10 +01:00
|
|
|
def __init__(self, match):
|
|
|
|
Extractor.__init__(self, match)
|
2017-12-03 01:38:24 +01:00
|
|
|
self.start_image = 0
|
2017-11-14 21:33:17 +01:00
|
|
|
|
2016-08-09 16:36:30 +02:00
|
|
|
def items(self):
|
2017-01-04 14:20:37 +01:00
|
|
|
self.login()
|
2017-11-14 21:33:17 +01:00
|
|
|
images = iter(self.get_images())
|
|
|
|
data = next(images)
|
|
|
|
|
2016-08-09 16:36:30 +02:00
|
|
|
yield Message.Version, 1
|
|
|
|
yield Message.Directory, data
|
2017-12-03 01:38:24 +01:00
|
|
|
for image in util.advance(images, self.start_image):
|
2017-01-04 14:20:37 +01:00
|
|
|
data.update(image)
|
2017-03-14 09:09:04 +01:00
|
|
|
data["extension"] = None
|
2017-11-14 21:33:17 +01:00
|
|
|
yield Message.Url, self.get_image_url(data["image_id"]), data
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
def get_images(self):
|
2017-11-14 21:33:17 +01:00
|
|
|
"""Return iterable containing metadata and images"""
|
2016-08-09 16:36:30 +02:00
|
|
|
|
|
|
|
def get_image_url(self, image_id):
|
|
|
|
"""Get url for an image with id 'image_id'"""
|
2019-02-21 22:51:11 +01:00
|
|
|
url = "{}/image/source/{}".format(self.root, image_id)
|
2018-06-25 22:39:43 +02:00
|
|
|
response = self.request(
|
2019-07-04 23:45:26 +02:00
|
|
|
url, method="HEAD", allow_redirects=False, notfound="image")
|
2016-08-09 16:36:30 +02:00
|
|
|
return response.headers["Location"].replace("/o/", "/priv/", 1)
|
|
|
|
|
2017-01-04 14:20:37 +01:00
|
|
|
def login(self):
|
|
|
|
"""Login and set necessary cookies"""
|
2020-10-15 22:13:27 +09:00
|
|
|
if not self._check_cookies(("user_session")):
|
2017-07-25 14:59:41 +02:00
|
|
|
username, password = self._get_auth_info()
|
2019-01-30 17:09:32 +01:00
|
|
|
self._update_cookies(self._login_impl(username, password))
|
2017-01-04 14:20:37 +01:00
|
|
|
|
2019-03-14 22:21:49 +01:00
|
|
|
@cache(maxage=7*24*3600, keyarg=1)
|
2017-01-04 14:20:37 +01:00
|
|
|
def _login_impl(self, username, password):
|
2017-03-17 09:42:59 +01:00
|
|
|
self.log.info("Logging in as %s", username)
|
2016-08-09 16:36:30 +02:00
|
|
|
url = "https://account.nicovideo.jp/api/v1/login"
|
2017-08-05 16:11:46 +02:00
|
|
|
data = {"mail_tel": username, "password": password}
|
2019-01-30 17:09:32 +01:00
|
|
|
|
2017-08-05 16:11:46 +02:00
|
|
|
self.request(url, method="POST", data=data)
|
2016-08-09 16:36:30 +02:00
|
|
|
if "user_session" not in self.session.cookies:
|
|
|
|
raise exception.AuthenticationError()
|
|
|
|
del self.session.cookies["nicosid"]
|
|
|
|
return self.session.cookies
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
class SeigaUserExtractor(SeigaExtractor):
|
|
|
|
"""Extractor for images of a user from seiga.nicovideo.jp"""
|
|
|
|
subcategory = "user"
|
2019-02-08 13:45:40 +01:00
|
|
|
directory_fmt = ("{category}", "{user[id]}")
|
2017-11-14 21:33:17 +01:00
|
|
|
filename_fmt = "{category}_{user[id]}_{image_id}.{extension}"
|
2019-08-28 22:56:43 +02:00
|
|
|
pattern = (r"(?:https?://)?(?:www\.|(?:sp\.)?seiga\.)?nicovideo\.jp/"
|
2019-02-08 13:45:40 +01:00
|
|
|
r"user/illust/(\d+)(?:\?(?:[^&]+&)*sort=([^&#]+))?")
|
|
|
|
test = (
|
2019-02-21 22:51:11 +01:00
|
|
|
("https://seiga.nicovideo.jp/user/illust/39537793", {
|
2017-11-13 20:46:24 +01:00
|
|
|
"pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
|
2018-03-08 18:04:34 +01:00
|
|
|
"count": ">= 4",
|
2017-12-30 19:19:36 +01:00
|
|
|
"keyword": {
|
|
|
|
"user": {
|
|
|
|
"id": 39537793,
|
|
|
|
"message": str,
|
2018-03-19 11:30:21 +01:00
|
|
|
"name": str,
|
2017-12-30 19:19:36 +01:00
|
|
|
},
|
|
|
|
"clips": int,
|
|
|
|
"comments": int,
|
|
|
|
"count": int,
|
|
|
|
"extension": None,
|
|
|
|
"image_id": int,
|
|
|
|
"title": str,
|
|
|
|
"views": int,
|
|
|
|
},
|
2017-01-04 14:20:37 +01:00
|
|
|
}),
|
2019-02-21 22:51:11 +01:00
|
|
|
("https://seiga.nicovideo.jp/user/illust/79433", {
|
2017-11-14 21:33:17 +01:00
|
|
|
"exception": exception.NotFoundError,
|
2017-01-04 14:20:37 +01:00
|
|
|
}),
|
2019-02-21 22:51:11 +01:00
|
|
|
("https://seiga.nicovideo.jp/user/illust/39537793"
|
2019-02-08 13:45:40 +01:00
|
|
|
"?sort=image_view&target=illust_all"),
|
2019-08-28 22:56:43 +02:00
|
|
|
("https://sp.seiga.nicovideo.jp/user/illust/39537793"),
|
2019-02-08 13:45:40 +01:00
|
|
|
)
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2019-02-11 13:31:10 +01:00
|
|
|
SeigaExtractor.__init__(self, match)
|
2017-11-14 21:33:17 +01:00
|
|
|
self.user_id, self.order = match.groups()
|
2017-12-03 01:38:24 +01:00
|
|
|
self.start_page = 1
|
2017-11-14 21:33:17 +01:00
|
|
|
|
|
|
|
def skip(self, num):
|
|
|
|
pages, images = divmod(num, 40)
|
2017-12-03 01:38:24 +01:00
|
|
|
self.start_page += pages
|
|
|
|
self.start_image += images
|
2017-11-14 21:33:17 +01:00
|
|
|
return num
|
|
|
|
|
|
|
|
def get_metadata(self, page):
|
|
|
|
"""Collect metadata from 'page'"""
|
|
|
|
data = text.extract_all(page, (
|
|
|
|
("name" , '<img alt="', '"'),
|
|
|
|
("msg" , '<li class="user_message">', '</li>'),
|
|
|
|
(None , '<span class="target_name">すべて</span>', ''),
|
|
|
|
("count", '<span class="count ">', '</span>'),
|
|
|
|
))[0]
|
|
|
|
|
|
|
|
if not data["name"] and "ユーザー情報が取得出来ませんでした" in page:
|
|
|
|
raise exception.NotFoundError("user")
|
|
|
|
|
|
|
|
return {
|
|
|
|
"user": {
|
2018-04-20 14:53:21 +02:00
|
|
|
"id": text.parse_int(self.user_id),
|
2017-11-14 21:33:17 +01:00
|
|
|
"name": data["name"],
|
|
|
|
"message": (data["msg"] or "").strip(),
|
|
|
|
},
|
2018-04-20 14:53:21 +02:00
|
|
|
"count": text.parse_int(data["count"]),
|
2017-11-14 21:33:17 +01:00
|
|
|
}
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
def get_images(self):
|
2019-02-21 22:51:11 +01:00
|
|
|
url = "{}/user/illust/{}".format(self.root, self.user_id)
|
2017-12-03 01:38:24 +01:00
|
|
|
params = {"sort": self.order, "page": self.start_page,
|
2017-11-14 21:33:17 +01:00
|
|
|
"target": "illust_all"}
|
2017-11-13 20:46:24 +01:00
|
|
|
|
|
|
|
while True:
|
|
|
|
cnt = 0
|
|
|
|
page = self.request(url, params=params).text
|
|
|
|
|
2017-12-03 01:38:24 +01:00
|
|
|
if params["page"] == self.start_page:
|
2017-11-14 21:33:17 +01:00
|
|
|
yield self.get_metadata(page)
|
|
|
|
|
2017-11-13 20:46:24 +01:00
|
|
|
for info in text.extract_iter(
|
|
|
|
page, '<li class="list_item', '</a></li> '):
|
2017-11-14 21:33:17 +01:00
|
|
|
data = text.extract_all(info, (
|
2017-11-13 20:46:24 +01:00
|
|
|
("image_id", '/seiga/im', '"'),
|
|
|
|
("title" , '<li class="title">', '</li>'),
|
|
|
|
("views" , '</span>', '</li>'),
|
|
|
|
("comments", '</span>', '</li>'),
|
|
|
|
("clips" , '</span>', '</li>'),
|
|
|
|
))[0]
|
2017-11-14 21:33:17 +01:00
|
|
|
for key in ("image_id", "views", "comments", "clips"):
|
2018-04-20 14:53:21 +02:00
|
|
|
data[key] = text.parse_int(data[key])
|
2017-11-14 21:33:17 +01:00
|
|
|
yield data
|
2017-11-13 20:46:24 +01:00
|
|
|
cnt += 1
|
|
|
|
|
|
|
|
if cnt < 40:
|
|
|
|
return
|
|
|
|
params["page"] += 1
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
class SeigaImageExtractor(SeigaExtractor):
|
|
|
|
"""Extractor for single images from seiga.nicovideo.jp"""
|
|
|
|
subcategory = "image"
|
2017-09-10 22:20:47 +02:00
|
|
|
filename_fmt = "{category}_{image_id}.{extension}"
|
2019-02-08 13:45:40 +01:00
|
|
|
pattern = (r"(?:https?://)?(?:"
|
2019-02-09 16:38:56 +01:00
|
|
|
r"(?:seiga\.|www\.)?nicovideo\.jp/(?:seiga/im|image/source/)"
|
2019-08-28 22:56:43 +02:00
|
|
|
r"|sp\.seiga\.nicovideo\.jp/seiga/#!/im"
|
2019-02-09 16:38:56 +01:00
|
|
|
r"|lohas\.nicoseiga\.jp/(?:thumb|(?:priv|o)/[^/]+/\d+)/)(\d+)")
|
2019-02-08 13:45:40 +01:00
|
|
|
test = (
|
2019-02-21 22:51:11 +01:00
|
|
|
("https://seiga.nicovideo.jp/seiga/im5977527", {
|
2020-10-15 22:13:27 +09:00
|
|
|
"keyword": "c8339781da260f7fc44894ad9ada016f53e3b12a",
|
2017-01-04 14:20:37 +01:00
|
|
|
"content": "d9202292012178374d57fb0126f6124387265297",
|
|
|
|
}),
|
2019-02-21 22:51:11 +01:00
|
|
|
("https://seiga.nicovideo.jp/seiga/im123", {
|
2017-01-04 14:20:37 +01:00
|
|
|
"exception": exception.NotFoundError,
|
|
|
|
}),
|
2019-02-21 22:51:11 +01:00
|
|
|
("https://seiga.nicovideo.jp/image/source/5977527"),
|
2019-08-28 22:56:43 +02:00
|
|
|
("https://sp.seiga.nicovideo.jp/seiga/#!/im5977527"),
|
2019-02-09 16:38:56 +01:00
|
|
|
("https://lohas.nicoseiga.jp/thumb/5977527i"),
|
|
|
|
("https://lohas.nicoseiga.jp/priv"
|
|
|
|
"/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
|
|
|
|
("https://lohas.nicoseiga.jp/o"
|
|
|
|
"/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
|
2019-02-08 13:45:40 +01:00
|
|
|
)
|
2017-01-04 14:20:37 +01:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2019-02-11 13:31:10 +01:00
|
|
|
SeigaExtractor.__init__(self, match)
|
2017-01-04 14:20:37 +01:00
|
|
|
self.image_id = match.group(1)
|
|
|
|
|
2017-11-14 21:33:17 +01:00
|
|
|
def skip(self, num):
|
2017-12-03 01:38:24 +01:00
|
|
|
self.start_image += num
|
2017-11-14 21:33:17 +01:00
|
|
|
return num
|
|
|
|
|
2017-01-04 14:20:37 +01:00
|
|
|
def get_images(self):
|
2020-10-15 22:13:27 +09:00
|
|
|
url = "{}/seiga/im{}".format(self.root, self.image_id)
|
|
|
|
page = self.request(url).text
|
|
|
|
|
|
|
|
data = text.extract_all(page, (
|
|
|
|
("date", '<li class="date"><span class="created">', '</span></li>'),
|
|
|
|
("title", '<h1 class="title">', '</h1>'),
|
|
|
|
("description" , '<p class="discription">', '</p>'),
|
|
|
|
))[0]
|
|
|
|
|
|
|
|
# Ugly,
|
|
|
|
data["user"] = text.extract_all(page, (
|
|
|
|
("id", '<li itemscope itemtype="http://data-vocabulary.org/Breadcrumb"><a href="/user/illust/', '" itemprop="url">'),
|
|
|
|
("name", '<span itemprop="title">', '<span class="pankuzu_suffix">'),
|
|
|
|
))[0]
|
|
|
|
|
|
|
|
data["description"] = text.remove_html(data["description"])
|
|
|
|
data["date"] = text.parse_datetime(data["date"] + ":00+0900", "%Y年%m月%d日 %H:%M:%S%z")
|
|
|
|
data["image_id"] = text.parse_int(self.image_id)
|
|
|
|
|
|
|
|
# VERY VERY UGLY!!!
|
|
|
|
return (data, data)
|