3fb41c34c8
The former implementation would try to use the embedded data from '/item/detail/' pages for every post, even if that wasn't really necessary. This commit also fixes some issues with posts only visible to logged in users.
189 lines
6.4 KiB
Python
189 lines
6.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2020 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://bcy.net/"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text
|
|
import json
|
|
import re
|
|
|
|
|
|
class BcyExtractor(Extractor):
|
|
"""Base class for bcy extractors"""
|
|
category = "bcy"
|
|
directory_fmt = ("{category}", "{user[id]} {user[name]}")
|
|
filename_fmt = "{post[id]} {id}.{extension}"
|
|
archive_fmt = "{post[id]}_{id}"
|
|
root = "https://bcy.net"
|
|
|
|
def __init__(self, match):
|
|
Extractor.__init__(self, match)
|
|
self.item_id = match.group(1)
|
|
|
|
def items(self):
|
|
sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
|
|
iroot = "https://img-bcy-qn.pstatp.com"
|
|
noop = self.config("noop")
|
|
|
|
for post in self.posts():
|
|
if not post["image_list"]:
|
|
continue
|
|
|
|
multi = None
|
|
tags = post.get("post_tags") or ()
|
|
data = {
|
|
"user": {
|
|
"id" : post["uid"],
|
|
"name" : post["uname"],
|
|
"avatar" : sub(iroot, post["avatar"].partition("~")[0]),
|
|
},
|
|
"post": {
|
|
"id" : text.parse_int(post["item_id"]),
|
|
"tags" : [t["tag_name"] for t in tags],
|
|
"date" : text.parse_timestamp(post["ctime"]),
|
|
"parody" : post["work"],
|
|
"content": post["plain"],
|
|
"likes" : post["like_count"],
|
|
"shares" : post["share_count"],
|
|
"replies": post["reply_count"],
|
|
},
|
|
}
|
|
|
|
yield Message.Directory, data
|
|
for data["num"], image in enumerate(post["image_list"], 1):
|
|
data["id"] = image["mid"]
|
|
data["width"] = image["w"]
|
|
data["height"] = image["h"]
|
|
|
|
url = image["path"].partition("~")[0]
|
|
text.nameext_from_url(url, data)
|
|
|
|
if data["extension"]:
|
|
if not url.startswith(iroot):
|
|
url = sub(iroot, url)
|
|
data["filter"] = ""
|
|
yield Message.Url, url, data
|
|
|
|
else:
|
|
if not multi:
|
|
if len(post["multi"]) < len(post["image_list"]):
|
|
multi = self._data_from_post(post["item_id"])
|
|
multi = multi["post_data"]["multi"]
|
|
else:
|
|
multi = post["multi"]
|
|
image = multi[data["num"] - 1]
|
|
|
|
if image["origin"]:
|
|
data["filter"] = "watermark"
|
|
yield Message.Url, image["origin"], data
|
|
|
|
if noop:
|
|
data["extension"] = ""
|
|
data["filter"] = "noop"
|
|
yield Message.Url, image["original_path"], data
|
|
|
|
def posts(self):
|
|
"""Returns an iterable with all relevant 'post' objects"""
|
|
|
|
def _data_from_post(self, post_id):
|
|
url = "{}/item/detail/{}".format(self.root, post_id)
|
|
page = self.request(url).text
|
|
return json.loads(
|
|
text.extract(page, 'JSON.parse("', '");')[0]
|
|
.replace('\\\\u002F', '/')
|
|
.replace('\\"', '"')
|
|
)["detail"]
|
|
|
|
|
|
class BcyUserExtractor(BcyExtractor):
|
|
"""Extractor for user timelines"""
|
|
subcategory = "user"
|
|
pattern = r"(?:https?://)?bcy\.net/u/(\d+)"
|
|
test = (
|
|
("https://bcy.net/u/1933712", {
|
|
"pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg",
|
|
"count": ">= 25",
|
|
}),
|
|
("https://bcy.net/u/109282764041", {
|
|
"pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+"
|
|
r"~tplv-banciyuan-logo-v3:.+\.image",
|
|
"range": "1-25",
|
|
"count": 25,
|
|
}),
|
|
)
|
|
|
|
def posts(self):
|
|
url = self.root + "/apiv3/user/selfPosts"
|
|
params = {"uid": self.item_id, "since": None}
|
|
|
|
while True:
|
|
data = self.request(url, params=params).json()
|
|
|
|
item = None
|
|
for item in data["data"]["items"]:
|
|
yield item["item_detail"]
|
|
|
|
if not item:
|
|
return
|
|
params["since"] = item["since"]
|
|
|
|
|
|
class BcyPostExtractor(BcyExtractor):
|
|
"""Extractor for individual posts"""
|
|
subcategory = "post"
|
|
pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)"
|
|
test = (
|
|
("https://bcy.net/item/detail/6355835481002893070", {
|
|
"url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3",
|
|
"count": 1,
|
|
"keyword": {
|
|
"user": {
|
|
"id" : 1933712,
|
|
"name" : "wukloo",
|
|
"avatar" : "re:https://img-bcy-qn.pstatp.com/Public/",
|
|
},
|
|
"post": {
|
|
"id" : 6355835481002893070,
|
|
"tags" : list,
|
|
"date" : "dt:2016-11-22 08:47:46",
|
|
"parody" : "东方PROJECT",
|
|
"content": "re:根据微博的建议稍微做了点修改",
|
|
"likes" : int,
|
|
"shares" : int,
|
|
"replies": int,
|
|
},
|
|
"id": 8330182,
|
|
"num": 1,
|
|
"width" : 3000,
|
|
"height": 1687,
|
|
"filename": "712e0780b09011e696f973c3d1568337",
|
|
"extension": "jpg",
|
|
},
|
|
}),
|
|
# only watermarked images available
|
|
("https://bcy.net/item/detail/6780546160802143236", {
|
|
"pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+"
|
|
r"~tplv-banciyuan-logo-v3:.+\.image",
|
|
"count": 8,
|
|
"keyword": {"filter": "watermark"}
|
|
}),
|
|
# only visible to logged in users
|
|
("https://bcy.net/item/detail/6747523535150783495", {
|
|
"count": 0,
|
|
}),
|
|
)
|
|
|
|
def posts(self):
|
|
data = self._data_from_post(self.item_id)
|
|
post = data["post_data"]
|
|
post["image_list"] = post["multi"]
|
|
post["plain"] = text.parse_unicode_escapes(post["plain"])
|
|
post.update(data["detail_user"])
|
|
return (post,)
|