b1db194c14
- split 'tags' into a list - parse 'date' into a datetime object - fix webm/mp4 URLs
339 lines
11 KiB
Python
339 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2019 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Generic extractors for *reactor sites"""
|
|
|
|
from .common import Extractor, Message, SharedConfigMixin
|
|
from .. import text
|
|
import urllib.parse
|
|
import random
|
|
import time
|
|
import json
|
|
|
|
|
|
BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)"
|
|
|
|
|
|
class ReactorExtractor(SharedConfigMixin, Extractor):
|
|
"""Base class for *reactor.cc extractors"""
|
|
basecategory = "reactor"
|
|
filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
|
|
archive_fmt = "{post_id}_{num}"
|
|
|
|
def __init__(self, match):
|
|
Extractor.__init__(self, match)
|
|
self.root = "http://" + match.group(1)
|
|
self.session.headers["Referer"] = self.root
|
|
|
|
self.wait_min = self.config("wait-min", 3)
|
|
self.wait_max = self.config("wait-max", 6)
|
|
if self.wait_max < self.wait_min:
|
|
self.wait_max = self.wait_min
|
|
|
|
if not self.category:
|
|
# set category based on domain name
|
|
netloc = urllib.parse.urlsplit(self.root).netloc
|
|
self.category = netloc.rpartition(".")[0]
|
|
|
|
def items(self):
|
|
data = self.metadata()
|
|
yield Message.Version, 1
|
|
yield Message.Directory, data
|
|
for post in self.posts():
|
|
for image in self._parse_post(post):
|
|
url = image["url"]
|
|
image.update(data)
|
|
yield Message.Url, url, text.nameext_from_url(url, image)
|
|
|
|
def metadata(self):
|
|
"""Collect metadata for extractor-job"""
|
|
return {}
|
|
|
|
def posts(self):
|
|
"""Return all relevant post-objects"""
|
|
return self._pagination(self.url)
|
|
|
|
def _pagination(self, url):
|
|
while True:
|
|
time.sleep(random.uniform(self.wait_min, self.wait_max))
|
|
|
|
response = self.request(url)
|
|
if response.history:
|
|
# sometimes there is a redirect from
|
|
# the last page of a listing (.../tag/<tag>/1)
|
|
# to the first page (.../tag/<tag>)
|
|
# which could cause an endless loop
|
|
cnt_old = response.history[0].url.count("/")
|
|
cnt_new = response.url.count("/")
|
|
if cnt_old == 5 and cnt_new == 4:
|
|
return
|
|
page = response.text
|
|
|
|
yield from text.extract_iter(
|
|
page, '<div class="uhead">', '<div class="ufoot">')
|
|
|
|
try:
|
|
pos = page.index("class='next'")
|
|
pos = page.rindex("class='current'", 0, pos)
|
|
url = self.root + text.extract(page, "href='", "'", pos)[0]
|
|
except (ValueError, TypeError):
|
|
return
|
|
|
|
def _parse_post(self, post):
|
|
post, _, script = post.partition('<script type="application/ld+json">')
|
|
images = text.extract_iter(post, '<div class="image">', '</div>')
|
|
script = script[:script.index("</")].strip()
|
|
|
|
try:
|
|
data = json.loads(script)
|
|
except ValueError:
|
|
try:
|
|
# remove control characters and escape backslashes
|
|
mapping = dict.fromkeys(range(32))
|
|
script = script.translate(mapping).replace("\\", "\\\\")
|
|
data = json.loads(script)
|
|
except ValueError as exc:
|
|
self.log.warning("Unable to parse JSON data: %s", exc)
|
|
return
|
|
|
|
num = 0
|
|
date = text.parse_datetime(data["datePublished"])
|
|
user = data["author"]["name"]
|
|
description = text.unescape(data["description"])
|
|
title, _, tags = text.unescape(data["headline"]).partition(" / ")
|
|
post_id = text.parse_int(
|
|
data["mainEntityOfPage"]["@id"].rpartition("/")[2])
|
|
|
|
if not tags:
|
|
title, tags = tags, title
|
|
tags = tags.split(" :: ")
|
|
|
|
for image in images:
|
|
url = text.extract(image, ' src="', '"')[0]
|
|
if not url:
|
|
continue
|
|
width = text.extract(image, ' width="', '"')[0]
|
|
height = text.extract(image, ' height="', '"')[0]
|
|
image_id = url.rpartition("-")[2].partition(".")[0]
|
|
num += 1
|
|
|
|
if image.startswith("<iframe "): # embed
|
|
url = "ytdl:" + text.unescape(url)
|
|
elif "/post/webm/" not in url and "/post/mp4/" not in url:
|
|
url = url.replace("/post/", "/post/full/")
|
|
|
|
yield {
|
|
"url": url,
|
|
"post_id": post_id,
|
|
"image_id": text.parse_int(image_id),
|
|
"width": text.parse_int(width),
|
|
"height": text.parse_int(height),
|
|
"title": title,
|
|
"description": description,
|
|
"tags": tags,
|
|
"date": date,
|
|
"user": user,
|
|
"num": num,
|
|
}
|
|
|
|
|
|
class ReactorTagExtractor(ReactorExtractor):
|
|
"""Extractor for tag searches on *reactor.cc sites"""
|
|
subcategory = "tag"
|
|
directory_fmt = ("{category}", "{search_tags}")
|
|
archive_fmt = "{search_tags}_{post_id}_{num}"
|
|
pattern = BASE_PATTERN + r"/tag/([^/?&#]+)"
|
|
test = ("http://anime.reactor.cc/tag/Anime+Art",)
|
|
|
|
def __init__(self, match):
|
|
ReactorExtractor.__init__(self, match)
|
|
self.tag = match.group(2)
|
|
|
|
def metadata(self):
|
|
return {"search_tags": text.unescape(self.tag).replace("+", " ")}
|
|
|
|
|
|
class ReactorSearchExtractor(ReactorTagExtractor):
|
|
"""Extractor for search results on *reactor.cc sites"""
|
|
subcategory = "search"
|
|
directory_fmt = ("{category}", "search", "{search_tags}")
|
|
archive_fmt = "s_{search_tags}_{post_id}_{num}"
|
|
pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
|
|
test = ("http://anime.reactor.cc/search?q=Art",)
|
|
|
|
|
|
class ReactorUserExtractor(ReactorExtractor):
|
|
"""Extractor for all posts of a user on *reactor.cc sites"""
|
|
subcategory = "user"
|
|
directory_fmt = ("{category}", "user", "{user}")
|
|
pattern = BASE_PATTERN + r"/user/([^/?&#]+)"
|
|
test = ("http://anime.reactor.cc/user/Shuster",)
|
|
|
|
def __init__(self, match):
|
|
ReactorExtractor.__init__(self, match)
|
|
self.user = match.group(2)
|
|
|
|
def metadata(self):
|
|
return {"user": text.unescape(self.user).replace("+", " ")}
|
|
|
|
|
|
class ReactorPostExtractor(ReactorExtractor):
|
|
"""Extractor for single posts on *reactor.cc sites"""
|
|
subcategory = "post"
|
|
pattern = BASE_PATTERN + r"/post/(\d+)"
|
|
test = ("http://anime.reactor.cc/post/3576250",)
|
|
|
|
def __init__(self, match):
|
|
ReactorExtractor.__init__(self, match)
|
|
self.post_id = match.group(2)
|
|
|
|
def items(self):
|
|
yield Message.Version, 1
|
|
post = self.request(self.url).text
|
|
pos = post.find('class="uhead">')
|
|
for image in self._parse_post(post[pos:]):
|
|
if image["num"] == 1:
|
|
yield Message.Directory, image
|
|
url = image["url"]
|
|
yield Message.Url, url, text.nameext_from_url(url, image)
|
|
|
|
|
|
# --------------------------------------------------------------------
|
|
# JoyReactor
|
|
|
|
JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))"
|
|
|
|
|
|
class JoyreactorTagExtractor(ReactorTagExtractor):
|
|
"""Extractor for tag searches on joyreactor.cc"""
|
|
category = "joyreactor"
|
|
pattern = JR_BASE_PATTERN + r"/tag/([^/?&#]+)"
|
|
test = (
|
|
("http://joyreactor.cc/tag/Advent+Cirno", {
|
|
"count": ">= 17",
|
|
}),
|
|
("http://joyreactor.com/tag/Cirno", {
|
|
"url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914",
|
|
}),
|
|
)
|
|
|
|
|
|
class JoyreactorSearchExtractor(ReactorSearchExtractor):
|
|
"""Extractor for search results on joyreactor.cc"""
|
|
category = "joyreactor"
|
|
pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
|
|
test = (
|
|
("http://joyreactor.cc/search/Cirno+Gifs", {
|
|
"range": "1-25",
|
|
"count": ">= 20",
|
|
}),
|
|
("http://joyreactor.com/search?q=Cirno+Gifs", {
|
|
"count": 0, # no search results on joyreactor.com
|
|
}),
|
|
)
|
|
|
|
|
|
class JoyreactorUserExtractor(ReactorUserExtractor):
|
|
"""Extractor for all posts of a user on joyreactor.cc"""
|
|
category = "joyreactor"
|
|
pattern = JR_BASE_PATTERN + r"/user/([^/?&#]+)"
|
|
test = (
|
|
("http://joyreactor.cc/user/hemantic"),
|
|
("http://joyreactor.com/user/Tacoman123", {
|
|
"url": "452cd0fa23e2ad0e122c296ba75aa7f0b29329f6",
|
|
}),
|
|
)
|
|
|
|
|
|
class JoyreactorPostExtractor(ReactorPostExtractor):
|
|
"""Extractor for single posts on joyreactor.cc"""
|
|
category = "joyreactor"
|
|
pattern = JR_BASE_PATTERN + r"/post/(\d+)"
|
|
test = (
|
|
("http://joyreactor.com/post/3721876", { # single image
|
|
"url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663",
|
|
"keyword": "966d2acd462732a9ed823a9db5ed19f95734fd10",
|
|
}),
|
|
("http://joyreactor.com/post/3713804", { # 4 images
|
|
"url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304",
|
|
"keyword": "84e34d402342607045a65fab6d4d593d146c238a",
|
|
}),
|
|
("http://joyreactor.com/post/3726210", { # gif / video
|
|
"url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b",
|
|
"keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47",
|
|
}),
|
|
("http://joyreactor.com/post/3668724", { # youtube embed
|
|
"url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a",
|
|
"keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651",
|
|
}),
|
|
("http://joyreactor.cc/post/1299", { # "malformed" JSON
|
|
"url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde",
|
|
}),
|
|
)
|
|
|
|
|
|
# --------------------------------------------------------------------
|
|
# PornReactor
|
|
|
|
PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)"
|
|
|
|
|
|
class PornreactorTagExtractor(ReactorTagExtractor):
|
|
"""Extractor for tag searches on pornreactor.cc"""
|
|
category = "pornreactor"
|
|
pattern = PR_BASE_PATTERN + r"/tag/([^/?&#]+)"
|
|
test = (
|
|
("http://pornreactor.cc/tag/RiceGnat", {
|
|
"range": "1-25",
|
|
"count": ">= 25",
|
|
}),
|
|
("http://fapreactor.com/tag/RiceGnat"),
|
|
)
|
|
|
|
|
|
class PornreactorSearchExtractor(ReactorSearchExtractor):
|
|
"""Extractor for search results on pornreactor.cc"""
|
|
category = "pornreactor"
|
|
pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
|
|
test = (
|
|
("http://pornreactor.cc/search?q=ecchi+hentai", {
|
|
"range": "1-25",
|
|
"count": ">= 25",
|
|
}),
|
|
("http://fapreactor.com/search/ecchi+hentai"),
|
|
)
|
|
|
|
|
|
class PornreactorUserExtractor(ReactorUserExtractor):
|
|
"""Extractor for all posts of a user on pornreactor.cc"""
|
|
category = "pornreactor"
|
|
pattern = PR_BASE_PATTERN + r"/user/([^/?&#]+)"
|
|
test = (
|
|
("http://pornreactor.cc/user/Disillusion", {
|
|
"range": "1-25",
|
|
"count": ">= 25",
|
|
}),
|
|
("http://fapreactor.com/user/Disillusion"),
|
|
)
|
|
|
|
|
|
class PornreactorPostExtractor(ReactorPostExtractor):
|
|
"""Extractor for single posts on pornreactor.cc"""
|
|
category = "pornreactor"
|
|
subcategory = "post"
|
|
pattern = PR_BASE_PATTERN + r"/post/(\d+)"
|
|
test = (
|
|
("http://pornreactor.cc/post/863166", {
|
|
"url": "680db1e33ca92ff70b2c0e1708c471cbe2201324",
|
|
"content": "ec6b0568bfb1803648744077da082d14de844340",
|
|
}),
|
|
("http://fapreactor.com/post/863166", {
|
|
"url": "864ecd5785e4898301aa8d054dd653b1165be158",
|
|
}),
|
|
)
|