2020-02-11 19:51:24 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# Copyright 2020 Mike Fährmann
|
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extractors for https://www.furaffinity.net/"""
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
|
|
|
from .. import text, util
|
|
|
|
|
|
|
|
|
2020-02-15 19:12:17 +01:00
|
|
|
BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net"
|
2020-02-11 19:51:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
class FuraffinityExtractor(Extractor):
|
|
|
|
"""Base class for furaffinity extractors"""
|
|
|
|
category = "furaffinity"
|
|
|
|
directory_fmt = ("{category}", "{user!l}")
|
|
|
|
filename_fmt = "{id} {title}.{extension}"
|
|
|
|
archive_fmt = "{id}"
|
2020-02-12 21:39:43 +01:00
|
|
|
cookiedomain = ".furaffinity.net"
|
2020-02-11 19:51:24 +01:00
|
|
|
root = "https://www.furaffinity.net"
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
Extractor.__init__(self, match)
|
|
|
|
self.user = match.group(1)
|
|
|
|
self.offset = 0
|
|
|
|
|
|
|
|
def items(self):
|
2020-08-18 21:26:46 +02:00
|
|
|
metadata = self.metadata()
|
2020-02-11 19:51:24 +01:00
|
|
|
for post_id in util.advance(self.posts(), self.offset):
|
|
|
|
post = self._parse_post(post_id)
|
|
|
|
if post:
|
2020-08-18 21:26:46 +02:00
|
|
|
if metadata:
|
|
|
|
post.update(metadata)
|
2020-02-11 19:51:24 +01:00
|
|
|
yield Message.Directory, post
|
|
|
|
yield Message.Url, post["url"], post
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
return self._pagination()
|
|
|
|
|
2020-08-18 21:26:46 +02:00
|
|
|
def metadata(self):
|
|
|
|
return None
|
|
|
|
|
2020-02-11 19:51:24 +01:00
|
|
|
def skip(self, num):
|
|
|
|
self.offset += num
|
|
|
|
return num
|
|
|
|
|
|
|
|
def _parse_post(self, post_id):
|
|
|
|
url = "{}/view/{}/".format(self.root, post_id)
|
|
|
|
extr = text.extract_from(self.request(url).text)
|
|
|
|
title, _, artist = text.unescape(extr(
|
|
|
|
'property="og:title" content="', '"')).rpartition(" by ")
|
2020-06-11 18:36:24 +02:00
|
|
|
artist_url = artist.replace("_", "").lower()
|
2020-02-12 21:39:43 +01:00
|
|
|
path = extr('href="//d.facdn.net/', '"')
|
|
|
|
|
|
|
|
if not path:
|
2020-02-11 19:51:24 +01:00
|
|
|
self.log.warning(
|
2020-02-12 21:39:43 +01:00
|
|
|
"Unable to download post %s (\"%s\")",
|
|
|
|
post_id, text.remove_html(
|
|
|
|
extr('System Message', '</section>') or
|
|
|
|
extr('System Message', '</table>')
|
|
|
|
)
|
|
|
|
)
|
2020-02-11 19:51:24 +01:00
|
|
|
return None
|
|
|
|
|
2020-03-13 23:56:55 +01:00
|
|
|
pi = text.parse_int
|
|
|
|
rh = text.remove_html
|
|
|
|
|
2020-02-12 21:39:43 +01:00
|
|
|
data = text.nameext_from_url(path, {
|
2020-06-11 18:36:24 +02:00
|
|
|
"id" : pi(post_id),
|
|
|
|
"title" : title,
|
|
|
|
"artist" : artist,
|
|
|
|
"artist_url": artist_url,
|
|
|
|
"user" : self.user or artist_url,
|
|
|
|
"url" : "https://d.facdn.net/" + path
|
2020-02-12 21:39:43 +01:00
|
|
|
})
|
|
|
|
|
|
|
|
tags = extr('class="tags-row">', '</section>')
|
|
|
|
if tags:
|
2020-03-13 23:56:55 +01:00
|
|
|
# new site layout
|
2020-02-12 21:39:43 +01:00
|
|
|
data["tags"] = text.split_html(tags)
|
2020-03-13 23:56:55 +01:00
|
|
|
data["description"] = text.unescape(rh(extr(
|
2020-02-12 21:39:43 +01:00
|
|
|
'class="section-body">', '</div>'), "", ""))
|
2020-03-13 23:56:55 +01:00
|
|
|
data["views"] = pi(rh(extr('class="views">', '</span>')))
|
|
|
|
data["favorites"] = pi(rh(extr('class="favorites">', '</span>')))
|
|
|
|
data["comments"] = pi(rh(extr('class="comments">', '</span>')))
|
|
|
|
data["rating"] = rh(extr('class="rating">', '</span>'))
|
|
|
|
data["fa_category"] = rh(extr('>Category</strong>', '</span>'))
|
|
|
|
data["theme"] = rh(extr('>', '<'))
|
|
|
|
data["species"] = rh(extr('>Species</strong>', '</div>'))
|
|
|
|
data["gender"] = rh(extr('>Gender</strong>', '</div>'))
|
|
|
|
data["width"] = pi(extr("<span>", "x"))
|
|
|
|
data["height"] = pi(extr("", "p"))
|
2020-02-12 21:39:43 +01:00
|
|
|
else:
|
2020-03-13 23:56:55 +01:00
|
|
|
# old site layout
|
|
|
|
data["fa_category"] = extr("<b>Category:</b>", "<").strip()
|
|
|
|
data["theme"] = extr("<b>Theme:</b>", "<").strip()
|
|
|
|
data["species"] = extr("<b>Species:</b>", "<").strip()
|
|
|
|
data["gender"] = extr("<b>Gender:</b>", "<").strip()
|
|
|
|
data["favorites"] = pi(extr("<b>Favorites:</b>", "<"))
|
|
|
|
data["comments"] = pi(extr("<b>Comments:</b>", "<"))
|
|
|
|
data["views"] = pi(extr("<b>Views:</b>", "<"))
|
|
|
|
data["width"] = pi(extr("<b>Resolution:</b>", "x"))
|
|
|
|
data["height"] = pi(extr("", "<"))
|
2020-02-12 21:39:43 +01:00
|
|
|
data["tags"] = text.split_html(extr(
|
|
|
|
'id="keywords">', '</div>'))[::2]
|
2020-03-13 23:56:55 +01:00
|
|
|
data["rating"] = extr('<img alt="', ' ')
|
2020-02-12 21:39:43 +01:00
|
|
|
data["description"] = text.unescape(text.remove_html(extr(
|
|
|
|
"</table>", "</table>"), "", ""))
|
|
|
|
data["date"] = text.parse_timestamp(data["filename"].partition(".")[0])
|
|
|
|
|
|
|
|
return data
|
2020-02-11 19:51:24 +01:00
|
|
|
|
|
|
|
def _pagination(self):
|
|
|
|
num = 1
|
|
|
|
|
|
|
|
while True:
|
|
|
|
url = "{}/{}/{}/{}/".format(
|
|
|
|
self.root, self.subcategory, self.user, num)
|
|
|
|
page = self.request(url).text
|
|
|
|
post_id = None
|
|
|
|
|
|
|
|
for post_id in text.extract_iter(page, 'id="sid-', '"'):
|
|
|
|
yield post_id
|
|
|
|
|
|
|
|
if not post_id:
|
|
|
|
return
|
|
|
|
num += 1
|
|
|
|
|
|
|
|
def _pagination_favorites(self):
|
|
|
|
path = "/favorites/{}/".format(self.user)
|
|
|
|
|
|
|
|
while path:
|
|
|
|
page = self.request(self.root + path).text
|
|
|
|
yield from text.extract_iter(page, 'id="sid-', '"')
|
2020-02-12 21:39:43 +01:00
|
|
|
path = text.extract(page, 'right" href="', '"')[0]
|
2020-02-11 19:51:24 +01:00
|
|
|
|
2020-08-18 21:26:46 +02:00
|
|
|
def _pagination_search(self, query):
|
|
|
|
url = self.root + "/search/"
|
|
|
|
data = {
|
|
|
|
"page" : 0,
|
|
|
|
"next_page" : "Next",
|
|
|
|
"order-by" : "relevancy",
|
|
|
|
"order-direction": "desc",
|
|
|
|
"range" : "all",
|
|
|
|
"rating-general" : "on",
|
|
|
|
"rating-mature" : "on",
|
|
|
|
"rating-adult" : "on",
|
|
|
|
"type-art" : "on",
|
|
|
|
"type-music" : "on",
|
|
|
|
"type-flash" : "on",
|
|
|
|
"type-story" : "on",
|
|
|
|
"type-photo" : "on",
|
|
|
|
"type-poetry" : "on",
|
|
|
|
"mode" : "extended",
|
|
|
|
}
|
|
|
|
data.update(query)
|
|
|
|
if "page" in query:
|
|
|
|
data["page"] = text.parse_int(query["page"])
|
|
|
|
|
|
|
|
while True:
|
|
|
|
page = self.request(url, method="POST", data=data).text
|
|
|
|
post_id = None
|
|
|
|
|
|
|
|
for post_id in text.extract_iter(page, 'id="sid-', '"'):
|
|
|
|
yield post_id
|
|
|
|
|
|
|
|
if not post_id:
|
|
|
|
return
|
|
|
|
data["page"] += 1
|
|
|
|
|
2020-02-11 19:51:24 +01:00
|
|
|
|
|
|
|
class FuraffinityGalleryExtractor(FuraffinityExtractor):
|
|
|
|
"""Extractor for a furaffinity user's gallery"""
|
|
|
|
subcategory = "gallery"
|
|
|
|
pattern = BASE_PATTERN + r"/gallery/([^/?&#]+)"
|
|
|
|
test = ("https://www.furaffinity.net/gallery/mirlinthloth/", {
|
|
|
|
"pattern": r"https://d.facdn.net/art/mirlinthloth/\d+/\d+.\w+\.\w+",
|
|
|
|
"range": "45-50",
|
|
|
|
"count": 6,
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
class FuraffinityScrapsExtractor(FuraffinityExtractor):
|
|
|
|
"""Extractor for a furaffinity user's scraps"""
|
|
|
|
subcategory = "scraps"
|
|
|
|
directory_fmt = ("{category}", "{user!l}", "Scraps")
|
|
|
|
pattern = BASE_PATTERN + r"/scraps/([^/?&#]+)"
|
|
|
|
test = ("https://www.furaffinity.net/scraps/mirlinthloth/", {
|
|
|
|
"pattern": r"https://d.facdn.net/art/[^/]+(/stories)?/\d+/\d+.\w+.\w+",
|
|
|
|
"count": ">= 3",
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
class FuraffinityFavoriteExtractor(FuraffinityExtractor):
|
|
|
|
"""Extractor for a furaffinity user's favorites"""
|
|
|
|
subcategory = "favorite"
|
|
|
|
directory_fmt = ("{category}", "{user!l}", "Favorites")
|
|
|
|
pattern = BASE_PATTERN + r"/favorites/([^/?&#]+)"
|
|
|
|
test = ("https://www.furaffinity.net/favorites/mirlinthloth/", {
|
|
|
|
"pattern": r"https://d.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+",
|
|
|
|
"range": "45-50",
|
|
|
|
"count": 6,
|
|
|
|
})
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
return self._pagination_favorites()
|
|
|
|
|
|
|
|
|
2020-08-18 21:26:46 +02:00
|
|
|
class FuraffinitySearchExtractor(FuraffinityExtractor):
|
|
|
|
"""Extractor for furaffinity search results"""
|
|
|
|
subcategory = "search"
|
|
|
|
directory_fmt = ("{category}", "Search", "{search}")
|
|
|
|
pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
|
|
|
|
test = ("https://www.furaffinity.net/search/?q=cute", {
|
|
|
|
"pattern": r"https://d.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+",
|
|
|
|
"range": "45-50",
|
|
|
|
"count": 6,
|
|
|
|
})
|
|
|
|
|
|
|
|
def metadata(self):
|
|
|
|
self.query = text.parse_query(self.user)
|
|
|
|
return {"search": self.query.get("q")}
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
return self._pagination_search(self.query)
|
|
|
|
|
|
|
|
|
2020-02-11 19:51:24 +01:00
|
|
|
class FuraffinityPostExtractor(FuraffinityExtractor):
|
|
|
|
"""Extractor for individual posts on furaffinity"""
|
|
|
|
subcategory = "post"
|
2020-02-12 21:39:43 +01:00
|
|
|
pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)"
|
|
|
|
test = (
|
|
|
|
("https://www.furaffinity.net/view/21835115/", {
|
|
|
|
"url": "eae4ef93d99365c69b31a37561bd800c03d336ad",
|
|
|
|
"keyword": {
|
|
|
|
"artist" : "mirlinthloth",
|
2020-06-11 18:36:24 +02:00
|
|
|
"artist_url" : "mirlinthloth",
|
2020-02-23 16:48:30 +01:00
|
|
|
"date" : "dt:2016-11-27 17:24:06",
|
2020-02-12 21:39:43 +01:00
|
|
|
"description": "A Song made playing the game Cosmic DJ.",
|
|
|
|
"extension" : "mp3",
|
|
|
|
"filename" : r"re:\d+\.\w+_dj_fennmink_-_bude_s_4_ever",
|
|
|
|
"id" : 21835115,
|
|
|
|
"tags" : list,
|
|
|
|
"title" : "Bude's 4 Ever",
|
|
|
|
"url" : "re:https://d.facdn.net/art/mirlinthloth/music",
|
|
|
|
"user" : "mirlinthloth",
|
2020-03-13 23:56:55 +01:00
|
|
|
"views" : int,
|
|
|
|
"favorites" : int,
|
|
|
|
"comments" : int,
|
|
|
|
"rating" : "General",
|
|
|
|
"fa_category": "Music",
|
|
|
|
"theme" : "All",
|
|
|
|
"species" : "Unspecified / Any",
|
|
|
|
"gender" : "Any",
|
|
|
|
"width" : 120,
|
|
|
|
"height" : 120,
|
2020-02-12 21:39:43 +01:00
|
|
|
},
|
|
|
|
}),
|
2020-02-15 19:12:17 +01:00
|
|
|
("https://furaffinity.net/view/21835115/"),
|
|
|
|
("https://sfw.furaffinity.net/view/21835115/"),
|
2020-02-12 21:39:43 +01:00
|
|
|
("https://www.furaffinity.net/full/21835115/"),
|
|
|
|
)
|
2020-02-11 19:51:24 +01:00
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
post_id = self.user
|
|
|
|
self.user = None
|
|
|
|
return (post_id,)
|
|
|
|
|
|
|
|
|
|
|
|
class FuraffinityUserExtractor(FuraffinityExtractor):
|
|
|
|
"""Extractor for furaffinity user profiles"""
|
|
|
|
subcategory = "user"
|
2020-02-12 21:39:43 +01:00
|
|
|
cookiedomain = None
|
2020-02-11 19:51:24 +01:00
|
|
|
pattern = BASE_PATTERN + r"/user/([^/?&#]+)"
|
|
|
|
test = (
|
|
|
|
("https://www.furaffinity.net/user/mirlinthloth/", {
|
|
|
|
"pattern": r"/gallery/mirlinthloth/$",
|
|
|
|
}),
|
|
|
|
("https://www.furaffinity.net/user/mirlinthloth/", {
|
|
|
|
"options": (("include", "all"),),
|
|
|
|
"pattern": r"/(gallery|scraps|favorites)/mirlinthloth/$",
|
|
|
|
"count": 3,
|
|
|
|
}),
|
|
|
|
)
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
base = "{}/{{}}/{}/".format(self.root, self.user)
|
|
|
|
return self._dispatch_extractors((
|
|
|
|
(FuraffinityGalleryExtractor , base.format("gallery")),
|
|
|
|
(FuraffinityScrapsExtractor , base.format("scraps")),
|
|
|
|
(FuraffinityFavoriteExtractor, base.format("favorites")),
|
|
|
|
), ("gallery",))
|
2020-04-17 22:18:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
class FuraffinityFollowingExtractor(FuraffinityExtractor):
|
|
|
|
"""Extractor for a furaffinity user's watched users"""
|
|
|
|
subcategory = "following"
|
|
|
|
pattern = BASE_PATTERN + "/watchlist/by/([^/?&#]+)"
|
|
|
|
test = ("https://www.furaffinity.net/watchlist/by/mirlinthloth/", {
|
|
|
|
"pattern": FuraffinityUserExtractor.pattern,
|
|
|
|
"range": "176-225",
|
|
|
|
"count": 50,
|
|
|
|
})
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
url = "{}/watchlist/by/{}/".format(self.root, self.user)
|
|
|
|
data = {"_extractor": FuraffinityUserExtractor}
|
|
|
|
|
|
|
|
while True:
|
|
|
|
page = self.request(url).text
|
|
|
|
|
|
|
|
for path in text.extract_iter(page, '<a href="', '"'):
|
|
|
|
yield Message.Queue, self.root + path, data
|
|
|
|
|
|
|
|
path = text.rextract(page, 'action="', '"')[0]
|
|
|
|
if url.endswith(path):
|
|
|
|
return
|
|
|
|
url = self.root + path
|