[hentaifoundry] split 'artist' into 'user'+'artist'

and some smaller changes ...

'user' is the name of the account an image is listed at and
'artist' is now the name of the account who created the image.

For example "https://www.hentai-foundry.com/user/Tenpura/faves/pictures"
- 'user': Tenpura
- 'artist' of the only image: LewdBrush
This commit is contained in:
Mike Fährmann 2018-09-22 17:38:09 +02:00
parent 55f5c87160
commit 1c95a0173f
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -15,19 +15,18 @@ from .. import text, util, exception
class HentaifoundryExtractor(Extractor):
"""Base class for hentaifoundry extractors"""
category = "hentaifoundry"
directory_fmt = ["{category}", "{artist}"]
directory_fmt = ["{category}", "{user}"]
filename_fmt = "{category}_{index}_{title}.{extension}"
archive_fmt = "{index}"
root = "https://www.hentai-foundry.com"
per_page = 25
def __init__(self, artist, needle=""):
def __init__(self, user="", page=1):
Extractor.__init__(self)
self.artist = artist
self.needle = needle
self.artist_url = "{}/pictures/user/{}".format(self.root, artist)
self.start_page = 1
self.url = ""
self.user = user
self.start_post = 0
self.start_page = text.parse_int(page, 1)
def items(self):
data = self.get_job_metadata()
@ -48,18 +47,25 @@ class HentaifoundryExtractor(Extractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
page = self.request(self.artist_url + "?enterAgree=1").text
needle = ' >{} ('.format(self.needle)
count = text.parse_int(text.extract(page, needle, ')')[0])
return {"artist": self.artist, "count": count}
self.request(self.root + "/?enterAgree=1")
return {"user": self.user}
def get_image_pages(self):
"""Yield urls all image pages of one artist"""
"""Yield urls of all relevant image pages"""
num = self.start_page
def get_image_metadata(self, url):
"""Collect metadata for an image"""
page = self.request(text.urljoin(self.root, url)).text
index = url.rsplit("/", 2)[1]
while True:
page = self.request("{}/page/{}".format(self.url, num)).text
yield from text.extract_iter(page, 'thumbTitle"><a href="', '"')
if 'class="pager"' not in page or 'class="last hidden"' in page:
return
num += 1
def get_image_metadata(self, page_url):
"""Collect url and metadata from an image page"""
page = self.request(text.urljoin(self.root, page_url)).text
index = page_url.rsplit("/", 2)[1]
title , pos = text.extract(page, '<title>', '</title>')
width , pos = text.extract(page, 'width="', '"', pos)
height, pos = text.extract(page, 'height="', '"', pos)
@ -110,28 +116,17 @@ class HentaifoundryExtractor(Extractor):
url = self.root + "/site/filters"
self.request(url, method="POST", data=data)
def _pagination(self, url):
num = self.start_page
while True:
page = self.request("{}/page/{}".format(url, num)).text
yield from text.extract_iter(page, 'thumbTitle"><a href="', '"')
if 'class="pager"' not in page or 'class="last hidden"' in page:
return
num += 1
class HentaifoundryUserExtractor(HentaifoundryExtractor):
"""Extractor for all images of a hentai-foundry-user"""
subcategory = "user"
pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
r"/(?:pictures/user/([^/]+)(?:/(?:page/(\d+))?)?$"
r"/(?:pictures/user/([^/]+)(?:/page/(\d+))?/?$"
r"|user/([^/]+)/profile)"]
test = [
("https://www.hentai-foundry.com/pictures/user/Tenpura", {
"url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28",
"keyword": "98dc5e3856a38243ad4be3e428dc6a069243bc13",
"keyword": "d56e75566dc7dfe71d2ebd08c056a47f8832372d",
}),
("https://www.hentai-foundry.com/pictures/user/Tenpura/page/3", None),
("https://www.hentai-foundry.com/user/Tenpura/profile", None),
@ -139,34 +134,38 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(
self, match.group(1) or match.group(3), "Pictures")
self.start_page = text.parse_int(match.group(2), 1)
self, match.group(1) or match.group(3), match.group(2))
self.url = "{}/pictures/user/{}".format(self.root, self.user)
def get_image_pages(self):
return self._pagination(self.artist_url)
def get_job_metadata(self):
page = self.request(self.url + "?enterAgree=1").text
count = text.extract(page, ">Pictures (", ")")[0]
return {"user": self.user, "count": text.parse_int(count)}
class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
"""Extractor for scrap images of a hentai-foundry-user"""
subcategory = "scraps"
directory_fmt = ["{category}", "{artist}", "Scraps"]
directory_fmt = ["{category}", "{user}", "Scraps"]
pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
r"/pictures/user/([^/]+)/scraps(?:/(?:page/(\d+))?)?$"]
r"/pictures/user/([^/]+)/scraps(?:/page/(\d+))?"]
test = [
("https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", {
"url": "00a11e30b73ff2b00a1fba0014f08d49da0a68ec",
"keyword": "e294a72ab7be53a716eab92d8c97f82d6e76693c",
"keyword": "8c9a2ad4bf20247bcebb7aef3cfe7016f35da4a7",
}),
(("https://www.hentai-foundry.com"
"/pictures/user/Evulchibi/scraps/page/3"), None),
]
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match.group(1), "Scraps")
self.start_page = text.parse_int(match.group(2), 1)
HentaifoundryExtractor.__init__(self, match.group(1), match.group(2))
self.url = "{}/pictures/user/{}/scraps".format(self.root, self.user)
def get_image_pages(self):
return self._pagination(self.artist_url + "/scraps")
def get_job_metadata(self):
page = self.request(self.url + "?enterAgree=1").text
count = text.extract(page, ">Scraps (", ")")[0]
return {"user": self.user, "count": text.parse_int(count)}
class HentaifoundryImageExtractor(HentaifoundryExtractor):
@ -178,7 +177,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
(("https://www.hentai-foundry.com"
"/pictures/user/Tenpura/407501/shimakaze"), {
"url": "fbf2fd74906738094e2575d2728e8dc3de18a8a3",
"keyword": "e6ae60151ae3c17a22b3d61574ff5a883e577573",
"keyword": "aa64a4cfcd9c254ee143d9a3522195d11f8c1fb8",
"content": "91bf01497c39254b6dfb234a18e8f01629c77fd1",
}),
("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/", {
@ -193,8 +192,11 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
self.index = match.group(2)
def items(self):
url, data = self.get_image_metadata(
"{}/{}/?enterAgree=1".format(self.artist_url, self.index))
post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format(
self.root, self.user, self.index)
url, data = self.get_image_metadata(post_url)
data["user"] = self.user
yield Message.Version, 1
yield Message.Directory, data
yield Message.Url, url, data