From 1c95a0173f58d562b88b52eb129924e6b187b446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 22 Sep 2018 17:38:09 +0200 Subject: [PATCH] [hentaifoundry] split 'artist' into 'user'+'artist' and some smaller changes ... 'user' is the name of the account an image is listed at and 'artist' is now the name of the account who created the image. For example "https://www.hentai-foundry.com/user/Tenpura/faves/pictures" - 'user': Tenpura - 'artist' of the only image: LewdBrush --- gallery_dl/extractor/hentaifoundry.py | 86 ++++++++++++++------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index c6db764f..61aa7bdd 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -15,19 +15,18 @@ from .. import text, util, exception class HentaifoundryExtractor(Extractor): """Base class for hentaifoundry extractors""" category = "hentaifoundry" - directory_fmt = ["{category}", "{artist}"] + directory_fmt = ["{category}", "{user}"] filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" root = "https://www.hentai-foundry.com" per_page = 25 - def __init__(self, artist, needle=""): + def __init__(self, user="", page=1): Extractor.__init__(self) - self.artist = artist - self.needle = needle - self.artist_url = "{}/pictures/user/{}".format(self.root, artist) - self.start_page = 1 + self.url = "" + self.user = user self.start_post = 0 + self.start_page = text.parse_int(page, 1) def items(self): data = self.get_job_metadata() @@ -48,18 +47,25 @@ class HentaifoundryExtractor(Extractor): def get_job_metadata(self): """Collect metadata for extractor-job""" - page = self.request(self.artist_url + "?enterAgree=1").text - needle = ' >{} ('.format(self.needle) - count = text.parse_int(text.extract(page, needle, ')')[0]) - return {"artist": self.artist, "count": count} + self.request(self.root + "/?enterAgree=1") + return {"user": self.user} def get_image_pages(self): - """Yield urls all image pages of one artist""" + """Yield urls of all relevant image pages""" + num = self.start_page - def get_image_metadata(self, url): - """Collect metadata for an image""" - page = self.request(text.urljoin(self.root, url)).text - index = url.rsplit("/", 2)[1] + while True: + page = self.request("{}/page/{}".format(self.url, num)).text + yield from text.extract_iter(page, 'thumbTitle">', '') width , pos = text.extract(page, 'width="', '"', pos) height, pos = text.extract(page, 'height="', '"', pos) @@ -110,28 +116,17 @@ class HentaifoundryExtractor(Extractor): url = self.root + "/site/filters" self.request(url, method="POST", data=data) - def _pagination(self, url): - num = self.start_page - - while True: - page = self.request("{}/page/{}".format(url, num)).text - yield from text.extract_iter(page, 'thumbTitle">Pictures (", ")")[0] + return {"user": self.user, "count": text.parse_int(count)} class HentaifoundryScrapsExtractor(HentaifoundryExtractor): """Extractor for scrap images of a hentai-foundry-user""" subcategory = "scraps" - directory_fmt = ["{category}", "{artist}", "Scraps"] + directory_fmt = ["{category}", "{user}", "Scraps"] pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/pictures/user/([^/]+)/scraps(?:/(?:page/(\d+))?)?$"] + r"/pictures/user/([^/]+)/scraps(?:/page/(\d+))?"] test = [ ("https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", { "url": "00a11e30b73ff2b00a1fba0014f08d49da0a68ec", - "keyword": "e294a72ab7be53a716eab92d8c97f82d6e76693c", + "keyword": "8c9a2ad4bf20247bcebb7aef3cfe7016f35da4a7", }), (("https://www.hentai-foundry.com" "/pictures/user/Evulchibi/scraps/page/3"), None), ] def __init__(self, match): - HentaifoundryExtractor.__init__(self, match.group(1), "Scraps") - self.start_page = text.parse_int(match.group(2), 1) + HentaifoundryExtractor.__init__(self, match.group(1), match.group(2)) + self.url = "{}/pictures/user/{}/scraps".format(self.root, self.user) - def get_image_pages(self): - return self._pagination(self.artist_url + "/scraps") + def get_job_metadata(self): + page = self.request(self.url + "?enterAgree=1").text + count = text.extract(page, ">Scraps (", ")")[0] + return {"user": self.user, "count": text.parse_int(count)} class HentaifoundryImageExtractor(HentaifoundryExtractor): @@ -178,7 +177,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): (("https://www.hentai-foundry.com" "/pictures/user/Tenpura/407501/shimakaze"), { "url": "fbf2fd74906738094e2575d2728e8dc3de18a8a3", - "keyword": "e6ae60151ae3c17a22b3d61574ff5a883e577573", + "keyword": "aa64a4cfcd9c254ee143d9a3522195d11f8c1fb8", "content": "91bf01497c39254b6dfb234a18e8f01629c77fd1", }), ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/", { @@ -193,8 +192,11 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): self.index = match.group(2) def items(self): - url, data = self.get_image_metadata( - "{}/{}/?enterAgree=1".format(self.artist_url, self.index)) + post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( + self.root, self.user, self.index) + url, data = self.get_image_metadata(post_url) + data["user"] = self.user + yield Message.Version, 1 yield Message.Directory, data yield Message.Url, url, data