gallery-dl/gallery_dl/extractor/sankaku.py

# -*- coding: utf-8 -*-

# Copyright 2014-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://chan.sankakucomplex.com/"""

from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import time
import random


class SankakuTagExtractor(Extractor):
    """Extractor for images from chan.sankakucomplex.com by search-tags"""
    category = "sankaku"
    subcategory = "tag"
    directory_fmt = ["{category}", "{tags}"]
    filename_fmt = "{category}_{id}_{md5}.{extension}"
    pattern = [r"(?:https?://)?chan\.sankakucomplex\.com"
               r"/\?(?:[^&#]*&)*tags=([^&#]+)"]
    test = [("https://chan.sankakucomplex.com/?tags=bonocho", {
        "count": 5,
        "pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
                    r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),
    })]
    root = "https://chan.sankakucomplex.com"
    cookienames = ("login", "pass_hash")
    cookiedomain = "chan.sankakucomplex.com"

    def __init__(self, match):
        Extractor.__init__(self)
        self.logged_in = True
        self.pagestart = 1
        self.tags = text.unquote(match.group(1).replace("+", " "))
        self.wait_min = self.config("wait-min", 2)
        self.wait_max = self.config("wait-max", 4)
        if self.wait_max < self.wait_min:
            self.wait_max = self.wait_min

    def skip(self, num):
        pages = min(num // 20, 49)
        self.pagestart += pages
        return pages * 20

    def items(self):
        self.login()
        data = self.get_job_metadata()
        yield Message.Version, 1
        yield Message.Directory, data
        for image in self.get_images():
            image.update(data)
            yield Message.Url, image["file_url"], image

    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
        return {"tags": self.tags}

    def get_images(self):
        """Yield all available images for the given tags"""
        params = {
            "tags": self.tags,
            "page": self.pagestart,
        }
        while self.logged_in or params["page"] <= 25:
            image = None
            page = self.request(self.root, params=params, retries=10).text
            pos = text.extract(page, '<div id=more-popular-posts-link>', '')[1]
            for image_id in text.extract_iter(
                    page, '<span class="thumb blacklisted" id=p', '>', pos):
                self.wait()
                image = self.get_image_metadata(image_id)
                yield image
            if not image:
                return
            params["page"] += 1
            params["next"] = image["id"] - 1
        self.log.warning(
            "Unauthenticated users may only access the first 500 images / 25 "
            "pages. (Use '--range 501-' to continue downloading from this "
            "point onwards after setting up an account.)")

    def get_image_metadata(self, image_id):
        """Collect metadata for a single image"""
        url = "https://chan.sankakucomplex.com/post/show/" + image_id
        page = self.request(url, retries=10).text
        file_url, pos = text.extract(page, '<li>Original: <a href="', '"')
        if file_url:
            width , pos = text.extract(page, '>', 'x', pos)
            height, pos = text.extract(page, '', ' ', pos)
        else:
            width , pos = text.extract(page, '<object width=', ' ', pos)
            height, pos = text.extract(page, 'height=', '>', pos)
            file_url = text.extract(page, '<embed src="', '"', pos)[0]
        data = text.nameext_from_url(file_url, {
            "id": util.safe_int(image_id),
            "file_url": "https:" + text.unescape(file_url),
            "width": util.safe_int(width),
            "height": util.safe_int(height),
        })
        data["md5"] = data["name"]
        return data

    def wait(self):
        """Wait for a randomly chosen amount of seconds"""
        time.sleep(random.uniform(self.wait_min, self.wait_max))

    def login(self):
        """Login and set necessary cookies"""
        if self._check_cookies(self.cookienames):
            return
        username, password = self._get_auth_info()
        if username:
            cookies = self._login_impl(username, password)
            for key, value in cookies.items():
                self.session.cookies.set(
                    key, value, domain=self.cookiedomain)
        else:
            self.logged_in = False

    @cache(maxage=90*24*60*60, keyarg=1)
    def _login_impl(self, username, password):
        """Actual login implementation"""
        self.log.info("Logging in as %s", username)
        params = {
            "url": "",
            "user[name]": username,
            "user[password]": password,
            "commit": "Login",
        }
        response = self.request(self.root + "/user/authenticate",
                                method="POST", params=params)
        if not response.history or response.url != self.root + "/user/home":
            raise exception.AuthenticationError()
        cookies = response.history[0].cookies
        return {c: cookies[c] for c in self.cookienames}
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`# -- coding: utf-8 --`
initial commit 2014-10-12 21:56:44 +02:00
share extractor and downloader sessions There was never any "good" reason for the strict separation between extractors and downloaders. This change allows for reduced resource usage (probably unnoticeable) and less lines of code at the "cost" of tighter coupling. 2017-06-30 19:38:14 +02:00			`# Copyright 2014-2017 Mike Fährmann`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract images from https://chan.sankakucomplex.com/"""`

[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`from .common import Extractor, Message`
			`from .. import text, util, exception`
			`from ..cache import cache`
			`import time`
			`import random`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`class SankakuTagExtractor(Extractor):`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`"""Extractor for images from chan.sankakucomplex.com by search-tags"""`
update all other extractors 2015-11-21 04:26:30 +01:00			`category = "sankaku"`
consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`subcategory = "tag"`
update all other extractors 2015-11-21 04:26:30 +01:00			`directory_fmt = ["{category}", "{tags}"]`
			`filename_fmt = "{category}_{id}_{md5}.{extension}"`
[sankaku] add warning for unauthenticated users also improve URL pattern and add missing options to default config file 2017-10-16 20:45:14 +02:00			`pattern = [r"(?:https?://)?chan\.sankakucomplex\.com"`
			`r"/\?(?:[^&#]&)tags=([^&#]+)"]`
add missing tests 2016-09-19 16:15:27 +02:00			`test = [("https://chan.sankakucomplex.com/?tags=bonocho", {`
[sankaku] unescape image URLs 2017-09-04 17:41:11 +02:00			`"count": 5,`
			`"pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"`
			`r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),`
add missing tests 2016-09-19 16:15:27 +02:00			`})]`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`root = "https://chan.sankakucomplex.com"`
			`cookienames = ("login", "pass_hash")`
			`cookiedomain = "chan.sankakucomplex.com"`
initial commit 2014-10-12 21:56:44 +02:00
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`def __init__(self, match):`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`Extractor.__init__(self)`
			`self.logged_in = True`
			`self.pagestart = 1`
			`self.tags = text.unquote(match.group(1).replace("+", " "))`
			`self.wait_min = self.config("wait-min", 2)`
			`self.wait_max = self.config("wait-max", 4)`
			`if self.wait_max < self.wait_min:`
			`self.wait_max = self.wait_min`
initial commit 2014-10-12 21:56:44 +02:00
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`def skip(self, num):`
			`pages = min(num // 20, 49)`
			`self.pagestart += pages`
			`return pages * 20`

[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`def items(self):`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`self.login()`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`data = self.get_job_metadata()`
[sankaku] always use correct file-url 2015-11-10 00:55:01 +01:00			`yield Message.Version, 1`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`yield Message.Directory, data`
			`for image in self.get_images():`
[sankaku] always use correct file-url 2015-11-10 00:55:01 +01:00			`image.update(data)`
change keyword names to valid Python identifiers This commit mostly replaces all minus-signs ('-') in keyword names with underscores ('_') to allow them to be used in filter-expressions. For example 'gallery-id' got renamed to 'gallery_id'. (It is theoretically possible to access any variable, regardless of its name, with 'locals()["NAME"]', but that seems a bit too convoluted if just 'NAME' could be enough) 2017-09-10 22:20:47 +02:00			`yield Message.Url, image["file_url"], image`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00
			`def get_job_metadata(self):`
			`"""Collect metadata for extractor-job"""`
remove explicit (sub)category keywords 2016-09-25 14:22:07 +02:00			`return {"tags": self.tags}`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00
			`def get_images(self):`
[sankaku] add warning for unauthenticated users also improve URL pattern and add missing options to default config file 2017-10-16 20:45:14 +02:00			`"""Yield all available images for the given tags"""`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`params = {`
			`"tags": self.tags,`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`"page": self.pagestart,`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`}`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`while self.logged_in or params["page"] <= 25:`
			`image = None`
			`page = self.request(self.root, params=params, retries=10).text`
[sankaku] always use correct file-url 2015-11-10 00:55:01 +01:00			`pos = text.extract(page, '<div id=more-popular-posts-link>', '')[1]`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`for image_id in text.extract_iter(`
			`page, '<span class="thumb blacklisted" id=p', '>', pos):`
			`self.wait()`
[sankaku] always use correct file-url 2015-11-10 00:55:01 +01:00			`image = self.get_image_metadata(image_id)`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`yield image`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`if not image:`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`return`
initial commit 2014-10-12 21:56:44 +02:00			`params["page"] += 1`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`params["next"] = image["id"] - 1`
[sankaku] add warning for unauthenticated users also improve URL pattern and add missing options to default config file 2017-10-16 20:45:14 +02:00			`self.log.warning(`
			`"Unauthenticated users may only access the first 500 images / 25 "`
			`"pages. (Use '--range 501-' to continue downloading from this "`
			`"point onwards after setting up an account.)")`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00
[sankaku] always use correct file-url 2015-11-10 00:55:01 +01:00			`def get_image_metadata(self, image_id):`
[sankaku] add warning for unauthenticated users also improve URL pattern and add missing options to default config file 2017-10-16 20:45:14 +02:00			`"""Collect metadata for a single image"""`
[sankaku] always use correct file-url 2015-11-10 00:55:01 +01:00			`url = "https://chan.sankakucomplex.com/post/show/" + image_id`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`page = self.request(url, retries=10).text`
[sankaku] fix swf extraction (closes #52) 2017-12-07 15:45:43 +01:00			`file_url, pos = text.extract(page, '<li>Original: <a href="', '"')`
			`if file_url:`
			`width , pos = text.extract(page, '>', 'x', pos)`
			`height, pos = text.extract(page, '', ' ', pos)`
			`else:`
			`width , pos = text.extract(page, '<object width=', ' ', pos)`
			`height, pos = text.extract(page, 'height=', '>', pos)`
			`file_url = text.extract(page, '<embed src="', '"', pos)[0]`
			`data = text.nameext_from_url(file_url, {`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`"id": util.safe_int(image_id),`
[sankaku] fix swf extraction (closes #52) 2017-12-07 15:45:43 +01:00			`"file_url": "https:" + text.unescape(file_url),`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00			`"width": util.safe_int(width),`
			`"height": util.safe_int(height),`
code cleanup to use nameext_from_url 2015-11-16 17:32:26 +01:00			`})`
			`data["md5"] = data["name"]`
			`return data`
[sankaku] rewrite/improve (fixes #44) - add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags 2017-10-14 23:01:33 +02:00
			`def wait(self):`
			`"""Wait for a randomly chosen amount of seconds"""`
			`time.sleep(random.uniform(self.wait_min, self.wait_max))`

			`def login(self):`
			`"""Login and set necessary cookies"""`
			`if self._check_cookies(self.cookienames):`
			`return`
			`username, password = self._get_auth_info()`
			`if username:`
			`cookies = self._login_impl(username, password)`
			`for key, value in cookies.items():`
			`self.session.cookies.set(`
			`key, value, domain=self.cookiedomain)`
			`else:`
			`self.logged_in = False`

			`@cache(maxage=902460*60, keyarg=1)`
			`def _login_impl(self, username, password):`
			`"""Actual login implementation"""`
			`self.log.info("Logging in as %s", username)`
			`params = {`
			`"url": "",`
			`"user[name]": username,`
			`"user[password]": password,`
			`"commit": "Login",`
			`}`
			`response = self.request(self.root + "/user/authenticate",`
			`method="POST", params=params)`
			`if not response.history or response.url != self.root + "/user/home":`
			`raise exception.AuthenticationError()`
[sankaku] add warning for unauthenticated users also improve URL pattern and add missing options to default config file 2017-10-16 20:45:14 +02:00			`cookies = response.history[0].cookies`
			`return {c: cookies[c] for c in self.cookienames}`