gallery-dl/gallery_dl/extractor/reddit.py

# -*- coding: utf-8 -*-

# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images subreddits at https://reddit.com/"""

from .common import Extractor, Message
from .. import text, extractor, exception
from ..cache import cache
import re


class RedditExtractor(Extractor):
    """Base class for reddit extractors"""
    category = "reddit"

    def __init__(self):
        Extractor.__init__(self)
        self.api = RedditAPI(self)
        self.max_depth = int(self.config("recursion", 0))
        self._visited = set()

    def items(self):
        subre = re.compile(RedditSubmissionExtractor.pattern[0])
        submissions = self.submissions()
        depth = 0

        yield Message.Version, 1
        with extractor.blacklist("reddit"):
            while True:
                extra = []
                for url in self._urls(submissions):
                    if url[0] == "#":
                        continue
                    if url[0] == "/":
                        url = "https://www.reddit.com" + url

                    match = subre.match(url)
                    if match:
                        extra.append(match.group(1))
                    else:
                        yield Message.Queue, url

                if not extra or depth == self.max_depth:
                    return
                depth += 1
                submissions = (
                    self.api.submission(sid) for sid in extra
                    if sid not in self._visited
                )

    def submissions(self):
        """Return an iterable containing all (submission, comments) tuples"""

    def _urls(self, submissions):
        for submission, comments in submissions:
            self._visited.add(submission["id"])
            if not submission["is_self"]:
                yield submission["url"]
            strings = [submission["selftext_html"] or ""]
            strings += [c["body_html"] or "" for c in comments]
            yield from text.extract_iter("".join(strings), ' href="', '"')


class RedditSubredditExtractor(RedditExtractor):
    """Extractor for images from subreddits on reddit.com"""
    subcategory = "subreddit"
    pattern = [r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/([^/?&#]+)"
               r"(/[a-z]+)?/?"
               r"(?:\?.*?(?:\bt=([a-z]+))?)?$"]

    def __init__(self, match):
        RedditExtractor.__init__(self)
        self.subreddit, self.order, self.timeframe = match.groups()

    def submissions(self):
        subreddit = self.subreddit + (self.order or "")
        params = {"t": self.timeframe} if self.timeframe else {}
        return self.api.submissions_subreddit(subreddit, params)


class RedditSubmissionExtractor(RedditExtractor):
    """Extractor for images from a submission on reddit.com"""
    subcategory = "submission"
    pattern = [(r"(?:https?://)?(?:"
                r"(?:m\.|www\.)?reddit\.com/r/[^/]+/comments|"
                r"redd\.it"
                r")/([a-z0-9]+)")]

    def __init__(self, match):
        RedditExtractor.__init__(self)
        self.submission_id = match.group(1)

    def submissions(self):
        return (self.api.submission(self.submission_id),)


class RedditAPI():
    """Minimal interface for the reddit API"""
    def __init__(self, extractor, client_id="6N9uN0krSDE-ig"):
        self.extractor = extractor
        self.client_id = extractor.config("client-id", client_id)
        self.comments  = extractor.config("comments", 500)
        self.session = extractor.session
        self.session.headers["User-Agent"] = ("Python:gallery-dl:0.8.4"
                                              " (by /u/mikf1)")

    def submission(self, submission_id):
        """Fetch the (submission, comments)=-tuple for a submission id"""
        endpoint = "/comments/" + submission_id + "/.json"
        submission, comments = self._call(endpoint, {"limit": self.comments})
        return (submission["data"]["children"][0]["data"],
                self._unfold(comments))

    def submissions_subreddit(self, subreddit, params):
        """Collect all (submission, comments)-tuples of a subreddit"""
        endpoint = "/r/" + subreddit + "/.json"
        params["limit"] = 100
        return self._pagination(endpoint, params)

    def authenticate(self):
        """Authenticate the application by requesting an access token"""
        access_token = self._authenticate_impl(self.client_id)
        self.session.headers["Authorization"] = access_token

    @cache(maxage=3600, keyarg=1)
    def _authenticate_impl(self, client_id):
        """Actual authenticate implementation"""
        url = "https://www.reddit.com/api/v1/access_token"
        data = {
            "grant_type": "https://oauth.reddit.com/grants/installed_client",
            "device_id": "DO_NOT_TRACK_THIS_DEVICE",
        }
        response = self.session.post(url, data=data, auth=(client_id, ""))
        if response.status_code != 200:
            raise exception.AuthenticationError()
        return "Bearer " + response.json()["access_token"]

    def _call(self, endpoint, params):
        url = "https://oauth.reddit.com" + endpoint
        params["raw_json"] = 1
        self.authenticate()
        data = self.session.get(url, params=params).json()
        if "error" in data:
            if data["error"] == 403:
                raise exception.AuthorizationError()
            if data["error"] == 404:
                raise exception.NotFoundError()
            raise Exception(data["message"])
        return data

    def _pagination(self, endpoint, params, _empty=()):
        date_min = int(self.extractor.config("date-min", 0))
        date_max = int(self.extractor.config("date-max", 253402210800))

        while True:
            data = self._call(endpoint, params)["data"]

            for submission in data["children"]:
                submission = submission["data"]
                if date_min <= submission["created_utc"] <= date_max:
                    if submission["num_comments"] and self.comments:
                        try:
                            yield self.submission(submission["id"])
                        except exception.AuthorizationError:
                            pass
                    else:
                        yield submission, _empty

            if not data["after"]:
                return
            params["after"] = data["after"]

    @staticmethod
    def _unfold(comments):
        # TODO: order?
        queue = comments["data"]["children"]
        while queue:
            comment = queue.pop()
            if comment["kind"] == "more":
                continue
            comment = comment["data"]
            yield comment
            if comment["replies"]:
                queue += comment["replies"]["data"]["children"]
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`# -- coding: utf-8 --`

			`# Copyright 2017 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract images subreddits at https://reddit.com/"""`

			`from .common import Extractor, Message`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`from .. import text, extractor, exception`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`from ..cache import cache`
			`import re`


			`class RedditExtractor(Extractor):`
			`"""Base class for reddit extractors"""`
			`category = "reddit"`

			`def __init__(self):`
			`Extractor.__init__(self)`
[reddit] support filtering by timestamp (#15) - Added the 'extractor.reddit.date-min' and '….date-max' config options. These values should be UTC timestamps. - All submissions not posted in date-min <= T <= date-max will be ignored. - Fixed the limit parameter for submission comments by setting it to its apparent max value (500). 2017-06-03 13:33:48 +02:00			`self.api = RedditAPI(self)`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`self.max_depth = int(self.config("recursion", 0))`
			`self._visited = set()`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00
			`def items(self):`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`subre = re.compile(RedditSubmissionExtractor.pattern[0])`
			`submissions = self.submissions()`
			`depth = 0`

[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`yield Message.Version, 1`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`with extractor.blacklist("reddit"):`
			`while True:`
			`extra = []`
			`for url in self._urls(submissions):`
			`if url[0] == "#":`
			`continue`
			`if url[0] == "/":`
			`url = "https://www.reddit.com" + url`

			`match = subre.match(url)`
			`if match:`
			`extra.append(match.group(1))`
			`else:`
			`yield Message.Queue, url`

			`if not extra or depth == self.max_depth:`
			`return`
			`depth += 1`
			`submissions = (`
			`self.api.submission(sid) for sid in extra`
			`if sid not in self._visited`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`)`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00
			`def submissions(self):`
			`"""Return an iterable containing all (submission, comments) tuples"""`

			`def _urls(self, submissions):`
			`for submission, comments in submissions:`
			`self._visited.add(submission["id"])`
			`if not submission["is_self"]:`
			`yield submission["url"]`
			`strings = [submission["selftext_html"] or ""]`
			`strings += [c["body_html"] or "" for c in comments]`
			`yield from text.extract_iter("".join(strings), ' href="', '"')`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00

			`class RedditSubredditExtractor(RedditExtractor):`
			`"""Extractor for images from subreddits on reddit.com"""`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`subcategory = "subreddit"`
[reddit] support sorting options and sub-options (#15) Example: https://www.reddit.com/r/<subreddit>/top/?sort=top&t=month (the 'sort=top' parameter is irrelevant and can be omitted) 2017-05-29 12:34:53 +02:00			`pattern = [r"(?:https?://)?(?:m\.\|www\.)?reddit\.com/r/([^/?&#]+)"`
			`r"(/[a-z]+)?/?"`
			`r"(?:\?.*?(?:\bt=([a-z]+))?)?$"]`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00
			`def __init__(self, match):`
			`RedditExtractor.__init__(self)`
[reddit] support sorting options and sub-options (#15) Example: https://www.reddit.com/r/<subreddit>/top/?sort=top&t=month (the 'sort=top' parameter is irrelevant and can be omitted) 2017-05-29 12:34:53 +02:00			`self.subreddit, self.order, self.timeframe = match.groups()`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00
			`def submissions(self):`
[reddit] support sorting options and sub-options (#15) Example: https://www.reddit.com/r/<subreddit>/top/?sort=top&t=month (the 'sort=top' parameter is irrelevant and can be omitted) 2017-05-29 12:34:53 +02:00			`subreddit = self.subreddit + (self.order or "")`
			`params = {"t": self.timeframe} if self.timeframe else {}`
			`return self.api.submissions_subreddit(subreddit, params)`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00

			`class RedditSubmissionExtractor(RedditExtractor):`
			`"""Extractor for images from a submission on reddit.com"""`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`subcategory = "submission"`
			`pattern = [(r"(?:https?://)?(?:"`
			`r"(?:m\.\|www\.)?reddit\.com/r/[^/]+/comments\|"`
			`r"redd\.it"`
			`r")/([a-z0-9]+)")]`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00
			`def __init__(self, match):`
			`RedditExtractor.__init__(self)`
			`self.submission_id = match.group(1)`

			`def submissions(self):`
			`return (self.api.submission(self.submission_id),)`


			`class RedditAPI():`
			`"""Minimal interface for the reddit API"""`
[reddit] support filtering by timestamp (#15) - Added the 'extractor.reddit.date-min' and '….date-max' config options. These values should be UTC timestamps. - All submissions not posted in date-min <= T <= date-max will be ignored. - Fixed the limit parameter for submission comments by setting it to its apparent max value (500). 2017-06-03 13:33:48 +02:00			`def __init__(self, extractor, client_id="6N9uN0krSDE-ig"):`
[reddit] ignore Authorization errors for subreddits - also made the limit for retrieved comments customizable via the 'extractor.reddit.comments' config value - default is 500; 0 ignores comments completely 2017-06-05 18:37:50 +02:00			`self.extractor = extractor`
			`self.client_id = extractor.config("client-id", client_id)`
fix tests and small stuff 2017-06-06 14:22:09 +02:00			`self.comments = extractor.config("comments", 500)`
[reddit] support filtering by timestamp (#15) - Added the 'extractor.reddit.date-min' and '….date-max' config options. These values should be UTC timestamps. - All submissions not posted in date-min <= T <= date-max will be ignored. - Fixed the limit parameter for submission comments by setting it to its apparent max value (500). 2017-06-03 13:33:48 +02:00			`self.session = extractor.session`
			`self.session.headers["User-Agent"] = ("Python:gallery-dl:0.8.4"`
			`" (by /u/mikf1)")`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00
			`def submission(self, submission_id):`
			`"""Fetch the (submission, comments)=-tuple for a submission id"""`
			`endpoint = "/comments/" + submission_id + "/.json"`
[reddit] ignore Authorization errors for subreddits - also made the limit for retrieved comments customizable via the 'extractor.reddit.comments' config value - default is 500; 0 ignores comments completely 2017-06-05 18:37:50 +02:00			`submission, comments = self._call(endpoint, {"limit": self.comments})`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`return (submission["data"]["children"][0]["data"],`
			`self._unfold(comments))`

[reddit] support sorting options and sub-options (#15) Example: https://www.reddit.com/r/<subreddit>/top/?sort=top&t=month (the 'sort=top' parameter is irrelevant and can be omitted) 2017-05-29 12:34:53 +02:00			`def submissions_subreddit(self, subreddit, params):`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`"""Collect all (submission, comments)-tuples of a subreddit"""`
			`endpoint = "/r/" + subreddit + "/.json"`
[reddit] support sorting options and sub-options (#15) Example: https://www.reddit.com/r/<subreddit>/top/?sort=top&t=month (the 'sort=top' parameter is irrelevant and can be omitted) 2017-05-29 12:34:53 +02:00			`params["limit"] = 100`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`return self._pagination(endpoint, params)`

			`def authenticate(self):`
			`"""Authenticate the application by requesting an access token"""`
			`access_token = self._authenticate_impl(self.client_id)`
			`self.session.headers["Authorization"] = access_token`

			`@cache(maxage=3600, keyarg=1)`
			`def _authenticate_impl(self, client_id):`
			`"""Actual authenticate implementation"""`
			`url = "https://www.reddit.com/api/v1/access_token"`
			`data = {`
			`"grant_type": "https://oauth.reddit.com/grants/installed_client",`
			`"device_id": "DO_NOT_TRACK_THIS_DEVICE",`
			`}`
			`response = self.session.post(url, data=data, auth=(client_id, ""))`
			`if response.status_code != 200:`
			`raise exception.AuthenticationError()`
			`return "Bearer " + response.json()["access_token"]`

			`def _call(self, endpoint, params):`
			`url = "https://oauth.reddit.com" + endpoint`
[reddit] support filtering by timestamp (#15) - Added the 'extractor.reddit.date-min' and '….date-max' config options. These values should be UTC timestamps. - All submissions not posted in date-min <= T <= date-max will be ignored. - Fixed the limit parameter for submission comments by setting it to its apparent max value (500). 2017-06-03 13:33:48 +02:00			`params["raw_json"] = 1`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`self.authenticate()`
[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`data = self.session.get(url, params=params).json()`
			`if "error" in data:`
			`if data["error"] == 403:`
			`raise exception.AuthorizationError()`
			`if data["error"] == 404:`
			`raise exception.NotFoundError()`
			`raise Exception(data["message"])`
			`return data`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00
			`def _pagination(self, endpoint, params, _empty=()):`
[reddit] ignore Authorization errors for subreddits - also made the limit for retrieved comments customizable via the 'extractor.reddit.comments' config value - default is 500; 0 ignores comments completely 2017-06-05 18:37:50 +02:00			`date_min = int(self.extractor.config("date-min", 0))`
			`date_max = int(self.extractor.config("date-max", 253402210800))`

[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`while True:`
			`data = self._call(endpoint, params)["data"]`

			`for submission in data["children"]:`
			`submission = submission["data"]`
[reddit] ignore Authorization errors for subreddits - also made the limit for retrieved comments customizable via the 'extractor.reddit.comments' config value - default is 500; 0 ignores comments completely 2017-06-05 18:37:50 +02:00			`if date_min <= submission["created_utc"] <= date_max:`
			`if submission["num_comments"] and self.comments:`
			`try:`
			`yield self.submission(submission["id"])`
			`except exception.AuthorizationError:`
			`pass`
[reddit] support filtering by timestamp (#15) - Added the 'extractor.reddit.date-min' and '….date-max' config options. These values should be UTC timestamps. - All submissions not posted in date-min <= T <= date-max will be ignored. - Fixed the limit parameter for submission comments by setting it to its apparent max value (500). 2017-06-03 13:33:48 +02:00			`else:`
			`yield submission, _empty`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00
			`if not data["after"]:`
			`return`
			`params["after"] = data["after"]`

[reddit] enable recursion (#15) reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level 2017-05-26 16:40:08 +02:00			`@staticmethod`
			`def _unfold(comments):`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`# TODO: order?`
			`queue = comments["data"]["children"]`
			`while queue:`
			`comment = queue.pop()`
			`if comment["kind"] == "more":`
			`continue`
			`comment = comment["data"]`
			`yield comment`
			`if comment["replies"]:`
			`queue += comment["replies"]["data"]["children"]`