[reddit] add ability to load more comments (#15)

The 'extractor.reddit.morecomments' option enables the use of
the '/api/morechildren' API endpoint (1) to load even more
comments than the usual submission-request provides.
Possible values are the booleans 'true' and 'false' (default).

Note: this feature comes at the cost of 1 extra API call towards
the rate limit for every 100 extra comments.

(1) https://www.reddit.com/dev/api/#GET_api_morechildren
This commit is contained in:
Mike Fährmann 2017-06-13 18:49:07 +02:00
parent 05ed95e5b0
commit 56bec79e6a
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -108,6 +108,7 @@ class RedditAPI():
def __init__(self, extractor):
self.extractor = extractor
self.comments = extractor.config("comments", 500)
self.morecomments = extractor.config("morecomments", False)
self.refresh_token = extractor.config("refresh-token")
self.log = extractor.log
self.session = extractor.session
@ -116,9 +117,10 @@ class RedditAPI():
def submission(self, submission_id):
"""Fetch the (submission, comments)=-tuple for a submission id"""
endpoint = "/comments/" + submission_id + "/.json"
link_id = "t3_" + submission_id if self.morecomments else None
submission, comments = self._call(endpoint, {"limit": self.comments})
return (submission["data"]["children"][0]["data"],
self._unfold(comments))
self._flatten(comments, link_id))
def submissions_subreddit(self, subreddit, params):
"""Collect all (submission, comments)-tuples of a subreddit"""
@ -126,6 +128,24 @@ class RedditAPI():
params["limit"] = 100
return self._pagination(endpoint, params)
def morechildren(self, link_id, children):
"""Load additional comments from a submission"""
endpoint = "/api/morechildren"
params = {"link_id": link_id, "api_type": "json"}
index, done = 0, False
while not done:
if len(children) - index < 100:
done = True
params["children"] = ",".join(children[index:index + 100])
index += 100
data = self._call(endpoint, params)["json"]
for thing in data["data"]["things"]:
if thing["kind"] == "more":
children.extend(thing["data"]["children"])
else:
yield thing["data"]
def authenticate(self):
"""Authenticate the application by requesting an access token"""
access_token = self._authenticate_impl(self.refresh_token)
@ -190,15 +210,18 @@ class RedditAPI():
return
params["after"] = data["after"]
@staticmethod
def _unfold(comments):
# TODO: order?
def _flatten(self, comments, link_id=None):
extra = []
queue = comments["data"]["children"]
while queue:
comment = queue.pop()
comment = queue.pop(0)
if comment["kind"] == "more":
if link_id:
extra.extend(comment["data"]["children"])
continue
comment = comment["data"]
yield comment
if comment["replies"]:
queue += comment["replies"]["data"]["children"]
if link_id and extra:
yield from self.morechildren(link_id, extra)