53cc498d9c
This specifically applies to all Mastodon extractors and all extractors with a 'basecategory', i.e. 'booru', 'foolslide', etc. Values inside those general config locations wouldn't be recognized when a value with the same was set on the 'extractor' level. For example 'extractor.mastodon.directory' should be used over 'extractor.directory' when both are set, but this was impossible with the previous implementation. (fixes #843)
242 lines
8.1 KiB
Python
242 lines
8.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2019-2020 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for mastodon instances"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text, util, config, exception
|
|
import re
|
|
|
|
|
|
class MastodonExtractor(Extractor):
|
|
"""Base class for mastodon extractors"""
|
|
basecategory = "mastodon"
|
|
directory_fmt = ("mastodon", "{instance}", "{account[username]}")
|
|
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
|
|
archive_fmt = "{media[id]}"
|
|
cookiedomain = None
|
|
instance = None
|
|
root = None
|
|
|
|
def __init__(self, match):
|
|
Extractor.__init__(self, match)
|
|
self.api = MastodonAPI(self)
|
|
|
|
def config(self, key, default=None):
|
|
return config.interpolate_common(
|
|
("extractor",), (
|
|
(self.category, self.subcategory),
|
|
(self.basecategory, self.instance, self.subcategory),
|
|
), key, default,
|
|
)
|
|
|
|
def items(self):
|
|
yield Message.Version, 1
|
|
for status in self.statuses():
|
|
attachments = status["media_attachments"]
|
|
if attachments:
|
|
self.prepare(status)
|
|
yield Message.Directory, status
|
|
for media in attachments:
|
|
status["media"] = media
|
|
url = media["url"]
|
|
yield Message.Url, url, text.nameext_from_url(url, status)
|
|
|
|
def statuses(self):
|
|
"""Return an iterable containing all relevant Status-objects"""
|
|
return ()
|
|
|
|
def prepare(self, status):
|
|
"""Prepare a status object"""
|
|
del status["media_attachments"]
|
|
status["instance"] = self.instance
|
|
status["tags"] = [tag["name"] for tag in status["tags"]]
|
|
status["date"] = text.parse_datetime(
|
|
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
|
|
|
|
class MastodonUserExtractor(MastodonExtractor):
|
|
"""Extractor for all images of an account/user"""
|
|
subcategory = "user"
|
|
|
|
def __init__(self, match):
|
|
MastodonExtractor.__init__(self, match)
|
|
self.account_name = match.group(1)
|
|
|
|
def statuses(self):
|
|
handle = "@{}@{}".format(self.account_name, self.instance)
|
|
for account in self.api.account_search(handle, 1):
|
|
if account["username"] == self.account_name:
|
|
break
|
|
else:
|
|
raise exception.NotFoundError("account")
|
|
return self.api.account_statuses(account["id"])
|
|
|
|
|
|
class MastodonStatusExtractor(MastodonExtractor):
|
|
"""Extractor for images from a status"""
|
|
subcategory = "status"
|
|
|
|
def __init__(self, match):
|
|
MastodonExtractor.__init__(self, match)
|
|
self.status_id = match.group(1)
|
|
|
|
def statuses(self):
|
|
return (self.api.status(self.status_id),)
|
|
|
|
|
|
class MastodonAPI():
|
|
"""Minimal interface for the Mastodon API
|
|
|
|
https://github.com/tootsuite/mastodon
|
|
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
|
|
"""
|
|
|
|
def __init__(self, extractor, access_token=None):
|
|
self.root = extractor.root
|
|
self.extractor = extractor
|
|
|
|
if not access_token:
|
|
access_token = extractor.config(
|
|
"access-token", extractor.access_token)
|
|
self.headers = {"Authorization": "Bearer {}".format(access_token)}
|
|
|
|
def account_search(self, query, limit=40):
|
|
"""Search for content"""
|
|
params = {"q": query, "limit": limit}
|
|
return self._call("accounts/search", params).json()
|
|
|
|
def account_statuses(self, account_id):
|
|
"""Get an account's statuses"""
|
|
endpoint = "accounts/{}/statuses".format(account_id)
|
|
params = {"only_media": "1"}
|
|
return self._pagination(endpoint, params)
|
|
|
|
def status(self, status_id):
|
|
"""Fetch a Status"""
|
|
return self._call("statuses/" + status_id).json()
|
|
|
|
def _call(self, endpoint, params=None):
|
|
if endpoint.startswith("http"):
|
|
url = endpoint
|
|
else:
|
|
url = "{}/api/v1/{}".format(self.root, endpoint)
|
|
|
|
while True:
|
|
response = self.extractor.request(
|
|
url, params=params, headers=self.headers, fatal=None)
|
|
code = response.status_code
|
|
|
|
if code < 400:
|
|
return response
|
|
if code == 404:
|
|
raise exception.NotFoundError()
|
|
if code == 429:
|
|
self.extractor.wait(until=text.parse_datetime(
|
|
response.headers["x-ratelimit-reset"],
|
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
))
|
|
continue
|
|
raise exception.StopExtraction(response.json().get("error"))
|
|
|
|
def _pagination(self, endpoint, params):
|
|
url = "{}/api/v1/{}".format(self.root, endpoint)
|
|
while url:
|
|
response = self._call(url, params)
|
|
yield from response.json()
|
|
|
|
url = response.links.get("next")
|
|
if not url:
|
|
return
|
|
url = url["url"]
|
|
|
|
|
|
def generate_extractors():
|
|
"""Dynamically generate Extractor classes for Mastodon instances"""
|
|
|
|
symtable = globals()
|
|
extractors = config.get(("extractor",), "mastodon")
|
|
if extractors:
|
|
util.combine_dict(EXTRACTORS, extractors)
|
|
config.set(("extractor",), "mastodon", EXTRACTORS)
|
|
|
|
for instance, info in EXTRACTORS.items():
|
|
|
|
if not isinstance(info, dict):
|
|
continue
|
|
|
|
category = info.get("category") or instance.replace(".", "")
|
|
root = info.get("root") or "https://" + instance
|
|
name = (info.get("name") or category).capitalize()
|
|
token = info.get("access-token")
|
|
pattern = info.get("pattern") or re.escape(instance)
|
|
|
|
class Extr(MastodonUserExtractor):
|
|
pass
|
|
|
|
Extr.__name__ = Extr.__qualname__ = name + "UserExtractor"
|
|
Extr.__doc__ = "Extractor for all images of a user on " + instance
|
|
Extr.category = category
|
|
Extr.instance = instance
|
|
Extr.pattern = (r"(?:https?://)?" + pattern +
|
|
r"/@([^/?&#]+)(?:/media)?/?$")
|
|
Extr.test = info.get("test-user")
|
|
Extr.root = root
|
|
Extr.access_token = token
|
|
symtable[Extr.__name__] = Extr
|
|
|
|
class Extr(MastodonStatusExtractor):
|
|
pass
|
|
|
|
Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor"
|
|
Extr.__doc__ = "Extractor for images from a status on " + instance
|
|
Extr.category = category
|
|
Extr.instance = instance
|
|
Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?&#]+/(\d+)"
|
|
Extr.test = info.get("test-status")
|
|
Extr.root = root
|
|
Extr.access_token = token
|
|
symtable[Extr.__name__] = Extr
|
|
|
|
|
|
EXTRACTORS = {
|
|
"mastodon.social": {
|
|
"category" : "mastodon.social",
|
|
"access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48",
|
|
"client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo",
|
|
"client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI",
|
|
"test-user" : ("https://mastodon.social/@jk", {
|
|
"pattern": r"https://files.mastodon.social/media_attachments"
|
|
r"/files/\d+/\d+/\d+/original/\w+",
|
|
"range": "1-60",
|
|
"count": 60,
|
|
}),
|
|
"test-status" : ("https://mastodon.social/@jk/103794036899778366", {
|
|
"count": 4,
|
|
}),
|
|
},
|
|
"pawoo.net": {
|
|
"category" : "pawoo",
|
|
"access-token" : "c12c9d275050bce0dc92169a28db09d7"
|
|
"0d62d0a75a8525953098c167eacd3668",
|
|
"client-id" : "978a25f843ec01e53d09be2c290cd75c"
|
|
"782bc3b7fdbd7ea4164b9f3c3780c8ff",
|
|
"client-secret": "9208e3d4a7997032cf4f1b0e12e5df38"
|
|
"8428ef1fadb446dcfeb4f5ed6872d97b",
|
|
},
|
|
"baraag.net": {
|
|
"category" : "baraag",
|
|
"access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0",
|
|
"client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
|
|
"client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
|
|
},
|
|
}
|
|
|
|
|
|
generate_extractors()
|