Mike Fährmann 53cc498d9c
improve config lookup when there are multiple possible locations
This specifically applies to all Mastodon extractors and all
extractors with a 'basecategory', i.e. 'booru', 'foolslide', etc.

Values inside those general config locations wouldn't be recognized
when a value with the same was set on the 'extractor' level.

For example 'extractor.mastodon.directory' should be used over
'extractor.directory' when both are set, but this was impossible
with the previous implementation.

(fixes #843)
2020-06-21 00:07:10 +02:00

242 lines
8.1 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for mastodon instances"""
from .common import Extractor, Message
from .. import text, util, config, exception
import re
class MastodonExtractor(Extractor):
"""Base class for mastodon extractors"""
basecategory = "mastodon"
directory_fmt = ("mastodon", "{instance}", "{account[username]}")
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
cookiedomain = None
instance = None
root = None
def __init__(self, match):
Extractor.__init__(self, match)
self.api = MastodonAPI(self)
def config(self, key, default=None):
return config.interpolate_common(
("extractor",), (
(self.category, self.subcategory),
(self.basecategory, self.instance, self.subcategory),
), key, default,
)
def items(self):
yield Message.Version, 1
for status in self.statuses():
attachments = status["media_attachments"]
if attachments:
self.prepare(status)
yield Message.Directory, status
for media in attachments:
status["media"] = media
url = media["url"]
yield Message.Url, url, text.nameext_from_url(url, status)
def statuses(self):
"""Return an iterable containing all relevant Status-objects"""
return ()
def prepare(self, status):
"""Prepare a status object"""
del status["media_attachments"]
status["instance"] = self.instance
status["tags"] = [tag["name"] for tag in status["tags"]]
status["date"] = text.parse_datetime(
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
class MastodonUserExtractor(MastodonExtractor):
"""Extractor for all images of an account/user"""
subcategory = "user"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.account_name = match.group(1)
def statuses(self):
handle = "@{}@{}".format(self.account_name, self.instance)
for account in self.api.account_search(handle, 1):
if account["username"] == self.account_name:
break
else:
raise exception.NotFoundError("account")
return self.api.account_statuses(account["id"])
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.status_id = match.group(1)
def statuses(self):
return (self.api.status(self.status_id),)
class MastodonAPI():
"""Minimal interface for the Mastodon API
https://github.com/tootsuite/mastodon
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
"""
def __init__(self, extractor, access_token=None):
self.root = extractor.root
self.extractor = extractor
if not access_token:
access_token = extractor.config(
"access-token", extractor.access_token)
self.headers = {"Authorization": "Bearer {}".format(access_token)}
def account_search(self, query, limit=40):
"""Search for content"""
params = {"q": query, "limit": limit}
return self._call("accounts/search", params).json()
def account_statuses(self, account_id):
"""Get an account's statuses"""
endpoint = "accounts/{}/statuses".format(account_id)
params = {"only_media": "1"}
return self._pagination(endpoint, params)
def status(self, status_id):
"""Fetch a Status"""
return self._call("statuses/" + status_id).json()
def _call(self, endpoint, params=None):
if endpoint.startswith("http"):
url = endpoint
else:
url = "{}/api/v1/{}".format(self.root, endpoint)
while True:
response = self.extractor.request(
url, params=params, headers=self.headers, fatal=None)
code = response.status_code
if code < 400:
return response
if code == 404:
raise exception.NotFoundError()
if code == 429:
self.extractor.wait(until=text.parse_datetime(
response.headers["x-ratelimit-reset"],
"%Y-%m-%dT%H:%M:%S.%fZ",
))
continue
raise exception.StopExtraction(response.json().get("error"))
def _pagination(self, endpoint, params):
url = "{}/api/v1/{}".format(self.root, endpoint)
while url:
response = self._call(url, params)
yield from response.json()
url = response.links.get("next")
if not url:
return
url = url["url"]
def generate_extractors():
"""Dynamically generate Extractor classes for Mastodon instances"""
symtable = globals()
extractors = config.get(("extractor",), "mastodon")
if extractors:
util.combine_dict(EXTRACTORS, extractors)
config.set(("extractor",), "mastodon", EXTRACTORS)
for instance, info in EXTRACTORS.items():
if not isinstance(info, dict):
continue
category = info.get("category") or instance.replace(".", "")
root = info.get("root") or "https://" + instance
name = (info.get("name") or category).capitalize()
token = info.get("access-token")
pattern = info.get("pattern") or re.escape(instance)
class Extr(MastodonUserExtractor):
pass
Extr.__name__ = Extr.__qualname__ = name + "UserExtractor"
Extr.__doc__ = "Extractor for all images of a user on " + instance
Extr.category = category
Extr.instance = instance
Extr.pattern = (r"(?:https?://)?" + pattern +
r"/@([^/?&#]+)(?:/media)?/?$")
Extr.test = info.get("test-user")
Extr.root = root
Extr.access_token = token
symtable[Extr.__name__] = Extr
class Extr(MastodonStatusExtractor):
pass
Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor"
Extr.__doc__ = "Extractor for images from a status on " + instance
Extr.category = category
Extr.instance = instance
Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?&#]+/(\d+)"
Extr.test = info.get("test-status")
Extr.root = root
Extr.access_token = token
symtable[Extr.__name__] = Extr
EXTRACTORS = {
"mastodon.social": {
"category" : "mastodon.social",
"access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48",
"client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo",
"client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI",
"test-user" : ("https://mastodon.social/@jk", {
"pattern": r"https://files.mastodon.social/media_attachments"
r"/files/\d+/\d+/\d+/original/\w+",
"range": "1-60",
"count": 60,
}),
"test-status" : ("https://mastodon.social/@jk/103794036899778366", {
"count": 4,
}),
},
"pawoo.net": {
"category" : "pawoo",
"access-token" : "c12c9d275050bce0dc92169a28db09d7"
"0d62d0a75a8525953098c167eacd3668",
"client-id" : "978a25f843ec01e53d09be2c290cd75c"
"782bc3b7fdbd7ea4164b9f3c3780c8ff",
"client-secret": "9208e3d4a7997032cf4f1b0e12e5df38"
"8428ef1fadb446dcfeb4f5ed6872d97b",
},
"baraag.net": {
"category" : "baraag",
"access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0",
"client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
"client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
},
}
generate_extractors()