Mike Fährmann 968d3e8465
remove '&' from URL patterns
'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"
2020-10-22 23:31:25 +02:00

242 lines
8.1 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for mastodon instances"""
from .common import Extractor, Message
from .. import text, util, config, exception
import re
class MastodonExtractor(Extractor):
"""Base class for mastodon extractors"""
basecategory = "mastodon"
directory_fmt = ("mastodon", "{instance}", "{account[username]}")
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
cookiedomain = None
instance = None
root = None
def __init__(self, match):
Extractor.__init__(self, match)
self.api = MastodonAPI(self)
def config(self, key, default=None):
return config.interpolate_common(
("extractor",), (
(self.category, self.subcategory),
(self.basecategory, self.instance, self.subcategory),
), key, default,
)
def items(self):
yield Message.Version, 1
for status in self.statuses():
attachments = status["media_attachments"]
if attachments:
self.prepare(status)
yield Message.Directory, status
for media in attachments:
status["media"] = media
url = media["url"]
yield Message.Url, url, text.nameext_from_url(url, status)
def statuses(self):
"""Return an iterable containing all relevant Status-objects"""
return ()
def prepare(self, status):
"""Prepare a status object"""
del status["media_attachments"]
status["instance"] = self.instance
status["tags"] = [tag["name"] for tag in status["tags"]]
status["date"] = text.parse_datetime(
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
class MastodonUserExtractor(MastodonExtractor):
"""Extractor for all images of an account/user"""
subcategory = "user"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.account_name = match.group(1)
def statuses(self):
handle = "@{}@{}".format(self.account_name, self.instance)
for account in self.api.account_search(handle, 1):
if account["username"] == self.account_name:
break
else:
raise exception.NotFoundError("account")
return self.api.account_statuses(account["id"])
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.status_id = match.group(1)
def statuses(self):
return (self.api.status(self.status_id),)
class MastodonAPI():
"""Minimal interface for the Mastodon API
https://github.com/tootsuite/mastodon
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
"""
def __init__(self, extractor, access_token=None):
self.root = extractor.root
self.extractor = extractor
if not access_token:
access_token = extractor.config(
"access-token", extractor.access_token)
self.headers = {"Authorization": "Bearer {}".format(access_token)}
def account_search(self, query, limit=40):
"""Search for content"""
params = {"q": query, "limit": limit}
return self._call("accounts/search", params).json()
def account_statuses(self, account_id):
"""Get an account's statuses"""
endpoint = "accounts/{}/statuses".format(account_id)
params = {"only_media": "1"}
return self._pagination(endpoint, params)
def status(self, status_id):
"""Fetch a Status"""
return self._call("statuses/" + status_id).json()
def _call(self, endpoint, params=None):
if endpoint.startswith("http"):
url = endpoint
else:
url = "{}/api/v1/{}".format(self.root, endpoint)
while True:
response = self.extractor.request(
url, params=params, headers=self.headers, fatal=None)
code = response.status_code
if code < 400:
return response
if code == 404:
raise exception.NotFoundError()
if code == 429:
self.extractor.wait(until=text.parse_datetime(
response.headers["x-ratelimit-reset"],
"%Y-%m-%dT%H:%M:%S.%fZ",
))
continue
raise exception.StopExtraction(response.json().get("error"))
def _pagination(self, endpoint, params):
url = "{}/api/v1/{}".format(self.root, endpoint)
while url:
response = self._call(url, params)
yield from response.json()
url = response.links.get("next")
if not url:
return
url = url["url"]
def generate_extractors():
"""Dynamically generate Extractor classes for Mastodon instances"""
symtable = globals()
extractors = config.get(("extractor",), "mastodon")
if extractors:
util.combine_dict(EXTRACTORS, extractors)
config.set(("extractor",), "mastodon", EXTRACTORS)
for instance, info in EXTRACTORS.items():
if not isinstance(info, dict):
continue
category = info.get("category") or instance.replace(".", "")
root = info.get("root") or "https://" + instance
name = (info.get("name") or category).capitalize()
token = info.get("access-token")
pattern = info.get("pattern") or re.escape(instance)
class Extr(MastodonUserExtractor):
pass
Extr.__name__ = Extr.__qualname__ = name + "UserExtractor"
Extr.__doc__ = "Extractor for all images of a user on " + instance
Extr.category = category
Extr.instance = instance
Extr.pattern = (r"(?:https?://)?" + pattern +
r"/@([^/?#]+)(?:/media)?/?$")
Extr.test = info.get("test-user")
Extr.root = root
Extr.access_token = token
symtable[Extr.__name__] = Extr
class Extr(MastodonStatusExtractor):
pass
Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor"
Extr.__doc__ = "Extractor for images from a status on " + instance
Extr.category = category
Extr.instance = instance
Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?#]+/(\d+)"
Extr.test = info.get("test-status")
Extr.root = root
Extr.access_token = token
symtable[Extr.__name__] = Extr
EXTRACTORS = {
"mastodon.social": {
"category" : "mastodon.social",
"access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48",
"client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo",
"client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI",
"test-user" : ("https://mastodon.social/@jk", {
"pattern": r"https://files.mastodon.social/media_attachments"
r"/files/(\d+/){3,}original/\w+",
"range": "1-60",
"count": 60,
}),
"test-status" : ("https://mastodon.social/@jk/103794036899778366", {
"count": 4,
}),
},
"pawoo.net": {
"category" : "pawoo",
"access-token" : "c12c9d275050bce0dc92169a28db09d7"
"0d62d0a75a8525953098c167eacd3668",
"client-id" : "978a25f843ec01e53d09be2c290cd75c"
"782bc3b7fdbd7ea4164b9f3c3780c8ff",
"client-secret": "9208e3d4a7997032cf4f1b0e12e5df38"
"8428ef1fadb446dcfeb4f5ed6872d97b",
},
"baraag.net": {
"category" : "baraag",
"access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0",
"client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
"client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
},
}
generate_extractors()