# -*- coding: utf-8 -*- # Copyright 2016-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from https://www.tumblr.com/""" from .common import Extractor, Message from .. import text, oauth, extractor, exception from datetime import datetime, timedelta import re import time def _original_inline_image(url): return re.sub( (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"), r"https://\1_1280.\2", url ) def _original_video(url): return re.sub( (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"), r"https://\1.\2", url ) POST_TYPES = frozenset(( "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?([^.]+\.tumblr\.com))") class TumblrExtractor(Extractor): """Base class for tumblr extractors""" category = "tumblr" directory_fmt = ("{category}", "{name}") filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" def __init__(self, match): Extractor.__init__(self) self.blog = match.group(1) or match.group(2) self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.avatar = self.config("avatar", False) self.inline = self.config("inline", True) self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) elif not self.types: self.log.warning("no valid post types selected") if self.reblogs == "same-blog": self._skip_reblog = self._skip_reblog_same_blog def items(self): blog = None yield Message.Version, 1 for post in self.posts(): if post["type"] not in self.types: continue if not blog: blog = self.api.info(self.blog) blog["uuid"] = self.blog yield Message.Directory, blog.copy() if self.avatar: url = self.api.avatar(self.blog) yield self._prepare_avatar(url, post.copy(), blog) reblog = "reblogged_from_id" in post if reblog and self._skip_reblog(post): continue post["reblogged"] = reblog post["blog"] = blog post["num"] = 0 if "trail" in post: del post["trail"] if "photos" in post: # type "photo" or "link" photos = post["photos"] del post["photos"] for photo in photos: post["photo"] = photo photo.update(photo["original_size"]) del photo["original_size"] del photo["alt_sizes"] yield self._prepare_image(photo["url"], post) if "audio_url" in post: # type: "audio" yield self._prepare(post["audio_url"], post) if "video_url" in post: # type: "video" yield self._prepare(_original_video(post["video_url"]), post) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in re.findall('= data["total_posts"]: return def likes(self, blog): """Retrieve liked posts""" params = {"limit": 50} while True: posts = self._call(blog, "likes", params)["liked_posts"] if not posts: return yield from posts params["before"] = posts[-1]["liked_timestamp"] def _call(self, blog, endpoint, params, **kwargs): if self.api_key: params["api_key"] = self.api_key url = "https://api.tumblr.com/v2/blog/{}/{}".format( blog, endpoint) response = self.request(url, params=params, **kwargs) data = response.json() status = data["meta"]["status"] if 200 <= status < 400: return data["response"] elif status == 403: raise exception.AuthorizationError() elif status == 404: raise exception.NotFoundError("user or post") elif status == 429: # daily rate limit if response.headers.get("x-ratelimit-perday-remaining") == "0": reset = response.headers.get("x-ratelimit-perday-reset") self.log.error( "Daily API rate limit exceeded: aborting; " "rate limit will reset at %s", self._to_time(reset), ) raise exception.StopExtraction() # hourly rate limit reset = response.headers.get("x-ratelimit-perhour-reset") if reset: self.log.info( "Hourly API rate limit exceeded; " "waiting until %s for rate limit reset", self._to_time(reset), ) time.sleep(int(reset) + 1) return self._call(blog, endpoint, params) self.log.error(data) raise exception.StopExtraction() @staticmethod def _to_time(reset): try: reset_time = datetime.now() + timedelta(seconds=int(reset)) except (ValueError, TypeError): return "?" return reset_time.strftime("%H:%M:%S")