357 lines
13 KiB
Python
Raw Normal View History

2017-05-30 17:43:02 +02:00
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://www.flickr.com/"""
from .common import Extractor, Message
from .. import text, util, exception
from . import oauth
2017-05-30 17:43:02 +02:00
class FlickrExtractor(Extractor):
"""Base class for flickr extractors"""
category = "flickr"
2017-05-31 17:31:51 +02:00
filename_fmt = "{category}_{id}.{extension}"
def __init__(self, match):
Extractor.__init__(self)
self.api = FlickrAPI(self)
self.item_id = match.group(1)
2017-06-06 14:22:36 +02:00
self.user = None
self.load_extra = self.config("metadata", False)
def items(self):
info = self.data()
yield Message.Version, 1
yield Message.Directory, info
for photo in self.photos():
photo.update(info)
url = photo["photo"]["source"]
yield Message.Url, url, text.nameext_from_url(url, photo)
def data(self):
self.user = self.api.urls_lookupUser(self.item_id)
return {"user": self.user}
def photos(self):
return []
2017-05-30 17:43:02 +02:00
class FlickrImageExtractor(FlickrExtractor):
"""Extractor for individual images from flickr.com"""
subcategory = "image"
pattern = [r"(?:https?://)?(?:www\.|m\.)?flickr\.com/photos/[^/]+/(\d+)",
r"(?:https?://)?[^.]+\.static\.?flickr\.com/(?:\d+/)+(\d+)_",
r"(?:https?://)?flic\.kr/(p)/([A-Za-z1-9]+)"]
2017-05-30 17:43:02 +02:00
test = [
("https://www.flickr.com/photos/departingyyz/16089302239", {
"url": "7f0887f5953f61c8b79a695cb102ea309c0346b0",
"keyword": "5ecdaf0192802451b7daca9b81f393f207ff7ee9",
"content": "6aaad7512d335ca93286fe2046e7fe3bb93d808e",
}),
2017-05-31 17:31:51 +02:00
("http://c2.staticflickr.com/2/1475/24531000464_9a7503ae68_b.jpg", {
"url": "40f5163488522ca5d918750ed7bd7fcf437982fe"}),
("https://farm2.static.flickr.com/1035/1188352415_cb139831d0.jpg", {
"url": "ef217b4fdcb148a0cc9eae44b9342d4a65f6d697"}),
("https://flic.kr/p/FPVo9U", {
"url": "92c54a00f31040c349cb2abcb1b9abe30cc508ae"}),
2017-05-30 17:43:02 +02:00
("https://www.flickr.com/photos/zzz/16089302238", {
"exception": exception.NotFoundError}),
2017-05-30 17:43:02 +02:00
]
def __init__(self, match):
FlickrExtractor.__init__(self, match)
if self.item_id == "p":
alphabet = ("123456789abcdefghijkmnopqrstu"
"vwxyzABCDEFGHJKLMNPQRSTUVWXYZ")
self.item_id = util.bdecode(match.group(2), alphabet)
2017-05-30 17:43:02 +02:00
def items(self):
2017-05-31 17:31:51 +02:00
size = self.api.photos_getSizes(self.item_id)[-1]
2017-05-30 17:43:02 +02:00
2017-06-06 14:22:36 +02:00
if self.load_extra:
2017-05-31 17:31:51 +02:00
info = self.api.photos_getInfo(self.item_id)
2017-05-30 17:43:02 +02:00
self._clean(info)
else:
2017-05-31 17:31:51 +02:00
info = {"id": self.item_id}
2017-05-30 17:43:02 +02:00
info["photo"] = size
url = size["source"]
text.nameext_from_url(url, info)
yield Message.Version, 1
yield Message.Directory, info
yield Message.Url, url, info
@staticmethod
def _clean(photo):
del photo["comments"]
del photo["views"]
photo["title"] = photo["title"]["_content"]
photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]]
if "location" in photo:
location = photo["location"]
for key, value in location.items():
if isinstance(value, dict):
location[key] = value["_content"]
2017-05-30 17:43:02 +02:00
2017-05-31 17:31:51 +02:00
class FlickrAlbumExtractor(FlickrExtractor):
"""Extractor for photo albums from flickr.com"""
subcategory = "album"
2017-06-06 14:22:36 +02:00
directory_fmt = ["{category}", "{subcategory}s",
"{album[id]} - {album[title]}"]
2017-05-31 17:31:51 +02:00
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/"
2017-06-06 14:22:36 +02:00
r"photos/([^/]+)/(?:album|set)s/(\d+)"]
2017-05-31 17:31:51 +02:00
test = [("https://www.flickr.com/photos/flickr/albums/72157656845052880", {
"url": "517db3faa55e88686f1d00a379f8f0daf4c7b837",
2017-06-06 14:22:36 +02:00
"keyword": "001fb1c99a6331cf69d72392af3badf95e8fe51e",
2017-05-31 17:31:51 +02:00
})]
2017-06-06 14:22:36 +02:00
def __init__(self, match):
FlickrExtractor.__init__(self, match)
self.album_id = match.group(2)
2017-05-31 17:31:51 +02:00
2017-06-06 14:22:36 +02:00
def data(self):
self._generator = self.api.photosets_getPhotos(self.album_id)
self._first = next(self._generator)
photoset = self._first.copy()
del photoset["photo"]
return {"album": photoset}
def photos(self):
for photo in self._photos():
self.api._extract_format(photo)
yield photo
def _photos(self):
yield from self._first["photo"]
for photoset in self._generator:
yield from photoset["photo"]
class FlickrGalleryExtractor(FlickrExtractor):
2017-06-06 16:22:30 +02:00
"""Extractor for photo galleries from flickr.com"""
2017-06-06 14:22:36 +02:00
subcategory = "gallery"
directory_fmt = ["{category}", "galleries",
"{user[username]} {gallery[id]}"]
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/"
r"photos/([^/]+)/galleries/(\d+)"]
2017-06-06 16:22:30 +02:00
test = [(("https://www.flickr.com/photos/flickr/"
"galleries/72157681572514792/"), {
2017-06-06 14:22:36 +02:00
"url": "97dd9640b09384f313845b784046da410f70aee6",
"keyword": "8b11026066ec86290ae18833859623ee5b52d363",
})]
def __init__(self, match):
FlickrExtractor.__init__(self, match)
self.gallery_id = match.group(2)
def data(self):
info = FlickrExtractor.data(self)
if self.load_extra:
info["gallery"] = self.api.galleries_getInfo(self.gallery_id)
else:
info["gallery"] = {"id": self.gallery_id}
return info
def photos(self):
return self.api.galleries_getPhotos(self.gallery_id)
2017-05-31 17:31:51 +02:00
2017-06-06 16:22:30 +02:00
class FlickrGroupExtractor(FlickrExtractor):
"""Extractor for group pools from flickr.com"""
subcategory = "group"
directory_fmt = ["{category}", "{subcategory}s", "{group[groupname]}"]
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"]
test = [("https://www.flickr.com/groups/bird_headshots/", {
"url": "40b5586fa0cd1578c3b8cc874fc6e3ae7af70786",
"keyword": "24e721cc510c1f74bc3d8f7bd6130773ba3faef4",
})]
def data(self):
self.group = self.api.urls_lookupGroup(self.item_id)
return {"group": self.group}
def photos(self):
return self.api.groups_pools_getPhotos(self.group["nsid"])
2017-06-02 17:15:05 +02:00
class FlickrUserExtractor(FlickrExtractor):
"""Extractor for the photostream of a flickr user"""
subcategory = "user"
directory_fmt = ["{category}", "{user[username]}"]
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"]
test = [("https://www.flickr.com/photos/shona_s/", {
"url": "d125b536cd8c4229363276b6c84579c394eec3a2",
2017-06-06 14:22:36 +02:00
"keyword": "2cdeae22cd9c3ff19ce905215f3782a7494d8264",
2017-06-02 17:15:05 +02:00
})]
2017-06-06 14:22:36 +02:00
def photos(self):
return self.api.people_getPublicPhotos(self.user["nsid"])
2017-06-02 17:15:05 +02:00
2017-06-06 14:22:36 +02:00
class FlickrFavoriteExtractor(FlickrExtractor):
2017-06-02 16:35:04 +02:00
"""Extractor for favorite photos of a flickr user"""
subcategory = "favorite"
directory_fmt = ["{category}", "{subcategory}s", "{user[username]}"]
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"]
test = [("https://www.flickr.com/photos/shona_s/favorites", {
"url": "5129b3f5bfa83cc25bdae3ce476036de1488dad2",
"keyword": "0e1c9521b6051411b585c9b41a4dc0bcde20e616",
})]
2017-06-06 14:22:36 +02:00
def photos(self):
return self.api.favorites_getPublicList(self.user["nsid"])
2017-06-02 16:35:04 +02:00
2017-05-30 17:43:02 +02:00
class FlickrAPI():
2017-05-31 17:31:51 +02:00
"""Minimal interface for the flickr API"""
API_URL = "https://api.flickr.com/services/rest/"
API_KEY = "ac4fd7aa98585b9eee1ba761c209de68"
API_SECRET = "3adb0f568dc68393"
FORMATS = [("o", "Original"), ("k", "Large 2048"),
2017-05-31 17:31:51 +02:00
("h", "Large 1600"), ("l", "Large")]
2017-05-30 17:43:02 +02:00
def __init__(self, extractor):
token = extractor.config("access-token")
token_secret = extractor.config("access-token-secret")
if token and token_secret:
self.session = oauth.OAuthSession(
extractor.session,
self.API_KEY, self.API_SECRET, token, token_secret)
self.API_KEY = None
else:
self.session = extractor.session
2017-05-30 17:43:02 +02:00
self.subcategory = extractor.subcategory
2017-06-02 17:15:05 +02:00
def favorites_getPublicList(self, user_id):
"""Returns a list of favorite public photos for the given user."""
2017-06-02 16:35:04 +02:00
params = {"user_id": user_id}
2017-06-02 17:15:05 +02:00
return self._listing("favorites.getPublicList", params)
2017-06-06 14:22:36 +02:00
def galleries_getInfo(self, gallery_id):
"""Gets information about a gallery."""
params = {"gallery_id": gallery_id}
gallery = self._call("galleries.getInfo", params)["gallery"]
del gallery["count_views"]
del gallery["count_comments"]
gallery["title"] = gallery["title"]["_content"]
gallery["description"] = gallery["description"]["_content"]
return gallery
def galleries_getPhotos(self, gallery_id):
"""Return the list of photos for a gallery."""
params = {"gallery_id": gallery_id}
return self._listing("galleries.getPhotos", params)
2017-06-06 16:22:30 +02:00
def groups_pools_getPhotos(self, group_id):
"""Returns a list of pool photos for a given group."""
params = {"group_id": group_id}
return self._listing("groups.pools.getPhotos", params)
2017-06-02 17:15:05 +02:00
def people_getPublicPhotos(self, user_id):
"""Get a list of public photos for the given user."""
params = {"user_id": user_id}
return self._listing("people.getPublicPhotos", params)
2017-06-02 16:35:04 +02:00
2017-05-30 17:43:02 +02:00
def photos_getInfo(self, photo_id):
2017-06-02 17:15:05 +02:00
"""Get information about a photo."""
2017-05-30 17:43:02 +02:00
params = {"photo_id": photo_id}
return self._call("photos.getInfo", params)["photo"]
def photos_getSizes(self, photo_id):
2017-06-02 17:15:05 +02:00
"""Returns the available sizes for a photo."""
2017-05-30 17:43:02 +02:00
params = {"photo_id": photo_id}
2017-05-31 17:31:51 +02:00
return self._call("photos.getSizes", params)["sizes"]["size"]
def photosets_getPhotos(self, photoset_id):
2017-06-02 17:15:05 +02:00
"""Get the list of photos in a set."""
2017-06-02 16:35:04 +02:00
params = {"photoset_id": photoset_id}
2017-06-06 14:22:36 +02:00
return self._pagination("photosets.getPhotos", params)
2017-05-31 17:31:51 +02:00
2017-06-06 16:22:30 +02:00
def urls_lookupGroup(self, groupname):
"""Returns a group NSID, given the url to a group's page."""
params = {"url": "https://www.flickr.com/groups/" + groupname}
group = self._call("urls.lookupGroup", params)["group"]
return {"nsid": group["id"],
"path_alias": groupname,
"groupname": group["groupname"]["_content"]}
2017-06-02 16:35:04 +02:00
def urls_lookupUser(self, username):
2017-06-02 17:15:05 +02:00
"""Returns a user NSID, given the url to a user's photos or profile."""
2017-06-02 16:35:04 +02:00
params = {"url": "https://www.flickr.com/photos/" + username}
user = self._call("urls.lookupUser", params)["user"]
return {"nsid": user["id"],
"path_alias": username,
"username": user["username"]["_content"]}
2017-05-30 17:43:02 +02:00
def _call(self, method, params):
params["method"] = "flickr." + method
params["format"] = "json"
params["nojsoncallback"] = "1"
if self.API_KEY:
params["api_key"] = self.API_KEY
data = self.session.get(self.API_URL, params=params).json()
2017-05-30 17:43:02 +02:00
if "code" in data and data["code"] == 1:
raise exception.NotFoundError(self.subcategory)
return data
2017-06-02 16:35:04 +02:00
def _pagination(self, method, params):
params["extras"] = "url_o,url_k,url_h,url_l"
params["page"] = 1
while True:
data = self._call(method, params)
for key, obj in data.items():
if key != "stats":
break
del obj["page"]
del obj["perpage"]
if "per_page" in obj:
del obj["per_page"]
yield obj
if params["page"] >= obj["pages"]:
2017-06-02 16:35:04 +02:00
break
params["page"] += 1
2017-06-02 17:15:05 +02:00
def _listing(self, method, params):
for photos in self._pagination(method, params):
for photo in photos["photo"]:
self._extract_format(photo)
yield photo
2017-06-02 16:35:04 +02:00
def _extract_format(self, photo):
for fmt, fmtname in self.FORMATS:
2017-06-02 16:35:04 +02:00
key = "url_" + fmt
if key in photo:
# generate photo info
photo["photo"] = {
"source": photo[key],
"width" : photo["width_" + fmt],
"height": photo["height_" + fmt],
"label" : fmtname,
"media" : "photo",
}
# remove excess data
keys = [
key for key in photo.keys()
if key.startswith(("url_", "width_", "height_"))
]
for key in keys:
del photo[key]
break
else:
# extra API call to get photo url and size
photo["photo"] = self.photos_getSizes(photo["id"])[-1]