https://im.vsco.co/ URLs redirect to the appropriate CDN server and occasionally insert a '/1200x1600/' into the image path, limiting image dimensions. This commit constructs redirect targets out of the given im,vsco.co URLs without sending extra HTTP requests and without any "builtin" resolution restrictions.
209 lines
7.2 KiB
209 lines
7.2 KiB
# -*- coding: utf-8 -*-
# Copyright 2019 Mike Fährmann
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://vsco.co/"""
from .common import Extractor, Message
from .. import text
import json
BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)"
class VscoExtractor(Extractor):
"""Base class for vsco extractors"""
category = "vsco"
root = "https://vsco.co"
directory_fmt = ("{category}", "{user}")
filename_fmt = "{id}.{extension}"
archive_fmt = "{id}"
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1).lower()
def items(self):
yield Message.Version, 1
yield Message.Directory, {"user": self.user}
for img in self.images():
if img["is_video"]:
url = "https://" + img["video_url"]
base = img["responsive_url"].partition("/")[2]
cdn, _, path = base.partition("/")
if cdn.startswith("aws"):
url = "https://image-{}.vsco.co/{}".format(cdn, path)
elif cdn.isdecimal():
url = "https://image.vsco.co/" + base
url = "https://" + img["responsive_url"]
data = text.nameext_from_url(url, {
"id" : img["_id"],
"user" : self.user,
"grid" : img["grid_name"],
"meta" : img.get("image_meta") or {},
"tags" : [tag["text"] for tag in img.get("tags") or ()],
"date" : text.parse_timestamp(img["upload_date"] // 1000),
"video" : img["is_video"],
"width" : img["width"],
"height": img["height"],
"description": img["description"],
yield Message.Url, url, data
def images(self):
"""Return an iterable with all relevant image objects"""
def _extract_preload_state(self, url):
page = self.request(url, notfound=self.subcategory).text
return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0])
def _pagination(self, url, params, token, key, extra):
headers = {
"Referer" : "{}/{}".format(self.root, self.user),
"Authorization" : "Bearer " + token,
"X-Client-Platform": "web",
"X-Client-Build" : "1",
yield from map(self._transform_media, extra)
while True:
data = self.request(url, params=params, headers=headers).json()
medias = data.get(key)
if not medias:
if "cursor" in params:
for media in medias:
yield media[media["type"]]
cursor = data.get("next_cursor")
if not cursor:
params["cursor"] = cursor
yield from medias
params["page"] += 1
def _transform_media(media):
media["_id"] = media["id"]
media["is_video"] = media["isVideo"]
media["grid_name"] = media["gridName"]
media["upload_date"] = media["uploadDate"]
media["responsive_url"] = media["responsiveUrl"]
media["video_url"] = media.get("videoUrl")
media["image_meta"] = media.get("imageMeta")
return media
class VscoUserExtractor(VscoExtractor):
"""Extractor for images from a user on vsco.co"""
subcategory = "user"
pattern = BASE_PATTERN + r"(?:/images(?:/\d+)?)?/?(?:$|[?#])"
test = (
("https://vsco.co/missuri/images/1", {
"pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+",
"range": "1-80",
"count": 80,
def images(self):
url = "{}/{}/images/1".format(self.root, self.user)
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
sid = str(data["sites"]["siteByUsername"][self.user]["site"]["id"])
site = data["medias"]["bySiteId"][sid]
url = "{}/api/3.0/medias/profile".format(self.root)
params = {
"site_id" : sid,
"limit" : "14",
"show_only": "0",
"cursor" : site["nextCursor"],
return self._pagination(url, params, tkn, "media", (
for media in site["medias"]
class VscoCollectionExtractor(VscoExtractor):
"""Extractor for images from a collection on vsco.co"""
subcategory = "collection"
directory_fmt = ("{category}", "{user}", "collection")
archive_fmt = "c_{user}_{id}"
pattern = BASE_PATTERN + r"/collection/"
test = ("https://vsco.co/vsco/collection/1", {
"pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+\.\w+",
"range": "1-80",
"count": 80,
def images(self):
url = "{}/{}/collection/1".format(self.root, self.user)
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
cid = (data["sites"]["siteByUsername"][self.user]
url = "{}/api/2.0/collections/{}/medias".format(self.root, cid)
params = {"page": 2, "size": "20"}
return self._pagination(url, params, tkn, "medias", (
for mid in data
class VscoImageExtractor(VscoExtractor):
"""Extractor for individual images on vsco.co"""
subcategory = "image"
pattern = BASE_PATTERN + r"/media/([0-9a-fA-F]+)"
test = (
("https://vsco.co/erenyildiz/media/5d34b93ef632433030707ce2", {
"url": "a45f9712325b42742324b330c348b72477996031",
"content": "1394d070828d82078035f19a92f404557b56b83f",
"keyword": {
"id" : "5d34b93ef632433030707ce2",
"user" : "erenyildiz",
"grid" : "erenyildiz",
"meta" : dict,
"tags" : list,
"date" : "type:datetime",
"video" : False,
"width" : 1537,
"height": 1537,
"description": "re:Ni seviyorum. #vsco #vscox #vscochallenges",
("https://vsco.co/jimenalazof/media/5b4feec558f6c45c18c040fd", {
"url": "08e7eef3301756ce81206c0b47c1e9373756a74a",
"content": "e739f058d726ee42c51c180a505747972a7dfa47",
"keyword": {"video" : True},
def __init__(self, match):
VscoExtractor.__init__(self, match)
self.media_id = match.group(2)
def images(self):
url = "{}/{}/media/{}".format(self.root, self.user, self.media_id)
data = self._extract_preload_state(url)
media = data["medias"]["byId"].popitem()[1]["media"]
return (self._transform_media(media),)