# -*- coding: utf-8 -*- # Copyright 2015 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from http://www.deviantart.com/""" from .common import Extractor, AsynchronousExtractor, Message from .. import text import re class DeviantartUserExtractor(AsynchronousExtractor): """Extractor for all works from an artist on deviantart.com""" category = "deviantart" subcategory = "user" directory_fmt = ["{category}", "{artist}"] filename_fmt = "{category}_{index}_{title}.{extension}" pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com(?:/gallery)?/?$"] test = [("http://shimoda7.deviantart.com/gallery/", { "url": "63bfa8efba199e27181943c9060f6770f91a8441", "keyword": "741bbea4891a23335bb5d119c4a42aeb54702c50", })] def __init__(self, match): AsynchronousExtractor.__init__(self) self.session.cookies["agegate_state"] = "1" self.artist = match.group(1) def items(self): metadata = self.get_job_metadata() yield Message.Version, 1 yield Message.Directory, metadata for url, data in self.get_works(): data.update(metadata) yield Message.Url, url, data def get_works(self): """Yield all work-items for a deviantart-artist""" url = "http://{}.deviantart.com/gallery/".format(self.artist) params = {"catpath": "/", "offset": 0} while True: num = 0 page = self.request(url, params=params).text _, pos = text.extract(page, '
', ''), ('date' , 'title="', '"'), ), values={"index": self.index})[0] def get_image(self, page): """Find image-url and -dimensions""" # try preview data, pos = text.extract_all(page, ( ('image' , '"og:image" content="', '"'), ('width' , '"og:image:width" content="', '"'), ('height', '"og:image:height" content="', '"'), )) if data["image"].startswith("https://orig"): return data # try main image data, pos = text.extract_all(page, ( (None , 'class="dev-content-normal "', ''), ('image' , ' src="', '"'), ('width' , ' width="', '"'), ('height', ' height="', '"'), ), pos) if data["image"].startswith("https://orig"): return data # try download test, pos = text.extract(page, 'dev-page-download', '', pos) if test is not None: data, pos = text.extract_all(page, ( ('image' , 'href="', '"'), (None , '', ' '), ('width' , '', ' '), ('height', ' ', '<'), ), pos) response = self.session.head(text.unescape(data["image"])) data["image"] = response.headers["Location"] return data