Mike Fährmann 58e95a7487
share extractor and downloader sessions
There was never any "good" reason for the strict separation
between extractors and downloaders. This change allows for
reduced resource usage (probably unnoticeable) and less lines
of code at the "cost" of tighter coupling.
2017-06-30 19:38:14 +02:00

100 lines
3.5 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2014-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://imgchili.net/"""
from .common import Extractor, Message
from .. import text
class ImgchiliExtractor(Extractor):
"""Base class for imgchili extractors"""
category = "imgchili"
url_base = "https://imgchili.net/"
def __init__(self, match):
Extractor.__init__(self)
self.url = match.group(0)
self.match = match
self.session.headers["Referer"] = self.url_base
def items(self):
page = self.request(self.url, encoding="utf-8").text
data = self.get_job_metadata(page)
yield Message.Version, 1
yield Message.Directory, data
for url, image in self.get_images(page):
data.update(image)
yield Message.Url, url, data
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
return {}
def get_images(self, page):
"""Collect image-urls and -metadata"""
return []
class ImgchiliImageExtractor(ImgchiliExtractor):
"""Extractor for single images from imgchili.net"""
subcategory = "image"
pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/show/\d+/(\d+)_[^/]+"]
test = [(("http://imgchili.net/show/89427/"
"89427136_test___quot;___gt;.png"), {
"url": "b93d92a6b58eb30a7ff6f9729cb748d25fea0c86",
"keyword": "376c4584dfae7d7d2e88687d4ee9618bbfd0a35c",
})]
def get_job_metadata(self, page):
name1, pos = text.extract(page, '="description" content="', '. An ')
name2, pos = text.extract(page, 'image called ', '" />\n', pos)
_ , pos = text.extract(page, '<link rel="image_src"', '', pos)
self.imgurl, pos = text.extract(page, ' href="', '"', pos)
parts = name2.split("in the gallery ")
name = parts[0] if not parts[0].endswith("...") else name1
return text.nameext_from_url(name, {
"image-id": self.match.group(1),
"title": text.unescape(parts[-1]) if len(parts) > 1 else ""
})
def get_images(self, page):
return [(self.imgurl, {})]
class ImgchiliAlbumExtractor(ImgchiliExtractor):
"""Extractor for image-albums from imgchili.net"""
subcategory = "album"
directory_fmt = ["{category}", "{title} - {key}"]
filename_fmt = "{num:>03} {filename}"
pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/album/([^/]+)"]
test = [("http://imgchili.net/album/7a3824c59f77c8d39b260f9168d4b49b", {
"url": "995e32b62c36d48b02ef4c7a7a19463924391e2a",
"keyword": "2d065bd7f822de4c0b7598679f2730e0082a617e",
})]
def get_job_metadata(self, page):
title = text.extract(page, "<h1>", "</h1>")[0]
return {
"title": text.unescape(title),
"key": self.match.group(1),
}
def get_images(self, page):
pos = 0
num = 0
while True:
num += 1
url , pos = text.extract(page, '<img src="http://t', 'jpg"', pos)
if not url:
return
imgid, pos = text.extract(page, ' alt="', '_', pos)
name , pos = text.extract(page, '<strong>', '</strong>', pos)
data = text.nameext_from_url(name, {"image-id": imgid, "num": num})
yield "http://i" + url + data["extension"], data