rewrite extractors to use config-module

This commit is contained in:
Mike Fährmann 2015-10-05 15:35:48 +02:00
parent 608d3193a9
commit 3c13548f29
19 changed files with 54 additions and 56 deletions

View File

@ -112,13 +112,11 @@ class DownloadJob():
scheme = url[:pos] if pos != -1 else "http"
if scheme == "https":
scheme = "http"
downloader = self.downloaders.get(scheme)
if downloader is None:
module = self.mngr.get_downloader_module(scheme)
downloader = module.Downloader()
self.downloaders[scheme] = downloader
return downloader
@staticmethod
@ -148,7 +146,7 @@ class ExtractorFinder():
if match:
module = importlib.import_module(".extractor." + name, __package__)
klass = getattr(module, module.info["extractor"])
return klass(match, {}), module.info
return klass(match), module.info
else:
print("no suitable extractor found")
return None, None
@ -158,9 +156,9 @@ class ExtractorFinder():
for category in config.get(("extractor",)):
patterns = config.get(("extractor", category, "pattern"), default=[])
for pattern in patterns:
match = re.match(pattern, url)
if match:
return category, match
match = re.match(pattern, url)
if match:
return category, match
for category, info in self.extractor_metadata():
for pattern in info["pattern"]:
match = re.match(pattern, url)

View File

@ -22,8 +22,8 @@ info = {
class ThreeDeeBooruExtractor(JSONBooruExtractor):
def __init__(self, match, config):
JSONBooruExtractor.__init__(self, match, config, info)
def __init__(self, match):
JSONBooruExtractor.__init__(self, match, info)
self.api_url = "http://behoimi.org/post/index.json"
self.headers = {
"Referer": "http://behoimi.org/post/show/",

View File

@ -25,8 +25,8 @@ class FourChanExtractor(ChanExtractor):
api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
def __init__(self, match, config):
def __init__(self, match):
ChanExtractor.__init__(
self, config, info["category"],
self, info["category"],
match.group(1), match.group(2)
)

View File

@ -25,8 +25,8 @@ class InfinityChanExtractor(ChanExtractor):
api_url = "https://8ch.net/{board}/res/{thread}.json"
file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
def __init__(self, match, config):
def __init__(self, match):
ChanExtractor.__init__(
self, config, info["category"],
self, info["category"],
match.group(1), match.group(2)
)

View File

@ -27,8 +27,8 @@ class BatotoExtractor(AsynchronousExtractor):
url_base = "http://bato.to/read/_/"
def __init__(self, match, config):
AsynchronousExtractor.__init__(self, config)
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.chapter_id = match.group(1)
def items(self):

View File

@ -19,8 +19,8 @@ class BooruExtractor(SequentialExtractor):
api_url = ""
def __init__(self, match, config, info):
SequentialExtractor.__init__(self, config)
def __init__(self, match, info):
SequentialExtractor.__init__(self)
self.info = info
self.tags = text.unquote(match.group(1))
self.page = "page"

View File

@ -10,15 +10,14 @@
from .common import SequentialExtractor, Message
from .. import text
import re
class ChanExtractor(SequentialExtractor):
api_url = ""
file_url = ""
def __init__(self, config, category, board, thread):
SequentialExtractor.__init__(self, config)
def __init__(self, category, board, thread):
SequentialExtractor.__init__(self)
self.metadata = {
"category": category,
"board": board,

View File

@ -12,7 +12,7 @@ import time
import queue
import requests
import threading
import html.parser
from .. import config
class Message():
@ -47,15 +47,15 @@ class Extractor():
class SequentialExtractor(Extractor):
def __init__(self, _):
def __init__(self):
Extractor.__init__(self)
class AsynchronousExtractor(Extractor):
def __init__(self, config):
def __init__(self):
Extractor.__init__(self)
queue_size = int(config.get("general", "queue-size", fallback=5))
queue_size = int(config.get(("queue-size",), default=5))
self.__queue = queue.Queue(maxsize=queue_size)
self.__thread = threading.Thread(target=self.async_items, daemon=True)

View File

@ -22,6 +22,6 @@ info = {
class DanbooruExtractor(JSONBooruExtractor):
def __init__(self, match, config):
JSONBooruExtractor.__init__(self, match, config, info)
def __init__(self, match):
JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://danbooru.donmai.us/posts.json"

View File

@ -23,6 +23,6 @@ info = {
class E621Extractor(JSONBooruExtractor):
def __init__(self, match, config):
JSONBooruExtractor.__init__(self, match, config, info)
def __init__(self, match):
JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://e621.net/post/index.json"

View File

@ -22,8 +22,8 @@ info = {
class GelbooruExtractor(XMLBooruExtractor):
def __init__(self, match, config):
XMLBooruExtractor.__init__(self, match, config, info)
def __init__(self, match):
XMLBooruExtractor.__init__(self, match, info)
self.api_url = "http://gelbooru.com/"
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}

View File

@ -25,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor):
url_base = "http://www.imagebam.com"
def __init__(self, match, config):
AsynchronousExtractor.__init__(self, config)
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.match = match
self.num = 0
self.metadata = {}

View File

@ -26,8 +26,8 @@ class ImgboxExtractor(AsynchronousExtractor):
url_base = "http://imgbox.com"
def __init__(self, match, config):
AsynchronousExtractor.__init__(self, config)
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.key = match.group(1)
self.metadata = {}

View File

@ -24,8 +24,8 @@ info = {
class ImgchiliExtractor(SequentialExtractor):
def __init__(self, match, config):
SequentialExtractor.__init__(self, config)
def __init__(self, match):
SequentialExtractor.__init__(self)
self.match = match
self.num = 0

View File

@ -28,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor):
url_base = "http://www.mangareader.net"
def __init__(self, match, config):
AsynchronousExtractor.__init__(self, config)
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.part = match.group(1)
def items(self):

View File

@ -9,7 +9,7 @@
"""Extract images from https://nijie.info/"""
from .common import AsynchronousExtractor, Message
from ..text import filename_from_url
from .. import config, text
import re
info = {
@ -26,8 +26,8 @@ class NijieExtractor(AsynchronousExtractor):
popup_url = "https://nijie.info/view_popup.php?id="
def __init__(self, match, config):
AsynchronousExtractor.__init__(self, config)
def __init__(self, match):
AsynchronousExtractor.__init__(self)
self.artist_id = match.group(1)
self.artist_url = (
"https://nijie.info/members_illust.php?id="
@ -36,7 +36,9 @@ class NijieExtractor(AsynchronousExtractor):
self.session.headers["Referer"] = self.artist_url
self.session.cookies["R18"] = "1"
self.session.cookies["nijie_referer"] = "nijie.info"
self.session.cookies.update(config["nijie-cookies"])
self.session.cookies.update(
config.get(("extractor", info["category"], "cookies"))
)
def items(self):
data = self.get_job_metadata()
@ -56,19 +58,19 @@ class NijieExtractor(AsynchronousExtractor):
def get_image_ids(self):
"""Collect all image-ids for a specific artist"""
text = self.request(self.artist_url).text
page = self.request(self.artist_url).text
regex = r'<a href="/view\.php\?id=(\d+)"'
return [m.group(1) for m in re.finditer(regex, text)]
return [m.group(1) for m in re.finditer(regex, page)]
def get_image_data(self, image_id):
"""Get URL and metadata for images specified by 'image_id'"""
text = self.request(self.popup_url + image_id).text
matches = re.findall('<img src="([^"]+)"', text)
page = self.request(self.popup_url + image_id).text
matches = re.findall('<img src="([^"]+)"', page)
for index, url in enumerate(matches):
yield "https:" + url, {
"count": len(matches),
"index": index,
"image-id": image_id,
"name" : filename_from_url(url),
"name" : text.filename_from_url(url),
"extension": url[url.rfind(".")+1:],
}

View File

@ -9,7 +9,7 @@
"""Extract images and ugoira from http://www.pixiv.net/"""
from .common import SequentialExtractor, Message
from .. import text
from .. import config, text
import re
import json
@ -29,16 +29,15 @@ class PixivExtractor(SequentialExtractor):
member_url = "http://www.pixiv.net/member_illust.php"
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
def __init__(self, match, config):
SequentialExtractor.__init__(self, config)
self.config = config
def __init__(self, match):
SequentialExtractor.__init__(self)
self.artist_id = match.group(1)
self.api = PixivAPI(self.session)
def items(self):
self.api.login(
self.config.get("pixiv", "username"),
self.config.get("pixiv", "password"),
config.get(("extractor", "pixiv", "username")),
config.get(("extractor", "pixiv", "password")),
)
metadata = self.get_job_metadata()

View File

@ -28,8 +28,8 @@ class RedHawkScansExtractor(SequentialExtractor):
url_base = "https://manga.redhawkscans.com/reader/read/"
def __init__(self, match, config):
SequentialExtractor.__init__(self, config)
def __init__(self, match):
SequentialExtractor.__init__(self)
self.part = match.group(1)
def items(self):

View File

@ -22,6 +22,6 @@ info = {
class YandereExtractor(JSONBooruExtractor):
def __init__(self, match, config):
JSONBooruExtractor.__init__(self, match, config, info)
def __init__(self, match):
JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://yande.re/post.json"