rewrite extractors to use config-module
This commit is contained in:
parent
608d3193a9
commit
3c13548f29
@ -112,13 +112,11 @@ class DownloadJob():
|
||||
scheme = url[:pos] if pos != -1 else "http"
|
||||
if scheme == "https":
|
||||
scheme = "http"
|
||||
|
||||
downloader = self.downloaders.get(scheme)
|
||||
if downloader is None:
|
||||
module = self.mngr.get_downloader_module(scheme)
|
||||
downloader = module.Downloader()
|
||||
self.downloaders[scheme] = downloader
|
||||
|
||||
return downloader
|
||||
|
||||
@staticmethod
|
||||
@ -148,7 +146,7 @@ class ExtractorFinder():
|
||||
if match:
|
||||
module = importlib.import_module(".extractor." + name, __package__)
|
||||
klass = getattr(module, module.info["extractor"])
|
||||
return klass(match, {}), module.info
|
||||
return klass(match), module.info
|
||||
else:
|
||||
print("no suitable extractor found")
|
||||
return None, None
|
||||
@ -158,9 +156,9 @@ class ExtractorFinder():
|
||||
for category in config.get(("extractor",)):
|
||||
patterns = config.get(("extractor", category, "pattern"), default=[])
|
||||
for pattern in patterns:
|
||||
match = re.match(pattern, url)
|
||||
if match:
|
||||
return category, match
|
||||
match = re.match(pattern, url)
|
||||
if match:
|
||||
return category, match
|
||||
for category, info in self.extractor_metadata():
|
||||
for pattern in info["pattern"]:
|
||||
match = re.match(pattern, url)
|
||||
|
@ -22,8 +22,8 @@ info = {
|
||||
|
||||
class ThreeDeeBooruExtractor(JSONBooruExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
JSONBooruExtractor.__init__(self, match, config, info)
|
||||
def __init__(self, match):
|
||||
JSONBooruExtractor.__init__(self, match, info)
|
||||
self.api_url = "http://behoimi.org/post/index.json"
|
||||
self.headers = {
|
||||
"Referer": "http://behoimi.org/post/show/",
|
||||
|
@ -25,8 +25,8 @@ class FourChanExtractor(ChanExtractor):
|
||||
api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
|
||||
file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
|
||||
|
||||
def __init__(self, match, config):
|
||||
def __init__(self, match):
|
||||
ChanExtractor.__init__(
|
||||
self, config, info["category"],
|
||||
self, info["category"],
|
||||
match.group(1), match.group(2)
|
||||
)
|
||||
|
@ -25,8 +25,8 @@ class InfinityChanExtractor(ChanExtractor):
|
||||
api_url = "https://8ch.net/{board}/res/{thread}.json"
|
||||
file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
|
||||
|
||||
def __init__(self, match, config):
|
||||
def __init__(self, match):
|
||||
ChanExtractor.__init__(
|
||||
self, config, info["category"],
|
||||
self, info["category"],
|
||||
match.group(1), match.group(2)
|
||||
)
|
||||
|
@ -27,8 +27,8 @@ class BatotoExtractor(AsynchronousExtractor):
|
||||
|
||||
url_base = "http://bato.to/read/_/"
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsynchronousExtractor.__init__(self, config)
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.chapter_id = match.group(1)
|
||||
|
||||
def items(self):
|
||||
|
@ -19,8 +19,8 @@ class BooruExtractor(SequentialExtractor):
|
||||
|
||||
api_url = ""
|
||||
|
||||
def __init__(self, match, config, info):
|
||||
SequentialExtractor.__init__(self, config)
|
||||
def __init__(self, match, info):
|
||||
SequentialExtractor.__init__(self)
|
||||
self.info = info
|
||||
self.tags = text.unquote(match.group(1))
|
||||
self.page = "page"
|
||||
|
@ -10,15 +10,14 @@
|
||||
|
||||
from .common import SequentialExtractor, Message
|
||||
from .. import text
|
||||
import re
|
||||
|
||||
class ChanExtractor(SequentialExtractor):
|
||||
|
||||
api_url = ""
|
||||
file_url = ""
|
||||
|
||||
def __init__(self, config, category, board, thread):
|
||||
SequentialExtractor.__init__(self, config)
|
||||
def __init__(self, category, board, thread):
|
||||
SequentialExtractor.__init__(self)
|
||||
self.metadata = {
|
||||
"category": category,
|
||||
"board": board,
|
||||
|
@ -12,7 +12,7 @@ import time
|
||||
import queue
|
||||
import requests
|
||||
import threading
|
||||
import html.parser
|
||||
from .. import config
|
||||
|
||||
|
||||
class Message():
|
||||
@ -47,15 +47,15 @@ class Extractor():
|
||||
|
||||
class SequentialExtractor(Extractor):
|
||||
|
||||
def __init__(self, _):
|
||||
def __init__(self):
|
||||
Extractor.__init__(self)
|
||||
|
||||
|
||||
class AsynchronousExtractor(Extractor):
|
||||
|
||||
def __init__(self, config):
|
||||
def __init__(self):
|
||||
Extractor.__init__(self)
|
||||
queue_size = int(config.get("general", "queue-size", fallback=5))
|
||||
queue_size = int(config.get(("queue-size",), default=5))
|
||||
self.__queue = queue.Queue(maxsize=queue_size)
|
||||
self.__thread = threading.Thread(target=self.async_items, daemon=True)
|
||||
|
||||
|
@ -22,6 +22,6 @@ info = {
|
||||
|
||||
class DanbooruExtractor(JSONBooruExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
JSONBooruExtractor.__init__(self, match, config, info)
|
||||
def __init__(self, match):
|
||||
JSONBooruExtractor.__init__(self, match, info)
|
||||
self.api_url = "https://danbooru.donmai.us/posts.json"
|
||||
|
@ -23,6 +23,6 @@ info = {
|
||||
|
||||
class E621Extractor(JSONBooruExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
JSONBooruExtractor.__init__(self, match, config, info)
|
||||
def __init__(self, match):
|
||||
JSONBooruExtractor.__init__(self, match, info)
|
||||
self.api_url = "https://e621.net/post/index.json"
|
||||
|
@ -22,8 +22,8 @@ info = {
|
||||
|
||||
class GelbooruExtractor(XMLBooruExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
XMLBooruExtractor.__init__(self, match, config, info)
|
||||
def __init__(self, match):
|
||||
XMLBooruExtractor.__init__(self, match, info)
|
||||
self.api_url = "http://gelbooru.com/"
|
||||
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
|
||||
|
||||
|
@ -25,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor):
|
||||
|
||||
url_base = "http://www.imagebam.com"
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsynchronousExtractor.__init__(self, config)
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.match = match
|
||||
self.num = 0
|
||||
self.metadata = {}
|
||||
|
@ -26,8 +26,8 @@ class ImgboxExtractor(AsynchronousExtractor):
|
||||
|
||||
url_base = "http://imgbox.com"
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsynchronousExtractor.__init__(self, config)
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.key = match.group(1)
|
||||
self.metadata = {}
|
||||
|
||||
|
@ -24,8 +24,8 @@ info = {
|
||||
|
||||
class ImgchiliExtractor(SequentialExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
SequentialExtractor.__init__(self, config)
|
||||
def __init__(self, match):
|
||||
SequentialExtractor.__init__(self)
|
||||
self.match = match
|
||||
self.num = 0
|
||||
|
||||
|
@ -28,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor):
|
||||
|
||||
url_base = "http://www.mangareader.net"
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsynchronousExtractor.__init__(self, config)
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.part = match.group(1)
|
||||
|
||||
def items(self):
|
||||
|
@ -9,7 +9,7 @@
|
||||
"""Extract images from https://nijie.info/"""
|
||||
|
||||
from .common import AsynchronousExtractor, Message
|
||||
from ..text import filename_from_url
|
||||
from .. import config, text
|
||||
import re
|
||||
|
||||
info = {
|
||||
@ -26,8 +26,8 @@ class NijieExtractor(AsynchronousExtractor):
|
||||
|
||||
popup_url = "https://nijie.info/view_popup.php?id="
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsynchronousExtractor.__init__(self, config)
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.artist_id = match.group(1)
|
||||
self.artist_url = (
|
||||
"https://nijie.info/members_illust.php?id="
|
||||
@ -36,7 +36,9 @@ class NijieExtractor(AsynchronousExtractor):
|
||||
self.session.headers["Referer"] = self.artist_url
|
||||
self.session.cookies["R18"] = "1"
|
||||
self.session.cookies["nijie_referer"] = "nijie.info"
|
||||
self.session.cookies.update(config["nijie-cookies"])
|
||||
self.session.cookies.update(
|
||||
config.get(("extractor", info["category"], "cookies"))
|
||||
)
|
||||
|
||||
def items(self):
|
||||
data = self.get_job_metadata()
|
||||
@ -56,19 +58,19 @@ class NijieExtractor(AsynchronousExtractor):
|
||||
|
||||
def get_image_ids(self):
|
||||
"""Collect all image-ids for a specific artist"""
|
||||
text = self.request(self.artist_url).text
|
||||
page = self.request(self.artist_url).text
|
||||
regex = r'<a href="/view\.php\?id=(\d+)"'
|
||||
return [m.group(1) for m in re.finditer(regex, text)]
|
||||
return [m.group(1) for m in re.finditer(regex, page)]
|
||||
|
||||
def get_image_data(self, image_id):
|
||||
"""Get URL and metadata for images specified by 'image_id'"""
|
||||
text = self.request(self.popup_url + image_id).text
|
||||
matches = re.findall('<img src="([^"]+)"', text)
|
||||
page = self.request(self.popup_url + image_id).text
|
||||
matches = re.findall('<img src="([^"]+)"', page)
|
||||
for index, url in enumerate(matches):
|
||||
yield "https:" + url, {
|
||||
"count": len(matches),
|
||||
"index": index,
|
||||
"image-id": image_id,
|
||||
"name" : filename_from_url(url),
|
||||
"name" : text.filename_from_url(url),
|
||||
"extension": url[url.rfind(".")+1:],
|
||||
}
|
||||
|
@ -9,7 +9,7 @@
|
||||
"""Extract images and ugoira from http://www.pixiv.net/"""
|
||||
|
||||
from .common import SequentialExtractor, Message
|
||||
from .. import text
|
||||
from .. import config, text
|
||||
import re
|
||||
import json
|
||||
|
||||
@ -29,16 +29,15 @@ class PixivExtractor(SequentialExtractor):
|
||||
member_url = "http://www.pixiv.net/member_illust.php"
|
||||
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
|
||||
|
||||
def __init__(self, match, config):
|
||||
SequentialExtractor.__init__(self, config)
|
||||
self.config = config
|
||||
def __init__(self, match):
|
||||
SequentialExtractor.__init__(self)
|
||||
self.artist_id = match.group(1)
|
||||
self.api = PixivAPI(self.session)
|
||||
|
||||
def items(self):
|
||||
self.api.login(
|
||||
self.config.get("pixiv", "username"),
|
||||
self.config.get("pixiv", "password"),
|
||||
config.get(("extractor", "pixiv", "username")),
|
||||
config.get(("extractor", "pixiv", "password")),
|
||||
)
|
||||
metadata = self.get_job_metadata()
|
||||
|
||||
|
@ -28,8 +28,8 @@ class RedHawkScansExtractor(SequentialExtractor):
|
||||
|
||||
url_base = "https://manga.redhawkscans.com/reader/read/"
|
||||
|
||||
def __init__(self, match, config):
|
||||
SequentialExtractor.__init__(self, config)
|
||||
def __init__(self, match):
|
||||
SequentialExtractor.__init__(self)
|
||||
self.part = match.group(1)
|
||||
|
||||
def items(self):
|
||||
|
@ -22,6 +22,6 @@ info = {
|
||||
|
||||
class YandereExtractor(JSONBooruExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
JSONBooruExtractor.__init__(self, match, config, info)
|
||||
def __init__(self, match):
|
||||
JSONBooruExtractor.__init__(self, match, info)
|
||||
self.api_url = "https://yande.re/post.json"
|
||||
|
Loading…
x
Reference in New Issue
Block a user