update download-infrastructure

This commit is contained in:
Mike Fährmann 2015-04-08 01:51:48 +02:00
parent 513808d156
commit 0abbee3710

View File

@ -6,14 +6,13 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
""" """
import os import os
import sys import sys
import re import re
import sqlite3 import sqlite3
import importlib import importlib
from extractor.common import Message
class DownloadManager(): class DownloadManager():
@ -21,35 +20,94 @@ class DownloadManager():
self.opts = opts self.opts = opts
self.conf = conf self.conf = conf
self.downloaders = {} self.downloaders = {}
self.extractors = ExtractorFinder(conf)
def add(self, extr): def add(self, url):
job = DownloadJob(self, url)
job.run()
def get_downloader_module(self, scheme):
"""Return a downloader module suitable for 'scheme'"""
module = self.downloaders.get(scheme)
if module is None:
module = importlib.import_module(".downloader."+scheme, __package__)
self.downloaders[scheme] = module
return module
def get_base_directory(self):
if self.opts.dest: if self.opts.dest:
dest = self.opts.dest return self.opts.dest
elif extr.category in self.conf:
dest = self.conf[extr.category].get("destination", "/tmp/")
else: else:
dest = self.conf["general"].get("destination", "/tmp/") return self.conf["general"].get("destination", "/tmp/")
dest = os.path.join(dest, extr.category, extr.directory)
os.makedirs(dest, exist_ok=True)
for url, filename in extr:
path = os.path.join(dest, filename)
if os.path.exists(path):
self.print_skip(path)
continue
dl = self.get_downloader(extr, url)
self.print_start(path)
tries = dl.download(url, path)
self.print_success(path, tries)
def get_downloader(self, extr, url): class DownloadJob():
end = url.find("://")
proto = url[:end] if end != -1 else "http" def __init__(self, mngr, url):
if proto not in self.downloaders: self.mngr = mngr
# import downloader self.extractor, self.info = mngr.extractors.get_for_url(url)
module = importlib.import_module("."+proto, __package__) self.directory = mngr.get_base_directory()
self.downloaders[proto] = module.Downloader self.downloaders = {}
return self.downloaders[proto](extr)
def run(self):
"""Execute/Run the downlaod job"""
if self.extractor is None:
return # TODO: error msg
for msg in self.extractor:
print(msg)
print(type(msg))
if msg[0] == Message.Url:
self.download(msg)
elif msg[0] == Message.Directory:
self.set_directory(msg)
elif msg[0] == Message.Version:
if msg[1] != 1:
raise "unsupported message-version ({}, {})".format(
self.info.category, msg[1]
)
# TODO: support for multiple message versions
def download(self, msg):
"""Download the resource specified in 'msg'"""
_, url, metadata = msg
filename = self.info["filename"].format(**metadata)
path = os.path.join(self.directory, filename)
if os.path.exists(path):
self.print_skip(path)
return
dl = self.get_downloader(url)
self.print_start(path)
tries = dl.download(url, path)
self.print_success(path, tries)
def set_directory(self, msg):
"""Set and create the target directory for downloads"""
path = []
for segment in self.info["directory"]:
path.append(segment.format(**msg[1]))
self.directory = os.path.join(
self.mngr.get_base_directory(),
*path
)
os.makedirs(self.directory, exist_ok=True)
def get_downloader(self, url):
"""Return, and possibly construct, a downloader suitable for 'url'"""
pos = url.find(":")
scheme = url[:pos] if pos != -1 else "http"
if scheme == "https":
scheme = "http"
downloader = self.downloaders.get(scheme)
if downloader is None:
module = self.mngr.get_downloader_module(scheme)
downloader = module.Downloader(self.extractor)
self.downloaders[scheme] = downloader
return downloader
@staticmethod @staticmethod
def print_start(path): def print_start(path):
@ -78,6 +136,17 @@ class ExtractorFinder():
self.load_from_database(conn) self.load_from_database(conn)
self.load_from_config(config) self.load_from_config(config)
def get_for_url(self, url):
# TODO: implement general case
module = importlib.import_module(".extractor.8chan", __package__)
for pattern in module.info["pattern"]:
match = re.match(pattern, url)
if match:
klass = getattr(module, module.info["extractor"])
return klass(match, self.config), module.info
print("pattern mismatch")
sys.exit()
def match(self, url): def match(self, url):
for category, regex in self.match_list: for category, regex in self.match_list:
match = regex.match(url) match = regex.match(url)