diff --git a/README.md b/README.md index 2cc5c07..e09fdfd 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,13 @@ Usage ``` ./grab-site URL -./grab-site URL --ignore-sets=blogs,forums -./grab-site URL --ignore-sets=blogs,forums --no-offsite-links +./grab-site URL --igsets=blogs,forums +./grab-site URL --igsets=blogs,forums --no-offsite-links ``` Note: `URL` must come before the options. -Note: `--ignore-sets=` must have the `=`. +Note: `--igsets=` means "ignore sets" and must have the `=`. `forums` and `blogs` are some frequently-used ignore sets. See [the full list of available ignore sets](https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns). diff --git a/dashboard.html b/dashboard.html index cbebbee..d10933b 100644 --- a/dashboard.html +++ b/dashboard.html @@ -1483,38 +1483,6 @@ var Dashboard = function() { var batchTimeWhenHidden = 5000; - var xhr = new XMLHttpRequest(); - xhr.onload = function() { - var recentLines = JSON.parse(xhr.responseText); - for(var i=0; i < recentLines.length; i++) { - this.handleData(recentLines[i]); - } - - this.queue = new BatchingQueue(function(queue) { - //console.log("Queue has ", queue.length, "items"); - for(var i=0; i < queue.length; i++) { - this.handleData(JSON.parse(queue[i])); - } - }.bind(this), batchTimeWhenVisible); - - this.decayer = new Decayer(1000, 1.5, 60000); - this.connectWebSocket(); - - document.addEventListener("visibilitychange", function() { - if(document.hidden) { - //console.log("Page has become hidden"); - this.queue.setMinInterval(batchTimeWhenHidden); - } else { - //console.log("Page has become visible"); - this.queue.setMinInterval(batchTimeWhenVisible); - this.queue.callNow(); - } - }.bind(this), false); - }.bind(this); - xhr.open("GET", "http://" + this.host + "/logs/recent?cb=" + Date.now() + Math.random()); - xhr.setRequestHeader('Accept', 'application/json'); - xhr.send(""); - document.onkeypress = this.keyPress.bind(this); // Adjust help text based on URL @@ -1527,6 +1495,27 @@ var Dashboard = function() { if(!showNicks) { document.write(''); } + + this.queue = new BatchingQueue(function(queue) { + //console.log("Queue has ", queue.length, "items"); + for(var i=0; i < queue.length; i++) { + this.handleData(JSON.parse(queue[i])); + } + }.bind(this), batchTimeWhenVisible); + + this.decayer = new Decayer(1000, 1.5, 60000); + this.connectWebSocket(); + + document.addEventListener("visibilitychange", function() { + if(document.hidden) { + //console.log("Page has become hidden"); + this.queue.setMinInterval(batchTimeWhenHidden); + } else { + //console.log("Page has become visible"); + this.queue.setMinInterval(batchTimeWhenVisible); + this.queue.callNow(); + } + }.bind(this), false); }; Dashboard.prototype.keyPress = function(ev) { diff --git a/grab-site b/grab-site index 949bb1e..895d837 100755 --- a/grab-site +++ b/grab-site @@ -15,7 +15,10 @@ span_hosts_allow="page-requisites,linked-pages" for arg in "$@"; do case $arg in --ignore-sets=*) - ignore_sets="${arg#*=}" + igsets="${arg#*=}" + ;; + --igsets=*) + igsets="${arg#*=}" ;; --no-offsite-links) span_hosts_allow="page-requisites" @@ -29,13 +32,13 @@ done echo echo "$id" > "$dir/id" -echo "global,$ignore_sets" > "$dir/ignore_sets" +echo "global,$igsets" > "$dir/igsets" touch "$dir/ignores" # Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses # html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml -HOOK_SETTINGS_DIR="$dir" PYTHONPATH="$self" ~/.local/bin/wpull3 \ +GRAB_SITE_WORKING_DIR="$dir" PYTHONPATH="$self" ~/.local/bin/wpull3 \ -U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \ --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \ --header="Accept-Language: en-US,en;q=0.5" \ diff --git a/server.py b/server.py index 39a6daf..f40686e 100644 --- a/server.py +++ b/server.py @@ -6,14 +6,53 @@ import aiohttp.web from autobahn.asyncio.websocket import WebSocketServerFactory, WebSocketServerProtocol class MyServerProtocol(WebSocketServerProtocol): + def __init__(self): + super().__init__() + def onConnect(self, request): - print("Client connecting: {}".format(request.peer)) + self.peer = request.peer + print("{} connected to WebSocket server".format(self.peer)) + self.factory.clients.add(self) + + def onClose(self, wasClean, code, reason): + print("{} disconnected from WebSocket server".format(self.peer)) + self.factory.clients.remove(self) + + def onMessage(self, payload, isBinary): + print(payload) + obj = json.loads(payload.decode('utf-8')) + type = obj["type"] + if type == "hello" and obj.get("mode"): + mode = obj['mode'] + if mode in ('dashboard', 'grabber'): + print("{} set mode {}".format(self.peer, mode)) + self.mode = mode + elif type == "download" or type == "stdout": + for client in self.factory.clients: + if client.mode == "dashboard": + client.sendMessage(json.dumps({ + "job_data": { + "ident": obj["ident"] + }, + "url": obj["url"], + "response_code": obj["response_code"], + "wget_code": obj["response_message"], + "type": type + })) def onMessage(self, payload, isBinary): print(payload) #self.sendMessage(payload, isBinary) +class MyServerFactory(WebSocketServerFactory): + protocol = MyServerProtocol + + def __init__(self): + super().__init__() + self.clients = set() + + dashboardHtml = open(os.path.join(os.path.dirname(__file__), "dashboard.html"), "rb").read() @asyncio.coroutine @@ -41,12 +80,11 @@ def main(): httpCoro = httpServer(loop, httpInterface, httpPort) loop.run_until_complete(httpCoro) - wsFactory = WebSocketServerFactory() - wsFactory.protocol = MyServerProtocol + wsFactory = MyServerFactory() wsCoro = loop.create_server(wsFactory, wsInterface, wsPort) loop.run_until_complete(wsCoro) - print("HTTP server started on {}:{}".format(httpInterface, httpPort)) + print(" HTTP server started on {}:{}".format(httpInterface, httpPort)) print("WebSocket server started on {}:{}".format(wsInterface, wsPort)) loop.run_forever() diff --git a/wpull_hooks.py b/wpull_hooks.py index bf7aa77..fa24595 100644 --- a/wpull_hooks.py +++ b/wpull_hooks.py @@ -8,17 +8,33 @@ from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientPr from ignoracle import Ignoracle, parameterize_record_info +wsClient = None + class MyClientProtocol(WebSocketClientProtocol): - def onConnect(self, response): - print("Connected to server: {}".format(response.peer)) - self.factory.client = self + def onOpen(self): + global wsClient + print("{} connected to WebSocket server".format(self.__class__.__name__)) + wsClient = self - def report(self, url): - self.sendMessage(json.dumps({"url": url}).encode('utf8')) + def onClose(self, reason, code): + # TODO: exponentially increasing delay (copy Decayer from dashboard) + connectToServer() + + def report(self, url, response_code, response_message): + self.sendMessage(json.dumps({ + "ident": grabId, + "type": "download", + "url": url, + "response_code": response_code, + "response_message": response_message, + }).encode('utf8')) -wsFactory = WebSocketClientFactory() -wsFactory.protocol = MyClientProtocol +class MyClientFactory(WebSocketClientFactory): + protocol = MyClientProtocol + + +wsFactory = MyClientFactory() def connectToServer(): loop = asyncio.get_event_loop() @@ -29,22 +45,24 @@ def connectToServer(): connectToServer() -cache = {} +igsetCache = {} def getPatternsForIgnoreSet(name): assert name != "", name - if name in cache: - return cache[name] + if name in igsetCache: + return igsetCache[name] print("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name) - cache[name] = json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"] - return cache[name] + igsetCache[name] = json.loads(urlopen( + "https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/" + + "master/db/ignore_patterns/%s.json" % name).read().decode("utf-8") + )["patterns"] + return igsetCache[name] -hook_settings_dir = os.environ['HOOK_SETTINGS_DIR'] - -ignoracle = Ignoracle() +workingDir = os.environ['GRAB_SITE_WORKING_DIR'] def mtime(f): return os.stat(f).st_mtime + class FileChangedWatcher(object): def __init__(self, fname): self.fname = fname @@ -57,17 +75,20 @@ class FileChangedWatcher(object): return changed -ignore_sets_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignore_sets")) -ignores_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignores")) +grabId = open(os.path.join(workingDir, "id")).read().strip() +igsetsWatcher = FileChangedWatcher(os.path.join(workingDir, "igsets")) +ignoresWatcher = FileChangedWatcher(os.path.join(workingDir, "ignores")) -def update_ignoracle(): - with open(os.path.join(hook_settings_dir, "ignore_sets"), "r") as f: - ignore_sets = f.read().strip("\r\n\t ,").split(',') +ignoracle = Ignoracle() - with open(os.path.join(hook_settings_dir, "ignores"), "r") as f: +def updateIgnoracle(): + with open(os.path.join(workingDir, "igsets"), "r") as f: + igsets = f.read().strip("\r\n\t ,").split(',') + + with open(os.path.join(workingDir, "ignores"), "r") as f: ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "") - for igset in ignore_sets: + for igset in igsets: ignores.update(getPatternsForIgnoreSet(igset)) print("Using these %d ignores:" % len(ignores)) @@ -75,20 +96,20 @@ def update_ignoracle(): ignoracle.set_patterns(ignores) -update_ignoracle() +updateIgnoracle() -def ignore_url_p(url, record_info): - ''' +def shouldIgnoreURL(url, record_info): + """ Returns whether a URL should be ignored. - ''' + """ parameters = parameterize_record_info(record_info) return ignoracle.ignores(url, **parameters) -def accept_url(url_info, record_info, verdict, reasons): - if ignore_sets_w.has_changed() or ignores_w.has_changed(): - update_ignoracle() +def acceptUrl(url_info, record_info, verdict, reasons): + if igsetsWatcher.has_changed() or ignoresWatcher.has_changed(): + updateIgnoracle() url = url_info['url'] @@ -97,9 +118,9 @@ def accept_url(url_info, record_info, verdict, reasons): # checking and ignore logging. return False - pattern = ignore_url_p(url, record_info) + pattern = shouldIgnoreURL(url, record_info) if pattern: - if not os.path.exists(os.path.join(hook_settings_dir, "igoff")): + if not os.path.exists(os.path.join(workingDir, "igoff")): print("IGNOR %s by %s" % (url, pattern)) return False @@ -107,27 +128,27 @@ def accept_url(url_info, record_info, verdict, reasons): return verdict -def handle_result(url_info, record_info, error_info=None, http_info=None): - print("url_info", url_info) - print("record_info", record_info) - print("error_info", error_info) - print("http_info", http_info) - wsFactory.client.report(url_info['url']) +def handleResult(url_info, record_info, error_info={}, http_info={}): + #print("url_info", url_info) + #print("record_info", record_info) + #print("error_info", error_info) + #print("http_info", http_info) + wsClient.report(url_info['url'], http_info.get("response_code"), http_info.get("response_message")) -def handle_response(url_info, record_info, http_info): - return handle_result(url_info, record_info, http_info=http_info) +def handleResponse(url_info, record_info, http_info): + return handleResult(url_info, record_info, http_info=http_info) -def handle_error(url_info, record_info, error_info): - return handle_result(url_info, record_info, error_info=error_info) +def handleError(url_info, record_info, error_info): + return handleResult(url_info, record_info, error_info=error_info) # Regular expressions for server headers go here ICY_FIELD_PATTERN = re.compile('Icy-|Ice-|X-Audiocast-') ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE) -def handle_pre_response(url_info, url_record, response_info): +def handlePreResponse(url_info, url_record, response_info): url = url_info['url'] # Check if server version starts with ICY @@ -155,7 +176,7 @@ def handle_pre_response(url_info, url_record, response_info): assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS wpull_hook.callbacks.version = 2 -wpull_hook.callbacks.accept_url = accept_url -wpull_hook.callbacks.handle_response = handle_response -wpull_hook.callbacks.handle_error = handle_error -wpull_hook.callbacks.handle_pre_response = handle_pre_response +wpull_hook.callbacks.accept_url = acceptUrl +wpull_hook.callbacks.handle_response = handleResponse +wpull_hook.callbacks.handle_error = handleError +wpull_hook.callbacks.handle_pre_response = handlePreResponse