Make WebSocket client/server sort of work; rename ignore_sets to igsets
This commit is contained in:
parent
db21e530e2
commit
18a192739b
@ -28,13 +28,13 @@ Usage
|
||||
|
||||
```
|
||||
./grab-site URL
|
||||
./grab-site URL --ignore-sets=blogs,forums
|
||||
./grab-site URL --ignore-sets=blogs,forums --no-offsite-links
|
||||
./grab-site URL --igsets=blogs,forums
|
||||
./grab-site URL --igsets=blogs,forums --no-offsite-links
|
||||
```
|
||||
|
||||
Note: `URL` must come before the options.
|
||||
|
||||
Note: `--ignore-sets=` must have the `=`.
|
||||
Note: `--igsets=` means "ignore sets" and must have the `=`.
|
||||
|
||||
`forums` and `blogs` are some frequently-used ignore sets.
|
||||
See [the full list of available ignore sets](https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns).
|
||||
|
@ -1483,38 +1483,6 @@ var Dashboard = function() {
|
||||
|
||||
var batchTimeWhenHidden = 5000;
|
||||
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.onload = function() {
|
||||
var recentLines = JSON.parse(xhr.responseText);
|
||||
for(var i=0; i < recentLines.length; i++) {
|
||||
this.handleData(recentLines[i]);
|
||||
}
|
||||
|
||||
this.queue = new BatchingQueue(function(queue) {
|
||||
//console.log("Queue has ", queue.length, "items");
|
||||
for(var i=0; i < queue.length; i++) {
|
||||
this.handleData(JSON.parse(queue[i]));
|
||||
}
|
||||
}.bind(this), batchTimeWhenVisible);
|
||||
|
||||
this.decayer = new Decayer(1000, 1.5, 60000);
|
||||
this.connectWebSocket();
|
||||
|
||||
document.addEventListener("visibilitychange", function() {
|
||||
if(document.hidden) {
|
||||
//console.log("Page has become hidden");
|
||||
this.queue.setMinInterval(batchTimeWhenHidden);
|
||||
} else {
|
||||
//console.log("Page has become visible");
|
||||
this.queue.setMinInterval(batchTimeWhenVisible);
|
||||
this.queue.callNow();
|
||||
}
|
||||
}.bind(this), false);
|
||||
}.bind(this);
|
||||
xhr.open("GET", "http://" + this.host + "/logs/recent?cb=" + Date.now() + Math.random());
|
||||
xhr.setRequestHeader('Accept', 'application/json');
|
||||
xhr.send("");
|
||||
|
||||
document.onkeypress = this.keyPress.bind(this);
|
||||
|
||||
// Adjust help text based on URL
|
||||
@ -1527,6 +1495,27 @@ var Dashboard = function() {
|
||||
if(!showNicks) {
|
||||
document.write('<style>.job-nick-aligned { width: 0; }</style>');
|
||||
}
|
||||
|
||||
this.queue = new BatchingQueue(function(queue) {
|
||||
//console.log("Queue has ", queue.length, "items");
|
||||
for(var i=0; i < queue.length; i++) {
|
||||
this.handleData(JSON.parse(queue[i]));
|
||||
}
|
||||
}.bind(this), batchTimeWhenVisible);
|
||||
|
||||
this.decayer = new Decayer(1000, 1.5, 60000);
|
||||
this.connectWebSocket();
|
||||
|
||||
document.addEventListener("visibilitychange", function() {
|
||||
if(document.hidden) {
|
||||
//console.log("Page has become hidden");
|
||||
this.queue.setMinInterval(batchTimeWhenHidden);
|
||||
} else {
|
||||
//console.log("Page has become visible");
|
||||
this.queue.setMinInterval(batchTimeWhenVisible);
|
||||
this.queue.callNow();
|
||||
}
|
||||
}.bind(this), false);
|
||||
};
|
||||
|
||||
Dashboard.prototype.keyPress = function(ev) {
|
||||
|
@ -15,7 +15,10 @@ span_hosts_allow="page-requisites,linked-pages"
|
||||
for arg in "$@"; do
|
||||
case $arg in
|
||||
--ignore-sets=*)
|
||||
ignore_sets="${arg#*=}"
|
||||
igsets="${arg#*=}"
|
||||
;;
|
||||
--igsets=*)
|
||||
igsets="${arg#*=}"
|
||||
;;
|
||||
--no-offsite-links)
|
||||
span_hosts_allow="page-requisites"
|
||||
@ -29,13 +32,13 @@ done
|
||||
|
||||
echo
|
||||
echo "$id" > "$dir/id"
|
||||
echo "global,$ignore_sets" > "$dir/ignore_sets"
|
||||
echo "global,$igsets" > "$dir/igsets"
|
||||
touch "$dir/ignores"
|
||||
|
||||
# Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses
|
||||
# html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml
|
||||
|
||||
HOOK_SETTINGS_DIR="$dir" PYTHONPATH="$self" ~/.local/bin/wpull3 \
|
||||
GRAB_SITE_WORKING_DIR="$dir" PYTHONPATH="$self" ~/.local/bin/wpull3 \
|
||||
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \
|
||||
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
|
||||
--header="Accept-Language: en-US,en;q=0.5" \
|
||||
|
46
server.py
46
server.py
@ -6,14 +6,53 @@ import aiohttp.web
|
||||
from autobahn.asyncio.websocket import WebSocketServerFactory, WebSocketServerProtocol
|
||||
|
||||
class MyServerProtocol(WebSocketServerProtocol):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def onConnect(self, request):
|
||||
print("Client connecting: {}".format(request.peer))
|
||||
self.peer = request.peer
|
||||
print("{} connected to WebSocket server".format(self.peer))
|
||||
self.factory.clients.add(self)
|
||||
|
||||
def onClose(self, wasClean, code, reason):
|
||||
print("{} disconnected from WebSocket server".format(self.peer))
|
||||
self.factory.clients.remove(self)
|
||||
|
||||
def onMessage(self, payload, isBinary):
|
||||
print(payload)
|
||||
obj = json.loads(payload.decode('utf-8'))
|
||||
type = obj["type"]
|
||||
if type == "hello" and obj.get("mode"):
|
||||
mode = obj['mode']
|
||||
if mode in ('dashboard', 'grabber'):
|
||||
print("{} set mode {}".format(self.peer, mode))
|
||||
self.mode = mode
|
||||
elif type == "download" or type == "stdout":
|
||||
for client in self.factory.clients:
|
||||
if client.mode == "dashboard":
|
||||
client.sendMessage(json.dumps({
|
||||
"job_data": {
|
||||
"ident": obj["ident"]
|
||||
},
|
||||
"url": obj["url"],
|
||||
"response_code": obj["response_code"],
|
||||
"wget_code": obj["response_message"],
|
||||
"type": type
|
||||
}))
|
||||
|
||||
def onMessage(self, payload, isBinary):
|
||||
print(payload)
|
||||
#self.sendMessage(payload, isBinary)
|
||||
|
||||
|
||||
class MyServerFactory(WebSocketServerFactory):
|
||||
protocol = MyServerProtocol
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.clients = set()
|
||||
|
||||
|
||||
dashboardHtml = open(os.path.join(os.path.dirname(__file__), "dashboard.html"), "rb").read()
|
||||
|
||||
@asyncio.coroutine
|
||||
@ -41,12 +80,11 @@ def main():
|
||||
httpCoro = httpServer(loop, httpInterface, httpPort)
|
||||
loop.run_until_complete(httpCoro)
|
||||
|
||||
wsFactory = WebSocketServerFactory()
|
||||
wsFactory.protocol = MyServerProtocol
|
||||
wsFactory = MyServerFactory()
|
||||
wsCoro = loop.create_server(wsFactory, wsInterface, wsPort)
|
||||
loop.run_until_complete(wsCoro)
|
||||
|
||||
print("HTTP server started on {}:{}".format(httpInterface, httpPort))
|
||||
print(" HTTP server started on {}:{}".format(httpInterface, httpPort))
|
||||
print("WebSocket server started on {}:{}".format(wsInterface, wsPort))
|
||||
|
||||
loop.run_forever()
|
||||
|
113
wpull_hooks.py
113
wpull_hooks.py
@ -8,17 +8,33 @@ from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientPr
|
||||
from ignoracle import Ignoracle, parameterize_record_info
|
||||
|
||||
|
||||
wsClient = None
|
||||
|
||||
class MyClientProtocol(WebSocketClientProtocol):
|
||||
def onConnect(self, response):
|
||||
print("Connected to server: {}".format(response.peer))
|
||||
self.factory.client = self
|
||||
def onOpen(self):
|
||||
global wsClient
|
||||
print("{} connected to WebSocket server".format(self.__class__.__name__))
|
||||
wsClient = self
|
||||
|
||||
def report(self, url):
|
||||
self.sendMessage(json.dumps({"url": url}).encode('utf8'))
|
||||
def onClose(self, reason, code):
|
||||
# TODO: exponentially increasing delay (copy Decayer from dashboard)
|
||||
connectToServer()
|
||||
|
||||
def report(self, url, response_code, response_message):
|
||||
self.sendMessage(json.dumps({
|
||||
"ident": grabId,
|
||||
"type": "download",
|
||||
"url": url,
|
||||
"response_code": response_code,
|
||||
"response_message": response_message,
|
||||
}).encode('utf8'))
|
||||
|
||||
|
||||
wsFactory = WebSocketClientFactory()
|
||||
wsFactory.protocol = MyClientProtocol
|
||||
class MyClientFactory(WebSocketClientFactory):
|
||||
protocol = MyClientProtocol
|
||||
|
||||
|
||||
wsFactory = MyClientFactory()
|
||||
|
||||
def connectToServer():
|
||||
loop = asyncio.get_event_loop()
|
||||
@ -29,22 +45,24 @@ def connectToServer():
|
||||
connectToServer()
|
||||
|
||||
|
||||
cache = {}
|
||||
igsetCache = {}
|
||||
def getPatternsForIgnoreSet(name):
|
||||
assert name != "", name
|
||||
if name in cache:
|
||||
return cache[name]
|
||||
if name in igsetCache:
|
||||
return igsetCache[name]
|
||||
print("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
|
||||
cache[name] = json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
|
||||
return cache[name]
|
||||
igsetCache[name] = json.loads(urlopen(
|
||||
"https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/" +
|
||||
"master/db/ignore_patterns/%s.json" % name).read().decode("utf-8")
|
||||
)["patterns"]
|
||||
return igsetCache[name]
|
||||
|
||||
hook_settings_dir = os.environ['HOOK_SETTINGS_DIR']
|
||||
|
||||
ignoracle = Ignoracle()
|
||||
workingDir = os.environ['GRAB_SITE_WORKING_DIR']
|
||||
|
||||
def mtime(f):
|
||||
return os.stat(f).st_mtime
|
||||
|
||||
|
||||
class FileChangedWatcher(object):
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
@ -57,17 +75,20 @@ class FileChangedWatcher(object):
|
||||
return changed
|
||||
|
||||
|
||||
ignore_sets_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignore_sets"))
|
||||
ignores_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignores"))
|
||||
grabId = open(os.path.join(workingDir, "id")).read().strip()
|
||||
igsetsWatcher = FileChangedWatcher(os.path.join(workingDir, "igsets"))
|
||||
ignoresWatcher = FileChangedWatcher(os.path.join(workingDir, "ignores"))
|
||||
|
||||
def update_ignoracle():
|
||||
with open(os.path.join(hook_settings_dir, "ignore_sets"), "r") as f:
|
||||
ignore_sets = f.read().strip("\r\n\t ,").split(',')
|
||||
ignoracle = Ignoracle()
|
||||
|
||||
with open(os.path.join(hook_settings_dir, "ignores"), "r") as f:
|
||||
def updateIgnoracle():
|
||||
with open(os.path.join(workingDir, "igsets"), "r") as f:
|
||||
igsets = f.read().strip("\r\n\t ,").split(',')
|
||||
|
||||
with open(os.path.join(workingDir, "ignores"), "r") as f:
|
||||
ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "")
|
||||
|
||||
for igset in ignore_sets:
|
||||
for igset in igsets:
|
||||
ignores.update(getPatternsForIgnoreSet(igset))
|
||||
|
||||
print("Using these %d ignores:" % len(ignores))
|
||||
@ -75,20 +96,20 @@ def update_ignoracle():
|
||||
|
||||
ignoracle.set_patterns(ignores)
|
||||
|
||||
update_ignoracle()
|
||||
updateIgnoracle()
|
||||
|
||||
|
||||
def ignore_url_p(url, record_info):
|
||||
'''
|
||||
def shouldIgnoreURL(url, record_info):
|
||||
"""
|
||||
Returns whether a URL should be ignored.
|
||||
'''
|
||||
"""
|
||||
parameters = parameterize_record_info(record_info)
|
||||
return ignoracle.ignores(url, **parameters)
|
||||
|
||||
|
||||
def accept_url(url_info, record_info, verdict, reasons):
|
||||
if ignore_sets_w.has_changed() or ignores_w.has_changed():
|
||||
update_ignoracle()
|
||||
def acceptUrl(url_info, record_info, verdict, reasons):
|
||||
if igsetsWatcher.has_changed() or ignoresWatcher.has_changed():
|
||||
updateIgnoracle()
|
||||
|
||||
url = url_info['url']
|
||||
|
||||
@ -97,9 +118,9 @@ def accept_url(url_info, record_info, verdict, reasons):
|
||||
# checking and ignore logging.
|
||||
return False
|
||||
|
||||
pattern = ignore_url_p(url, record_info)
|
||||
pattern = shouldIgnoreURL(url, record_info)
|
||||
if pattern:
|
||||
if not os.path.exists(os.path.join(hook_settings_dir, "igoff")):
|
||||
if not os.path.exists(os.path.join(workingDir, "igoff")):
|
||||
print("IGNOR %s by %s" % (url, pattern))
|
||||
return False
|
||||
|
||||
@ -107,27 +128,27 @@ def accept_url(url_info, record_info, verdict, reasons):
|
||||
return verdict
|
||||
|
||||
|
||||
def handle_result(url_info, record_info, error_info=None, http_info=None):
|
||||
print("url_info", url_info)
|
||||
print("record_info", record_info)
|
||||
print("error_info", error_info)
|
||||
print("http_info", http_info)
|
||||
wsFactory.client.report(url_info['url'])
|
||||
def handleResult(url_info, record_info, error_info={}, http_info={}):
|
||||
#print("url_info", url_info)
|
||||
#print("record_info", record_info)
|
||||
#print("error_info", error_info)
|
||||
#print("http_info", http_info)
|
||||
wsClient.report(url_info['url'], http_info.get("response_code"), http_info.get("response_message"))
|
||||
|
||||
|
||||
def handle_response(url_info, record_info, http_info):
|
||||
return handle_result(url_info, record_info, http_info=http_info)
|
||||
def handleResponse(url_info, record_info, http_info):
|
||||
return handleResult(url_info, record_info, http_info=http_info)
|
||||
|
||||
|
||||
def handle_error(url_info, record_info, error_info):
|
||||
return handle_result(url_info, record_info, error_info=error_info)
|
||||
def handleError(url_info, record_info, error_info):
|
||||
return handleResult(url_info, record_info, error_info=error_info)
|
||||
|
||||
|
||||
# Regular expressions for server headers go here
|
||||
ICY_FIELD_PATTERN = re.compile('Icy-|Ice-|X-Audiocast-')
|
||||
ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE)
|
||||
|
||||
def handle_pre_response(url_info, url_record, response_info):
|
||||
def handlePreResponse(url_info, url_record, response_info):
|
||||
url = url_info['url']
|
||||
|
||||
# Check if server version starts with ICY
|
||||
@ -155,7 +176,7 @@ def handle_pre_response(url_info, url_record, response_info):
|
||||
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
|
||||
|
||||
wpull_hook.callbacks.version = 2
|
||||
wpull_hook.callbacks.accept_url = accept_url
|
||||
wpull_hook.callbacks.handle_response = handle_response
|
||||
wpull_hook.callbacks.handle_error = handle_error
|
||||
wpull_hook.callbacks.handle_pre_response = handle_pre_response
|
||||
wpull_hook.callbacks.accept_url = acceptUrl
|
||||
wpull_hook.callbacks.handle_response = handleResponse
|
||||
wpull_hook.callbacks.handle_error = handleError
|
||||
wpull_hook.callbacks.handle_pre_response = handlePreResponse
|
||||
|
Loading…
x
Reference in New Issue
Block a user