Make WebSocket client/server sort of work; rename ignore_sets to igsets

This commit is contained in:
Ivan Kozik 2015-07-18 02:11:18 +00:00
parent db21e530e2
commit 18a192739b
5 changed files with 139 additions and 88 deletions

View File

@ -28,13 +28,13 @@ Usage
```
./grab-site URL
./grab-site URL --ignore-sets=blogs,forums
./grab-site URL --ignore-sets=blogs,forums --no-offsite-links
./grab-site URL --igsets=blogs,forums
./grab-site URL --igsets=blogs,forums --no-offsite-links
```
Note: `URL` must come before the options.
Note: `--ignore-sets=` must have the `=`.
Note: `--igsets=` means "ignore sets" and must have the `=`.
`forums` and `blogs` are some frequently-used ignore sets.
See [the full list of available ignore sets](https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns).

View File

@ -1483,11 +1483,17 @@ var Dashboard = function() {
var batchTimeWhenHidden = 5000;
var xhr = new XMLHttpRequest();
xhr.onload = function() {
var recentLines = JSON.parse(xhr.responseText);
for(var i=0; i < recentLines.length; i++) {
this.handleData(recentLines[i]);
document.onkeypress = this.keyPress.bind(this);
// Adjust help text based on URL
Array.prototype.slice.call(document.querySelectorAll('.url-q-or-amp')).map(function(elem) {
if(window.location.search.indexOf("?") != -1) {
elem.textContent = "&";
}
});
if(!showNicks) {
document.write('<style>.job-nick-aligned { width: 0; }</style>');
}
this.queue = new BatchingQueue(function(queue) {
@ -1510,23 +1516,6 @@ var Dashboard = function() {
this.queue.callNow();
}
}.bind(this), false);
}.bind(this);
xhr.open("GET", "http://" + this.host + "/logs/recent?cb=" + Date.now() + Math.random());
xhr.setRequestHeader('Accept', 'application/json');
xhr.send("");
document.onkeypress = this.keyPress.bind(this);
// Adjust help text based on URL
Array.prototype.slice.call(document.querySelectorAll('.url-q-or-amp')).map(function(elem) {
if(window.location.search.indexOf("?") != -1) {
elem.textContent = "&";
}
});
if(!showNicks) {
document.write('<style>.job-nick-aligned { width: 0; }</style>');
}
};
Dashboard.prototype.keyPress = function(ev) {

View File

@ -15,7 +15,10 @@ span_hosts_allow="page-requisites,linked-pages"
for arg in "$@"; do
case $arg in
--ignore-sets=*)
ignore_sets="${arg#*=}"
igsets="${arg#*=}"
;;
--igsets=*)
igsets="${arg#*=}"
;;
--no-offsite-links)
span_hosts_allow="page-requisites"
@ -29,13 +32,13 @@ done
echo
echo "$id" > "$dir/id"
echo "global,$ignore_sets" > "$dir/ignore_sets"
echo "global,$igsets" > "$dir/igsets"
touch "$dir/ignores"
# Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses
# html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml
HOOK_SETTINGS_DIR="$dir" PYTHONPATH="$self" ~/.local/bin/wpull3 \
GRAB_SITE_WORKING_DIR="$dir" PYTHONPATH="$self" ~/.local/bin/wpull3 \
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
--header="Accept-Language: en-US,en;q=0.5" \

View File

@ -6,14 +6,53 @@ import aiohttp.web
from autobahn.asyncio.websocket import WebSocketServerFactory, WebSocketServerProtocol
class MyServerProtocol(WebSocketServerProtocol):
def __init__(self):
super().__init__()
def onConnect(self, request):
print("Client connecting: {}".format(request.peer))
self.peer = request.peer
print("{} connected to WebSocket server".format(self.peer))
self.factory.clients.add(self)
def onClose(self, wasClean, code, reason):
print("{} disconnected from WebSocket server".format(self.peer))
self.factory.clients.remove(self)
def onMessage(self, payload, isBinary):
print(payload)
obj = json.loads(payload.decode('utf-8'))
type = obj["type"]
if type == "hello" and obj.get("mode"):
mode = obj['mode']
if mode in ('dashboard', 'grabber'):
print("{} set mode {}".format(self.peer, mode))
self.mode = mode
elif type == "download" or type == "stdout":
for client in self.factory.clients:
if client.mode == "dashboard":
client.sendMessage(json.dumps({
"job_data": {
"ident": obj["ident"]
},
"url": obj["url"],
"response_code": obj["response_code"],
"wget_code": obj["response_message"],
"type": type
}))
def onMessage(self, payload, isBinary):
print(payload)
#self.sendMessage(payload, isBinary)
class MyServerFactory(WebSocketServerFactory):
protocol = MyServerProtocol
def __init__(self):
super().__init__()
self.clients = set()
dashboardHtml = open(os.path.join(os.path.dirname(__file__), "dashboard.html"), "rb").read()
@asyncio.coroutine
@ -41,8 +80,7 @@ def main():
httpCoro = httpServer(loop, httpInterface, httpPort)
loop.run_until_complete(httpCoro)
wsFactory = WebSocketServerFactory()
wsFactory.protocol = MyServerProtocol
wsFactory = MyServerFactory()
wsCoro = loop.create_server(wsFactory, wsInterface, wsPort)
loop.run_until_complete(wsCoro)

View File

@ -8,17 +8,33 @@ from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientPr
from ignoracle import Ignoracle, parameterize_record_info
wsClient = None
class MyClientProtocol(WebSocketClientProtocol):
def onConnect(self, response):
print("Connected to server: {}".format(response.peer))
self.factory.client = self
def onOpen(self):
global wsClient
print("{} connected to WebSocket server".format(self.__class__.__name__))
wsClient = self
def report(self, url):
self.sendMessage(json.dumps({"url": url}).encode('utf8'))
def onClose(self, reason, code):
# TODO: exponentially increasing delay (copy Decayer from dashboard)
connectToServer()
def report(self, url, response_code, response_message):
self.sendMessage(json.dumps({
"ident": grabId,
"type": "download",
"url": url,
"response_code": response_code,
"response_message": response_message,
}).encode('utf8'))
wsFactory = WebSocketClientFactory()
wsFactory.protocol = MyClientProtocol
class MyClientFactory(WebSocketClientFactory):
protocol = MyClientProtocol
wsFactory = MyClientFactory()
def connectToServer():
loop = asyncio.get_event_loop()
@ -29,22 +45,24 @@ def connectToServer():
connectToServer()
cache = {}
igsetCache = {}
def getPatternsForIgnoreSet(name):
assert name != "", name
if name in cache:
return cache[name]
if name in igsetCache:
return igsetCache[name]
print("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
cache[name] = json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
return cache[name]
igsetCache[name] = json.loads(urlopen(
"https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/" +
"master/db/ignore_patterns/%s.json" % name).read().decode("utf-8")
)["patterns"]
return igsetCache[name]
hook_settings_dir = os.environ['HOOK_SETTINGS_DIR']
ignoracle = Ignoracle()
workingDir = os.environ['GRAB_SITE_WORKING_DIR']
def mtime(f):
return os.stat(f).st_mtime
class FileChangedWatcher(object):
def __init__(self, fname):
self.fname = fname
@ -57,17 +75,20 @@ class FileChangedWatcher(object):
return changed
ignore_sets_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignore_sets"))
ignores_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignores"))
grabId = open(os.path.join(workingDir, "id")).read().strip()
igsetsWatcher = FileChangedWatcher(os.path.join(workingDir, "igsets"))
ignoresWatcher = FileChangedWatcher(os.path.join(workingDir, "ignores"))
def update_ignoracle():
with open(os.path.join(hook_settings_dir, "ignore_sets"), "r") as f:
ignore_sets = f.read().strip("\r\n\t ,").split(',')
ignoracle = Ignoracle()
with open(os.path.join(hook_settings_dir, "ignores"), "r") as f:
def updateIgnoracle():
with open(os.path.join(workingDir, "igsets"), "r") as f:
igsets = f.read().strip("\r\n\t ,").split(',')
with open(os.path.join(workingDir, "ignores"), "r") as f:
ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "")
for igset in ignore_sets:
for igset in igsets:
ignores.update(getPatternsForIgnoreSet(igset))
print("Using these %d ignores:" % len(ignores))
@ -75,20 +96,20 @@ def update_ignoracle():
ignoracle.set_patterns(ignores)
update_ignoracle()
updateIgnoracle()
def ignore_url_p(url, record_info):
'''
def shouldIgnoreURL(url, record_info):
"""
Returns whether a URL should be ignored.
'''
"""
parameters = parameterize_record_info(record_info)
return ignoracle.ignores(url, **parameters)
def accept_url(url_info, record_info, verdict, reasons):
if ignore_sets_w.has_changed() or ignores_w.has_changed():
update_ignoracle()
def acceptUrl(url_info, record_info, verdict, reasons):
if igsetsWatcher.has_changed() or ignoresWatcher.has_changed():
updateIgnoracle()
url = url_info['url']
@ -97,9 +118,9 @@ def accept_url(url_info, record_info, verdict, reasons):
# checking and ignore logging.
return False
pattern = ignore_url_p(url, record_info)
pattern = shouldIgnoreURL(url, record_info)
if pattern:
if not os.path.exists(os.path.join(hook_settings_dir, "igoff")):
if not os.path.exists(os.path.join(workingDir, "igoff")):
print("IGNOR %s by %s" % (url, pattern))
return False
@ -107,27 +128,27 @@ def accept_url(url_info, record_info, verdict, reasons):
return verdict
def handle_result(url_info, record_info, error_info=None, http_info=None):
print("url_info", url_info)
print("record_info", record_info)
print("error_info", error_info)
print("http_info", http_info)
wsFactory.client.report(url_info['url'])
def handleResult(url_info, record_info, error_info={}, http_info={}):
#print("url_info", url_info)
#print("record_info", record_info)
#print("error_info", error_info)
#print("http_info", http_info)
wsClient.report(url_info['url'], http_info.get("response_code"), http_info.get("response_message"))
def handle_response(url_info, record_info, http_info):
return handle_result(url_info, record_info, http_info=http_info)
def handleResponse(url_info, record_info, http_info):
return handleResult(url_info, record_info, http_info=http_info)
def handle_error(url_info, record_info, error_info):
return handle_result(url_info, record_info, error_info=error_info)
def handleError(url_info, record_info, error_info):
return handleResult(url_info, record_info, error_info=error_info)
# Regular expressions for server headers go here
ICY_FIELD_PATTERN = re.compile('Icy-|Ice-|X-Audiocast-')
ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE)
def handle_pre_response(url_info, url_record, response_info):
def handlePreResponse(url_info, url_record, response_info):
url = url_info['url']
# Check if server version starts with ICY
@ -155,7 +176,7 @@ def handle_pre_response(url_info, url_record, response_info):
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
wpull_hook.callbacks.version = 2
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.handle_response = handle_response
wpull_hook.callbacks.handle_error = handle_error
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.accept_url = acceptUrl
wpull_hook.callbacks.handle_response = handleResponse
wpull_hook.callbacks.handle_error = handleError
wpull_hook.callbacks.handle_pre_response = handlePreResponse