148 lines
4.1 KiB
Python
148 lines
4.1 KiB
Python
import re
|
|
import os
|
|
import json
|
|
import pprint
|
|
import asyncio
|
|
from urllib.request import urlopen
|
|
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
|
|
from ignoracle import Ignoracle, parameterize_record_info
|
|
|
|
clients = []
|
|
|
|
class MyClientProtocol(WebSocketClientProtocol):
|
|
def onConnect(self, response):
|
|
print("Connected to server: {}".format(response.peer))
|
|
clients.append(self)
|
|
|
|
def report(self, url):
|
|
self.sendMessage(json.dumps({"url": url}).encode('utf8'))
|
|
|
|
|
|
cache = {}
|
|
def getPatternsForIgnoreSet(name):
|
|
assert name != "", name
|
|
if name in cache:
|
|
return cache[name]
|
|
print("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
|
|
cache[name] = json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
|
|
return cache[name]
|
|
|
|
hook_settings_dir = os.environ['HOOK_SETTINGS_DIR']
|
|
|
|
ignoracle = Ignoracle()
|
|
|
|
def mtime(f):
|
|
return os.stat(f).st_mtime
|
|
|
|
class FileChangedWatcher(object):
|
|
def __init__(self, fname):
|
|
self.fname = fname
|
|
self.last_mtime = mtime(fname)
|
|
|
|
def has_changed(self):
|
|
now_mtime = mtime(self.fname)
|
|
changed = mtime(self.fname) != self.last_mtime
|
|
self.last_mtime = now_mtime
|
|
return changed
|
|
|
|
|
|
ignore_sets_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignore_sets"))
|
|
ignores_w = FileChangedWatcher(os.path.join(hook_settings_dir, "ignores"))
|
|
|
|
def update_ignoracle():
|
|
with open(os.path.join(hook_settings_dir, "ignore_sets"), "r") as f:
|
|
ignore_sets = f.read().strip("\r\n\t ,").split(',')
|
|
|
|
with open(os.path.join(hook_settings_dir, "ignores"), "r") as f:
|
|
ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "")
|
|
|
|
for igset in ignore_sets:
|
|
ignores.update(getPatternsForIgnoreSet(igset))
|
|
|
|
print("Using these %d ignores:" % len(ignores))
|
|
pprint.pprint(ignores)
|
|
|
|
ignoracle.set_patterns(ignores)
|
|
|
|
update_ignoracle()
|
|
|
|
|
|
def ignore_url_p(url, record_info):
|
|
'''
|
|
Returns whether a URL should be ignored.
|
|
'''
|
|
parameters = parameterize_record_info(record_info)
|
|
return ignoracle.ignores(url, **parameters)
|
|
|
|
|
|
def accept_url(url_info, record_info, verdict, reasons):
|
|
if ignore_sets_w.has_changed() or ignores_w.has_changed():
|
|
update_ignoracle()
|
|
|
|
url = url_info['url']
|
|
|
|
if url.startswith('data:'):
|
|
# data: URLs aren't something you can grab, so drop them to avoid ignore
|
|
# checking and ignore logging.
|
|
return False
|
|
|
|
pattern = ignore_url_p(url, record_info)
|
|
if pattern:
|
|
if not os.path.exists(os.path.join(hook_settings_dir, "igoff")):
|
|
print("IGNOR %s by %s" % (url, pattern))
|
|
return False
|
|
|
|
# If we get here, none of our ignores apply. Return the original verdict.
|
|
return verdict
|
|
|
|
|
|
def handle_response(url_info, record_info, error_info=None, http_info=None):
|
|
if clients:
|
|
clients[0].report(url_info['url'])
|
|
|
|
|
|
# Regular expressions for server headers go here
|
|
ICY_FIELD_PATTERN = re.compile('Icy-|Ice-|X-Audiocast-')
|
|
ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE)
|
|
|
|
def handle_pre_response(url_info, url_record, response_info):
|
|
url = url_info['url']
|
|
|
|
# Check if server version starts with ICY
|
|
if response_info.get('version', '') == 'ICY':
|
|
maybe_log_ignore(url, '[icy version]')
|
|
|
|
return wpull_hook.actions.FINISH
|
|
|
|
# Loop through all the server headers for matches
|
|
for field, value in response_info.get('fields', []):
|
|
if ICY_FIELD_PATTERN.match(field):
|
|
maybe_log_ignore(url, '[icy field]')
|
|
|
|
return wpull_hook.actions.FINISH
|
|
|
|
if field == 'Server' and ICY_VALUE_PATTERN.match(value):
|
|
maybe_log_ignore(url, '[icy server]')
|
|
|
|
return wpull_hook.actions.FINISH
|
|
|
|
# Nothing matched, allow download
|
|
return wpull_hook.actions.NORMAL
|
|
|
|
|
|
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
|
|
|
|
wpull_hook.callbacks.version = 2
|
|
wpull_hook.callbacks.accept_url = accept_url
|
|
wpull_hook.callbacks.handle_response = handle_response
|
|
wpull_hook.callbacks.handle_pre_response = handle_pre_response
|
|
|
|
|
|
factory = WebSocketClientFactory()
|
|
factory.protocol = MyClientProtocol
|
|
|
|
loop = asyncio.get_event_loop()
|
|
port = int(os.environ.get('GRAB_SITE_WS_PORT', 29001))
|
|
coro = loop.create_connection(factory, '127.0.0.1', port)
|
|
loop.run_until_complete(coro)
|