72 lines
2.0 KiB
Python
72 lines
2.0 KiB
Python
import re
|
|
import json
|
|
from urllib.request import urlopen
|
|
from ignoracle import Ignoracle, parameterize_record_info
|
|
|
|
def getPatternsForIgnoreSet(name):
|
|
return json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
|
|
|
|
ignoracle = Ignoracle()
|
|
ignoracle.set_patterns(getPatternsForIgnoreSet('global'))
|
|
|
|
|
|
def ignore_url_p(url, record_info):
|
|
'''
|
|
Returns whether a URL should be ignored.
|
|
'''
|
|
parameters = parameterize_record_info(record_info)
|
|
return ignoracle.ignores(url, **parameters)
|
|
|
|
|
|
def accept_url(url_info, record_info, verdict, reasons):
|
|
url = url_info['url']
|
|
|
|
if url.startswith('data:'):
|
|
# data: URLs aren't something you can grab, so drop them to avoid ignore
|
|
# checking and ignore logging.
|
|
return False
|
|
|
|
pattern = ignore_url_p(url, record_info)
|
|
if pattern:
|
|
print("IGNOR %s by %s" % (url, pattern))
|
|
return False
|
|
|
|
# If we get here, none of our ignores apply. Return the original verdict.
|
|
return verdict
|
|
|
|
|
|
# Regular expressions for server headers go here
|
|
ICY_FIELD_PATTERN = re.compile('Icy-|Ice-|X-Audiocast-')
|
|
ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE)
|
|
|
|
def handle_pre_response(url_info, url_record, response_info):
|
|
url = url_info['url']
|
|
|
|
# Check if server version starts with ICY
|
|
if response_info.get('version', '') == 'ICY':
|
|
maybe_log_ignore(url, '[icy version]')
|
|
|
|
return wpull_hook.actions.FINISH
|
|
|
|
# Loop through all the server headers for matches
|
|
for field, value in response_info.get('fields', []):
|
|
if ICY_FIELD_PATTERN.match(field):
|
|
maybe_log_ignore(url, '[icy field]')
|
|
|
|
return wpull_hook.actions.FINISH
|
|
|
|
if field == 'Server' and ICY_VALUE_PATTERN.match(value):
|
|
maybe_log_ignore(url, '[icy server]')
|
|
|
|
return wpull_hook.actions.FINISH
|
|
|
|
# Nothing matched, allow download
|
|
return wpull_hook.actions.NORMAL
|
|
|
|
|
|
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
|
|
|
|
wpull_hook.callbacks.version = 2
|
|
wpull_hook.callbacks.accept_url = accept_url
|
|
wpull_hook.callbacks.handle_pre_response = handle_pre_response
|