master
Ivan Kozik 2015-02-05 04:39:52 +00:00
parent 429b2032ff
commit 5f7593fda2
2 changed files with 20 additions and 7 deletions

View File

@ -26,7 +26,9 @@ done
# Note: we use the default html5lib parser instead of lxml (as ArchiveBot does)
IGNORE_SETS=$ignore_sets PYTHONPATH="$self" ~/.local/bin/wpull3 \
echo "global,$ignore_sets" > "$dir/ignore_sets"
HOOK_SETTINGS_DIR="$dir" PYTHONPATH="$self" ~/.local/bin/wpull3 \
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0" \
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
-o "$dir/wpull.log" \

View File

@ -4,18 +4,29 @@ import json
from urllib.request import urlopen
from ignoracle import Ignoracle, parameterize_record_info
cache = {}
def getPatternsForIgnoreSet(name):
assert name != "", name
if name in cache:
return cache[name]
print("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
return json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
cache[name] = json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
return cache[name]
ignore_sets = set(os.environ.get('IGNORE_SETS', '').strip().split(','))
ignore_sets.add('global')
hook_settings_dir = os.environ['HOOK_SETTINGS_DIR']
ignoracle = Ignoracle()
ignores = set()
for igset in ignore_sets:
ignores.update(getPatternsForIgnoreSet(igset))
def update_ignoracle():
with open(os.path.join(hook_settings_dir, "ignore_sets"), "r") as f:
ignore_sets = f.read().strip("\r\n\t ,").split(',')
ignores = set()
for igset in ignore_sets:
ignores.update(getPatternsForIgnoreSet(igset))
ignoracle.set_patterns(ignores)
update_ignoracle()
def ignore_url_p(url, record_info):