Allow specifying --ignore-sets NAME1,NAME2,...

This commit is contained in:
Ivan Kozik 2015-02-05 04:24:05 +00:00
parent a61ed949ca
commit eea440422d
2 changed files with 25 additions and 2 deletions

View File

@ -9,9 +9,24 @@ self=$(dirname "$0")
mkdir -p "$dir"
while [[ $# > 1 ]]; do
key="$1"
case $key in
-i|--ignore-sets)
ignore_sets="$2"
shift
;;
*)
# unknown option
;;
esac
shift
done
# Note: we use the default html5lib parser instead of lxml (as ArchiveBot does)
PYTHONPATH="$self" ~/.local/bin/wpull3 \
IGNORE_SETS=$ignore_sets PYTHONPATH="$self" ~/.local/bin/wpull3 \
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0" \
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
-o "$dir/wpull.log" \

View File

@ -1,13 +1,21 @@
import re
import os
import json
from urllib.request import urlopen
from ignoracle import Ignoracle, parameterize_record_info
def getPatternsForIgnoreSet(name):
assert name != "", name
print("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
return json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
ignore_sets = set(os.environ.get('IGNORE_SETS', '').strip().split(','))
ignore_sets.add('global')
ignoracle = Ignoracle()
ignoracle.set_patterns(getPatternsForIgnoreSet('global'))
ignores = set()
for igset in ignore_sets:
ignores.update(getPatternsForIgnoreSet(igset))
def ignore_url_p(url, record_info):