Allow specifying --ignore-sets NAME1,NAME2,...
This commit is contained in:
parent
a61ed949ca
commit
eea440422d
17
grab-site.sh
17
grab-site.sh
@ -9,9 +9,24 @@ self=$(dirname "$0")
|
||||
|
||||
mkdir -p "$dir"
|
||||
|
||||
while [[ $# > 1 ]]; do
|
||||
key="$1"
|
||||
|
||||
case $key in
|
||||
-i|--ignore-sets)
|
||||
ignore_sets="$2"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
# unknown option
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# Note: we use the default html5lib parser instead of lxml (as ArchiveBot does)
|
||||
|
||||
PYTHONPATH="$self" ~/.local/bin/wpull3 \
|
||||
IGNORE_SETS=$ignore_sets PYTHONPATH="$self" ~/.local/bin/wpull3 \
|
||||
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0" \
|
||||
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
|
||||
-o "$dir/wpull.log" \
|
||||
|
@ -1,13 +1,21 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
from urllib.request import urlopen
|
||||
from ignoracle import Ignoracle, parameterize_record_info
|
||||
|
||||
def getPatternsForIgnoreSet(name):
|
||||
assert name != "", name
|
||||
print("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
|
||||
return json.loads(urlopen("https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/master/db/ignore_patterns/%s.json" % name).read().decode("utf-8"))["patterns"]
|
||||
|
||||
ignore_sets = set(os.environ.get('IGNORE_SETS', '').strip().split(','))
|
||||
ignore_sets.add('global')
|
||||
|
||||
ignoracle = Ignoracle()
|
||||
ignoracle.set_patterns(getPatternsForIgnoreSet('global'))
|
||||
ignores = set()
|
||||
for igset in ignore_sets:
|
||||
ignores.update(getPatternsForIgnoreSet(igset))
|
||||
|
||||
|
||||
def ignore_url_p(url, record_info):
|
||||
|
Loading…
x
Reference in New Issue
Block a user