95 lines
2.3 KiB
Bash
Executable File
95 lines
2.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -e
|
|
|
|
url="$1"
|
|
id="$(python3 -c "import os, binascii; print(binascii.hexlify(os.urandom(16)).decode('utf-8'))")"
|
|
# remove protocol, remove trailing slashes, convert slashes to "-"es
|
|
dir="$(echo -n "$url" | sed -r 's,^(ftp|https?)://,,g' | sed -r 's,/+$,,g' | sed -r 's,[/\?&],-,g')-$(date +%F)-${id:0:8}"
|
|
self=$(dirname "$0")
|
|
|
|
mkdir -p "$dir"
|
|
|
|
level="inf"
|
|
concurrency="2"
|
|
page_requisites_level="5"
|
|
span_hosts_allow="page-requisites,linked-pages"
|
|
|
|
for arg in "$@"; do
|
|
case $arg in
|
|
--ignore-sets=*)
|
|
igsets="${arg#*=}"
|
|
;;
|
|
--igsets=*)
|
|
igsets="${arg#*=}"
|
|
;;
|
|
--level=*)
|
|
level="${arg#*=}"
|
|
;;
|
|
--page-requisites-level=*)
|
|
page_requisites_level="${arg#*=}"
|
|
;;
|
|
--concurrency=*)
|
|
concurrency="${arg#*=}"
|
|
;;
|
|
--concurrent=*)
|
|
concurrency="${arg#*=}"
|
|
;;
|
|
--no-offsite-links)
|
|
span_hosts_allow="page-requisites"
|
|
;;
|
|
*)
|
|
# unknown option
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
echo
|
|
echo "$id" > "$dir/id"
|
|
echo "$url" > "$dir/start_url"
|
|
echo "$concurrency" > "$dir/concurrency"
|
|
echo "global,$igsets" > "$dir/igsets"
|
|
touch "$dir/igoff"
|
|
touch "$dir/ignores"
|
|
|
|
LIBGRABSITE="$(python3 -c 'import os, libgrabsite; print(os.path.dirname(libgrabsite.__file__))')"
|
|
|
|
# Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses
|
|
# html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml
|
|
GRAB_SITE_WORKING_DIR="$dir" "$self/patched-wpull" \
|
|
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \
|
|
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
|
|
--header="Accept-Language: en-US,en;q=0.5" \
|
|
-o "$dir/wpull.log" \
|
|
--database "$dir/wpull.db" \
|
|
--plugin-script "$LIBGRABSITE/plugin.py" \
|
|
--python-script "$LIBGRABSITE/wpull_hooks.py" \
|
|
--plugin-args " --dupes-db $dir/dupes_db" \
|
|
--save-cookies "$dir/cookies.txt" \
|
|
--no-check-certificate \
|
|
--delete-after \
|
|
--no-robots \
|
|
--page-requisites \
|
|
--no-parent \
|
|
--sitemaps \
|
|
--inet4-only \
|
|
--timeout 20 \
|
|
--tries 3 \
|
|
--concurrent "$concurrency" \
|
|
--waitretry 5 \
|
|
--warc-file "$dir/$dir" \
|
|
--warc-max-size 5368709120 \
|
|
--debug-manhole \
|
|
--strip-session-id \
|
|
--escaped-fragment \
|
|
--monitor-disk 400m \
|
|
--monitor-memory 10k \
|
|
--max-redirect 8 \
|
|
--recursive \
|
|
--level "$level" \
|
|
--page-requisites-level "$page_requisites_level" \
|
|
--span-hosts-allow "$span_hosts_allow" \
|
|
--quiet \
|
|
"$url"
|