383 lines
13 KiB
Python
383 lines
13 KiB
Python
import faulthandler
|
|
faulthandler.enable()
|
|
|
|
import re
|
|
import os
|
|
import sys
|
|
import urllib.request
|
|
import shutil
|
|
import binascii
|
|
import datetime
|
|
import shlex
|
|
import click
|
|
import libgrabsite
|
|
|
|
def print_version(ctx, param, value):
|
|
if not value or ctx.resilient_parsing:
|
|
return
|
|
click.echo(libgrabsite.__version__)
|
|
ctx.exit()
|
|
|
|
def replace_2arg(args, arg, replacement):
|
|
idx = args.index(arg)
|
|
if idx == -1:
|
|
return
|
|
args.pop(idx)
|
|
args.pop(idx)
|
|
for r in reversed(replacement):
|
|
args.insert(idx, r)
|
|
|
|
def patch_dns_inet_is_multicast():
|
|
"""
|
|
Patch dnspython's dns.inet.is_multicast to not raise ValueError:
|
|
https://github.com/ArchiveTeam/grab-site/issues/111
|
|
"""
|
|
import dns.inet
|
|
is_multicast_dnspython = dns.inet.is_multicast
|
|
def is_multicast(text):
|
|
try:
|
|
return is_multicast_dnspython(text)
|
|
except Exception:
|
|
return False
|
|
dns.inet.is_multicast = is_multicast
|
|
|
|
@click.command()
|
|
|
|
@click.option('--concurrency', default=2, metavar='NUM',
|
|
help='Use this many connections to fetch in parallel (default: 2).')
|
|
|
|
@click.option('--concurrent', default=-1, metavar='NUM',
|
|
help='Alias for --concurrency.')
|
|
|
|
@click.option('--delay', default="0", metavar='DELAY',
|
|
help=
|
|
'Time to wait between requests, in milliseconds (default: 0). '
|
|
'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
|
|
'for each request. Delay applies to each concurrent fetcher, not globally.')
|
|
|
|
@click.option('--recursive/--1', default=True,
|
|
help=
|
|
'--recursive (default: true) to crawl under last /path/ component '
|
|
'recursively, or --1 to get just START_URL.')
|
|
|
|
@click.option('--offsite-links/--no-offsite-links', default=True,
|
|
help=
|
|
'--offsite-links (default: true) to grab all links to a depth of 1 '
|
|
'on other domains, or --no-offsite-links to disable.')
|
|
|
|
@click.option('--igsets', default="", metavar='LIST',
|
|
help='Comma-separated list of ignore sets to use in addition to "global".')
|
|
|
|
@click.option('--ignore-sets', default="", metavar='LIST',
|
|
help='Alias for --igsets.')
|
|
|
|
@click.option('--import-ignores', default=None, metavar='FILE',
|
|
help='Copy this file to DIR/ignores before the crawl begins.')
|
|
|
|
@click.option('--igon/--igoff', default=False,
|
|
help=
|
|
'--igon (default: false) to print all URLs being ignored to the terminal '
|
|
'and dashboard.')
|
|
|
|
@click.option('--debug', is_flag=True, help='Print a lot of debugging information.')
|
|
|
|
@click.option('--video/--no-video', default=True,
|
|
help=
|
|
'--no-video (default: false) to skip the download of videos by both '
|
|
'mime type and file extension. Skipped videos are logged to '
|
|
'DIR/skipped_videos')
|
|
|
|
@click.option('-i', '--input-file', default=None, type=str,
|
|
help=
|
|
'Load list of URLs-to-grab from a local file or from a URL; like wget -i. '
|
|
'File must be a newline-delimited list of URLs. '
|
|
'Combine with --1 to avoid a recursive crawl on each URL.')
|
|
|
|
@click.option('--max-content-length', default=-1, metavar='N',
|
|
help=
|
|
"Skip the download of any response that claims a Content-Length "
|
|
"larger than N (default: -1, don't skip anything).")
|
|
|
|
@click.option('--level', default="inf", metavar='NUM',
|
|
help='Recurse this many levels (default: inf).')
|
|
|
|
@click.option('--page-requisites-level', default="5", metavar='NUM',
|
|
help='Recursive this many levels for page requisites (default: 5).')
|
|
|
|
@click.option('--warc-max-size', default=5368709120, metavar='BYTES',
|
|
help=
|
|
'Try to limit each WARC file to around BYTES bytes before rolling over '
|
|
'to a new WARC file (default: 5368709120, which is 5GiB).')
|
|
|
|
@click.option('--ua', default="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
|
|
metavar='STRING', help='Send User-Agent: STRING instead of pretending to be Firefox on Windows.')
|
|
|
|
@click.option('--wpull-args', default="",
|
|
metavar='ARGS', help=
|
|
r'String containing additional arguments to pass to wpull; '
|
|
r'see ~/.local/bin/wpull --help. ARGS is split with shlex.split '
|
|
r'and individual arguments can contain spaces if quoted, e.g. '
|
|
r'--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""')
|
|
|
|
@click.option('--sitemaps/--no-sitemaps', default=True,
|
|
help=
|
|
'--sitemaps (default: true) to queue URLs from sitemap.xml '
|
|
'at the root of the site, or --no-sitemaps to disable.')
|
|
|
|
@click.option('--dupespotter/--no-dupespotter', default=True,
|
|
help=
|
|
'--dupespotter (default: true) to skip the extraction of links '
|
|
'from pages that look like duplicates of earlier pages, or '
|
|
'--no-dupespotter to disable. Disable this for sites that are '
|
|
'directory listings.')
|
|
|
|
@click.option('--id', default=None, type=str, metavar='ID',
|
|
help=
|
|
'Use id ID for the crawl instead of a random 128-bit id. '
|
|
'This must be unique for every crawl.')
|
|
|
|
@click.option('--dir', default=None, type=str, metavar='DIR', help=
|
|
'Put control files, temporary files, and unfinished WARCs in DIR '
|
|
'(default: a directory name based on the URL, date, and first 8 '
|
|
'characters of the id).')
|
|
|
|
@click.option('--finished-warc-dir', default=None, type=str, metavar='FINISHED_WARC_DIR',
|
|
help=
|
|
'Absolute path to a directory into which finished .warc.gz and .cdx '
|
|
'files will be moved.')
|
|
|
|
@click.option('--permanent-error-status-codes', default='401,403,404,405,410', type=str,
|
|
metavar='STATUS_CODES',
|
|
help=
|
|
'A comma-separated list of HTTP status codes to treat as a permanent '
|
|
'error and therefore *not* retry (default: 401,403,404,405,410)')
|
|
|
|
@click.option('--which-wpull-args-partial', is_flag=True,
|
|
help=
|
|
'Print a partial list of wpull arguments that would be used and exit. '
|
|
'Excludes grab-site-specific features, and removes DIR/ from paths. '
|
|
'Useful for reporting bugs on wpull without grab-site involvement.')
|
|
|
|
@click.option('--which-wpull-command', is_flag=True,
|
|
help=
|
|
"Populate DIR/ but don't start wpull; instead print the command that would "
|
|
"have been used to start wpull with all of the grab-site functionality.")
|
|
|
|
@click.option('--version', is_flag=True, callback=print_version,
|
|
expose_value=False, is_eager=True, help='Print version and exit.')
|
|
|
|
@click.argument('start_url', nargs=-1, required=False)
|
|
|
|
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
|
|
ignore_sets, import_ignores, igon, debug, video, level, page_requisites_level,
|
|
max_content_length, sitemaps, dupespotter, warc_max_size, ua, input_file,
|
|
wpull_args, start_url, id, dir, finished_warc_dir, permanent_error_status_codes,
|
|
which_wpull_args_partial, which_wpull_command):
|
|
"""
|
|
Runs a crawl on one or more URLs. For additional help, see
|
|
|
|
https://github.com/ArchiveTeam/grab-site/blob/master/README.md#usage
|
|
"""
|
|
if not (input_file or start_url):
|
|
print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
|
|
sys.exit(1)
|
|
elif input_file and start_url:
|
|
print("Can't specify both START_URL and --input-file=; see --help", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
span_hosts_allow = "page-requisites,linked-pages"
|
|
if not offsite_links:
|
|
span_hosts_allow = "page-requisites"
|
|
|
|
if concurrent != -1:
|
|
concurrency = concurrent
|
|
|
|
if ignore_sets != "":
|
|
igsets = ignore_sets
|
|
|
|
if start_url:
|
|
claim_start_url = start_url[0]
|
|
else:
|
|
input_file_is_remote = bool(re.match("^(ftp|https?)://", input_file))
|
|
if input_file_is_remote:
|
|
claim_start_url = input_file
|
|
else:
|
|
claim_start_url = 'file://' + os.path.abspath(input_file)
|
|
|
|
if not id:
|
|
id = binascii.hexlify(os.urandom(16)).decode('utf-8')
|
|
ymd = datetime.datetime.utcnow().isoformat()[:10]
|
|
no_proto_no_trailing = claim_start_url.split('://', 1)[1].rstrip('/')[:100]
|
|
unwanted_chars_re = r'[^-_a-zA-Z0-9%\.,;@+=]'
|
|
warc_name = "{}-{}-{}".format(re.sub(unwanted_chars_re, '-', no_proto_no_trailing).lstrip('-'), ymd, id[:8])
|
|
|
|
# make absolute because wpull will start in temp/
|
|
if not dir:
|
|
working_dir = os.path.abspath(warc_name)
|
|
else:
|
|
working_dir = os.path.abspath(dir)
|
|
|
|
LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
|
|
args = [
|
|
"--debug" if debug else "--quiet",
|
|
"-U", ua,
|
|
"--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"--header", "Accept-Language: en-US,en;q=0.5",
|
|
"--no-check-certificate",
|
|
"--no-robots",
|
|
"--inet4-only",
|
|
"--dns-timeout", "20",
|
|
"--connect-timeout", "20",
|
|
"--read-timeout", "900",
|
|
"--session-timeout", str(86400 * 2),
|
|
"--tries", "3",
|
|
"--waitretry", "5",
|
|
"--max-redirect", "8",
|
|
"--output-file", "{}/wpull.log".format(working_dir),
|
|
"--database", "{}/wpull.db".format(working_dir),
|
|
"--plugin-script", "{}/wpull_hooks.py".format(LIBGRABSITE),
|
|
"--save-cookies", "{}/cookies.txt".format(working_dir),
|
|
"--delete-after",
|
|
"--page-requisites",
|
|
"--no-parent",
|
|
"--concurrent", str(concurrency),
|
|
"--warc-file", "{}/{}".format(working_dir, warc_name),
|
|
"--warc-max-size", str(warc_max_size),
|
|
"--warc-cdx",
|
|
"--strip-session-id",
|
|
"--escaped-fragment",
|
|
"--level", level,
|
|
"--page-requisites-level", page_requisites_level,
|
|
"--span-hosts-allow", span_hosts_allow,
|
|
"--load-cookies", "{}/default_cookies.txt".format(LIBGRABSITE),
|
|
]
|
|
|
|
if os.name != "nt" and sys.platform != "cygwin":
|
|
args += [
|
|
"--debug-manhole"
|
|
]
|
|
|
|
if finished_warc_dir is not None:
|
|
args += ["--warc-move", finished_warc_dir]
|
|
|
|
if sitemaps:
|
|
args += ["--sitemaps"]
|
|
|
|
if recursive:
|
|
args += ["--recursive"]
|
|
|
|
if wpull_args:
|
|
args += shlex.split(wpull_args)
|
|
|
|
DIR_input_file = os.path.join(working_dir, "input_file")
|
|
if start_url:
|
|
args.extend(start_url)
|
|
else:
|
|
args += ["--input-file", DIR_input_file]
|
|
|
|
if which_wpull_args_partial:
|
|
replace_2arg(args, "--output-file", ["--output-file", "wpull.log"])
|
|
replace_2arg(args, "--database", ["--database", "wpull.db"])
|
|
replace_2arg(args, "--plugin-script", [])
|
|
replace_2arg(args, "--save-cookies", ["--save-cookies", "cookies.txt"])
|
|
replace_2arg(args, "--load-cookies", [])
|
|
replace_2arg(args, "--warc-file", ["--warc-file", warc_name])
|
|
try:
|
|
args.remove("--quiet")
|
|
except ValueError:
|
|
pass
|
|
print(" ".join(shlex.quote(a) for a in args))
|
|
return
|
|
|
|
# Create DIR and DIR files only after which_wpull_args_* checks
|
|
os.makedirs(working_dir)
|
|
temp_dir = os.path.join(working_dir, "temp")
|
|
os.makedirs(temp_dir)
|
|
|
|
if input_file is not None:
|
|
# wpull -i doesn't support URLs, so download the input file ourselves if necessary
|
|
if input_file_is_remote:
|
|
# TODO: use wpull with correct user agent instead of urllib.request
|
|
# wpull -O fails: https://github.com/chfoo/wpull/issues/275
|
|
u = urllib.request.urlopen(input_file)
|
|
with open(DIR_input_file, "wb") as f:
|
|
while True:
|
|
s = u.read(1024 * 1024)
|
|
if not s:
|
|
break
|
|
f.write(s)
|
|
else:
|
|
shutil.copyfile(input_file, DIR_input_file)
|
|
|
|
with open("{}/id".format(working_dir), "w") as f:
|
|
f.write(id)
|
|
|
|
with open("{}/start_url".format(working_dir), "w") as f:
|
|
f.write(claim_start_url)
|
|
|
|
with open("{}/all_start_urls".format(working_dir), "w") as f:
|
|
for u in start_url:
|
|
f.write(u + "\n")
|
|
|
|
with open("{}/concurrency".format(working_dir), "w") as f:
|
|
f.write(str(concurrency))
|
|
|
|
with open("{}/max_content_length".format(working_dir), "w") as f:
|
|
f.write(str(max_content_length))
|
|
|
|
with open("{}/igsets".format(working_dir), "w") as f:
|
|
f.write("global,{}".format(igsets))
|
|
|
|
if video:
|
|
with open("{}/video".format(working_dir), "w") as f:
|
|
pass
|
|
|
|
if not igon:
|
|
with open("{}/igoff".format(working_dir), "w") as f:
|
|
pass
|
|
|
|
with open("{}/ignores".format(working_dir), "w") as f:
|
|
if import_ignores is not None:
|
|
f.write(open(import_ignores, "r").read())
|
|
|
|
with open("{}/delay".format(working_dir), "w") as f:
|
|
f.write(delay)
|
|
|
|
with open("{}/scrape".format(working_dir), "w") as f:
|
|
pass
|
|
|
|
# We don't actually need to write control files for this mode to work, but the
|
|
# only reason to use this is if you're starting wpull manually with modified
|
|
# arguments, and wpull_hooks.py requires the control files.
|
|
if which_wpull_command:
|
|
bin = sys.argv[0].replace("/grab-site", "/wpull") # TODO
|
|
print("GRAB_SITE_WORKING_DIR={} DUPESPOTTER_ENABLED={} {} {}".format(
|
|
working_dir, int(dupespotter), bin, " ".join(shlex.quote(a) for a in args)))
|
|
return
|
|
|
|
patch_dns_inet_is_multicast()
|
|
|
|
# Mutate argv, environ, cwd before we turn into wpull
|
|
sys.argv[1:] = args
|
|
os.environ["GRAB_SITE_WORKING_DIR"] = working_dir
|
|
os.environ["DUPESPOTTER_ENABLED"] = "1" if dupespotter else "0"
|
|
# We can use --warc-tempdir= to put WARC-related temporary files in a temp
|
|
# directory, but wpull also creates non-WARC-related "resp_cb" temporary
|
|
# files in the cwd, so we must start wpull in temp/ anyway.
|
|
os.chdir(temp_dir)
|
|
|
|
# Modify NO_DOCUMENT_STATUS_CODES
|
|
# https://github.com/chfoo/wpull/issues/143
|
|
from wpull.processor.web import WebProcessor
|
|
WebProcessor.NO_DOCUMENT_STATUS_CODES = \
|
|
tuple(int(code) for code in permanent_error_status_codes.split(","))
|
|
|
|
import wpull.application.main
|
|
# Don't let wpull install a handler for SIGINT or SIGTERM,
|
|
# because we install our own in wpull_hooks.py.
|
|
wpull.application.main.main(use_signals=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|