import faulthandler faulthandler.enable() import re import os import sys import urllib.request import shutil import binascii import datetime import shlex import click import libgrabsite def print_version(ctx, param, value): if not value or ctx.resilient_parsing: return click.echo(libgrabsite.__version__) ctx.exit() def replace_2arg(args, arg, replacement): idx = args.index(arg) if idx == -1: return args.pop(idx) args.pop(idx) for r in reversed(replacement): args.insert(idx, r) @click.command() @click.option('--concurrency', default=2, metavar='NUM', help='Use this many connections to fetch in parallel (default: 2).') @click.option('--concurrent', default=-1, metavar='NUM', help='Alias for --concurrency.') @click.option('--delay', default="0", metavar='DELAY', help= 'Time to wait between requests, in milliseconds (default: 0). ' 'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX ' 'for each request. Delay applies to each concurrent fetcher, not globally.') @click.option('--recursive/--1', default=True, help= '--recursive (default: true) to crawl under last /path/ component ' 'recursively, or --1 to get just START_URL.') @click.option('--offsite-links/--no-offsite-links', default=True, help= '--offsite-links (default: true) to grab all links to a depth of 1 ' 'on other domains, or --no-offsite-links to disable.') @click.option('--igsets', default="", metavar='LIST', help='Comma-separated list of ignore sets to use in addition to "global".') @click.option('--ignore-sets', default="", metavar='LIST', help='Alias for --igsets.') @click.option('--igon/--igoff', default=False, help= '--igon (default: false) to print all URLs being ignored to the terminal ' 'and dashboard.') @click.option('--video/--no-video', default=True, help= '--no-video (default: false) to skip the download of videos by both ' 'mime type and file extension. Skipped videos are logged to ' 'DIR/skipped_videos') @click.option('-i', '--input-file', default=None, type=str, help= 'Load list of URLs-to-grab from a local file or from a URL; like wget -i. ' 'File must be a newline-delimited list of URLs. ' 'Combine with --1 to avoid a recursive crawl on each URL.') @click.option('--max-content-length', default=-1, metavar='N', help= "Skip the download of any response that claims a Content-Length " "larger than N (default: -1, don't skip anything).") @click.option('--level', default="inf", metavar='NUM', help='Recurse this many levels (default: inf).') @click.option('--page-requisites-level', default="5", metavar='NUM', help='Recursive this many levels for page requisites (default: 5).') @click.option('--warc-max-size', default=5368709120, metavar='BYTES', help= 'Try to limit each WARC file to around BYTES bytes before rolling over ' 'to a new WARC file (default: 5368709120, which is 5GiB).') @click.option('--ua', default="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0", metavar='STRING', help='Send User-Agent: STRING instead of pretending to be Firefox on Windows.') @click.option('--wpull-args', default="", metavar='ARGS', help= r'String containing additional arguments to pass to wpull; ' r'see ~/.local/bin/wpull --help. ARGS is split with shlex.split ' r'and individual arguments can contain spaces if quoted, e.g. ' r'--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""') @click.option('--sitemaps/--no-sitemaps', default=True, help= '--sitemaps (default: true) to queue URLs from sitemap.xml ' 'at the root of the site, or --no-sitemaps to disable.') @click.option('--dupespotter/--no-dupespotter', default=True, help= '--dupespotter (default: true) to skip the extraction of links ' 'from pages that look like duplicates of earlier pages, or ' '--no-dupespotter to disable. Disable this for sites that are ' 'directory listings.') @click.option('--id', default=None, type=str, metavar='ID', help= 'Use id ID for the crawl instead of a random 128-bit id. ' 'This must be unique for every crawl.') @click.option('--dir', default=None, type=str, metavar='DIR', help= 'Put control files, temporary files, and unfinished WARCs in DIR ' '(default: a directory name based on the URL, date, and first 8 ' 'characters of the id).') @click.option('--finished-warc-dir', default=None, type=str, metavar='FINISHED_WARC_DIR', help='Move finished .warc.gz and .cdx files to this directory.') @click.option('--custom-hooks', default=None, type=str, metavar='PY_SCRIPT', help= 'Copy PY_SCRIPT to DIR/custom_hooks.py, then exec DIR/custom_hooks.py ' 'on startup and every time it changes. The script gets a `wpull_hook` ' 'global that can be used to change crawl behavior. ' 'See libgrabsite/wpull_hooks.py and extra_docs/custom_hooks_sample.py.') @click.option('--which-wpull-args-partial', is_flag=True, help= 'Print a partial list of wpull arguments that would be used and exit. ' 'Excludes grab-site-specific features, and removes DIR/ from paths. ' 'Useful for reporting bugs on wpull without grab-site involvement.') @click.option('--which-wpull-command', is_flag=True, help= "Populate DIR/ but don't start wpull; instead print the command that would " "have been used to start wpull with all of the grab-site functionality.") @click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, help='Print version and exit.') @click.argument('start_url', nargs=-1, required=False) def main(concurrency, concurrent, delay, recursive, offsite_links, igsets, ignore_sets, igon, video, level, page_requisites_level, max_content_length, sitemaps, dupespotter, warc_max_size, ua, input_file, wpull_args, start_url, id, dir, finished_warc_dir, custom_hooks, which_wpull_args_partial, which_wpull_command): if not (input_file or start_url): print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr) sys.exit(1) elif input_file and start_url: print("Can't specify both START_URL and --input-file=; see --help", file=sys.stderr) sys.exit(1) span_hosts_allow = "page-requisites,linked-pages" if not offsite_links: span_hosts_allow = "page-requisites" if concurrent != -1: concurrency = concurrent if ignore_sets != "": igsets = ignore_sets if start_url: claim_start_url = start_url[0] else: input_file_is_remote = bool(re.match("^(ftp|https?)://", input_file)) if input_file_is_remote: claim_start_url = input_file else: claim_start_url = 'file://' + os.path.abspath(input_file) if not id: id = binascii.hexlify(os.urandom(16)).decode('utf-8') ymd = datetime.datetime.utcnow().isoformat()[:10] no_proto_no_trailing = claim_start_url.split('://', 1)[1].rstrip('/')[:100] warc_name = "{}-{}-{}".format(re.sub('[^-_a-zA-Z0-9%\.,;@+=]', '-', no_proto_no_trailing).lstrip('-'), ymd, id[:8]) def get_base_wpull_args(): return ["-U", ua, "--header=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "--header=Accept-Language: en-US,en;q=0.5", "--no-check-certificate", "--no-robots", "--inet4-only", "--dns-timeout", "20", "--connect-timeout", "20", "--read-timeout", "900", "--session-timeout", str(86400 * 2), "--tries", "3", "--waitretry", "5", "--max-redirect", "8", "--quiet" ] # make absolute because wpull will start in temp/ if not dir: working_dir = os.path.abspath(warc_name) else: working_dir = os.path.abspath(dir) LIBGRABSITE = os.path.dirname(libgrabsite.__file__) args = get_base_wpull_args() + [ "--output-file", "{}/wpull.log".format(working_dir), "--database", "{}/wpull.db".format(working_dir), "--plugin-script", "{}/plugin.py".format(LIBGRABSITE), "--python-script", "{}/wpull_hooks.py".format(LIBGRABSITE), "--save-cookies", "{}/cookies.txt".format(working_dir), "--delete-after", "--page-requisites", "--no-parent", "--concurrent", str(concurrency), "--warc-file", "{}/{}".format(working_dir, warc_name), "--warc-max-size", str(warc_max_size), "--warc-cdx", "--strip-session-id", "--escaped-fragment", "--level", level, "--page-requisites-level", page_requisites_level, "--span-hosts-allow", span_hosts_allow, "--load-cookies", "{}/default_cookies.txt".format(LIBGRABSITE) ] # psutil is not available on Windows and therefore wpull's --monitor-* # options are also not available. if os.name != "nt" and sys.platform != "cygwin": # psutil may also just be not installed try: import psutil except ImportError: psutil = None if psutil is not None: args += [ "--monitor-disk", "400m", "--monitor-memory", "10k", ] args += [ "--debug-manhole" ] if finished_warc_dir is not None: args += ["--warc-move", finished_warc_dir] if sitemaps: args += ["--sitemaps"] if recursive: args += ["--recursive"] if wpull_args: args += shlex.split(wpull_args) DIR_input_file = os.path.join(working_dir, "input_file") if start_url: args.extend(start_url) else: args += ["--input-file", DIR_input_file] if which_wpull_args_partial: replace_2arg(args, "--output-file", ["--output-file", "wpull.log"]) replace_2arg(args, "--database", ["--database", "wpull.db"]) replace_2arg(args, "--plugin-script", []) replace_2arg(args, "--python-script", []) replace_2arg(args, "--save-cookies", ["--save-cookies", "cookies.txt"]) replace_2arg(args, "--load-cookies", []) replace_2arg(args, "--warc-file", ["--warc-file", warc_name]) try: args.remove("--quiet") except ValueError: pass print(" ".join(shlex.quote(a) for a in args)) return # Create DIR and DIR files only after which_wpull_args_* checks os.makedirs(working_dir) temp_dir = os.path.join(working_dir, "temp") os.makedirs(temp_dir) DIR_custom_hooks = os.path.join(working_dir, "custom_hooks.py") if custom_hooks: shutil.copyfile(custom_hooks, DIR_custom_hooks) else: with open(DIR_custom_hooks, "wb") as _: pass if input_file is not None: # wpull -i doesn't support URLs, so download the input file ourselves if necessary if input_file_is_remote: # TODO: use wpull with correct user agent instead of urllib.request # wpull -O fails: https://github.com/chfoo/wpull/issues/275 u = urllib.request.urlopen(input_file) with open(DIR_input_file, "wb") as f: while True: s = u.read(1024*1024) if not s: break f.write(s) else: shutil.copyfile(input_file, DIR_input_file) with open("{}/id".format(working_dir), "w") as f: f.write(id) with open("{}/start_url".format(working_dir), "w") as f: f.write(claim_start_url) with open("{}/all_start_urls".format(working_dir), "w") as f: for u in start_url: f.write(u + "\n") with open("{}/concurrency".format(working_dir), "w") as f: f.write(str(concurrency)) with open("{}/max_content_length".format(working_dir), "w") as f: f.write(str(max_content_length)) with open("{}/igsets".format(working_dir), "w") as f: f.write("global,{}".format(igsets)) if video: with open("{}/video".format(working_dir), "w") as f: pass if not igon: with open("{}/igoff".format(working_dir), "w") as f: pass with open("{}/ignores".format(working_dir), "w") as f: pass with open("{}/delay".format(working_dir), "w") as f: f.write(delay) # We don't actually need to write control files for this mode to work, but the # only reason to use this is if you're starting wpull manually with modified # arguments, and wpull_hooks.py requires the control files. if which_wpull_command: bin = sys.argv[0].replace("/grab-site", "/wpull") # TODO print("GRAB_SITE_WORKING_DIR={} DUPESPOTTER_ENABLED={} {} {}".format( working_dir, int(dupespotter), bin, " ".join(shlex.quote(a) for a in args))) return # Mutate argv, environ, cwd before we turn into wpull sys.argv[1:] = args os.environ["GRAB_SITE_WORKING_DIR"] = working_dir os.environ["DUPESPOTTER_ENABLED"] = "1" if dupespotter else "0" # We can use --warc-tempdir= to put WARC-related temporary files in a temp # directory, but wpull also creates non-WARC-related "resp_cb" temporary # files in the cwd, so we must start wpull in temp/ anyway. os.chdir(temp_dir) from wpull.app import Application def noop_setup_signal_handlers(self): pass # Don't let wpull install a handler for SIGINT or SIGTERM, # because we install our own in wpull_hooks.py. Application.setup_signal_handlers = noop_setup_signal_handlers import wpull.__main__ wpull.__main__.main() if __name__ == '__main__': main()