382 lines
13 KiB
Python
Raw Normal View History

import faulthandler
faulthandler.enable()
import re
import os
import sys
import urllib.request
import shutil
import binascii
import datetime
import shlex
import click
import libgrabsite
2015-07-27 07:07:18 +00:00
def print_version(ctx, param, value):
if not value or ctx.resilient_parsing:
return
click.echo(libgrabsite.__version__)
ctx.exit()
def replace_2arg(args, arg, replacement):
idx = args.index(arg)
if idx == -1:
return
args.pop(idx)
args.pop(idx)
for r in reversed(replacement):
args.insert(idx, r)
2015-07-27 07:07:18 +00:00
def patch_dns_inet_is_multicast():
"""
Patch dnspython's dns.inet.is_multicast to not raise ValueError:
https://github.com/ludios/grab-site/issues/111
"""
import dns.inet
is_multicast_dnspython = dns.inet.is_multicast
def is_multicast(text):
try:
return is_multicast_dnspython(text)
except Exception:
return False
dns.inet.is_multicast = is_multicast
@click.command()
2015-07-27 06:55:20 +00:00
@click.option('--concurrency', default=2, metavar='NUM',
2015-07-28 13:57:42 +00:00
help='Use this many connections to fetch in parallel (default: 2).')
2015-07-27 06:55:20 +00:00
2015-07-27 06:58:17 +00:00
@click.option('--concurrent', default=-1, metavar='NUM',
2015-07-28 13:57:42 +00:00
help='Alias for --concurrency.')
@click.option('--delay', default="0", metavar='DELAY',
help=
'Time to wait between requests, in milliseconds (default: 0). '
'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
'for each request. Delay applies to each concurrent fetcher, not globally.')
2015-07-27 06:55:20 +00:00
@click.option('--recursive/--1', default=True,
help=
'--recursive (default: true) to crawl under last /path/ component '
2015-07-28 13:57:42 +00:00
'recursively, or --1 to get just START_URL.')
2015-07-27 06:55:20 +00:00
@click.option('--offsite-links/--no-offsite-links', default=True,
help=
'--offsite-links (default: true) to grab all links to a depth of 1 '
2015-07-28 13:57:42 +00:00
'on other domains, or --no-offsite-links to disable.')
2015-07-27 06:55:20 +00:00
@click.option('--igsets', default="", metavar='LIST',
2015-07-28 13:57:42 +00:00
help='Comma-separated list of ignore sets to use in addition to "global".')
2015-07-27 06:55:20 +00:00
@click.option('--ignore-sets', default="", metavar='LIST',
2015-07-28 13:57:42 +00:00
help='Alias for --igsets.')
2015-07-27 06:55:20 +00:00
@click.option('--import-ignores', default=None, metavar='FILE',
help='Copy this file to DIR/ignores before the crawl begins.')
2015-08-10 13:23:43 +00:00
@click.option('--igon/--igoff', default=False,
help=
'--igon (default: false) to print all URLs being ignored to the terminal '
'and dashboard.')
@click.option('--debug', is_flag=True, help='Print a lot of debugging information.')
@click.option('--video/--no-video', default=True,
help=
'--no-video (default: false) to skip the download of videos by both '
'mime type and file extension. Skipped videos are logged to '
'DIR/skipped_videos')
@click.option('-i', '--input-file', default=None, type=str,
help=
'Load list of URLs-to-grab from a local file or from a URL; like wget -i. '
'File must be a newline-delimited list of URLs. '
'Combine with --1 to avoid a recursive crawl on each URL.')
@click.option('--max-content-length', default=-1, metavar='N',
help=
"Skip the download of any response that claims a Content-Length "
"larger than N (default: -1, don't skip anything).")
@click.option('--level', default="inf", metavar='NUM',
2015-07-28 13:57:42 +00:00
help='Recurse this many levels (default: inf).')
2015-07-27 06:55:20 +00:00
@click.option('--page-requisites-level', default="5", metavar='NUM',
2015-07-28 13:57:42 +00:00
help='Recursive this many levels for page requisites (default: 5).')
2015-07-27 06:55:20 +00:00
@click.option('--warc-max-size', default=5368709120, metavar='BYTES',
help=
'Try to limit each WARC file to around BYTES bytes before rolling over '
'to a new WARC file (default: 5368709120, which is 5GiB).')
2018-10-07 19:39:22 +00:00
@click.option('--ua', default="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0",
metavar='STRING', help='Send User-Agent: STRING instead of pretending to be Firefox on Windows.')
@click.option('--wpull-args', default="",
metavar='ARGS', help=
r'String containing additional arguments to pass to wpull; '
r'see ~/.local/bin/wpull --help. ARGS is split with shlex.split '
r'and individual arguments can contain spaces if quoted, e.g. '
r'--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""')
2015-07-27 06:55:20 +00:00
@click.option('--sitemaps/--no-sitemaps', default=True,
help=
'--sitemaps (default: true) to queue URLs from sitemap.xml '
2015-07-28 13:57:42 +00:00
'at the root of the site, or --no-sitemaps to disable.')
2015-07-27 06:55:20 +00:00
@click.option('--dupespotter/--no-dupespotter', default=True,
help=
'--dupespotter (default: true) to skip the extraction of links '
'from pages that look like duplicates of earlier pages, or '
'--no-dupespotter to disable. Disable this for sites that are '
'directory listings.')
@click.option('--id', default=None, type=str, metavar='ID',
help=
'Use id ID for the crawl instead of a random 128-bit id. '
'This must be unique for every crawl.')
@click.option('--dir', default=None, type=str, metavar='DIR', help=
'Put control files, temporary files, and unfinished WARCs in DIR '
'(default: a directory name based on the URL, date, and first 8 '
'characters of the id).')
@click.option('--finished-warc-dir', default=None, type=str, metavar='FINISHED_WARC_DIR',
help='Move finished .warc.gz and .cdx files to this directory.')
@click.option('--permanent-error-status-codes', default='401,403,404,405,410', type=str,
2017-02-08 20:25:40 +00:00
metavar='STATUS_CODES',
help=
'A comma-separated list of HTTP status codes to treat as a permanent '
'error and therefore *not* retry (default: 401,403,404,405,410)')
@click.option('--which-wpull-args-partial', is_flag=True,
help=
'Print a partial list of wpull arguments that would be used and exit. '
'Excludes grab-site-specific features, and removes DIR/ from paths. '
'Useful for reporting bugs on wpull without grab-site involvement.')
@click.option('--which-wpull-command', is_flag=True,
help=
"Populate DIR/ but don't start wpull; instead print the command that would "
"have been used to start wpull with all of the grab-site functionality.")
2015-07-27 07:07:18 +00:00
@click.option('--version', is_flag=True, callback=print_version,
expose_value=False, is_eager=True, help='Print version and exit.')
@click.argument('start_url', nargs=-1, required=False)
2015-07-27 06:55:20 +00:00
2015-07-28 13:57:42 +00:00
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
ignore_sets, import_ignores, igon, debug, video, level, page_requisites_level,
max_content_length, sitemaps, dupespotter, warc_max_size, ua, input_file,
wpull_args, start_url, id, dir, finished_warc_dir, permanent_error_status_codes,
which_wpull_args_partial, which_wpull_command):
2018-07-28 12:36:29 +00:00
"""
Runs a crawl on one or more URLs. For additional help, see
https://github.com/ludios/grab-site/blob/master/README.md#usage
"""
if not (input_file or start_url):
print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
sys.exit(1)
elif input_file and start_url:
print("Can't specify both START_URL and --input-file=; see --help", file=sys.stderr)
sys.exit(1)
span_hosts_allow = "page-requisites,linked-pages"
if not offsite_links:
span_hosts_allow = "page-requisites"
2015-07-27 06:58:17 +00:00
if concurrent != -1:
concurrency = concurrent
2015-07-27 06:58:17 +00:00
if ignore_sets != "":
igsets = ignore_sets
if start_url:
claim_start_url = start_url[0]
else:
input_file_is_remote = bool(re.match("^(ftp|https?)://", input_file))
if input_file_is_remote:
claim_start_url = input_file
else:
claim_start_url = 'file://' + os.path.abspath(input_file)
if not id:
id = binascii.hexlify(os.urandom(16)).decode('utf-8')
2018-10-07 19:42:37 +00:00
ymd = datetime.datetime.utcnow().isoformat()[:10]
no_proto_no_trailing = claim_start_url.split('://', 1)[1].rstrip('/')[:100]
2018-10-07 19:42:37 +00:00
unwanted_chars_re = r'[^-_a-zA-Z0-9%\.,;@+=]'
warc_name = "{}-{}-{}".format(re.sub(unwanted_chars_re, '-', no_proto_no_trailing).lstrip('-'), ymd, id[:8])
# make absolute because wpull will start in temp/
if not dir:
working_dir = os.path.abspath(warc_name)
else:
working_dir = os.path.abspath(dir)
LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
args = [
"--debug" if debug else "--quiet",
2018-10-07 19:40:59 +00:00
"-U", ua,
"--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"--header", "Accept-Language: en-US,en;q=0.5",
"--no-check-certificate",
"--no-robots",
"--inet4-only",
2018-10-07 19:40:59 +00:00
"--dns-timeout", "20",
"--connect-timeout", "20",
"--read-timeout", "900",
"--session-timeout", str(86400 * 2),
"--tries", "3",
"--waitretry", "5",
"--max-redirect", "8",
"--output-file", "{}/wpull.log".format(working_dir),
"--database", "{}/wpull.db".format(working_dir),
"--plugin-script", "{}/wpull_hooks.py".format(LIBGRABSITE),
"--save-cookies", "{}/cookies.txt".format(working_dir),
"--delete-after",
"--page-requisites",
"--no-parent",
2018-10-07 19:40:59 +00:00
"--concurrent", str(concurrency),
"--warc-file", "{}/{}".format(working_dir, warc_name),
"--warc-max-size", str(warc_max_size),
"--warc-cdx",
"--strip-session-id",
"--escaped-fragment",
2018-10-07 19:40:59 +00:00
"--level", level,
"--page-requisites-level", page_requisites_level,
2018-10-07 19:40:59 +00:00
"--span-hosts-allow", span_hosts_allow,
"--load-cookies", "{}/default_cookies.txt".format(LIBGRABSITE),
]
if os.name != "nt" and sys.platform != "cygwin":
args += [
"--debug-manhole"
]
if finished_warc_dir is not None:
args += ["--warc-move", finished_warc_dir]
2015-07-27 06:55:20 +00:00
if sitemaps:
args += ["--sitemaps"]
if recursive:
args += ["--recursive"]
if wpull_args:
args += shlex.split(wpull_args)
DIR_input_file = os.path.join(working_dir, "input_file")
if start_url:
args.extend(start_url)
else:
args += ["--input-file", DIR_input_file]
if which_wpull_args_partial:
2018-10-07 19:38:22 +00:00
replace_2arg(args, "--output-file", ["--output-file", "wpull.log"])
replace_2arg(args, "--database", ["--database", "wpull.db"])
replace_2arg(args, "--plugin-script", [])
replace_2arg(args, "--python-script", [])
2018-10-07 19:38:22 +00:00
replace_2arg(args, "--save-cookies", ["--save-cookies", "cookies.txt"])
replace_2arg(args, "--load-cookies", [])
replace_2arg(args, "--warc-file", ["--warc-file", warc_name])
try:
args.remove("--quiet")
except ValueError:
pass
print(" ".join(shlex.quote(a) for a in args))
return
# Create DIR and DIR files only after which_wpull_args_* checks
os.makedirs(working_dir)
temp_dir = os.path.join(working_dir, "temp")
os.makedirs(temp_dir)
if input_file is not None:
# wpull -i doesn't support URLs, so download the input file ourselves if necessary
if input_file_is_remote:
# TODO: use wpull with correct user agent instead of urllib.request
# wpull -O fails: https://github.com/chfoo/wpull/issues/275
u = urllib.request.urlopen(input_file)
with open(DIR_input_file, "wb") as f:
while True:
2018-10-07 19:33:33 +00:00
s = u.read(1024 * 1024)
if not s:
break
f.write(s)
else:
shutil.copyfile(input_file, DIR_input_file)
with open("{}/id".format(working_dir), "w") as f:
f.write(id)
with open("{}/start_url".format(working_dir), "w") as f:
f.write(claim_start_url)
with open("{}/all_start_urls".format(working_dir), "w") as f:
for u in start_url:
f.write(u + "\n")
with open("{}/concurrency".format(working_dir), "w") as f:
f.write(str(concurrency))
with open("{}/max_content_length".format(working_dir), "w") as f:
f.write(str(max_content_length))
with open("{}/igsets".format(working_dir), "w") as f:
f.write("global,{}".format(igsets))
if video:
with open("{}/video".format(working_dir), "w") as f:
pass
if not igon:
with open("{}/igoff".format(working_dir), "w") as f:
pass
with open("{}/ignores".format(working_dir), "w") as f:
if import_ignores is not None:
f.write(open(import_ignores, "r").read())
with open("{}/delay".format(working_dir), "w") as f:
f.write(delay)
with open("{}/scrape".format(working_dir), "w") as f:
pass
# We don't actually need to write control files for this mode to work, but the
# only reason to use this is if you're starting wpull manually with modified
# arguments, and wpull_hooks.py requires the control files.
if which_wpull_command:
bin = sys.argv[0].replace("/grab-site", "/wpull") # TODO
print("GRAB_SITE_WORKING_DIR={} DUPESPOTTER_ENABLED={} {} {}".format(
working_dir, int(dupespotter), bin, " ".join(shlex.quote(a) for a in args)))
return
patch_dns_inet_is_multicast()
# Mutate argv, environ, cwd before we turn into wpull
sys.argv[1:] = args
os.environ["GRAB_SITE_WORKING_DIR"] = working_dir
2018-10-07 19:36:23 +00:00
os.environ["DUPESPOTTER_ENABLED"] = "1" if dupespotter else "0"
# We can use --warc-tempdir= to put WARC-related temporary files in a temp
# directory, but wpull also creates non-WARC-related "resp_cb" temporary
# files in the cwd, so we must start wpull in temp/ anyway.
os.chdir(temp_dir)
# Modify NO_DOCUMENT_STATUS_CODES
# https://github.com/chfoo/wpull/issues/143
from wpull.processor.web import WebProcessor
WebProcessor.NO_DOCUMENT_STATUS_CODES = \
tuple(int(code) for code in permanent_error_status_codes.split(","))
import wpull.application.main
# Don't let wpull install a handler for SIGINT or SIGTERM,
# because we install our own in wpull_hooks.py.
wpull.application.main.main(use_signals=False)
if __name__ == '__main__':
main()