grab-site/libgrabsite/main.py

import faulthandler
faulthandler.enable()

import re
import os
import sys
import urllib.request
import shutil
import binascii
import datetime
import shlex
import click
import libgrabsite

def print_version(ctx, param, value):
	if not value or ctx.resilient_parsing:
		return
	click.echo(libgrabsite.__version__)
	ctx.exit()


@click.command()

@click.option('--concurrency', default=2, metavar='NUM',
	help='Use this many connections to fetch in parallel (default: 2).')

@click.option('--concurrent', default=-1, metavar='NUM',
	help='Alias for --concurrency.')

@click.option('--delay', default="0", metavar='DELAY',
	help=
		'Time to wait between requests, in milliseconds (default: 0).  '
		'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
		'for each request.  Delay applies to each concurrent fetcher, not globally.')

@click.option('--recursive/--1', default=True,
	help=
		'--recursive (default: true) to crawl under last /path/ component '
		'recursively, or --1 to get just START_URL.')

@click.option('--offsite-links/--no-offsite-links', default=True,
	help=
		'--offsite-links (default: true) to grab all links to a depth of 1 '
		'on other domains, or --no-offsite-links to disable.')

@click.option('--igsets', default="", metavar='LIST',
	help='Comma-separated list of ignore sets to use in addition to "global".')

@click.option('--ignore-sets', default="", metavar='LIST',
	help='Alias for --igsets.')

@click.option('--igon/--igoff', default=False,
	help=
		'--igon (default: false) to print all URLs being ignored to the terminal '
		'and dashboard.')

@click.option('--video/--no-video', default=True,
	help=
		'--no-video (default: false) to skip the download of videos by both '
		'mime type and file extension.  Skipped videos are logged to '
		'DIR/skipped_videos')

@click.option('-i', '--input-file', default=None, type=str,
	help=
		'Load list of URLs-to-grab from a local file or from a URL; like wget -i. '
		'File must be a newline-delimited list of URLs. '
		'Combine with --1 to avoid a recursive crawl on each URL.')

@click.option('--max-content-length', default=-1, metavar='N',
	help=
		"Skip the download of any response that claims a Content-Length "
		"larger than N (default: -1, don't skip anything).")

@click.option('--level', default="inf", metavar='NUM',
	help='Recurse this many levels (default: inf).')

@click.option('--page-requisites-level', default="5", metavar='NUM',
	help='Recursive this many levels for page requisites (default: 5).')

@click.option('--warc-max-size', default=5368709120, metavar='BYTES',
	help=
		'Try to limit each WARC file to around BYTES bytes before rolling over '
		'to a new WARC file (default: 5368709120, which is 5GiB).')

@click.option('--ua', default="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0",
	metavar='STRING', help='Send User-Agent: STRING instead of pretending to be Firefox on Windows.')

@click.option('--wpull-args', default="",
	metavar='ARGS', help=
		r'String containing additional arguments to pass to wpull; '
		r'see ~/.local/bin/wpull --help.  ARGS is split with shlex.split '
		r'and individual arguments can contain spaces if quoted, e.g. '
		r'--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""')

@click.option('--sitemaps/--no-sitemaps', default=True,
	help=
		'--sitemaps (default: true) to queue URLs from sitemap.xml '
		'at the root of the site, or --no-sitemaps to disable.')

@click.option('--dupespotter/--no-dupespotter', default=True,
	help=
		'--dupespotter (default: true) to skip the extraction of links '
		'from pages that look like duplicates of earlier pages, or '
		'--no-dupespotter to disable.  Disable this for sites that are '
		'directory listings.')

@click.option('--version', is_flag=True, callback=print_version,
	expose_value=False, is_eager=True, help='Print version and exit.')

@click.argument('start_url', nargs=-1, required=False)

def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
ignore_sets, igon, video, level, page_requisites_level, max_content_length,
sitemaps, dupespotter, warc_max_size, ua, input_file, wpull_args, start_url):
	if not (input_file or start_url):
		print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
		sys.exit(1)
	elif input_file and start_url:
		print("Can't specify both START_URL and --input-file=; see --help", file=sys.stderr)
		sys.exit(1)

	span_hosts_allow = "page-requisites,linked-pages"
	if not offsite_links:
		span_hosts_allow = "page-requisites"

	if concurrent != -1:
		concurrency = concurrent

	if ignore_sets != "":
		igsets = ignore_sets

	if start_url:
		claim_start_url = start_url[0]
	else:
		input_file_is_remote = bool(re.match("^(ftp|https?)://", input_file))
		if input_file_is_remote:
			claim_start_url = input_file
		else:
			claim_start_url = 'file://' + os.path.abspath(input_file)

	id = binascii.hexlify(os.urandom(16)).decode('utf-8')
	ymd = datetime.datetime.utcnow().isoformat()[:10]
	no_proto_no_trailing = claim_start_url.split('://', 1)[1].rstrip('/')[:100]
	warc_name = "{}-{}-{}".format(re.sub('[^-_a-zA-Z0-9%\.,;@+=]', '-', no_proto_no_trailing), ymd, id[:8])

	# make absolute because wpull will start in temp/
	working_dir = os.path.abspath(warc_name)
	os.makedirs(working_dir)
	temp_dir = os.path.join(working_dir, "temp")
	os.makedirs(temp_dir)

	def get_base_wpull_args():
		return ["-U", ua,
			"--header=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			"--header=Accept-Language: en-US,en;q=0.5",
			"--no-check-certificate",
			"--no-robots",
			"--inet4-only",
			"--timeout", "20",
			"--tries", "3",
			"--waitretry", "5",
			"--max-redirect", "8",
			"--quiet"
		]

	if input_file is not None:
		# wpull -i doesn't support URLs, so download the input file ourselves if necessary
		DIR_input_file = os.path.join(working_dir, "input_file")
		if input_file_is_remote:
			# TODO: use wpull with correct user agent instead of urllib.request
			# wpull -O fails: https://github.com/chfoo/wpull/issues/275
			u = urllib.request.urlopen(input_file)
			with open(DIR_input_file, "wb") as f:
				while True:
					s = u.read(1024*1024)
					if not s:
						break
					f.write(s)
		else:
			shutil.copyfile(input_file, DIR_input_file)

	with open("{}/id".format(working_dir), "w") as f:
		f.write(id)

	with open("{}/start_url".format(working_dir), "w") as f:
		f.write(claim_start_url)

	with open("{}/all_start_urls".format(working_dir), "w") as f:
		for u in start_url:
			f.write(u + "\n")

	with open("{}/concurrency".format(working_dir), "w") as f:
		f.write(str(concurrency))

	with open("{}/max_content_length".format(working_dir), "w") as f:
		f.write(str(max_content_length))

	with open("{}/igsets".format(working_dir), "w") as f:
		f.write("global,{}".format(igsets))

	if video:
		with open("{}/video".format(working_dir), "w") as f:
			pass

	if not igon:
		with open("{}/igoff".format(working_dir), "w") as f:
			pass

	with open("{}/ignores".format(working_dir), "w") as f:
		pass

	with open("{}/delay".format(working_dir), "w") as f:
		f.write(delay)

	LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
	args = get_base_wpull_args() + [
		"-o", "{}/wpull.log".format(working_dir),
		"--database", "{}/wpull.db".format(working_dir),
		"--plugin-script", "{}/plugin.py".format(LIBGRABSITE),
		"--python-script", "{}/wpull_hooks.py".format(LIBGRABSITE),
		"--save-cookies", "{}/cookies.txt".format(working_dir),
		"--delete-after",
		"--page-requisites",
		"--no-parent",
		"--concurrent", str(concurrency),
		"--warc-file", "{}/{}".format(working_dir, warc_name),
		"--warc-max-size", str(warc_max_size),
		"--warc-cdx",
		"--strip-session-id",
		"--escaped-fragment",
		"--level", level,
		"--page-requisites-level", page_requisites_level,
		"--span-hosts-allow", span_hosts_allow,
	]
	# psutil is not available on Windows and therefore wpull's --monitor-*
	# options are also not available.
	if os.name != "nt" and sys.platform != "cygwin":
		# psutil may also just be not installed
		try:
			import psutil
		except ImportError:
			psutil = None
		if psutil is not None:
			args += [
				"--monitor-disk", "400m",
				"--monitor-memory", "10k",
			]
		args += [
			"--debug-manhole"
		]

	if sitemaps:
		args += ["--sitemaps"]

	if recursive:
		args += ["--recursive"]

	if wpull_args:
		args += shlex.split(wpull_args)

	if start_url:
		args.extend(start_url)
	else:
		args += ["--input-file", DIR_input_file]

	# Mutate argv, environ, cwd before we turn into wpull
	sys.argv[1:] = args
	os.environ["GRAB_SITE_WORKING_DIR"] = working_dir
	os.environ["DUPESPOTTER_ENABLED"] = "1" if dupespotter else "0"
	# We can use --warc-tempdir= to put WARC-related temporary files in a temp
	# directory, but wpull also creates non-WARC-related "resp_cb" temporary
	# files in the cwd, so we must start wpull in temp/ anyway.
	os.chdir(temp_dir)

	from wpull.app import Application
	def noop_setup_signal_handlers(self):
		pass

	# Don't let wpull install a handler for SIGINT or SIGTERM,
	# because we install our own in wpull_hooks.py.
	Application.setup_signal_handlers = noop_setup_signal_handlers

	import wpull.__main__
	wpull.__main__.main()


if __name__ == '__main__':
	main()
Don't spawn wpull in a subprocess, just import it and call its main() 2015-07-28 11:53:35 +00:00			`import faulthandler`
			`faulthandler.enable()`

First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`import re`
			`import os`
			`import sys`
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`import urllib.request`
			`import shutil`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`import binascii`
			`import datetime`
Implement --wpull-args for passing additional arguments to wpull 2015-08-12 06:39:49 +00:00			`import shlex`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`import click`
			`import libgrabsite`

Add --version 2015-07-27 07:07:18 +00:00			`def print_version(ctx, param, value):`
			`if not value or ctx.resilient_parsing:`
			`return`
			`click.echo(libgrabsite.__version__)`
			`ctx.exit()`


First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`@click.command()`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`@click.option('--concurrency', default=2, metavar='NUM',`
Add --delay option 2015-07-28 13:57:42 +00:00			`help='Use this many connections to fetch in parallel (default: 2).')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Make --igsets actually work 2015-07-27 06:58:17 +00:00			`@click.option('--concurrent', default=-1, metavar='NUM',`
Add --delay option 2015-07-28 13:57:42 +00:00			`help='Alias for --concurrency.')`

			`@click.option('--delay', default="0", metavar='DELAY',`
			`help=`
			`'Time to wait between requests, in milliseconds (default: 0). '`
			`'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '`
			`'for each request. Delay applies to each concurrent fetcher, not globally.')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`@click.option('--recursive/--1', default=True,`
			`help=`
			`'--recursive (default: true) to crawl under last /path/ component '`
Add --delay option 2015-07-28 13:57:42 +00:00			`'recursively, or --1 to get just START_URL.')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Fix: --offsite-links should be on by default 2015-08-13 12:29:11 +00:00			`@click.option('--offsite-links/--no-offsite-links', default=True,`
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`help=`
			`'--offsite-links (default: true) to grab all links to a depth of 1 '`
Add --delay option 2015-07-28 13:57:42 +00:00			`'on other domains, or --no-offsite-links to disable.')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`@click.option('--igsets', default="", metavar='LIST',`
Add --delay option 2015-07-28 13:57:42 +00:00			`help='Comma-separated list of ignore sets to use in addition to "global".')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`@click.option('--ignore-sets', default="", metavar='LIST',`
Add --delay option 2015-07-28 13:57:42 +00:00			`help='Alias for --igsets.')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Implement --igon / --igoff 2015-08-10 13:23:43 +00:00			`@click.option('--igon/--igoff', default=False,`
			`help=`
			`'--igon (default: false) to print all URLs being ignored to the terminal '`
			`'and dashboard.')`

Add --no-video option to skip the download of videos 2015-08-21 08:28:27 +00:00			`@click.option('--video/--no-video', default=True,`
			`help=`
			`'--no-video (default: false) to skip the download of videos by both '`
			`'mime type and file extension. Skipped videos are logged to '`
			`'DIR/skipped_videos')`

Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`@click.option('-i', '--input-file', default=None, type=str,`
			`help=`
			`'Load list of URLs-to-grab from a local file or from a URL; like wget -i. '`
			`'File must be a newline-delimited list of URLs. '`
			`'Combine with --1 to avoid a recursive crawl on each URL.')`

Implement --max-content-length=N for skipping large responses 2015-08-10 13:12:22 +00:00			`@click.option('--max-content-length', default=-1, metavar='N',`
			`help=`
			`"Skip the download of any response that claims a Content-Length "`
Implement --ua= for setting the User-Agent 2015-08-10 13:38:00 +00:00			`"larger than N (default: -1, don't skip anything).")`
Implement --max-content-length=N for skipping large responses 2015-08-10 13:12:22 +00:00
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`@click.option('--level', default="inf", metavar='NUM',`
Add --delay option 2015-07-28 13:57:42 +00:00			`help='Recurse this many levels (default: inf).')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`@click.option('--page-requisites-level', default="5", metavar='NUM',`
Add --delay option 2015-07-28 13:57:42 +00:00			`help='Recursive this many levels for page requisites (default: 5).')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Add --warc-max-size=BYTES option for controlling WARC size 2015-08-21 07:47:12 +00:00			`@click.option('--warc-max-size', default=5368709120, metavar='BYTES',`
			`help=`
			`'Try to limit each WARC file to around BYTES bytes before rolling over '`
			`'to a new WARC file (default: 5368709120, which is 5GiB).')`

Update default user agent 2015-09-25 20:32:08 +00:00			`@click.option('--ua', default="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0",`
Implement --ua= for setting the User-Agent 2015-08-10 13:38:00 +00:00			`metavar='STRING', help='Send User-Agent: STRING instead of pretending to be Firefox on Windows.')`

Implement --wpull-args for passing additional arguments to wpull 2015-08-12 06:39:49 +00:00			`@click.option('--wpull-args', default="",`
			`metavar='ARGS', help=`
			`r'String containing additional arguments to pass to wpull; '`
			`r'see ~/.local/bin/wpull --help. ARGS is split with shlex.split '`
			`r'and individual arguments can contain spaces if quoted, e.g. '`
			`r'--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""')`

Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00			`@click.option('--sitemaps/--no-sitemaps', default=True,`
			`help=`
			`'--sitemaps (default: true) to queue URLs from sitemap.xml '`
Add --delay option 2015-07-28 13:57:42 +00:00			`'at the root of the site, or --no-sitemaps to disable.')`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Add --no-dupespotter for turning off dupespotter which sometimes has false positives 2015-09-30 22:16:56 +00:00			`@click.option('--dupespotter/--no-dupespotter', default=True,`
			`help=`
			`'--dupespotter (default: true) to skip the extraction of links '`
			`'from pages that look like duplicates of earlier pages, or '`
			`'--no-dupespotter to disable. Disable this for sites that are '`
			`'directory listings.')`

Add --version 2015-07-27 07:07:18 +00:00			`@click.option('--version', is_flag=True, callback=print_version,`
			`expose_value=False, is_eager=True, help='Print version and exit.')`

Add support for passing multiple URLs to grab-site 2015-08-21 07:18:31 +00:00			`@click.argument('start_url', nargs=-1, required=False)`
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00
Add --delay option 2015-07-28 13:57:42 +00:00			`def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,`
Add --no-video option to skip the download of videos 2015-08-21 08:28:27 +00:00			`ignore_sets, igon, video, level, page_requisites_level, max_content_length,`
Add --no-dupespotter for turning off dupespotter which sometimes has false positives 2015-09-30 22:16:56 +00:00			`sitemaps, dupespotter, warc_max_size, ua, input_file, wpull_args, start_url):`
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`if not (input_file or start_url):`
			`print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)`
			`sys.exit(1)`
			`elif input_file and start_url:`
			`print("Can't specify both START_URL and --input-file=; see --help", file=sys.stderr)`
			`sys.exit(1)`

First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`span_hosts_allow = "page-requisites,linked-pages"`
			`if not offsite_links:`
			`span_hosts_allow = "page-requisites"`

Make --igsets actually work 2015-07-27 06:58:17 +00:00			`if concurrent != -1:`
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`concurrency = concurrent`

Make --igsets actually work 2015-07-27 06:58:17 +00:00			`if ignore_sets != "":`
Write proper --help text and use aliased inputs too 2015-07-27 06:44:55 +00:00			`igsets = ignore_sets`

Fix: new Click gives us () instead of None when no start_url's are given 2015-10-03 22:53:20 +00:00			`if start_url:`
Add support for passing multiple URLs to grab-site 2015-08-21 07:18:31 +00:00			`claim_start_url = start_url[0]`
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`else:`
			`input_file_is_remote = bool(re.match("^(ftp\|https?)://", input_file))`
			`if input_file_is_remote:`
			`claim_start_url = input_file`
			`else:`
			`claim_start_url = 'file://' + os.path.abspath(input_file)`

First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`id = binascii.hexlify(os.urandom(16)).decode('utf-8')`
			`ymd = datetime.datetime.utcnow().isoformat()[:10]`
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`no_proto_no_trailing = claim_start_url.split('://', 1)[1].rstrip('/')[:100]`
directory name gen: whitelist instead of blacklist characters 2015-07-28 12:12:35 +00:00			`warc_name = "{}-{}-{}".format(re.sub('[^-_a-zA-Z0-9%\.,;@+=]', '-', no_proto_no_trailing), ymd, id[:8])`

Put all temporary files in DIR/temp; don't let ctrl-c exit grab-site before wpull 2015-07-27 07:26:54 +00:00			`# make absolute because wpull will start in temp/`
			`working_dir = os.path.abspath(warc_name)`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`os.makedirs(working_dir)`
Put all temporary files in DIR/temp; don't let ctrl-c exit grab-site before wpull 2015-07-27 07:26:54 +00:00			`temp_dir = os.path.join(working_dir, "temp")`
			`os.makedirs(temp_dir)`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`def get_base_wpull_args():`
			`return ["-U", ua,`
			`"--header=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",`
			`"--header=Accept-Language: en-US,en;q=0.5",`
			`"--no-check-certificate",`
			`"--no-robots",`
			`"--inet4-only",`
			`"--timeout", "20",`
			`"--tries", "3",`
			`"--waitretry", "5",`
			`"--max-redirect", "8",`
			`"--quiet"`
			`]`

			`if input_file is not None:`
			`# wpull -i doesn't support URLs, so download the input file ourselves if necessary`
			`DIR_input_file = os.path.join(working_dir, "input_file")`
			`if input_file_is_remote:`
			`# TODO: use wpull with correct user agent instead of urllib.request`
			`# wpull -O fails: https://github.com/chfoo/wpull/issues/275`
			`u = urllib.request.urlopen(input_file)`
			`with open(DIR_input_file, "wb") as f:`
			`while True:`
			`s = u.read(1024*1024)`
			`if not s:`
			`break`
			`f.write(s)`
			`else:`
			`shutil.copyfile(input_file, DIR_input_file)`

First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`with open("{}/id".format(working_dir), "w") as f:`
			`f.write(id)`

			`with open("{}/start_url".format(working_dir), "w") as f:`
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`f.write(claim_start_url)`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00
Add support for passing multiple URLs to grab-site 2015-08-21 07:18:31 +00:00			`with open("{}/all_start_urls".format(working_dir), "w") as f:`
			`for u in start_url:`
			`f.write(u + "\n")`

First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`with open("{}/concurrency".format(working_dir), "w") as f:`
			`f.write(str(concurrency))`

Implement --max-content-length=N for skipping large responses 2015-08-10 13:12:22 +00:00			`with open("{}/max_content_length".format(working_dir), "w") as f:`
			`f.write(str(max_content_length))`

First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`with open("{}/igsets".format(working_dir), "w") as f:`
			`f.write("global,{}".format(igsets))`

Add --no-video option to skip the download of videos 2015-08-21 08:28:27 +00:00			`if video:`
			`with open("{}/video".format(working_dir), "w") as f:`
			`pass`

Implement --igon / --igoff 2015-08-10 13:23:43 +00:00			`if not igon:`
			`with open("{}/igoff".format(working_dir), "w") as f:`
			`pass`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00
			`with open("{}/ignores".format(working_dir), "w") as f:`
			`pass`

Allow changing delay (in milliseconds) using DIR/delay file 2015-07-28 13:44:51 +00:00			`with open("{}/delay".format(working_dir), "w") as f:`
Add --delay option 2015-07-28 13:57:42 +00:00			`f.write(delay)`
Allow changing delay (in milliseconds) using DIR/delay file 2015-07-28 13:44:51 +00:00
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`LIBGRABSITE = os.path.dirname(libgrabsite.__file__)`
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`args = get_base_wpull_args() + [`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`"-o", "{}/wpull.log".format(working_dir),`
			`"--database", "{}/wpull.db".format(working_dir),`
			`"--plugin-script", "{}/plugin.py".format(LIBGRABSITE),`
			`"--python-script", "{}/wpull_hooks.py".format(LIBGRABSITE),`
			`"--save-cookies", "{}/cookies.txt".format(working_dir),`
			`"--delete-after",`
			`"--page-requisites",`
			`"--no-parent",`
			`"--concurrent", str(concurrency),`
Put all temporary files in DIR/temp; don't let ctrl-c exit grab-site before wpull 2015-07-27 07:26:54 +00:00			`"--warc-file", "{}/{}".format(working_dir, warc_name),`
Add --warc-max-size=BYTES option for controlling WARC size 2015-08-21 07:47:12 +00:00			`"--warc-max-size", str(warc_max_size),`
Make wpull write .cdx file (its impl does one .cdx covering all WARC files) 2015-07-31 23:55:27 +00:00			`"--warc-cdx",`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`"--strip-session-id",`
			`"--escaped-fragment",`
			`"--level", level,`
			`"--page-requisites-level", page_requisites_level,`
			`"--span-hosts-allow", span_hosts_allow,`
			`]`
Don't try to use the unavailable --monitor- options on Windows 2015-10-28 16:19:11 +00:00			`# psutil is not available on Windows and therefore wpull's --monitor-*`
			`# options are also not available.`
			`if os.name != "nt" and sys.platform != "cygwin":`
Don't crash if psutil is not available on non-Windows OS (it is no longer installed by wpull 1.2.2) 2015-11-21 19:54:13 +00:00			`# psutil may also just be not installed`
			`try:`
			`import psutil`
			`except ImportError:`
			`psutil = None`
			`if psutil is not None:`
			`args += [`
			`"--monitor-disk", "400m",`
			`"--monitor-memory", "10k",`
			`]`
Don't try to use the unavailable --monitor- options on Windows 2015-10-28 16:19:11 +00:00			`args += [`
Don't use --debug-manhole on Windows to avoid a crash File "C:\Python34\lib\site-packages\click\core.py", line 680, in main rv = self.invoke(ctx) File "C:\Python34\lib\site-packages\click\core.py", line 873, in invoke return ctx.invoke(self.callback, *ctx.params) File "C:\Python34\lib\site-packages\click\core.py", line 508, in invoke return callback(args, kwargs) File "C:\Python34\lib\site-packages\libgrabsite\main.py", line 276, in main wpull.__main__.main() File "C:\Python34\lib\site-packages\wpull\__main__.py", line 38, in main manhole.install() File "C:\Python34\lib\site-packages\manhole.py", line 565, in install _MANHOLE.configure(kwargs) # Threads might be started here File "C:\Python34\lib\site-packages\manhole.py", line 412, in configure self.patch_os_fork_functions() File "C:\Python34\lib\site-packages\manhole.py", line 505, in patch_os_fork_functions self.original_os_fork, os.fork = os.fork, self.patched_fork AttributeError: 'module' object has no attribute 'fork' Exception in thread Manhole: Traceback (most recent call last): File "C:\Python34\lib\threading.py", line 920, in _bootstrap_inner self.run() File "C:\Python34\lib\site-packages\manhole.py", line 192, in run sock = self.get_socket() File "C:\Python34\lib\site-packages\manhole.py", line 445, in get_socket sock = _ORIGINAL_SOCKET(socket.AF_UNIX, socket.SOCK_STREAM) AttributeError: 'module' object has no attribute 'AF_UNIX' 2015-10-28 16:31:04 +00:00			`"--debug-manhole"`
Don't try to use the unavailable --monitor- options on Windows 2015-10-28 16:19:11 +00:00			`]`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00
Add --sitemaps/--no-sitemaps 2015-07-27 06:55:20 +00:00			`if sitemaps:`
			`args += ["--sitemaps"]`

First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00			`if recursive:`
			`args += ["--recursive"]`

Implement --wpull-args for passing additional arguments to wpull 2015-08-12 06:39:49 +00:00			`if wpull_args:`
			`args += shlex.split(wpull_args)`

Fix: new Click gives us () instead of None when no start_url's are given 2015-10-03 22:53:20 +00:00			`if start_url:`
Add support for passing multiple URLs to grab-site 2015-08-21 07:18:31 +00:00			`args.extend(start_url)`
Implement -i / --input-file, supporting both local input files and URLs 2015-08-12 05:24:09 +00:00			`else:`
			`args += ["--input-file", DIR_input_file]`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00
Don't spawn wpull in a subprocess, just import it and call its main() 2015-07-28 11:53:35 +00:00			`# Mutate argv, environ, cwd before we turn into wpull`
			`sys.argv[1:] = args`
			`os.environ["GRAB_SITE_WORKING_DIR"] = working_dir`
Add --no-dupespotter for turning off dupespotter which sometimes has false positives 2015-09-30 22:16:56 +00:00			`os.environ["DUPESPOTTER_ENABLED"] = "1" if dupespotter else "0"`
Put all temporary files in DIR/temp; don't let ctrl-c exit grab-site before wpull 2015-07-27 07:26:54 +00:00			`# We can use --warc-tempdir= to put WARC-related temporary files in a temp`
			`# directory, but wpull also creates non-WARC-related "resp_cb" temporary`
			`# files in the cwd, so we must start wpull in temp/ anyway.`
Don't spawn wpull in a subprocess, just import it and call its main() 2015-07-28 11:53:35 +00:00			`os.chdir(temp_dir)`

			`from wpull.app import Application`
			`def noop_setup_signal_handlers(self):`
			`pass`
Put all temporary files in DIR/temp; don't let ctrl-c exit grab-site before wpull 2015-07-27 07:26:54 +00:00
Don't spawn wpull in a subprocess, just import it and call its main() 2015-07-28 11:53:35 +00:00			`# Don't let wpull install a handler for SIGINT or SIGTERM,`
			`# because we install our own in wpull_hooks.py.`
			`Application.setup_signal_handlers = noop_setup_signal_handlers`
Put all temporary files in DIR/temp; don't let ctrl-c exit grab-site before wpull 2015-07-27 07:26:54 +00:00
Don't spawn wpull in a subprocess, just import it and call its main() 2015-07-28 11:53:35 +00:00			`import wpull.__main__`
			`wpull.__main__.main()`
First take on converting grab-site to a Python program 2015-07-27 06:32:08 +00:00

			`if __name__ == '__main__':`
			`main()`