Add --delay option

This commit is contained in:
Ivan Kozik 2015-07-28 13:57:42 +00:00
parent 3c28b53620
commit 7ac5b07a99
2 changed files with 29 additions and 17 deletions

View File

@ -19,44 +19,50 @@ def print_version(ctx, param, value):
@click.command()
@click.option('--concurrency', default=2, metavar='NUM',
help='Use this many connections to fetch in parallel')
help='Use this many connections to fetch in parallel (default: 2).')
@click.option('--concurrent', default=-1, metavar='NUM',
help='Alias for --concurrency')
help='Alias for --concurrency.')
@click.option('--delay', default="0", metavar='DELAY',
help=
'Time to wait between requests, in milliseconds (default: 0). '
'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
'for each request. Delay applies to each concurrent fetcher, not globally.')
@click.option('--recursive/--1', default=True,
help=
'--recursive (default: true) to crawl under last /path/ component '
'recursively, or --1 to get just START_URL')
'recursively, or --1 to get just START_URL.')
@click.option('--offsite-links/--no-offsite-links', default=False,
help=
'--offsite-links (default: true) to grab all links to a depth of 1 '
'on other domains, or --no-offsite-links to disable')
'on other domains, or --no-offsite-links to disable.')
@click.option('--igsets', default="", metavar='LIST',
help='Comma-separated list of ignore sets to use in addition to "global"')
help='Comma-separated list of ignore sets to use in addition to "global".')
@click.option('--ignore-sets', default="", metavar='LIST',
help='Alias for --igsets')
help='Alias for --igsets.')
@click.option('--level', default="inf", metavar='NUM',
help='Recurse this many levels (default: inf)')
help='Recurse this many levels (default: inf).')
@click.option('--page-requisites-level', default="5", metavar='NUM',
help='Recursive this many levels for page requisites (default: 5)')
help='Recursive this many levels for page requisites (default: 5).')
@click.option('--sitemaps/--no-sitemaps', default=True,
help=
'--sitemaps (default: true) to queue URLs from sitemap.xml '
'at the root of the site, or --no-sitemaps to disable')
'at the root of the site, or --no-sitemaps to disable.')
@click.option('--version', is_flag=True, callback=print_version,
expose_value=False, is_eager=True, help='Print version and exit.')
@click.argument('start_url')
def main(concurrency, concurrent, recursive, offsite_links, igsets,
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
ignore_sets, level, page_requisites_level, sitemaps, start_url):
span_hosts_allow = "page-requisites,linked-pages"
if not offsite_links:
@ -98,7 +104,7 @@ ignore_sets, level, page_requisites_level, sitemaps, start_url):
pass
with open("{}/delay".format(working_dir), "w") as f:
f.write("0")
f.write(delay)
LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
args = [

View File

@ -390,15 +390,21 @@ def exit_status(code):
return code
def update_delay_in_job_data():
with open(delay_watcher.fname, "r") as f:
content = f.read().strip()
if "-" in content:
job_data["delay_min"], job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
else:
job_data["delay_min"] = job_data["delay_max"] = int(content)
update_delay_in_job_data()
def wait_time(_):
try:
if delay_watcher.has_changed():
with open(delay_watcher.fname, "r") as f:
content = f.read().strip()
if "-" in content:
job_data["delay_min"], job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
else:
job_data["delay_min"] = job_data["delay_max"] = int(content)
update_delay_in_job_data()
except Exception:
traceback.print_exc()
return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000