Add --delay option
This commit is contained in:
parent
3c28b53620
commit
7ac5b07a99
@ -19,44 +19,50 @@ def print_version(ctx, param, value):
|
||||
@click.command()
|
||||
|
||||
@click.option('--concurrency', default=2, metavar='NUM',
|
||||
help='Use this many connections to fetch in parallel')
|
||||
help='Use this many connections to fetch in parallel (default: 2).')
|
||||
|
||||
@click.option('--concurrent', default=-1, metavar='NUM',
|
||||
help='Alias for --concurrency')
|
||||
help='Alias for --concurrency.')
|
||||
|
||||
@click.option('--delay', default="0", metavar='DELAY',
|
||||
help=
|
||||
'Time to wait between requests, in milliseconds (default: 0). '
|
||||
'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
|
||||
'for each request. Delay applies to each concurrent fetcher, not globally.')
|
||||
|
||||
@click.option('--recursive/--1', default=True,
|
||||
help=
|
||||
'--recursive (default: true) to crawl under last /path/ component '
|
||||
'recursively, or --1 to get just START_URL')
|
||||
'recursively, or --1 to get just START_URL.')
|
||||
|
||||
@click.option('--offsite-links/--no-offsite-links', default=False,
|
||||
help=
|
||||
'--offsite-links (default: true) to grab all links to a depth of 1 '
|
||||
'on other domains, or --no-offsite-links to disable')
|
||||
'on other domains, or --no-offsite-links to disable.')
|
||||
|
||||
@click.option('--igsets', default="", metavar='LIST',
|
||||
help='Comma-separated list of ignore sets to use in addition to "global"')
|
||||
help='Comma-separated list of ignore sets to use in addition to "global".')
|
||||
|
||||
@click.option('--ignore-sets', default="", metavar='LIST',
|
||||
help='Alias for --igsets')
|
||||
help='Alias for --igsets.')
|
||||
|
||||
@click.option('--level', default="inf", metavar='NUM',
|
||||
help='Recurse this many levels (default: inf)')
|
||||
help='Recurse this many levels (default: inf).')
|
||||
|
||||
@click.option('--page-requisites-level', default="5", metavar='NUM',
|
||||
help='Recursive this many levels for page requisites (default: 5)')
|
||||
help='Recursive this many levels for page requisites (default: 5).')
|
||||
|
||||
@click.option('--sitemaps/--no-sitemaps', default=True,
|
||||
help=
|
||||
'--sitemaps (default: true) to queue URLs from sitemap.xml '
|
||||
'at the root of the site, or --no-sitemaps to disable')
|
||||
'at the root of the site, or --no-sitemaps to disable.')
|
||||
|
||||
@click.option('--version', is_flag=True, callback=print_version,
|
||||
expose_value=False, is_eager=True, help='Print version and exit.')
|
||||
|
||||
@click.argument('start_url')
|
||||
|
||||
def main(concurrency, concurrent, recursive, offsite_links, igsets,
|
||||
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
|
||||
ignore_sets, level, page_requisites_level, sitemaps, start_url):
|
||||
span_hosts_allow = "page-requisites,linked-pages"
|
||||
if not offsite_links:
|
||||
@ -98,7 +104,7 @@ ignore_sets, level, page_requisites_level, sitemaps, start_url):
|
||||
pass
|
||||
|
||||
with open("{}/delay".format(working_dir), "w") as f:
|
||||
f.write("0")
|
||||
f.write(delay)
|
||||
|
||||
LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
|
||||
args = [
|
||||
|
@ -390,15 +390,21 @@ def exit_status(code):
|
||||
return code
|
||||
|
||||
|
||||
def update_delay_in_job_data():
|
||||
with open(delay_watcher.fname, "r") as f:
|
||||
content = f.read().strip()
|
||||
if "-" in content:
|
||||
job_data["delay_min"], job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
|
||||
else:
|
||||
job_data["delay_min"] = job_data["delay_max"] = int(content)
|
||||
|
||||
update_delay_in_job_data()
|
||||
|
||||
|
||||
def wait_time(_):
|
||||
try:
|
||||
if delay_watcher.has_changed():
|
||||
with open(delay_watcher.fname, "r") as f:
|
||||
content = f.read().strip()
|
||||
if "-" in content:
|
||||
job_data["delay_min"], job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
|
||||
else:
|
||||
job_data["delay_min"] = job_data["delay_max"] = int(content)
|
||||
update_delay_in_job_data()
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000
|
||||
|
Loading…
x
Reference in New Issue
Block a user