Add --delay option
This commit is contained in:
parent
3c28b53620
commit
7ac5b07a99
@ -19,44 +19,50 @@ def print_version(ctx, param, value):
|
|||||||
@click.command()
|
@click.command()
|
||||||
|
|
||||||
@click.option('--concurrency', default=2, metavar='NUM',
|
@click.option('--concurrency', default=2, metavar='NUM',
|
||||||
help='Use this many connections to fetch in parallel')
|
help='Use this many connections to fetch in parallel (default: 2).')
|
||||||
|
|
||||||
@click.option('--concurrent', default=-1, metavar='NUM',
|
@click.option('--concurrent', default=-1, metavar='NUM',
|
||||||
help='Alias for --concurrency')
|
help='Alias for --concurrency.')
|
||||||
|
|
||||||
|
@click.option('--delay', default="0", metavar='DELAY',
|
||||||
|
help=
|
||||||
|
'Time to wait between requests, in milliseconds (default: 0). '
|
||||||
|
'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
|
||||||
|
'for each request. Delay applies to each concurrent fetcher, not globally.')
|
||||||
|
|
||||||
@click.option('--recursive/--1', default=True,
|
@click.option('--recursive/--1', default=True,
|
||||||
help=
|
help=
|
||||||
'--recursive (default: true) to crawl under last /path/ component '
|
'--recursive (default: true) to crawl under last /path/ component '
|
||||||
'recursively, or --1 to get just START_URL')
|
'recursively, or --1 to get just START_URL.')
|
||||||
|
|
||||||
@click.option('--offsite-links/--no-offsite-links', default=False,
|
@click.option('--offsite-links/--no-offsite-links', default=False,
|
||||||
help=
|
help=
|
||||||
'--offsite-links (default: true) to grab all links to a depth of 1 '
|
'--offsite-links (default: true) to grab all links to a depth of 1 '
|
||||||
'on other domains, or --no-offsite-links to disable')
|
'on other domains, or --no-offsite-links to disable.')
|
||||||
|
|
||||||
@click.option('--igsets', default="", metavar='LIST',
|
@click.option('--igsets', default="", metavar='LIST',
|
||||||
help='Comma-separated list of ignore sets to use in addition to "global"')
|
help='Comma-separated list of ignore sets to use in addition to "global".')
|
||||||
|
|
||||||
@click.option('--ignore-sets', default="", metavar='LIST',
|
@click.option('--ignore-sets', default="", metavar='LIST',
|
||||||
help='Alias for --igsets')
|
help='Alias for --igsets.')
|
||||||
|
|
||||||
@click.option('--level', default="inf", metavar='NUM',
|
@click.option('--level', default="inf", metavar='NUM',
|
||||||
help='Recurse this many levels (default: inf)')
|
help='Recurse this many levels (default: inf).')
|
||||||
|
|
||||||
@click.option('--page-requisites-level', default="5", metavar='NUM',
|
@click.option('--page-requisites-level', default="5", metavar='NUM',
|
||||||
help='Recursive this many levels for page requisites (default: 5)')
|
help='Recursive this many levels for page requisites (default: 5).')
|
||||||
|
|
||||||
@click.option('--sitemaps/--no-sitemaps', default=True,
|
@click.option('--sitemaps/--no-sitemaps', default=True,
|
||||||
help=
|
help=
|
||||||
'--sitemaps (default: true) to queue URLs from sitemap.xml '
|
'--sitemaps (default: true) to queue URLs from sitemap.xml '
|
||||||
'at the root of the site, or --no-sitemaps to disable')
|
'at the root of the site, or --no-sitemaps to disable.')
|
||||||
|
|
||||||
@click.option('--version', is_flag=True, callback=print_version,
|
@click.option('--version', is_flag=True, callback=print_version,
|
||||||
expose_value=False, is_eager=True, help='Print version and exit.')
|
expose_value=False, is_eager=True, help='Print version and exit.')
|
||||||
|
|
||||||
@click.argument('start_url')
|
@click.argument('start_url')
|
||||||
|
|
||||||
def main(concurrency, concurrent, recursive, offsite_links, igsets,
|
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
|
||||||
ignore_sets, level, page_requisites_level, sitemaps, start_url):
|
ignore_sets, level, page_requisites_level, sitemaps, start_url):
|
||||||
span_hosts_allow = "page-requisites,linked-pages"
|
span_hosts_allow = "page-requisites,linked-pages"
|
||||||
if not offsite_links:
|
if not offsite_links:
|
||||||
@ -98,7 +104,7 @@ ignore_sets, level, page_requisites_level, sitemaps, start_url):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
with open("{}/delay".format(working_dir), "w") as f:
|
with open("{}/delay".format(working_dir), "w") as f:
|
||||||
f.write("0")
|
f.write(delay)
|
||||||
|
|
||||||
LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
|
LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
|
||||||
args = [
|
args = [
|
||||||
|
@ -390,15 +390,21 @@ def exit_status(code):
|
|||||||
return code
|
return code
|
||||||
|
|
||||||
|
|
||||||
|
def update_delay_in_job_data():
|
||||||
|
with open(delay_watcher.fname, "r") as f:
|
||||||
|
content = f.read().strip()
|
||||||
|
if "-" in content:
|
||||||
|
job_data["delay_min"], job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
|
||||||
|
else:
|
||||||
|
job_data["delay_min"] = job_data["delay_max"] = int(content)
|
||||||
|
|
||||||
|
update_delay_in_job_data()
|
||||||
|
|
||||||
|
|
||||||
def wait_time(_):
|
def wait_time(_):
|
||||||
try:
|
try:
|
||||||
if delay_watcher.has_changed():
|
if delay_watcher.has_changed():
|
||||||
with open(delay_watcher.fname, "r") as f:
|
update_delay_in_job_data()
|
||||||
content = f.read().strip()
|
|
||||||
if "-" in content:
|
|
||||||
job_data["delay_min"], job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
|
|
||||||
else:
|
|
||||||
job_data["delay_min"] = job_data["delay_max"] = int(content)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000
|
return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000
|
||||||
|
Loading…
x
Reference in New Issue
Block a user