Implement --igon / --igoff

This commit is contained in:
Ivan Kozik 2015-08-10 13:23:43 +00:00
parent 76ba117d34
commit ee4dbe162e
3 changed files with 14 additions and 3 deletions

View File

@ -131,6 +131,9 @@ Options can come before or after the URL.
Content-Length larger than N. (default: -1, don't skip anything). Can be changed during Content-Length larger than N. (default: -1, don't skip anything). Can be changed during
the crawl by editing the `DIR/max_content_length` file. the crawl by editing the `DIR/max_content_length` file.
* `--igon`: Print all URLs being ignored to the terminal and dashboard. Can be
changed during the crawl by `touch`ing or `rm`ing the `DIR/igoff` file.
* `--level=N`: recurse `N` levels instead of `inf` levels. * `--level=N`: recurse `N` levels instead of `inf` levels.
* `--page-requisites-level=N`: recurse page requisites `N` levels instead of `5` levels. * `--page-requisites-level=N`: recurse page requisites `N` levels instead of `5` levels.

View File

@ -46,6 +46,11 @@ def print_version(ctx, param, value):
@click.option('--ignore-sets', default="", metavar='LIST', @click.option('--ignore-sets', default="", metavar='LIST',
help='Alias for --igsets.') help='Alias for --igsets.')
@click.option('--igon/--igoff', default=False,
help=
'--igon (default: false) to print all URLs being ignored to the terminal '
'and dashboard.')
@click.option('--max-content-length', default=-1, metavar='N', @click.option('--max-content-length', default=-1, metavar='N',
help= help=
"Skip the download of any response that claims a Content-Length " "Skip the download of any response that claims a Content-Length "
@ -68,7 +73,7 @@ def print_version(ctx, param, value):
@click.argument('start_url') @click.argument('start_url')
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets, def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
ignore_sets, level, page_requisites_level, max_content_length, sitemaps, ignore_sets, igon, level, page_requisites_level, max_content_length, sitemaps,
start_url): start_url):
span_hosts_allow = "page-requisites,linked-pages" span_hosts_allow = "page-requisites,linked-pages"
if not offsite_links: if not offsite_links:
@ -106,8 +111,9 @@ start_url):
with open("{}/igsets".format(working_dir), "w") as f: with open("{}/igsets".format(working_dir), "w") as f:
f.write("global,{}".format(igsets)) f.write("global,{}".format(igsets))
with open("{}/igoff".format(working_dir), "w") as f: if not igon:
pass with open("{}/igoff".format(working_dir), "w") as f:
pass
with open("{}/ignores".format(working_dir), "w") as f: with open("{}/ignores".format(working_dir), "w") as f:
pass pass

View File

@ -330,6 +330,8 @@ def update_igoff():
job_data["suppress_ignore_reports"] = igoff job_data["suppress_ignore_reports"] = igoff
return igoff return igoff
update_igoff()
def maybe_log_ignore(url, pattern): def maybe_log_ignore(url, pattern):
if not update_igoff(): if not update_igoff():