diff --git a/README.md b/README.md index 9fb7837..9b8939f 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,9 @@ Options can come before or after the URL. Content-Length larger than N. (default: -1, don't skip anything). Can be changed during the crawl by editing the `DIR/max_content_length` file. +* `--igon`: Print all URLs being ignored to the terminal and dashboard. Can be + changed during the crawl by `touch`ing or `rm`ing the `DIR/igoff` file. + * `--level=N`: recurse `N` levels instead of `inf` levels. * `--page-requisites-level=N`: recurse page requisites `N` levels instead of `5` levels. diff --git a/libgrabsite/main.py b/libgrabsite/main.py index 09c05c5..99c57a9 100644 --- a/libgrabsite/main.py +++ b/libgrabsite/main.py @@ -46,6 +46,11 @@ def print_version(ctx, param, value): @click.option('--ignore-sets', default="", metavar='LIST', help='Alias for --igsets.') +@click.option('--igon/--igoff', default=False, + help= + '--igon (default: false) to print all URLs being ignored to the terminal ' + 'and dashboard.') + @click.option('--max-content-length', default=-1, metavar='N', help= "Skip the download of any response that claims a Content-Length " @@ -68,7 +73,7 @@ def print_version(ctx, param, value): @click.argument('start_url') def main(concurrency, concurrent, delay, recursive, offsite_links, igsets, -ignore_sets, level, page_requisites_level, max_content_length, sitemaps, +ignore_sets, igon, level, page_requisites_level, max_content_length, sitemaps, start_url): span_hosts_allow = "page-requisites,linked-pages" if not offsite_links: @@ -106,8 +111,9 @@ start_url): with open("{}/igsets".format(working_dir), "w") as f: f.write("global,{}".format(igsets)) - with open("{}/igoff".format(working_dir), "w") as f: - pass + if not igon: + with open("{}/igoff".format(working_dir), "w") as f: + pass with open("{}/ignores".format(working_dir), "w") as f: pass diff --git a/libgrabsite/wpull_hooks.py b/libgrabsite/wpull_hooks.py index d71f0dd..ff05e1c 100644 --- a/libgrabsite/wpull_hooks.py +++ b/libgrabsite/wpull_hooks.py @@ -330,6 +330,8 @@ def update_igoff(): job_data["suppress_ignore_reports"] = igoff return igoff +update_igoff() + def maybe_log_ignore(url, pattern): if not update_igoff():