From b7c2f1d1bd81fa383c557534de633f38e5e4f5f4 Mon Sep 17 00:00:00 2001 From: Ivan Kozik Date: Mon, 27 Jul 2015 06:55:20 +0000 Subject: [PATCH] Add --sitemaps/--no-sitemaps --- README.md | 2 ++ libgrabsite/main.py | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e58e74c..f26fddc 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,8 @@ Options can come before or after the URL. * `--page-requisites-level=N`: recurse page requisites `N` levels instead of `5` levels. +* `--no-sitemaps`: don't queue URLs from `sitemap.xml` at the root of the site. + * `--help`: print help text. diff --git a/libgrabsite/main.py b/libgrabsite/main.py index c2b81c0..d0978ae 100644 --- a/libgrabsite/main.py +++ b/libgrabsite/main.py @@ -8,28 +8,43 @@ import click import libgrabsite @click.command() + @click.option('--concurrency', default=2, metavar='NUM', help='Use this many connections to fetch in parallel') + @click.option('--concurrent', default=None, metavar='NUM', type=int, help='Alias for --concurrency') + @click.option('--recursive/--1', default=True, help= '--recursive (default: true) to crawl under last /path/ component ' 'recursively, or --1 to get just START_URL') + @click.option('--offsite-links/--no-offsite-links', default=False, help= '--offsite-links (default: true) to grab all links to a depth of 1 ' 'on other domains, or --no-offsite-links to disable') + @click.option('--igsets', default="", metavar='LIST', help='Comma-separated list of ignore sets to use in addition to "global"') + @click.option('--ignore-sets', default="", metavar='LIST', help='Alias for --igsets') + @click.option('--level', default="inf", metavar='NUM', help='Recurse this many levels (default: inf)') + @click.option('--page-requisites-level', default="5", metavar='NUM', help='Recursive this many levels for page requisites (default: 5)') + +@click.option('--sitemaps/--no-sitemaps', default=True, + help= + '--sitemaps (default: true) to queue URLs from sitemap.xml ' + 'at the root of the site, or --no-sitemaps to disable') + @click.argument('start_url') -def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets, level, page_requisites_level, start_url): + +def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets, level, page_requisites_level, sitemaps, start_url): span_hosts_allow = "page-requisites,linked-pages" if not offsite_links: span_hosts_allow = "page-requisites" @@ -82,7 +97,6 @@ def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets, "--no-robots", "--page-requisites", "--no-parent", - "--sitemaps", "--inet4-only", "--timeout", "20", "--tries", "3", @@ -102,6 +116,9 @@ def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets, "--quiet", ] + if sitemaps: + args += ["--sitemaps"] + if recursive: args += ["--recursive"]