Add --sitemaps/--no-sitemaps
This commit is contained in:
parent
2e7d928614
commit
b7c2f1d1bd
@ -119,6 +119,8 @@ Options can come before or after the URL.
|
||||
|
||||
* `--page-requisites-level=N`: recurse page requisites `N` levels instead of `5` levels.
|
||||
|
||||
* `--no-sitemaps`: don't queue URLs from `sitemap.xml` at the root of the site.
|
||||
|
||||
* `--help`: print help text.
|
||||
|
||||
|
||||
|
@ -8,28 +8,43 @@ import click
|
||||
import libgrabsite
|
||||
|
||||
@click.command()
|
||||
|
||||
@click.option('--concurrency', default=2, metavar='NUM',
|
||||
help='Use this many connections to fetch in parallel')
|
||||
|
||||
@click.option('--concurrent', default=None, metavar='NUM', type=int,
|
||||
help='Alias for --concurrency')
|
||||
|
||||
@click.option('--recursive/--1', default=True,
|
||||
help=
|
||||
'--recursive (default: true) to crawl under last /path/ component '
|
||||
'recursively, or --1 to get just START_URL')
|
||||
|
||||
@click.option('--offsite-links/--no-offsite-links', default=False,
|
||||
help=
|
||||
'--offsite-links (default: true) to grab all links to a depth of 1 '
|
||||
'on other domains, or --no-offsite-links to disable')
|
||||
|
||||
@click.option('--igsets', default="", metavar='LIST',
|
||||
help='Comma-separated list of ignore sets to use in addition to "global"')
|
||||
|
||||
@click.option('--ignore-sets', default="", metavar='LIST',
|
||||
help='Alias for --igsets')
|
||||
|
||||
@click.option('--level', default="inf", metavar='NUM',
|
||||
help='Recurse this many levels (default: inf)')
|
||||
|
||||
@click.option('--page-requisites-level', default="5", metavar='NUM',
|
||||
help='Recursive this many levels for page requisites (default: 5)')
|
||||
|
||||
@click.option('--sitemaps/--no-sitemaps', default=True,
|
||||
help=
|
||||
'--sitemaps (default: true) to queue URLs from sitemap.xml '
|
||||
'at the root of the site, or --no-sitemaps to disable')
|
||||
|
||||
@click.argument('start_url')
|
||||
def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets, level, page_requisites_level, start_url):
|
||||
|
||||
def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets, level, page_requisites_level, sitemaps, start_url):
|
||||
span_hosts_allow = "page-requisites,linked-pages"
|
||||
if not offsite_links:
|
||||
span_hosts_allow = "page-requisites"
|
||||
@ -82,7 +97,6 @@ def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets,
|
||||
"--no-robots",
|
||||
"--page-requisites",
|
||||
"--no-parent",
|
||||
"--sitemaps",
|
||||
"--inet4-only",
|
||||
"--timeout", "20",
|
||||
"--tries", "3",
|
||||
@ -102,6 +116,9 @@ def main(concurrency, concurrent, recursive, offsite_links, igsets, ignore_sets,
|
||||
"--quiet",
|
||||
]
|
||||
|
||||
if sitemaps:
|
||||
args += ["--sitemaps"]
|
||||
|
||||
if recursive:
|
||||
args += ["--recursive"]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user