Add --1 option for turning off recursion; document options

This commit is contained in:
Ivan Kozik 2015-07-20 08:23:35 +00:00
parent e83375382d
commit 1fce3af4a0
3 changed files with 21 additions and 6 deletions

View File

@ -83,15 +83,26 @@ Then, start as many crawls as you want with:
```
~/.local/bin/grab-site URL
~/.local/bin/grab-site URL --igsets=blogs,forums
~/.local/bin/grab-site URL --igsets=blogs,forums --no-offsite-links
```
Do this inside tmux unless they're very short crawls.
Note: `URL` must come before the options.
Options:
Note: `--igsets=` means "ignore sets" and must have the `=`.
* `--igsets=blogs,forums`: use ignore sets `blogs` and `forums`.
Example: `~/.local/bin/grab-site URL --igsets=blogs,forums`
Note: `igsets` must be followed with `=` and not ` `.
* `--no-offsite-links`: avoid following links to a depth of 1 on other domains.
* `--1`: grab just `URL` and page requisites without recursing.
* `--level=N`: recurse `N` levels instead of `inf` levels.
```
Note: `URL` must always come **before** the options.
`forums` and `blogs` are some frequently-used ignore sets.
See [the full list of available ignore sets](https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns).

View File

@ -14,6 +14,7 @@ level="inf"
concurrency="2"
page_requisites_level="5"
span_hosts_allow="page-requisites,linked-pages"
recursive="--recursive"
for arg in "$@"; do
case $arg in
@ -23,6 +24,9 @@ for arg in "$@"; do
--igsets=*)
igsets="${arg#*=}"
;;
--1)
recursive=""
;;
--level=*)
level="${arg#*=}"
;;
@ -85,7 +89,7 @@ GRAB_SITE_WORKING_DIR="$dir" "$self/patched-wpull" \
--monitor-disk 400m \
--monitor-memory 10k \
--max-redirect 8 \
--recursive \
$recursive \
--level "$level" \
--page-requisites-level "$page_requisites_level" \
--span-hosts-allow "$span_hosts_allow" \

View File

@ -1 +1 @@
__version__ = '0.3.0'
__version__ = '0.4.0'