From 1fce3af4a0a9a53c00a011ea0612dfae7c00623d Mon Sep 17 00:00:00 2001 From: Ivan Kozik Date: Mon, 20 Jul 2015 08:23:35 +0000 Subject: [PATCH] Add --1 option for turning off recursion; document options --- README.md | 19 +++++++++++++++---- grab-site | 6 +++++- libgrabsite/__init__.py | 2 +- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0392f2b..f49b69d 100644 --- a/README.md +++ b/README.md @@ -83,15 +83,26 @@ Then, start as many crawls as you want with: ``` ~/.local/bin/grab-site URL -~/.local/bin/grab-site URL --igsets=blogs,forums -~/.local/bin/grab-site URL --igsets=blogs,forums --no-offsite-links ``` Do this inside tmux unless they're very short crawls. -Note: `URL` must come before the options. +Options: -Note: `--igsets=` means "ignore sets" and must have the `=`. +* `--igsets=blogs,forums`: use ignore sets `blogs` and `forums`. + + Example: `~/.local/bin/grab-site URL --igsets=blogs,forums` + + Note: `igsets` must be followed with `=` and not ` `. + +* `--no-offsite-links`: avoid following links to a depth of 1 on other domains. + +* `--1`: grab just `URL` and page requisites without recursing. + +* `--level=N`: recurse `N` levels instead of `inf` levels. +``` + +Note: `URL` must always come **before** the options. `forums` and `blogs` are some frequently-used ignore sets. See [the full list of available ignore sets](https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns). diff --git a/grab-site b/grab-site index b5757a9..dbfe3d4 100755 --- a/grab-site +++ b/grab-site @@ -14,6 +14,7 @@ level="inf" concurrency="2" page_requisites_level="5" span_hosts_allow="page-requisites,linked-pages" +recursive="--recursive" for arg in "$@"; do case $arg in @@ -23,6 +24,9 @@ for arg in "$@"; do --igsets=*) igsets="${arg#*=}" ;; + --1) + recursive="" + ;; --level=*) level="${arg#*=}" ;; @@ -85,7 +89,7 @@ GRAB_SITE_WORKING_DIR="$dir" "$self/patched-wpull" \ --monitor-disk 400m \ --monitor-memory 10k \ --max-redirect 8 \ - --recursive \ + $recursive \ --level "$level" \ --page-requisites-level "$page_requisites_level" \ --span-hosts-allow "$span_hosts_allow" \ diff --git a/libgrabsite/__init__.py b/libgrabsite/__init__.py index 0404d81..abeeedb 100644 --- a/libgrabsite/__init__.py +++ b/libgrabsite/__init__.py @@ -1 +1 @@ -__version__ = '0.3.0' +__version__ = '0.4.0'