OnionSpider/ONION_SEARCH.php.example

<?php namespace localhost;
#	OnionSearch configuration file
#	Copyright (C) 2015 y.st. <mailto:copyright@y.st>

#	This program is free software: you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation, either version 3 of the License, or
#	(at your option) any later version.

#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.

#	You should have received a copy of the GNU General Public License
#	along with this program.  If not, see <http://www.gnu.org/licenses/>.
const ONION_SEARCH = array(
// Main spider settings:
	'MAIN' => array(
// (string) The name of the table containing hashes of blacklisted onion addresses
		'BLACKLIST'    => '',
// (boolean) Should we print to the command command line as the script runs?
		'DEBUG'        => true,
// (string) The name of the table used to store uncrawlable sites
		'NOCRAWL'      => '',
// (string) The name of the table used to store crawlable sites
		'TOCRAWL'      => '',
// (integer) The number of second to wait before re-crawling a page
		'REFRESHDELAY' => 30*24*60*60,
	),

// (array) Settings for \DBO instantiation (database-specific):
// This array will be passed into \DBO::__construct() function using
// the "..." operator.
	'DBO' => array(),

// Settings for \st\y\curl_limit use:
	'LIMIT' => array(
// (boolean) Should curl_limit dump way too much information to the command line?
		'DEBUG' => false,
// (integer) The maximum file size that the spider will download
		'LIMIT' => 1024*1024,
	),

// Settings for \st\y\remote_files use:
	'REMOTE' => array(
// (string) The user agent that the spider should identify as
		'USERAGENT' => 'I did not set my OnionSpider User-Agent string.',
// (integer) The number of seconds to wait before giving up on a Web page
		'TIMEOUT'   => 60*60,
	),
);
/**
 * Databases are formatted as follows:
 *
 * ONIONSEARCH_BLACKLIST is a table containing a single column: `hash`.
 * Every row in this table is a 40 character string found by taking the
 * base 16 hash of a blacklisted onion address. This allows you to
 * blacklist onion addresses without needing to store the onion address
 * in a retrievable way. Entries in this table are found as
 * \sha1('<sixteen characters>.onion.') and blacklisting any onion
 * address also blacklists all of its subdomains. Blacklisting only a
 * subdomain will not work, as hashes are compared only against the
 * main domain. For example, if you add
 * \sha1('example.example32example.onion.') to this table, no domain at
 * all will be blacklisted, but if you add
 * \sha1('example32example.onion.') to this table,
 * 'example.example32example.onion' and 'example32example.onion' would
 * both be blacklisted.
 *
 * ONIONSEARCH_NOCRAWL is a table with two fields: `uri` and
 * `primary_key`. `uri` stores the URI of a site that we want to know
 * about, but which we believe that attempting to crawl will not help
 * us find other sites. `primary_key` is not used by OnionSpider, but
 * should be an auto-incrementing, unsigned integer.
 *
 * ONIONSEARCH_TOCRAWL is a table containing four columns: `uri`,
 * `title`, `timestamp` and `primary_key` . If a <title/> exists on a
 * site's main index page, it will be stored in the `title` column. If
 * not, the URI of the page will be stored as the `title`. The `uri`
 * column always stores the URI of the site's main index page. The
 * `timestamp` column represents the last time that the site was
 * crawled, to allow us to avoid recrawling a site too soon.
 * `primary_key` is not used by OnionSpider, but is used by MySQL
 * because MySQL does not allow variable-width fields, such as `uri`,
 * to be used as a key. It should be an auto-incrementing unsigned
 * integer.
**/