# This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . const ONION_SEARCH = array( // Main spider settings: 'MAIN' => array( // (string) The name of the table containing hashes of blacklisted onion addresses 'BLACKLIST' => '', // (boolean) Should we print to the command command line as the script runs? 'DEBUG' => true, // (string) The name of the table used to store uncrawlable sites 'NOCRAWL' => '', // (string) The name of the table used to store crawlable sites 'TOCRAWL' => '', // (integer) The number of second to wait before re-crawling a page 'REFRESHDELAY' => 30*24*60*60, ), // (array) Settings for \DBO instantiation (database-specific): // This array will be passed into \DBO::__construct() function using // the "..." operator. 'DBO' => array(), // Settings for \st\y\curl_limit use: 'LIMIT' => array( // (boolean) Should curl_limit dump way too much information to the command line? 'DEBUG' => false, // (integer) The maximum file size that the spider will download 'LIMIT' => 1024*1024, ), // Settings for \st\y\remote_files use: 'REMOTE' => array( // (string) The user agent that the spider should identify as 'USERAGENT' => 'I did not set my OnionSpider User-Agent string.', // (integer) The number of seconds to wait before giving up on a Web page 'TIMEOUT' => 60*60, ), ); /** * Databases are formatted as follows: * * ONIONSEARCH_BLACKLIST is a table containing a single column: `hash`. * Every row in this table is a 40 character string found by taking the * base 16 hash of a blacklisted onion address. This allows you to * blacklist onion addresses without needing to store the onion address * in a retrievable way. Entries in this table are found as * \sha1('.onion.') and blacklisting any onion * address also blacklists all of its subdomains. Blacklisting only a * subdomain will not work, as hashes are compared only against the * main domain. For example, if you add * \sha1('example.example32example.onion.') to this table, no domain at * all will be blacklisted, but if you add * \sha1('example32example.onion.') to this table, * 'example.example32example.onion' and 'example32example.onion' would * both be blacklisted. * * ONIONSEARCH_NOCRAWL is a table with two fields: `uri` and * `primary_key`. `uri` stores the URI of a site that we want to know * about, but which we believe that attempting to crawl will not help * us find other sites. `primary_key` is not used by OnionSpider, but * should be an auto-incrementing, unsigned integer. * * ONIONSEARCH_TOCRAWL is a table containing four columns: `uri`, * `title`, `timestamp` and `primary_key` . If a exists on a * site's main index page, it will be stored in the `title` column. If * not, the URI of the page will be stored as the `title`. The `uri` * column always stores the URI of the site's main index page. The * `timestamp` column represents the last time that the site was * crawled, to allow us to avoid recrawling a site too soon. * `primary_key` is not used by OnionSpider, but is used by MySQL * because MySQL does not allow variable-width fields, such as `uri`, * to be used as a key. It should be an auto-incrementing unsigned * integer. **/