#!/usr/bin/php * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . **/ /** * REMEMBER TO ADD AT LEAST ONE URI TO YOUR INITIAL DATABASE. * IF YOU DO NOT, THE SPIDER WILL NOT GO ANYWHERE. * After the first crawl, or rather, after the first URI you add, * manually adding URIs will no longer be necessary. **/ use st\y\curl_limit, st\y\remote_files, st\y\robots_txt, st\y\uri; use function st\y\gopher, st\y\hash_uri_onion, st\y\parse_seconds; use const st\y\CURLOPT_TOR, st\y\CURLOPT_TLS_ENCRYPTONLY, st\y\VERSION, localhost\ONION_SEARCH; spl_autoload_register(); require 'st/y/const/CURLOPT_TOR.php'; require 'st/y/const/CURLOPT_TLS_ENCRYPTONLY.php'; require 'st/y/const/VERSION.php'; require 'st/y/function/error_handler.php'; require 'st/y/function/gopher.php'; require 'st/y/function/hash_uri_onion.php'; require 'st/y/function/parse_seconds.php'; require 'localhost/const/ONION_SEARCH.php'; set_error_handler('\\st\\y\\error_handler'); /** * We need to be sure that the functions and classes used by this * spider are present. If the library version is too old to contain * these, we need to alert the user as to why the needed constructs are * missing and alert him/her as to where the needed library can be * downloaded. **/ if(version_compare(VERSION['include.d'], '0.0.1.1', '<')): throw new LogicException('Your include.d is too old. You need version 0.0.1.1 or above. You may find an update to download at .'); endif; // The file-downloading object $cURL = new remote_files(array( CURLOPT_FOLLOWLOCATION => false, /** * The PHP documentation at * says * that the \CURLOPT_NOPROGRESS setting should only be set to false for * debugging. *THIS IS A LIE.* When this is set to false, the function * specified by the \CURLOPT_PROGRESSFUNCTION setting is actually * called. However, if the \CURLOPT_NOPROGRESS setting is *not* set to * false, the function specified by the \CURLOPT_PROGRESSFUNCTION * setting is *not* called, meaning that we have no way to terminate a * runaway download. **/ CURLOPT_NOPROGRESS => false, CURLOPT_PROGRESSFUNCTION => new curl_limit(ONION_SEARCH['LIMIT']['LIMIT'], ONION_SEARCH['LIMIT']['DEBUG']), CURLOPT_USERAGENT => ONION_SEARCH['REMOTE']['USERAGENT'], CURLOPT_TIMEOUT => ONION_SEARCH['REMOTE']['TIMEOUT'], CURLOPT_PROTOCOLS => CURLPROTO_HTTPS|CURLPROTO_HTTP|CURLPROTO_GOPHER, )); // We need to use Tor to retrieve files from onion space. $cURL->setopt_array(CURLOPT_TOR); // Use TLS for encryption only, not for identity verification. $cURL->setopt_array(CURLOPT_TLS_ENCRYPTONLY); // These four constants are easier to use in strings if they are variables. $blacklist = ONION_SEARCH['MAIN']['BLACKLIST']; $nocrawl = ONION_SEARCH['MAIN']['NOCRAWL']; $tocrawl = ONION_SEARCH['MAIN']['TOCRAWL']; $useragent = ONION_SEARCH['REMOTE']['USERAGENT']; // The database reading/writing object $PDO = new PDO(...ONION_SEARCH['DBO']); /** * The documentation insists that using \PDO::prepare() is better than * using \DBO::query(). I suppose that we can comply. Honestly, the * documentation is probably right, I am just frustrated that I had to * rewrite my code when away from the \mysqli class. The \mysqli class * also has this functionality, but the documentation does not seem to * dissuade people from using regular-style queries. **/ $query = array( 'in' => array( 'tocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = :URI"), 'nocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = :URI"), 'blacklist' => $PDO->prepare("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = :hash"), ), 'add' => array( 'tocrawl' => $PDO->prepare("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES (:URI,:title,'0')"), 'nocrawl' => $PDO->prepare("INSERT INTO `$nocrawl` (`uri`) VALUES (:URI)"), ), 'list' => array( 'crawl' => $PDO->prepare("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < :time ORDER BY `timestamp` ASC"), ), 'remove' => array( 'tocrawl' => $PDO->prepare("DELETE FROM `$tocrawl` WHERE `uri` = :URI"), ), 'update' => array( 'title' => $PDO->prepare("UPDATE `$tocrawl` SET `title` = :title WHERE `uri` = :URI"), 'timestamp' => $PDO->prepare("UPDATE `$tocrawl` SET `timestamp` = :time WHERE `uri` = :URI"), ), ); // Set up the document parser. $DOMDocument = new DOMDocument(); /** * Prevent malformed Web pages from killing the spider. * We will need to clear the errors as they come up to prevent them * from filling the buffer though. **/ $DOMDocument->recover = true; libxml_use_internal_errors(true); // This allows us to loop until we are done, then stop. $continue_loop = true; while($continue_loop): /** * If we have URIs to process, this will be set to true immediately in * the second nested loop. If this loop finishes without executing the * contained loop, it means that we must not have had any URIs to * process. **/ $continue_loop = false; /** * For debugging purposes, it can be useful to start this script on a * particular URI. When this script finds URIs on a Web page, it checks * it for validity and properly discards it if invalid. However, when * given a URI via the command line, it assumes that this URI is valid * and will abort otherwise. This is a feature, not a bug. If you are * trying to debug a particular page and you accidentally specify a bad * URI, continuing despite the mistake is the incorrect behavior. * * Furthermore, it is assumed that the URI specified this way is part * of a site that already exists in the database. This tool is not a * way to add URIs, it is a way to test bugs in the spider. If a * particular page crashed the spider, this feature allows you to start * with the crashing page so that you don't have to wait for the spider * to find that page again in order to test a fix in the spider code. **/ if(isset($argv[1])): $DEBUG_URI = new uri($argv[1]); $URIlist = array( (object) array( 'uri' => new uri('/', $DEBUG_URI), ), ); // We don't want to run this code a second time, so let's remove the trigger. unset($argv[1]); else: $query['list']['crawl']->execute(array( ':time' => time()-ONION_SEARCH['MAIN']['REFRESHDELAY'], )); $URIlist = $query['list']['crawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['list']['crawl']->closeCursor(); endif; foreach($URIlist as $uri): /** * This time stamp is used to estimate how much time is remaining in * the processing of a given site. For a more accurate estimation, we * should probably move this closer to the loop below, but I would * rather have the estimate be a little high than have it be a little * low. **/ $starttime = time(); $site_URI = new uri($uri->uri); // We have URIs to process. Set this to true. $continue_loop = true; // If the onion address has been blacklisted, we should ignore it. $hash = array( ':hash' => hash_uri_onion($site_URI), ); $query['in']['blacklist']->execute($hash); $result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['list']['crawl']->closeCursor(); $blacklisted = $result[0]->{'COUNT(*)'} != '0'; // If the onion address has blacklisted, we should remove it from the database. if($blacklisted): $query['remove']['tocrawl']->execute($hash); $query['remove']['tocrawl']->closeCursor(); continue; endif; // Different types of URI need to be handled differently. switch($site_URI->scheme): case 'https': case 'http': $robotstxt_URI = new uri('/robots.txt', $site_URI); break; case 'gopher': $robotstxt_URI = new uri('/0/robots.txt', $site_URI); break; default: throw new RuntimeException("$useragent does not know how to crawl \"$site_URI->scheme\" URIs, URI \"$site_URI\" found in table \"$tocrawl\"."); endswitch; // Information about the current site will be held in a pair of arrays. $done = array(); $queue = array($site_URI); /** * If we are debugging with a particular URI, we should make sure that * that URI is at the front of the queue. **/ if(isset($DEBUG_URI)): $queue[] = $DEBUG_URI; // We don't want to run this code a second time, so let's remove the trigger. unset($DEBUG_URI); endif; // The spider should respect the wishes of the webmaster. if(ONION_SEARCH['MAIN']['DEBUG']): echo "Checking \"robots.txt\" file at <$robotstxt_URI>\n"; endif; $page = $cURL->get($robotstxt_URI); if($page === false): // Using blank data when no data is available allows us to assume that // we actually have data. $robotstxt = new robots_txt(''); else: $robotstxt = new robots_txt($page); endif; foreach($robotstxt->sitemaps as $sitemap): $queue[] = new uri($sitemap, $site_URI); endforeach; while(null !== ($URI = array_shift($queue))): $done[] = $URI; // The spider should respect the wishes of the webmaster. if($robotstxt->disallows(ONION_SEARCH['REMOTE']['USERAGENT'], $URI)): continue; endif; // Retrieve the page. if(ONION_SEARCH['MAIN']['DEBUG']): echo "Working with page at URI <$URI>\n"; endif; $page = $cURL->get($URI); // If the page is empty, there is no point in parsing it. if(!empty($page)): $DOMDocument->loadXML($page); // Those lazy fools wrote bad markup and put errors in our buffer. // We should clear them. libxml_clear_errors(); /** * If we set the document URI, the base URI resolver is able to handle * tags that specify a relative base. **/ $DOMDocument->documentURI = $URI; /** * This is just a little witchery to find the base URI only once * instead of having to find it again for each tag. If we used the * "baseURI" property of every tag, we would need to instantiate * the \st\y\uri class for each hyperlink if a tag were * present, as well as check to see if a "baseURI" property were set * for each tag. It is much more efficient to check once per page * and perform the needed setup once per page. **/ $base_tag = $DOMDocument->getElementsByTagName('base'); if($base_tag->length and ($baseURI = $base_tag->item(0)->baseURI)): $base = new uri($baseURI); else: $base = $URI; endif; foreach($DOMDocument->getElementsByTagName('loc') as $LOC): // We need to make relative URIs absolute. $new_URI = new uri($LOC->textContent, $base); // We also need to strip away the fragment, if any. unset($new_URI->fragment); $new_URI_site = new uri('/', $new_URI); if($new_URI_site == $site_URI): if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)): $queue[] = $new_URI; endif; else: // We do not want to keep track URIs using blacklisted onion addresses. $hash = array( ':hash' => hash_uri_onion($new_URI_site), ); $query['in']['blacklist']->execute($hash); $result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['blacklist']->closeCursor(); $not_blacklisted = $result[0]->{'COUNT(*)'} == '0'; // If we do not check for the presence of an existing anchor in our // database, the database will grow needlessly when recrawling pages. switch($new_URI_site->scheme): case 'https': case 'http': case 'gopher': $query['in']['tocrawl']->execute(array( ':URI' => $new_URI_site, )); $result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['tocrawl']->closeCursor(); break; default: $query['in']['nocrawl']->execute(array( ':URI' => $new_URI_site, )); $result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['nocrawl']->closeCursor(); endswitch; $is_new_site = $result[0]->{'COUNT(*)'} == '0'; if(isset($new_URI->host) and (substr($new_URI->host, -6) == '.onion') and $not_blacklisted and $is_new_site): switch($new_URI_site->scheme): case 'https': case 'http': case 'gopher': $query['add']['tocrawl']->execute(array( ':URI' => $new_URI_site, ':title' => $new_URI_site, )); $query['add']['tocrawl']->closeCursor(); break; default: $query['add']['nocrawl']->execute(array( ':URI' => $new_URI_site, )); $query['add']['nocrawl']->closeCursor(); endswitch; endif; endif; endforeach; foreach($DOMDocument->getElementsByTagName('a') as $A): $href = $A->attributes->getNamedItem('href'); if($href): // If we do not wrap this in a try block, malformed $a[URI]s in hyperlinks will kill the spider. try { // We need to make relative URIs absolute. $new_URI = new uri($href->textContent, $base); // We also need to strip away the fragment, if any. unset($new_URI->fragment); $new_URI_site = new uri('/', $new_URI); if($new_URI_site == $site_URI): if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)): $queue[] = $new_URI; endif; else: // We do not want to keep track URIs using blacklisted onion addresses. $hash = array( ':hash' => hash_uri_onion($new_URI_site), ); $query['in']['blacklist']->execute($hash); $result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['blacklist']->closeCursor(); $not_blacklisted = $result[0]->{'COUNT(*)'} == '0'; // If we do not check for the presence of an existing anchor in our // database, the database will grow needlessly when recrawling pages. switch($new_URI_site->scheme): case 'https': case 'http': case 'gopher': $query['in']['tocrawl']->execute(array( ':URI' => $new_URI_site, )); $result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['tocrawl']->closeCursor(); break; default: $query['in']['nocrawl']->execute(array( ':URI' => $new_URI_site, )); $result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['nocrawl']->closeCursor(); endswitch; $is_new_site = $result[0]->{'COUNT(*)'} == '0'; if(isset($new_URI_site->host) and substr($new_URI_site->host, -6) == '.onion' and $not_blacklisted and $is_new_site): switch($new_URI_site->scheme): case 'https': case 'http': case 'gopher': if(isset($A->textContent)): $prepared_text = $A->textContent; else: $prepared_text = ''; endif; $query['add']['tocrawl']->execute(array( ':URI' => $new_URI_site, ':title' => $prepared_text, )); $query['add']['tocrawl']->closeCursor(); break; default: $query['add']['nocrawl']->execute(array( ':URI' => $new_URI_site, )); $query['add']['nocrawl']->closeCursor(); endswitch; endif; endif; // Some people just do not know how to form valid URIs ... } catch(DomainException $e) { if($e->getFile() == uri::FILE): if(ONION_SEARCH['MAIN']['DEBUG']): echo "Malformed URI found on page <$URI>, ignoring: <$href->textContent>\n"; endif; else: throw $e; endif; } endif; endforeach; endif; foreach(gopher($page) as $line): $new_URI = $line['uri']; $new_URI_site = new uri('/', $new_URI); if($new_URI_site == $site_URI): switch($line['type']): /** * We might as well use Gopher's file type hints to our advantage. * If the server says that it is something that we know we do not want, * we will ignore it. If the server is lying, we simply will crawl the * site in a misguided fashion. If the site administrator wants their * site crawled properly, they should configure their server correctly. * * Please note that Gopher directory responses are accepted by this * spider even if this is an HTTPS/HTTP server and not a Gopher server. * We do not care what protocol was used to retrieve the file, we only * care that the syntax looks like a gopher directory file. **/ case '1': case '2': case '7': case 'h': break; if(!in_array($queue) and !in_array($done)): $queue[] = $line['uri']; endif; endswitch; else: /** * Gopher directories are only capable of linking to Gopher servers, * Telnet servers, and TN3270 servers. We cannot crawl Telnet or TN3270 * servers, so checking to see if this URI uses the 'gopher' scheme, we * effectively check to see if we can crawl this server. **/ if($new_URI_site->scheme == 'gopher'): $query['in']['tocrawl']->execute(array( ':URI' => $new_URI_site, )); $result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['tocrawl']->closeCursor(); else: $query['in']['nocrawl']->execute(array( ':URI' => $new_URI_site, )); $result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass'); $query['in']['nocrawl']->closeCursor(); endif; $is_new_site = $result[0]->{'COUNT(*)'} == '0'; if($is_new_site): if($new_URI_site->scheme == 'gopher'): $query['add']['tocrawl']->execute(array( ':URI' => $new_URI_site, ':title' => $prepared_text, )); $query['add']['tocrawl']->closeCursor(); else: $query['add']['nocrawl']->execute(array( ':URI' => $new_URI_site, )); $query['add']['nocrawl']->closeCursor(); endif; endif; endif; endforeach; // Save information about the index page: if($URI->path == '/'): $title = $DOMDocument->getElementsByTagName('title'); // If the page has no , we can just use the URI instead. if($title->length): $prepared_title = $title->item(0)->textContent; else: $prepared_title = $site_URI; endif; $query['update']['title']->execute(array( ':URI' => $new_URI_site, ':title' => $prepared_title, )); $query['update']['title']->closeCursor(); endif; if(ONION_SEARCH['MAIN']['DEBUG']): $queue_count = count($queue); $done_count = count($done); $percent = (int) ($done_count / ($queue_count + $done_count) * 100); $seconds_passed = time() - $starttime; $seconds_left = ($seconds_passed / $done_count) * $queue_count; $time_passed = parse_seconds($seconds_passed); $time_left = parse_seconds($seconds_left); echo "Queue: $queue_count; Done: $done_count; Estimated percentage complete: %$percent\n"; echo "Time spent: $time_passed[d] day(s), $time_passed[h] hour(s), $time_passed[m] minute(s), and $time_passed[s] second(s)\n"; echo "Estimated time remaining: $time_left[d] day(s), $time_left[h] hour(s), $time_left[m] minute(s), and $time_left[s] second(s)\n"; endif; endwhile; $query['update']['timestamp']->execute(array( ':URI' => $new_URI_site, ':time' => time(), )); $query['update']['timestamp']->closeCursor(); endforeach; endwhile; /** * TO DO LIST: * * add check to see if crawlable site is present; if not present, remove from active list * add check to see if non-crawlable site is present; if not present, remove from active list **/