OnionSpider/spider.php

#!/usr/bin/php
<?php
/**
 * Onion Search
 * Copyright (C) 2015-2016 y.st. <mailto:copyright@y.st>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
**/

/**
 * REMEMBER TO ADD AT LEAST ONE URI TO YOUR INITIAL DATABASE.
 * IF YOU DO NOT, THE SPIDER WILL NOT GO ANYWHERE.
 * After the first crawl, or rather, after the first URI you add,
 * manually adding URIs will no longer be necessary.
**/
use st\y\curl_limit, st\y\remote_files, st\y\robots_txt, st\y\uri;
use function st\y\gopher, st\y\hash_uri_onion, st\y\parse_seconds;
use const st\y\CURLOPT_TOR, st\y\CURLOPT_TLS_ENCRYPTONLY, st\y\VERSION, localhost\ONION_SEARCH;
spl_autoload_register();
require 'st/y/const/CURLOPT_TOR.php';
require 'st/y/const/CURLOPT_TLS_ENCRYPTONLY.php';
require 'st/y/const/VERSION.php';
require 'st/y/function/error_handler.php';
require 'st/y/function/gopher.php';
require 'st/y/function/hash_uri_onion.php';
require 'st/y/function/parse_seconds.php';
require 'localhost/const/ONION_SEARCH.php';
set_error_handler('\\st\\y\\error_handler');

/**
 * We need to be sure that the functions and classes used by this
 * spider are present.  If the library version is too old to contain
 * these, we need to alert the user as to why the needed constructs are
 * missing and alert him/her as to where the needed library can be
 * downloaded.
**/
if(version_compare(VERSION['include.d'], '0.0.1.1', '<')):
	throw new LogicException('Your include.d is too old. You need version 0.0.1.1 or above.
	You may find an update to download at <https://notabug.org/y.st./include.d>.');
endif;

// The file-downloading object
$cURL = new remote_files(array(
	CURLOPT_FOLLOWLOCATION   => false,
/**
 * The PHP documentation at
 * <https://secure.php.net/manual/en/function.curl-setopt.php> says
 * that the \CURLOPT_NOPROGRESS setting should only be set to false for
 * debugging. *THIS IS A LIE.* When this is set to false, the function
 * specified by the \CURLOPT_PROGRESSFUNCTION setting is actually
 * called. However, if the \CURLOPT_NOPROGRESS setting is *not* set to
 * false, the function specified by the \CURLOPT_PROGRESSFUNCTION
 * setting is *not* called, meaning that we have no way to terminate a
 * runaway download.
**/
	CURLOPT_NOPROGRESS       => false,
	CURLOPT_PROGRESSFUNCTION => new curl_limit(ONION_SEARCH['LIMIT']['LIMIT'], ONION_SEARCH['LIMIT']['DEBUG']),
	CURLOPT_USERAGENT        => ONION_SEARCH['REMOTE']['USERAGENT'],
	CURLOPT_TIMEOUT          => ONION_SEARCH['REMOTE']['TIMEOUT'],
	CURLOPT_PROTOCOLS        => CURLPROTO_HTTPS|CURLPROTO_HTTP|CURLPROTO_GOPHER,
));

// We need to use Tor to retrieve files from onion space.
$cURL->setopt_array(CURLOPT_TOR);

// Use TLS for encryption only, not for identity verification.
$cURL->setopt_array(CURLOPT_TLS_ENCRYPTONLY);

// These four constants are easier to use in strings if they are variables.
$blacklist = ONION_SEARCH['MAIN']['BLACKLIST'];
$nocrawl   = ONION_SEARCH['MAIN']['NOCRAWL'];
$tocrawl   = ONION_SEARCH['MAIN']['TOCRAWL'];
$useragent = ONION_SEARCH['REMOTE']['USERAGENT'];

// The database reading/writing object
$PDO = new PDO(...ONION_SEARCH['DBO']);

/**
 * The documentation insists that using \PDO::prepare() is better than
 * using \DBO::query(). I suppose that we can comply. Honestly, the
 * documentation is probably right, I am just frustrated that I had to
 * rewrite my code when away from the \mysqli class. The \mysqli class
 * also has this functionality, but the documentation does not seem to
 * dissuade people from using regular-style queries.
**/
$query = array(
	'in'     => array(
		'tocrawl'   => $PDO->prepare("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = :URI"),
		'nocrawl'   => $PDO->prepare("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = :URI"),
		'blacklist' => $PDO->prepare("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = :hash"),
	),
	'add'    => array(
		'tocrawl'   => $PDO->prepare("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES (:URI,:title,'0')"),
		'nocrawl'   => $PDO->prepare("INSERT INTO `$nocrawl` (`uri`) VALUES (:URI)"),
	),
	'list'   => array(
		'crawl'     => $PDO->prepare("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < :time ORDER BY `timestamp` ASC"),
	),
	'remove' => array(
		'tocrawl'   => $PDO->prepare("DELETE FROM `$tocrawl` WHERE `uri` = :URI"),
	),
	'update' => array(
		'title'     => $PDO->prepare("UPDATE `$tocrawl` SET `title` = :title WHERE `uri` = :URI"),
		'timestamp' => $PDO->prepare("UPDATE `$tocrawl` SET `timestamp` = :time WHERE `uri` = :URI"),
	),
);

// Set up the document parser.
$DOMDocument = new DOMDocument();

/**
 * Prevent malformed Web pages from killing the spider.
 * We will need to clear the errors as they come up to prevent them
 * from filling the buffer though.
**/
$DOMDocument->recover = true;
libxml_use_internal_errors(true);

// This allows us to loop until we are done, then stop.
$continue_loop = true;

while($continue_loop):
/**
 * If we have URIs to process, this will be set to true immediately in
 * the second nested loop. If this loop finishes without executing the
 * contained loop, it means that we must not have had any URIs to
 * process.
**/
	$continue_loop = false;
/**
 * For debugging purposes, it can be useful to start this script on a
 * particular URI. When this script finds URIs on a Web page, it checks
 * it for validity and properly discards it if invalid. However, when
 * given a URI via the command line, it assumes that this URI is valid
 * and will abort otherwise. This is a feature, not a bug. If you are
 * trying to debug a particular page and you accidentally specify a bad
 * URI, continuing despite the mistake is the incorrect behavior.
 *
 * Furthermore, it is assumed that the URI specified this way is part
 * of a site that already exists in the database. This tool is not a
 * way to add URIs, it is a way to test bugs in the spider. If a
 * particular page crashed the spider, this feature allows you to start
 * with the crashing page so that you don't have to wait for the spider
 * to find that page again in order to test a fix in the spider code.
**/
	if(isset($argv[1])):
		$DEBUG_URI = new uri($argv[1]);
		$URIlist = array(
			(object) array(
				'uri' => new uri('/', $DEBUG_URI),
			),
		);
// We don't want to run this code a second time, so let's remove the trigger.
		unset($argv[1]);
	else:
		$query['list']['crawl']->execute(array(
			':time' => time()-ONION_SEARCH['MAIN']['REFRESHDELAY'],
		));
		$URIlist = $query['list']['crawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
		$query['list']['crawl']->closeCursor();
	endif;
	foreach($URIlist as $uri):
/**
 * This time stamp is used to estimate how much time is remaining in
 * the processing of a given site. For a more accurate estimation, we
 * should probably move this closer to the loop below, but I would
 * rather have the estimate be a little high than have it be a little
 * low.
**/
		$starttime = time();
		$site_URI = new uri($uri->uri);
// We have URIs to process. Set this to true.
		$continue_loop = true;
// If the onion address has been blacklisted, we should ignore it.
		$hash = array(
			':hash' => hash_uri_onion($site_URI),
		);
		$query['in']['blacklist']->execute($hash);
		$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
		$query['list']['crawl']->closeCursor();
		$blacklisted = $result[0]->{'COUNT(*)'} != '0';
// If the onion address has blacklisted, we should remove it from the database.
		if($blacklisted):
			$query['remove']['tocrawl']->execute($hash);
			$query['remove']['tocrawl']->closeCursor();
			continue;
		endif;
// Different types of URI need to be handled differently.
		switch($site_URI->scheme):
			case 'https':
			case 'http':
				$robotstxt_URI = new uri('/robots.txt', $site_URI);
			break;
			case 'gopher':
				$robotstxt_URI = new uri('/0/robots.txt', $site_URI);
			break;
			default:
				throw new RuntimeException("$useragent does not know how to crawl \"$site_URI->scheme\" URIs, URI \"$site_URI\" found in table \"$tocrawl\".");
		endswitch;
// Information about the current site will be held in a pair of arrays.
		$done = array();
		$queue = array($site_URI);
/**
 * If we are debugging with a particular URI, we should make sure that
 * that URI is at the front of the queue.
**/
		if(isset($DEBUG_URI)):
			$queue[] = $DEBUG_URI;
// We don't want to run this code a second time, so let's remove the trigger.
			unset($DEBUG_URI);
		endif;
// The spider should respect the wishes of the webmaster.
		if(ONION_SEARCH['MAIN']['DEBUG']):
			echo "Checking \"robots.txt\" file at <$robotstxt_URI>\n";
		endif;
		$page = $cURL->get($robotstxt_URI);
		if($page === false):
// Using blank data when no data is available allows us to assume that
// we actually have data.
			$robotstxt = new robots_txt('');
		else:
			$robotstxt = new robots_txt($page);
		endif;
		foreach($robotstxt->sitemaps as $sitemap):
			$queue[] = new uri($sitemap, $site_URI);
		endforeach;
		while(null !== ($URI = array_shift($queue))):
			$done[] = $URI;
// The spider should respect the wishes of the webmaster.
			if($robotstxt->disallows(ONION_SEARCH['REMOTE']['USERAGENT'], $URI)):
				continue;
			endif;
// Retrieve the page.
			if(ONION_SEARCH['MAIN']['DEBUG']):
				echo "Working with page at URI <$URI>\n";
			endif;
			$page = $cURL->get($URI);
// If the page is empty, there is no point in parsing it.
			if(!empty($page)):
				$DOMDocument->loadXML($page);
// Those lazy fools wrote bad markup and put errors in our buffer.
// We should clear them.
				libxml_clear_errors();
/**
 * If we set the document URI, the base URI resolver is able to handle
 * <base/> tags that specify a relative base.
**/
				$DOMDocument->documentURI = $URI;
/**
 * This is just a little witchery to find the base URI only once
 * instead of having to find it again for each <a/> tag. If we used the
 * "baseURI" property of every <a/> tag, we would need to instantiate
 * the \st\y\uri class for each hyperlink if a <base/> tag were
 * present, as well as check to see if a "baseURI" property were set
 * for each <a/> tag. It is much more efficient to check once per page
 * and perform the needed setup once per page.
**/
				$base_tag = $DOMDocument->getElementsByTagName('base');
				if($base_tag->length and ($baseURI = $base_tag->item(0)->baseURI)):
					$base = new uri($baseURI);
				else:
					$base = $URI;
				endif;
				foreach($DOMDocument->getElementsByTagName('loc') as $LOC):
// We need to make relative URIs absolute.
					$new_URI = new uri($LOC->textContent, $base);
// We also need to strip away the fragment, if any.
					unset($new_URI->fragment);
					$new_URI_site = new uri('/', $new_URI);
					if($new_URI_site == $site_URI):
						if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)):
							$queue[] = $new_URI;
						endif;
					else:
// We do not want to keep track URIs using blacklisted onion addresses.
						$hash = array(
							':hash' => hash_uri_onion($new_URI_site),
						);
						$query['in']['blacklist']->execute($hash);
						$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
						$query['in']['blacklist']->closeCursor();
						$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
						switch($new_URI_site->scheme):
							case 'https':
							case 'http':
							case 'gopher':
								$query['in']['tocrawl']->execute(array(
									':URI' => $new_URI_site,
								));
								$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
								$query['in']['tocrawl']->closeCursor();
							break;
							default:
								$query['in']['nocrawl']->execute(array(
									':URI' => $new_URI_site,
								));
								$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
								$query['in']['nocrawl']->closeCursor();
						endswitch;
						$is_new_site = $result[0]->{'COUNT(*)'} == '0';
						if(isset($new_URI->host) and (substr($new_URI->host, -6) == '.onion') and
						$not_blacklisted and $is_new_site):
							switch($new_URI_site->scheme):
								case 'https':
								case 'http':
								case 'gopher':
									$query['add']['tocrawl']->execute(array(
										':URI' => $new_URI_site,
										':title' => $new_URI_site,
									));
									$query['add']['tocrawl']->closeCursor();
								break;
								default:
									$query['add']['nocrawl']->execute(array(
										':URI' => $new_URI_site,
									));
									$query['add']['nocrawl']->closeCursor();
							endswitch;
						endif;
					endif;
				endforeach;
				foreach($DOMDocument->getElementsByTagName('a') as $A):
					$href = $A->attributes->getNamedItem('href');
					if($href):
// If we do not wrap this in a try block, malformed $a[URI]s in hyperlinks will kill the spider.
						try {
// We need to make relative URIs absolute.
							$new_URI = new uri($href->textContent, $base);
// We also need to strip away the fragment, if any.
							unset($new_URI->fragment);
							$new_URI_site = new uri('/', $new_URI);
							if($new_URI_site == $site_URI):
								if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)):
									$queue[] = $new_URI;
								endif;
							else:
// We do not want to keep track URIs using blacklisted onion addresses.
								$hash = array(
									':hash' => hash_uri_onion($new_URI_site),
								);
								$query['in']['blacklist']->execute($hash);
								$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
								$query['in']['blacklist']->closeCursor();
								$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
								switch($new_URI_site->scheme):
									case 'https':
									case 'http':
									case 'gopher':
										$query['in']['tocrawl']->execute(array(
											':URI' => $new_URI_site,
										));
										$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
										$query['in']['tocrawl']->closeCursor();
									break;
									default:
										$query['in']['nocrawl']->execute(array(
											':URI' => $new_URI_site,
										));
										$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
										$query['in']['nocrawl']->closeCursor();
								endswitch;
								$is_new_site = $result[0]->{'COUNT(*)'} == '0';
								if(isset($new_URI_site->host) and substr($new_URI_site->host, -6) == '.onion' and
								$not_blacklisted and $is_new_site):
									switch($new_URI_site->scheme):
										case 'https':
										case 'http':
										case 'gopher':
											if(isset($A->textContent)):
												$prepared_text = $A->textContent;
											else:
												$prepared_text = '';
											endif;
											$query['add']['tocrawl']->execute(array(
												':URI' => $new_URI_site,
												':title' => $prepared_text,
											));
											$query['add']['tocrawl']->closeCursor();
										break;
										default:
											$query['add']['nocrawl']->execute(array(
												':URI' => $new_URI_site,
											));
											$query['add']['nocrawl']->closeCursor();
									endswitch;
								endif;
							endif;
// Some people just do not know how to form valid URIs ...
						} catch(DomainException $e) {
							if($e->getFile() == uri::FILE):
								if(ONION_SEARCH['MAIN']['DEBUG']):
									echo "Malformed URI found on page <$URI>, ignoring: <$href->textContent>\n";
								endif;
							else:
								throw $e;
							endif;
						}
					endif;
				endforeach;
			endif;
			foreach(gopher($page) as $line):
				$new_URI = $line['uri'];
				$new_URI_site = new uri('/', $new_URI);
				if($new_URI_site == $site_URI):
					switch($line['type']):
/**
 * We might as well use Gopher's file type hints to our advantage.
 * If the server says that it is something that we know we do not want,
 * we will ignore it. If the server is lying, we simply will crawl the
 * site in a misguided fashion. If the site administrator wants their
 * site crawled properly, they should configure their server correctly.
 *
 * Please note that Gopher directory responses are accepted by this
 * spider even if this is an HTTPS/HTTP server and not a Gopher server.
 * We do not care what protocol was used to retrieve the file, we only
 * care that the syntax looks like a gopher directory file.
**/
						case '1':
						case '2':
						case '7':
						case 'h':
						break;
							if(!in_array($queue) and !in_array($done)):
								$queue[] = $line['uri'];
							endif;
					endswitch;
				else:
/**
 * Gopher directories are only capable of linking to Gopher servers,
 * Telnet servers, and TN3270 servers. We cannot crawl Telnet or TN3270
 * servers, so checking to see if this URI uses the 'gopher' scheme, we
 * effectively check to see if we can crawl this server.
**/
					if($new_URI_site->scheme == 'gopher'):
						$query['in']['tocrawl']->execute(array(
							':URI' => $new_URI_site,
						));
						$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
						$query['in']['tocrawl']->closeCursor();
					else:
						$query['in']['nocrawl']->execute(array(
							':URI' => $new_URI_site,
						));
						$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
						$query['in']['nocrawl']->closeCursor();
					endif;
					$is_new_site = $result[0]->{'COUNT(*)'} == '0';
					if($is_new_site):
						if($new_URI_site->scheme == 'gopher'):
							$query['add']['tocrawl']->execute(array(
								':URI' => $new_URI_site,
								':title' => $prepared_text,
							));
							$query['add']['tocrawl']->closeCursor();
						else:
							$query['add']['nocrawl']->execute(array(
								':URI' => $new_URI_site,
							));
							$query['add']['nocrawl']->closeCursor();
						endif;
					endif;
				endif;
			endforeach;
// Save information about the index page:
			if($URI->path == '/'):
				$title = $DOMDocument->getElementsByTagName('title');
// If the page has no <title/>, we can just use the URI instead.
				if($title->length):
					$prepared_title = $title->item(0)->textContent;
				else:
					$prepared_title = $site_URI;
				endif;
				$query['update']['title']->execute(array(
					':URI' => $new_URI_site,
					':title' => $prepared_title,
				));
				$query['update']['title']->closeCursor();
			endif;
			if(ONION_SEARCH['MAIN']['DEBUG']):
				$queue_count = count($queue);
				$done_count = count($done);
				$percent = (int) ($done_count / ($queue_count + $done_count) * 100);
				$seconds_passed = time() - $starttime;
				$seconds_left = ($seconds_passed / $done_count) * $queue_count;
				$time_passed = parse_seconds($seconds_passed);
				$time_left = parse_seconds($seconds_left);
				echo "Queue: $queue_count; Done: $done_count; Estimated percentage complete: %$percent\n";
				echo "Time spent: $time_passed[d] day(s), $time_passed[h] hour(s), $time_passed[m] minute(s), and $time_passed[s] second(s)\n";
				echo "Estimated time remaining: $time_left[d] day(s), $time_left[h] hour(s), $time_left[m] minute(s), and $time_left[s] second(s)\n";
			endif;
		endwhile;
		$query['update']['timestamp']->execute(array(
			':URI' => $new_URI_site,
			':time' => time(),
		));
		$query['update']['timestamp']->closeCursor();
	endforeach;
endwhile;

/**
 * TO DO LIST:
 *
 * add check to see if crawlable site is present; if not present, remove from active list
 * add check to see if non-crawlable site is present; if not present, remove from active list
**/