OnionSpider/spider.php

<?php
#	Onion Search
#	Copyright (C) 2015 y.st. <mailto:copyright@y.st>

#	This program is free software: you can redistribute it and/or modify
#	it under the terms of the GNU Affero General Public License as published
#	by the Free Software Foundation, either version 3 of the License, or
#	(at your option) any later version.

#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU Affero General Public License for more details.

#	You should have received a copy of the GNU Affero General Public License
#	along with this program.  If not, see <http://www.gnu.org/licenses/>.

/**
 * REMEMBER TO ADD AT LEAST ONE URI TO YOUR INITIAL DATABASE.
 * IF YOU DO NOT, THE SPIDER WILL NOT GO ANYWHERE.
 * After the first crawl, or rather, after the first URI you add,
 * manually adding URIs will no longer be necessary.
**/
use st\y\curl_limit, st\y\remote_files, st\y\robots_txt, st\y\wrapper\xml;
spl_autoload_register();
require 'st/y/const.php';
require 'st/y/function/error_handler.php';
require 'st/y/function/hash_uri_onion.php';
require 'st/y/function/merge_uris.php';
require 'localhost/ONION_SEARCH/const.php';
set_error_handler('\\st\\y\\error_handler');

/**
 * We need to be sure that the functions and classes used by this
 * spider are present.  If the library version is too old to contain
 * these, we need to alert the user as to why the needed constructs are
 * missing and alert him/her as to where the needed library can be
 * downloaded.
**/
if(version_compare(\st\y\VERSION, '0.0.16.0.0', '<')):
	throw new LogicException('Your y.st. PHP library is too old. You need version 0.0.16.0.0 or above.
	You may find an update to download at <https://notabug.org/y.st./php>.');
endif;

// The file-downloading object
$cURL = new remote_files(array(
	CURLOPT_PROXY            => 'localhost',
	CURLOPT_PROXYPORT        => 9050,
	CURLOPT_PROXYTYPE        => CURLPROXY_SOCKS5_HOSTNAME,
	CURLOPT_SSL_VERIFYPEER   => false,
	CURLOPT_RETURNTRANSFER   => true,
	CURLOPT_FOLLOWLOCATION   => false,
	CURLOPT_NOPROGRESS       => false,
	CURLOPT_PROGRESSFUNCTION => new curl_limit(\localhost\ONION_SEARCH\CURL_LIMIT),
	CURLOPT_USERAGENT        => \localhost\ONION_SEARCH\REMOTE_USERAGENT,
	CURLOPT_TIMEOUT          => \localhost\ONION_SEARCH\REMOTE_TIMEOUT,
));

// The database reading/writing object
$MySQLi = new mysqli(
	\localhost\ONION_SEARCH\MYSQLI_HOST,
	\localhost\ONION_SEARCH\MYSQLI_USERNAME,
	\localhost\ONION_SEARCH\MYSQLI_PASSWORD,
	\localhost\ONION_SEARCH\MYSQLI_DATABASE,
	\localhost\ONION_SEARCH\MYSQLI_PORT,
	\localhost\ONION_SEARCH\MYSQLI_SOCKET
);
/** I think this makes it so our spider writes to the database
 * automatically as data is acquired. The advantage for us is that if
 * the spider is terminated prematurely, we do not lose any data that
 * we already obtained.
**/
$MySQLi->autocommit(true);

// For now, we only want to deal with HTTP and HTTPS.
$approved_protocols = array(
	'http' => true,
	'https' => true,
);

// These three constants are easier to use in strings if they are variables.
$anchors = \localhost\ONION_SEARCH\ONIONSEARCH_ANCHORS;
$blacklist = \localhost\ONION_SEARCH\ONIONSEARCH_BLACKLIST;
$titles = \localhost\ONION_SEARCH\ONIONSEARCH_TITLES;

// Some strange issue prevents us from using this object multiple times.
#$XML = new xml();

// Initialize the "/robots.txt" support variables so we can just assume that they exist later.
$robotstxt_uri = '';
$robotstxt = null;

/**
 * When a new URI has been found on several pages in one iteration,
 * several copies of that URI will end up in the queue for the next
 * iteration. This variable allows us to make note of what the last URI
 * we processed was, so we can avoid hitting that URI again right away
 * without needing to ask the MySQL database when each and every URI in
 * our queue was last used.
**/
$last_uri_processed = '';

// This allows us to loop until we are done, then stop.
$continue_loop = true;

while($continue_loop):
/**
 * If we have URIs to process, this will be set to true immediately in
 * the second nested loop. If this loop finishes without executing the
 * contained loop, it means that we must not have had any URIs to
 * process.
**/
	$continue_loop = false;
	foreach(array(
		array(
			'index' => 'to',
			'query' => "SELECT `$anchors`.`to` FROM `$anchors` LEFT JOIN `$titles` ON `$anchors`.`to` = `$titles`.`uri` WHERE `$titles`.`uri` IS NULL ORDER BY `$anchors`.`to`",
		),
		array(
			'index' => 'uri',
			'query' => "SELECT `uri` FROM `$titles` WHERE `timestamp` < ".(time()-\localhost\ONION_SEARCH\ONIONSEARCH_REFRESHDELAY)." ORDER BY `uri`",
		),
	) as $query):
		$URIlist = $MySQLi->query($query['query']);
		foreach($URIlist as $uri):
// We have URIs to process. Set this to true.
			$continue_loop = true;
			$URI = $uri[$query['index']];
// If this URI is the same as the last one we worked with, we should skip it.
			if($URI == $last_uri_processed):
				continue;
// Otherwise, this URI becomes the last URI, for use in the next iteration.
			else:
				$last_uri_processed = $URI;
			endif;
// If the onion address has been blacklisted, we should ignore it.
			$hash = \st\y\hash_uri_onion($URI);
			$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
			$blacklisted = $result->fetch_array()['COUNT(*)'] != '0';
			$result->free();
			if($blacklisted):
				continue;
			endif;
			$new_robotstxt_uri = \st\y\merge_uris($URI, '/robots.txt');
			if($robotstxt_uri != $new_robotstxt_uri):
				$robotstxt_uri = $new_robotstxt_uri;
				echo "Checking \"/robots.txt\" file at \"$robotstxt_uri\"\n";
				$page = $cURL->get($robotstxt_uri);
				if($page === false):
// Using blank data when no data is available allows us to assume that
// we actually have data.
					$robotstxt = new robots_txt('');
				else:
					$robotstxt = new robots_txt($page);
				endif;
			endif;
// The spider should respect the wishes of the webmaster.
			if($robotstxt->disallows(\localhost\ONION_SEARCH\REMOTE_USERAGENT, $URI)):
				continue;
			endif;
			echo "Working with page at URI \"$URI\"\n";
			$page = $cURL->get($URI);
// If $values and $index are not reset, they keep the values from the
// last iteration. I forget that that does, but it is bad.
			$values = null;
			$index = null;
// We need to figure out why a new \xml object is needed for every iteration.
// Strangely, trying to reuse an \xml object results in empty strings.
// Or maybe it results in null values. I forget.
			$XML = new xml();
			$XML->parse_into_struct($page, $values, $index);
			if(isset($index['BASE'][0], $values[$index['BASE'][0]]['attributes']['HREF'])):
				$base_verify = parse_url($values[$index['BASE'][0]]['attributes']['HREF']);
				if(isset($base_verify['scheme'], $base_verify['path'])):
					$base = $values[$index['BASE'][0]]['attributes']['HREF'];
				else:
					$base = $URI;
				endif;
			else:
				$base = $URI;
			endif;
// If the page that no <title/>, we can just use the URI instead.
			if(isset($index['TITLE'][0]['value'])):
				$page_title = $values[$index['TITLE'][0]]['value'];
			else:
				$page_title = $URI;
			endif;
			$prepared_title = $MySQLi->escape_string($page_title);
			$prepared_from = $MySQLi->escape_string($URI);
			$timestamp = time();
			$result = $MySQLi->query("SELECT COUNT(*) FROM `$titles` WHERE `uri` = '$prepared_from'");
			$is_known_page = $result->fetch_array()['COUNT(*)'] != '0';
			$result->free();
// If this page is in our database, we want to update it.
			if($is_known_page):
				$MySQLi->query("UPDATE `$titles` SET `title` = '$prepared_title', `timestamp` = '$timestamp' WHERE `uri` = '$prepared_from'");
// If this page is not in our database, we want to add it.
			else:
				$MySQLi->query("INSERT INTO `$titles` (`uri`,`title`, `timestamp`) VALUES('$prepared_from','$prepared_title','$timestamp')");
			endif;
			if(isset($index['A'])):
				foreach($index['A'] as $A):
					if(isset($values[$A]['attributes']['HREF'])):
// This line should both make relative URIs absolute and strip off the fragment, if any.
						$new_URI = explode('#', \st\y\merge_uris($base, $values[$A]['attributes']['HREF']))[0];
						$new_URI_domain = parse_url($new_URI, PHP_URL_HOST);
						$new_URI_protocol = parse_url($new_URI, PHP_URL_SCHEME);
// We do not want to keep track of blacklisted URIs.
						$hash = \st\y\hash_uri_onion($new_URI);
						$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
						$not_blacklisted = $result->fetch_array()['COUNT(*)'] == '0';
						$result->free();
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
						$prepared_to = $MySQLi->escape_string($new_URI);
						$result = $MySQLi->query("SELECT COUNT(*) FROM `$anchors` WHERE `to` = '$prepared_to' AND `from` = '$prepared_from'");
						$is_new_anchor = $result->fetch_array()['COUNT(*)'] == '0';
						$result->free();
						if($new_URI_domain and (substr($new_URI_domain, -6) == '.onion'
						or substr($new_URI_domain, -7) == '.onion.') and
						$not_blacklisted and $is_new_anchor):
							if(isset($values[$A]['value'])):
								$prepared_text = $MySQLi->escape_string($values[$A]['value']);
							else:
								$prepared_text = '';
							endif;
							$MySQLi->query("INSERT INTO `$anchors` (`to`,`from`,`text`) VALUES('$prepared_to','$prepared_from','$prepared_text')");
						endif;
					endif;
				endforeach;
			endif;
		endforeach;
		$URIlist->free();
	endforeach;
endwhile;
$MySQLi->close();