OnionSpider/spider.php

237 lines
9.4 KiB
PHP

<?php
# Onion Search
# Copyright (C) 2015 y.st. <mailto:copyright@y.st>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
/**
* REMEMBER TO ADD AT LEAST ONE URI TO YOUR INITIAL DATABASE.
* IF YOU DO NOT, THE SPIDER WILL NOT GO ANYWHERE.
* After the first crawl, or rather, after the first URI you add,
* manually adding URIs will no longer be necessary.
**/
use st\y\curl_limit, st\y\remote_files, st\y\robots_txt, st\y\wrapper\xml;
spl_autoload_register();
require 'st/y/const.php';
require 'st/y/function/error_handler.php';
require 'st/y/function/hash_uri_onion.php';
require 'st/y/function/merge_uris.php';
require 'localhost/ONION_SEARCH/const.php';
set_error_handler('\\st\\y\\error_handler');
/**
* We need to be sure that the functions and classes used by this
* spider are present. If the library version is too old to contain
* these, we need to alert the user as to why the needed constructs are
* missing and alert him/her as to where the needed library can be
* downloaded.
**/
if(version_compare(\st\y\VERSION, '0.0.16.0.0', '<')):
throw new LogicException('Your y.st. PHP library is too old. You need version 0.0.16.0.0 or above.
You may find an update to download at <https://notabug.org/y.st./php>.');
endif;
// The file-downloading object
$cURL = new remote_files(array(
CURLOPT_PROXY => 'localhost',
CURLOPT_PROXYPORT => 9050,
CURLOPT_PROXYTYPE => CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => false,
CURLOPT_NOPROGRESS => false,
CURLOPT_PROGRESSFUNCTION => new curl_limit(\localhost\ONION_SEARCH\CURL_LIMIT),
CURLOPT_USERAGENT => \localhost\ONION_SEARCH\REMOTE_USERAGENT,
CURLOPT_TIMEOUT => \localhost\ONION_SEARCH\REMOTE_TIMEOUT,
));
// The database reading/writing object
$MySQLi = new mysqli(
\localhost\ONION_SEARCH\MYSQLI_HOST,
\localhost\ONION_SEARCH\MYSQLI_USERNAME,
\localhost\ONION_SEARCH\MYSQLI_PASSWORD,
\localhost\ONION_SEARCH\MYSQLI_DATABASE,
\localhost\ONION_SEARCH\MYSQLI_PORT,
\localhost\ONION_SEARCH\MYSQLI_SOCKET
);
/** I think this makes it so our spider writes to the database
* automatically as data is acquired. The advantage for us is that if
* the spider is terminated prematurely, we do not lose any data that
* we already obtained.
**/
$MySQLi->autocommit(true);
// For now, we only want to deal with HTTP and HTTPS.
$approved_protocols = array(
'http' => true,
'https' => true,
);
// These three constants are easier to use in strings if they are variables.
$anchors = \localhost\ONION_SEARCH\ONIONSEARCH_ANCHORS;
$blacklist = \localhost\ONION_SEARCH\ONIONSEARCH_BLACKLIST;
$titles = \localhost\ONION_SEARCH\ONIONSEARCH_TITLES;
// Some strange issue prevents us from using this object multiple times.
#$XML = new xml();
// Initialize the "/robots.txt" support variables so we can just assume that they exist later.
$robotstxt_uri = '';
$robotstxt = null;
/**
* When a new URI has been found on several pages in one iteration,
* several copies of that URI will end up in the queue for the next
* iteration. This variable allows us to make note of what the last URI
* we processed was, so we can avoid hitting that URI again right away
* without needing to ask the MySQL database when each and every URI in
* our queue was last used.
**/
$last_uri_processed = '';
// This allows us to loop until we are done, then stop.
$continue_loop = true;
while($continue_loop):
/**
* If we have URIs to process, this will be set to true immediately in
* the second nested loop. If this loop finishes without executing the
* contained loop, it means that we must not have had any URIs to
* process.
**/
$continue_loop = false;
foreach(array(
array(
'index' => 'to',
'query' => "SELECT `$anchors`.`to` FROM `$anchors` LEFT JOIN `$titles` ON `$anchors`.`to` = `$titles`.`uri` WHERE `$titles`.`uri` IS NULL ORDER BY `$anchors`.`to`",
),
array(
'index' => 'uri',
'query' => "SELECT `uri` FROM `$titles` WHERE `timestamp` < ".(time()-\localhost\ONION_SEARCH\ONIONSEARCH_REFRESHDELAY)." ORDER BY `uri`",
),
) as $query):
$URIlist = $MySQLi->query($query['query']);
foreach($URIlist as $uri):
// We have URIs to process. Set this to true.
$continue_loop = true;
$URI = $uri[$query['index']];
// If this URI is the same as the last one we worked with, we should skip it.
if($URI == $last_uri_processed):
continue;
// Otherwise, this URI becomes the last URI, for use in the next iteration.
else:
$last_uri_processed = $URI;
endif;
// If the onion address has been blacklisted, we should ignore it.
$hash = \st\y\hash_uri_onion($URI);
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
$blacklisted = $result->fetch_array()['COUNT(*)'] != '0';
$result->free();
if($blacklisted):
continue;
endif;
$new_robotstxt_uri = \st\y\merge_uris($URI, '/robots.txt');
if($robotstxt_uri != $new_robotstxt_uri):
$robotstxt_uri = $new_robotstxt_uri;
echo "Checking \"/robots.txt\" file at \"$robotstxt_uri\"\n";
$page = $cURL->get($robotstxt_uri);
if($page === false):
// Using blank data when no data is available allows us to assume that
// we actually have data.
$robotstxt = new robots_txt('');
else:
$robotstxt = new robots_txt($page);
endif;
endif;
// The spider should respect the wishes of the webmaster.
if($robotstxt->disallows(\localhost\ONION_SEARCH\REMOTE_USERAGENT, $URI)):
continue;
endif;
echo "Working with page at URI \"$URI\"\n";
$page = $cURL->get($URI);
// If $values and $index are not reset, they keep the values from the
// last iteration. I forget that that does, but it is bad.
$values = null;
$index = null;
// We need to figure out why a new \xml object is needed for every iteration.
// Strangely, trying to reuse an \xml object results in empty strings.
// Or maybe it results in null values. I forget.
$XML = new xml();
$XML->parse_into_struct($page, $values, $index);
if(isset($index['BASE'][0], $values[$index['BASE'][0]]['attributes']['HREF'])):
$base_verify = parse_url($values[$index['BASE'][0]]['attributes']['HREF']);
if(isset($base_verify['scheme'], $base_verify['path'])):
$base = $values[$index['BASE'][0]]['attributes']['HREF'];
else:
$base = $URI;
endif;
else:
$base = $URI;
endif;
// If the page that no <title/>, we can just use the URI instead.
if(isset($index['TITLE'][0]['value'])):
$page_title = $values[$index['TITLE'][0]]['value'];
else:
$page_title = $URI;
endif;
$prepared_title = $MySQLi->escape_string($page_title);
$prepared_from = $MySQLi->escape_string($URI);
$timestamp = time();
$result = $MySQLi->query("SELECT COUNT(*) FROM `$titles` WHERE `uri` = '$prepared_from'");
$is_known_page = $result->fetch_array()['COUNT(*)'] != '0';
$result->free();
// If this page is in our database, we want to update it.
if($is_known_page):
$MySQLi->query("UPDATE `$titles` SET `title` = '$prepared_title', `timestamp` = '$timestamp' WHERE `uri` = '$prepared_from'");
// If this page is not in our database, we want to add it.
else:
$MySQLi->query("INSERT INTO `$titles` (`uri`,`title`, `timestamp`) VALUES('$prepared_from','$prepared_title','$timestamp')");
endif;
if(isset($index['A'])):
foreach($index['A'] as $A):
if(isset($values[$A]['attributes']['HREF'])):
// This line should both make relative URIs absolute and strip off the fragment, if any.
$new_URI = explode('#', \st\y\merge_uris($base, $values[$A]['attributes']['HREF']))[0];
$new_URI_domain = parse_url($new_URI, PHP_URL_HOST);
$new_URI_protocol = parse_url($new_URI, PHP_URL_SCHEME);
// We do not want to keep track of blacklisted URIs.
$hash = \st\y\hash_uri_onion($new_URI);
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
$not_blacklisted = $result->fetch_array()['COUNT(*)'] == '0';
$result->free();
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
$prepared_to = $MySQLi->escape_string($new_URI);
$result = $MySQLi->query("SELECT COUNT(*) FROM `$anchors` WHERE `to` = '$prepared_to' AND `from` = '$prepared_from'");
$is_new_anchor = $result->fetch_array()['COUNT(*)'] == '0';
$result->free();
if($new_URI_domain and (substr($new_URI_domain, -6) == '.onion'
or substr($new_URI_domain, -7) == '.onion.') and
$not_blacklisted and $is_new_anchor):
if(isset($values[$A]['value'])):
$prepared_text = $MySQLi->escape_string($values[$A]['value']);
else:
$prepared_text = '';
endif;
$MySQLi->query("INSERT INTO `$anchors` (`to`,`from`,`text`) VALUES('$prepared_to','$prepared_from','$prepared_text')");
endif;
endif;
endforeach;
endif;
endforeach;
$URIlist->free();
endforeach;
endwhile;
$MySQLi->close();