#!/usr/bin/php
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
**/
/**
* REMEMBER TO ADD AT LEAST ONE URI TO YOUR INITIAL DATABASE.
* IF YOU DO NOT, THE SPIDER WILL NOT GO ANYWHERE.
* After the first crawl, or rather, after the first URI you add,
* manually adding URIs will no longer be necessary.
**/
use st\y\curl_limit, st\y\remote_files, st\y\robots_txt, st\y\uri;
use function st\y\gopher, st\y\hash_uri_onion, st\y\parse_seconds;
use const st\y\CURLOPT_TOR, st\y\CURLOPT_TLS_ENCRYPTONLY, st\y\VERSION, localhost\ONION_SEARCH;
spl_autoload_register();
require 'st/y/const/CURLOPT_TOR.php';
require 'st/y/const/CURLOPT_TLS_ENCRYPTONLY.php';
require 'st/y/const/VERSION.php';
require 'st/y/function/error_handler.php';
require 'st/y/function/gopher.php';
require 'st/y/function/hash_uri_onion.php';
require 'st/y/function/parse_seconds.php';
require 'localhost/const/ONION_SEARCH.php';
set_error_handler('\\st\\y\\error_handler');
/**
* We need to be sure that the functions and classes used by this
* spider are present. If the library version is too old to contain
* these, we need to alert the user as to why the needed constructs are
* missing and alert him/her as to where the needed library can be
* downloaded.
**/
if(version_compare(VERSION['include.d'], '0.0.1.1', '<')):
throw new LogicException('Your include.d is too old. You need version 0.0.1.1 or above.
You may find an update to download at .');
endif;
// The file-downloading object
$cURL = new remote_files(array(
CURLOPT_FOLLOWLOCATION => false,
/**
* The PHP documentation at
* says
* that the \CURLOPT_NOPROGRESS setting should only be set to false for
* debugging. *THIS IS A LIE.* When this is set to false, the function
* specified by the \CURLOPT_PROGRESSFUNCTION setting is actually
* called. However, if the \CURLOPT_NOPROGRESS setting is *not* set to
* false, the function specified by the \CURLOPT_PROGRESSFUNCTION
* setting is *not* called, meaning that we have no way to terminate a
* runaway download.
**/
CURLOPT_NOPROGRESS => false,
CURLOPT_PROGRESSFUNCTION => new curl_limit(ONION_SEARCH['LIMIT']['LIMIT'], ONION_SEARCH['LIMIT']['DEBUG']),
CURLOPT_USERAGENT => ONION_SEARCH['REMOTE']['USERAGENT'],
CURLOPT_TIMEOUT => ONION_SEARCH['REMOTE']['TIMEOUT'],
CURLOPT_PROTOCOLS => CURLPROTO_HTTPS|CURLPROTO_HTTP|CURLPROTO_GOPHER,
));
// We need to use Tor to retrieve files from onion space.
$cURL->setopt_array(CURLOPT_TOR);
// Use TLS for encryption only, not for identity verification.
$cURL->setopt_array(CURLOPT_TLS_ENCRYPTONLY);
// These four constants are easier to use in strings if they are variables.
$blacklist = ONION_SEARCH['MAIN']['BLACKLIST'];
$nocrawl = ONION_SEARCH['MAIN']['NOCRAWL'];
$tocrawl = ONION_SEARCH['MAIN']['TOCRAWL'];
$useragent = ONION_SEARCH['REMOTE']['USERAGENT'];
// The database reading/writing object
$PDO = new PDO(...ONION_SEARCH['DBO']);
/**
* The documentation insists that using \PDO::prepare() is better than
* using \DBO::query(). I suppose that we can comply. Honestly, the
* documentation is probably right, I am just frustrated that I had to
* rewrite my code when away from the \mysqli class. The \mysqli class
* also has this functionality, but the documentation does not seem to
* dissuade people from using regular-style queries.
**/
$query = array(
'in' => array(
'tocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = :URI"),
'nocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = :URI"),
'blacklist' => $PDO->prepare("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = :hash"),
),
'add' => array(
'tocrawl' => $PDO->prepare("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES (:URI,:title,'0')"),
'nocrawl' => $PDO->prepare("INSERT INTO `$nocrawl` (`uri`) VALUES (:URI)"),
),
'list' => array(
'crawl' => $PDO->prepare("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < :time ORDER BY `timestamp` ASC"),
),
'remove' => array(
'tocrawl' => $PDO->prepare("DELETE FROM `$tocrawl` WHERE `uri` = :URI"),
),
'update' => array(
'title' => $PDO->prepare("UPDATE `$tocrawl` SET `title` = :title WHERE `uri` = :URI"),
'timestamp' => $PDO->prepare("UPDATE `$tocrawl` SET `timestamp` = :time WHERE `uri` = :URI"),
),
);
// Set up the document parser.
$DOMDocument = new DOMDocument();
/**
* Prevent malformed Web pages from killing the spider.
* We will need to clear the errors as they come up to prevent them
* from filling the buffer though.
**/
$DOMDocument->recover = true;
libxml_use_internal_errors(true);
// This allows us to loop until we are done, then stop.
$continue_loop = true;
while($continue_loop):
/**
* If we have URIs to process, this will be set to true immediately in
* the second nested loop. If this loop finishes without executing the
* contained loop, it means that we must not have had any URIs to
* process.
**/
$continue_loop = false;
/**
* For debugging purposes, it can be useful to start this script on a
* particular URI. When this script finds URIs on a Web page, it checks
* it for validity and properly discards it if invalid. However, when
* given a URI via the command line, it assumes that this URI is valid
* and will abort otherwise. This is a feature, not a bug. If you are
* trying to debug a particular page and you accidentally specify a bad
* URI, continuing despite the mistake is the incorrect behavior.
*
* Furthermore, it is assumed that the URI specified this way is part
* of a site that already exists in the database. This tool is not a
* way to add URIs, it is a way to test bugs in the spider. If a
* particular page crashed the spider, this feature allows you to start
* with the crashing page so that you don't have to wait for the spider
* to find that page again in order to test a fix in the spider code.
**/
if(isset($argv[1])):
$DEBUG_URI = new uri($argv[1]);
$URIlist = array(
(object) array(
'uri' => new uri('/', $DEBUG_URI),
),
);
// We don't want to run this code a second time, so let's remove the trigger.
unset($argv[1]);
else:
$query['list']['crawl']->execute(array(
':time' => time()-ONION_SEARCH['MAIN']['REFRESHDELAY'],
));
$URIlist = $query['list']['crawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['list']['crawl']->closeCursor();
endif;
foreach($URIlist as $uri):
/**
* This time stamp is used to estimate how much time is remaining in
* the processing of a given site. For a more accurate estimation, we
* should probably move this closer to the loop below, but I would
* rather have the estimate be a little high than have it be a little
* low.
**/
$starttime = time();
$site_URI = new uri($uri->uri);
// We have URIs to process. Set this to true.
$continue_loop = true;
// If the onion address has been blacklisted, we should ignore it.
$hash = array(
':hash' => hash_uri_onion($site_URI),
);
$query['in']['blacklist']->execute($hash);
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['list']['crawl']->closeCursor();
$blacklisted = $result[0]->{'COUNT(*)'} != '0';
// If the onion address has blacklisted, we should remove it from the database.
if($blacklisted):
$query['remove']['tocrawl']->execute($hash);
$query['remove']['tocrawl']->closeCursor();
continue;
endif;
// Different types of URI need to be handled differently.
switch($site_URI->scheme):
case 'https':
case 'http':
$robotstxt_URI = new uri('/robots.txt', $site_URI);
break;
case 'gopher':
$robotstxt_URI = new uri('/0/robots.txt', $site_URI);
break;
default:
throw new RuntimeException("$useragent does not know how to crawl \"$site_URI->scheme\" URIs, URI \"$site_URI\" found in table \"$tocrawl\".");
endswitch;
// Information about the current site will be held in a pair of arrays.
$done = array();
$queue = array($site_URI);
/**
* If we are debugging with a particular URI, we should make sure that
* that URI is at the front of the queue.
**/
if(isset($DEBUG_URI)):
$queue[] = $DEBUG_URI;
// We don't want to run this code a second time, so let's remove the trigger.
unset($DEBUG_URI);
endif;
// The spider should respect the wishes of the webmaster.
if(ONION_SEARCH['MAIN']['DEBUG']):
echo "Checking \"robots.txt\" file at <$robotstxt_URI>\n";
endif;
$page = $cURL->get($robotstxt_URI);
if($page === false):
// Using blank data when no data is available allows us to assume that
// we actually have data.
$robotstxt = new robots_txt('');
else:
$robotstxt = new robots_txt($page);
endif;
foreach($robotstxt->sitemaps as $sitemap):
$queue[] = new uri($sitemap, $site_URI);
endforeach;
while(null !== ($URI = array_shift($queue))):
$done[] = $URI;
// The spider should respect the wishes of the webmaster.
if($robotstxt->disallows(ONION_SEARCH['REMOTE']['USERAGENT'], $URI)):
continue;
endif;
// Retrieve the page.
if(ONION_SEARCH['MAIN']['DEBUG']):
echo "Working with page at URI <$URI>\n";
endif;
$page = $cURL->get($URI);
// If the page is empty, there is no point in parsing it.
if(!empty($page)):
$DOMDocument->loadXML($page);
// Those lazy fools wrote bad markup and put errors in our buffer.
// We should clear them.
libxml_clear_errors();
/**
* If we set the document URI, the base URI resolver is able to handle
* tags that specify a relative base.
**/
$DOMDocument->documentURI = $URI;
/**
* This is just a little witchery to find the base URI only once
* instead of having to find it again for each tag. If we used the
* "baseURI" property of every tag, we would need to instantiate
* the \st\y\uri class for each hyperlink if a tag were
* present, as well as check to see if a "baseURI" property were set
* for each tag. It is much more efficient to check once per page
* and perform the needed setup once per page.
**/
$base_tag = $DOMDocument->getElementsByTagName('base');
if($base_tag->length and ($baseURI = $base_tag->item(0)->baseURI)):
$base = new uri($baseURI);
else:
$base = $URI;
endif;
foreach($DOMDocument->getElementsByTagName('loc') as $LOC):
// We need to make relative URIs absolute.
$new_URI = new uri($LOC->textContent, $base);
// We also need to strip away the fragment, if any.
unset($new_URI->fragment);
$new_URI_site = new uri('/', $new_URI);
if($new_URI_site == $site_URI):
if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)):
$queue[] = $new_URI;
endif;
else:
// We do not want to keep track URIs using blacklisted onion addresses.
$hash = array(
':hash' => hash_uri_onion($new_URI_site),
);
$query['in']['blacklist']->execute($hash);
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['blacklist']->closeCursor();
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
switch($new_URI_site->scheme):
case 'https':
case 'http':
case 'gopher':
$query['in']['tocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['tocrawl']->closeCursor();
break;
default:
$query['in']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['nocrawl']->closeCursor();
endswitch;
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
if(isset($new_URI->host) and (substr($new_URI->host, -6) == '.onion') and
$not_blacklisted and $is_new_site):
switch($new_URI_site->scheme):
case 'https':
case 'http':
case 'gopher':
$query['add']['tocrawl']->execute(array(
':URI' => $new_URI_site,
':title' => $new_URI_site,
));
$query['add']['tocrawl']->closeCursor();
break;
default:
$query['add']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$query['add']['nocrawl']->closeCursor();
endswitch;
endif;
endif;
endforeach;
foreach($DOMDocument->getElementsByTagName('a') as $A):
$href = $A->attributes->getNamedItem('href');
if($href):
// If we do not wrap this in a try block, malformed $a[URI]s in hyperlinks will kill the spider.
try {
// We need to make relative URIs absolute.
$new_URI = new uri($href->textContent, $base);
// We also need to strip away the fragment, if any.
unset($new_URI->fragment);
$new_URI_site = new uri('/', $new_URI);
if($new_URI_site == $site_URI):
if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)):
$queue[] = $new_URI;
endif;
else:
// We do not want to keep track URIs using blacklisted onion addresses.
$hash = array(
':hash' => hash_uri_onion($new_URI_site),
);
$query['in']['blacklist']->execute($hash);
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['blacklist']->closeCursor();
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
switch($new_URI_site->scheme):
case 'https':
case 'http':
case 'gopher':
$query['in']['tocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['tocrawl']->closeCursor();
break;
default:
$query['in']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['nocrawl']->closeCursor();
endswitch;
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
if(isset($new_URI_site->host) and substr($new_URI_site->host, -6) == '.onion' and
$not_blacklisted and $is_new_site):
switch($new_URI_site->scheme):
case 'https':
case 'http':
case 'gopher':
if(isset($A->textContent)):
$prepared_text = $A->textContent;
else:
$prepared_text = '';
endif;
$query['add']['tocrawl']->execute(array(
':URI' => $new_URI_site,
':title' => $prepared_text,
));
$query['add']['tocrawl']->closeCursor();
break;
default:
$query['add']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$query['add']['nocrawl']->closeCursor();
endswitch;
endif;
endif;
// Some people just do not know how to form valid URIs ...
} catch(DomainException $e) {
if($e->getFile() == uri::FILE):
if(ONION_SEARCH['MAIN']['DEBUG']):
echo "Malformed URI found on page <$URI>, ignoring: <$href->textContent>\n";
endif;
else:
throw $e;
endif;
}
endif;
endforeach;
endif;
foreach(gopher($page) as $line):
$new_URI = $line['uri'];
$new_URI_site = new uri('/', $new_URI);
if($new_URI_site == $site_URI):
switch($line['type']):
/**
* We might as well use Gopher's file type hints to our advantage.
* If the server says that it is something that we know we do not want,
* we will ignore it. If the server is lying, we simply will crawl the
* site in a misguided fashion. If the site administrator wants their
* site crawled properly, they should configure their server correctly.
*
* Please note that Gopher directory responses are accepted by this
* spider even if this is an HTTPS/HTTP server and not a Gopher server.
* We do not care what protocol was used to retrieve the file, we only
* care that the syntax looks like a gopher directory file.
**/
case '1':
case '2':
case '7':
case 'h':
break;
if(!in_array($queue) and !in_array($done)):
$queue[] = $line['uri'];
endif;
endswitch;
else:
/**
* Gopher directories are only capable of linking to Gopher servers,
* Telnet servers, and TN3270 servers. We cannot crawl Telnet or TN3270
* servers, so checking to see if this URI uses the 'gopher' scheme, we
* effectively check to see if we can crawl this server.
**/
if($new_URI_site->scheme == 'gopher'):
$query['in']['tocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['tocrawl']->closeCursor();
else:
$query['in']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['nocrawl']->closeCursor();
endif;
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
if($is_new_site):
if($new_URI_site->scheme == 'gopher'):
$query['add']['tocrawl']->execute(array(
':URI' => $new_URI_site,
':title' => $prepared_text,
));
$query['add']['tocrawl']->closeCursor();
else:
$query['add']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$query['add']['nocrawl']->closeCursor();
endif;
endif;
endif;
endforeach;
// Save information about the index page:
if($URI->path == '/'):
$title = $DOMDocument->getElementsByTagName('title');
// If the page has no , we can just use the URI instead.
if($title->length):
$prepared_title = $title->item(0)->textContent;
else:
$prepared_title = $site_URI;
endif;
$query['update']['title']->execute(array(
':URI' => $new_URI_site,
':title' => $prepared_title,
));
$query['update']['title']->closeCursor();
endif;
if(ONION_SEARCH['MAIN']['DEBUG']):
$queue_count = count($queue);
$done_count = count($done);
$percent = (int) ($done_count / ($queue_count + $done_count) * 100);
$seconds_passed = time() - $starttime;
$seconds_left = ($seconds_passed / $done_count) * $queue_count;
$time_passed = parse_seconds($seconds_passed);
$time_left = parse_seconds($seconds_left);
echo "Queue: $queue_count; Done: $done_count; Estimated percentage complete: %$percent\n";
echo "Time spent: $time_passed[d] day(s), $time_passed[h] hour(s), $time_passed[m] minute(s), and $time_passed[s] second(s)\n";
echo "Estimated time remaining: $time_left[d] day(s), $time_left[h] hour(s), $time_left[m] minute(s), and $time_left[s] second(s)\n";
endif;
endwhile;
$query['update']['timestamp']->execute(array(
':URI' => $new_URI_site,
':time' => time(),
));
$query['update']['timestamp']->closeCursor();
endforeach;
endwhile;
/**
* TO DO LIST:
*
* add check to see if crawlable site is present; if not present, remove from active list
* add check to see if non-crawlable site is present; if not present, remove from active list
**/