Use \PDO class instead of \mysqli class
This commit is contained in:
parent
6a24c899f2
commit
38c5554cb5
@ -29,21 +29,10 @@ const ONION_SEARCH = array(
|
||||
'REFRESHDELAY' => 30*24*60*60,
|
||||
),
|
||||
|
||||
// Settings for \mysqli use:
|
||||
'MYSQLI' => array(
|
||||
// (string) The host of the MySQL database
|
||||
'HOST' => 'localhost',
|
||||
// (string) The user name to use when connecting to the MySQL database
|
||||
'USERNAME' => 'onionspider',
|
||||
// (string) The password to use when connecting to the MySQL database
|
||||
'PASSWORD' => 'password',// Please use a better password than this.
|
||||
// (string) The name of the database that OnionSpider should use
|
||||
'DATABASE' => 'onionspider',
|
||||
// (integer) The port to use when connecting to the MySQL database
|
||||
'PORT' => 3306,// The default MySQL port
|
||||
// (string) The socket to use when connecting to the MySQL database
|
||||
'SOCKET' => null,// I ... do not actually know what this does.
|
||||
),
|
||||
// (array) Settings for \DBO instantiation (database-specific):
|
||||
// This array will be passed into \DBO::__construct() function using
|
||||
// the "..." operator.
|
||||
'DBO' => array(),
|
||||
|
||||
// Settings for \st\y\curl_limit use:
|
||||
'LIMIT' => array(
|
||||
|
214
spider.php
214
spider.php
@ -73,21 +73,44 @@ $cURL = new remote_files(array(
|
||||
// We need to use Tor to retrieve files from onion space
|
||||
$cURL->setopt_array(CURLOPT_TOR);
|
||||
|
||||
// These four constants are easier to use in strings if they are variables.
|
||||
$blacklist = ONION_SEARCH['MAIN']['BLACKLIST'];
|
||||
$nocrawl = ONION_SEARCH['MAIN']['NOCRAWL'];
|
||||
$tocrawl = ONION_SEARCH['MAIN']['TOCRAWL'];
|
||||
$useragent = ONION_SEARCH['REMOTE']['USERAGENT'];
|
||||
|
||||
// The database reading/writing object
|
||||
$MySQLi = new mysqli(
|
||||
ONION_SEARCH['MYSQLI']['HOST'],
|
||||
ONION_SEARCH['MYSQLI']['USERNAME'],
|
||||
ONION_SEARCH['MYSQLI']['PASSWORD'],
|
||||
ONION_SEARCH['MYSQLI']['DATABASE'],
|
||||
ONION_SEARCH['MYSQLI']['PORT'],
|
||||
ONION_SEARCH['MYSQLI']['SOCKET']
|
||||
);
|
||||
/** I think this makes it so our spider writes to the database
|
||||
* automatically as data is acquired. The advantage for us is that if
|
||||
* the spider is terminated prematurely, we do not lose any data that
|
||||
* we already obtained.
|
||||
$PDO = new PDO(...ONION_SEARCH['DBO']);
|
||||
|
||||
/**
|
||||
* The documentation insists that using \PDO::prepare() is better than
|
||||
* using \DBO::query(). I suppose that we can comply. Honestly, the
|
||||
* documentation is probably right, I am just frustrated that I had to
|
||||
* rewrite my code when away from the \mysqli class. The \mysqli class
|
||||
* also has this functionality, but the documentation does not seem to
|
||||
* dissuade people from using regular-style queries.
|
||||
**/
|
||||
$MySQLi->autocommit(true);
|
||||
$query = array(
|
||||
'in' => array(
|
||||
'tocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = :URI"),
|
||||
'nocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = :URI"),
|
||||
'blacklist' => $PDO->prepare("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = :hash"),
|
||||
),
|
||||
'add' => array(
|
||||
'tocrawl' => $PDO->prepare("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES (:URI,:title,'0')"),
|
||||
'nocrawl' => $PDO->prepare("INSERT INTO `$nocrawl` (`uri`) VALUES (:URI)"),
|
||||
),
|
||||
'list' => array(
|
||||
'crawl' => $PDO->prepare("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < :time ORDER BY `uri`"),
|
||||
),
|
||||
'remove' => array(
|
||||
'tocrawl' => $PDO->prepare("DELETE FROM `$tocrawl` WHERE `uri` = :URI"),
|
||||
),
|
||||
'update' => array(
|
||||
'title' => $PDO->prepare("UPDATE `$tocrawl` SET `title` = :title WHERE `uri` = :URI"),
|
||||
'timestamp' => $PDO->prepare("UPDATE `$tocrawl` SET `timestamp` = :time WHERE `uri` = :URI"),
|
||||
),
|
||||
);
|
||||
|
||||
// For now, we only want to deal with HTTP and HTTPS.
|
||||
$approved_protocols = array(
|
||||
@ -95,12 +118,6 @@ $approved_protocols = array(
|
||||
'https' => true,
|
||||
);
|
||||
|
||||
// These four constants are easier to use in strings if they are variables.
|
||||
$blacklist = ONION_SEARCH['MAIN']['BLACKLIST'];
|
||||
$nocrawl = ONION_SEARCH['MAIN']['NOCRAWL'];
|
||||
$tocrawl = ONION_SEARCH['MAIN']['TOCRAWL'];
|
||||
$useragent = ONION_SEARCH['REMOTE']['USERAGENT'];
|
||||
|
||||
// Set up the document parser.
|
||||
$DOMDocument = new DOMDocument();
|
||||
|
||||
@ -123,7 +140,11 @@ while($continue_loop):
|
||||
* process.
|
||||
**/
|
||||
$continue_loop = false;
|
||||
$URIlist = $MySQLi->query("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < ".(time()-ONION_SEARCH['MAIN']['REFRESHDELAY'])." ORDER BY `uri`");
|
||||
$query['list']['crawl']->execute(array(
|
||||
':time' => time()-ONION_SEARCH['MAIN']['REFRESHDELAY'],
|
||||
));
|
||||
$URIlist = $query['list']['crawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['list']['crawl']->closeCursor();
|
||||
foreach($URIlist as $uri):
|
||||
/**
|
||||
* This time stamp is used to estimate how much time is remaining in
|
||||
@ -133,17 +154,21 @@ while($continue_loop):
|
||||
* low.
|
||||
**/
|
||||
$starttime = time();
|
||||
$site_URI = new uri($uri['uri']);
|
||||
$prepared_site_URI = $MySQLi->escape_string($site_URI);
|
||||
$site_URI = new uri($uri->uri);
|
||||
// We have URIs to process. Set this to true.
|
||||
$continue_loop = true;
|
||||
// If the onion address has been blacklisted, we should ignore it.
|
||||
$hash = hash_uri_onion($site_URI);
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
|
||||
$blacklisted = $result->fetch_array()['COUNT(*)'] != '0';
|
||||
$result->free();
|
||||
$hash = array(
|
||||
':hash' => hash_uri_onion($site_URI),
|
||||
);
|
||||
$query['in']['blacklist']->execute($hash);
|
||||
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['list']['crawl']->closeCursor();
|
||||
$blacklisted = $result[0]->{'COUNT(*)'} != '0';
|
||||
// If the onion address has blacklisted, we should remove it from the database.
|
||||
if($blacklisted):
|
||||
$MySQLi->query("DELETE FROM `$tocrawl` WHERE `url` = '$URL'");
|
||||
$query['remove']['tocrawl']->execute($hash);
|
||||
$query['remove']['tocrawl']->closeCursor();
|
||||
continue;
|
||||
endif;
|
||||
// Different types of URI need to be handled differently.
|
||||
@ -174,7 +199,7 @@ while($continue_loop):
|
||||
$robotstxt = new robots_txt($page);
|
||||
endif;
|
||||
foreach($robotstxt->sitemaps as $sitemap):
|
||||
$URI = new uri($sitemap, $site_URI);
|
||||
$queue[] = new uri($sitemap, $site_URI);
|
||||
endforeach;
|
||||
while(null !== ($URI = array_shift($queue))):
|
||||
$done[] = $URI;
|
||||
@ -219,25 +244,34 @@ while($continue_loop):
|
||||
$queue[] = $new_URI;
|
||||
endif;
|
||||
else:
|
||||
// We do not want to keep track of blacklisted URIs.
|
||||
$hash = hash_uri_onion($new_URI_site);
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
|
||||
$not_blacklisted = $result->fetch_array()['COUNT(*)'] == '0';
|
||||
$result->free();
|
||||
// We do not want to keep track URIs using blacklisted onion addresses.
|
||||
$hash = array(
|
||||
':hash' => hash_uri_onion($new_URI_site),
|
||||
);
|
||||
$query['in']['blacklist']->execute($hash);
|
||||
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['blacklist']->closeCursor();
|
||||
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
|
||||
// If we do not check for the presence of an existing anchor in our
|
||||
// database, the database will grow needlessly when recrawling pages.
|
||||
$prepared_new_URI_site = $MySQLi->escape_string($new_URI_site);
|
||||
switch($new_URI_site->scheme):
|
||||
case 'https':
|
||||
case 'http':
|
||||
case 'gopher':
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = '$prepared_new_URI_site'");
|
||||
$query['in']['tocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['tocrawl']->closeCursor();
|
||||
break;
|
||||
default:
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = '$prepared_new_URI_site'");
|
||||
$query['in']['nocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['nocrawl']->closeCursor();
|
||||
endswitch;
|
||||
$is_new_site = $result->fetch_array()['COUNT(*)'] == '0';
|
||||
$result->free();
|
||||
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
|
||||
if(isset($new_URI->host) and (substr($new_URI->host, -6) == '.onion'
|
||||
or substr($new_URI->host, -7) == '.onion.') and
|
||||
$not_blacklisted and $is_new_site):
|
||||
@ -245,10 +279,17 @@ while($continue_loop):
|
||||
case 'https':
|
||||
case 'http':
|
||||
case 'gopher':
|
||||
$MySQLi->query("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES('$prepared_new_URI_site','$prepared_new_URI_site','0')");
|
||||
$query['add']['tocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
':title' => $new_URI_site,
|
||||
));
|
||||
$query['add']['tocrawl']->closeCursor();
|
||||
break;
|
||||
default:
|
||||
$MySQLi->query("INSERT INTO `$nocrawl` (`uri`) VALUES('$prepared_new_URI_site')");
|
||||
$query['add']['nocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$query['add']['nocrawl']->closeCursor();
|
||||
endswitch;
|
||||
endif;
|
||||
endif;
|
||||
@ -268,25 +309,34 @@ while($continue_loop):
|
||||
$queue[] = $new_URI;
|
||||
endif;
|
||||
else:
|
||||
// We do not want to keep track of blacklisted URIs.
|
||||
$hash = hash_uri_onion($new_URI_site);
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
|
||||
$not_blacklisted = $result->fetch_array()['COUNT(*)'] == '0';
|
||||
$result->free();
|
||||
// We do not want to keep track URIs using blacklisted onion addresses.
|
||||
$hash = array(
|
||||
':hash' => hash_uri_onion($new_URI_site),
|
||||
);
|
||||
$query['in']['blacklist']->execute($hash);
|
||||
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['blacklist']->closeCursor();
|
||||
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
|
||||
// If we do not check for the presence of an existing anchor in our
|
||||
// database, the database will grow needlessly when recrawling pages.
|
||||
$prepared_new_URI_site = $MySQLi->escape_string($new_URI_site);
|
||||
switch($new_URI_site->scheme):
|
||||
case 'https':
|
||||
case 'http':
|
||||
case 'gopher':
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = '$prepared_new_URI_site'");
|
||||
$query['in']['tocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['tocrawl']->closeCursor();
|
||||
break;
|
||||
default:
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = '$prepared_new_URI_site'");
|
||||
$query['in']['nocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['nocrawl']->closeCursor();
|
||||
endswitch;
|
||||
$is_new_site = $result->fetch_array()['COUNT(*)'] == '0';
|
||||
$result->free();
|
||||
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
|
||||
if(isset($new_URI_site->host) and (substr($new_URI_site->host, -6) == '.onion'
|
||||
or substr($new_URI_site->host, -7) == '.onion.') and
|
||||
$not_blacklisted and $is_new_site):
|
||||
@ -295,14 +345,21 @@ while($continue_loop):
|
||||
case 'http':
|
||||
case 'gopher':
|
||||
if(isset($A->textContent)):
|
||||
$prepared_text = $MySQLi->escape_string($A->textContent);
|
||||
$prepared_text = $A->textContent;
|
||||
else:
|
||||
$prepared_text = '';
|
||||
endif;
|
||||
$MySQLi->query("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES('$prepared_new_URI_site','$prepared_text','0')");
|
||||
$query['add']['tocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
':title' => $prepared_text,
|
||||
));
|
||||
$query['add']['tocrawl']->closeCursor();
|
||||
break;
|
||||
default:
|
||||
$MySQLi->query("INSERT INTO `$nocrawl` (`uri`) VALUES('$prepared_new_URI_site')");
|
||||
$query['add']['nocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$query['add']['nocrawl']->closeCursor();
|
||||
endswitch;
|
||||
endif;
|
||||
endif;
|
||||
@ -346,7 +403,6 @@ while($continue_loop):
|
||||
endif;
|
||||
endswitch;
|
||||
else:
|
||||
$prepared_new_URI_site = $MySQLi->escape_string($new_URI_site);
|
||||
/**
|
||||
* Gopher directories are only capable of linking to Gopher servers,
|
||||
* Telnet servers, and TN3270 servers. We cannot crawl Telnet or TN3270
|
||||
@ -354,31 +410,49 @@ while($continue_loop):
|
||||
* effectively check to see if we can crawl this server.
|
||||
**/
|
||||
if($new_URI_site->scheme == 'gopher'):
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = '$prepared_new_URI_site'");
|
||||
$query['in']['tocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['tocrawl']->closeCursor();
|
||||
else:
|
||||
$result = $MySQLi->query("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = '$prepared_new_URI_site'");
|
||||
$query['in']['nocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
||||
$query['in']['nocrawl']->closeCursor();
|
||||
endif;
|
||||
$is_new_site = $result->fetch_array()['COUNT(*)'] == '0';
|
||||
$result->free();
|
||||
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
|
||||
if($is_new_site):
|
||||
$prepared_text = $MySQLi->escape_string($line['title']);
|
||||
if($new_URI_site->scheme == 'gopher'):
|
||||
$MySQLi->query("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES('$prepared_new_URI_site','$prepared_text','0')");
|
||||
$query['add']['tocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
':title' => $prepared_text,
|
||||
));
|
||||
$query['add']['tocrawl']->closeCursor();
|
||||
else:
|
||||
$MySQLi->query("INSERT INTO `$nocrawl` (`uri`) VALUES('$prepared_new_URI_site')");
|
||||
$query['add']['nocrawl']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
));
|
||||
$query['add']['nocrawl']->closeCursor();
|
||||
endif;
|
||||
endif;
|
||||
endif;
|
||||
endforeach;
|
||||
// Save information about the index page:
|
||||
if($URI->path == '/'):
|
||||
$title = $DOMDocument->getElementsByTagName('title');
|
||||
// If the page has no <title/>, we can just use the URI instead.
|
||||
if(isset($index['TITLE'][0]) and isset($values[$index['TITLE'][0]]['value'])):
|
||||
$prepared_title = $MySQLi->escape_string($values[$index['TITLE'][0]]['value']);
|
||||
if($title->length):
|
||||
$prepared_title = $title->item(0)->textContent;
|
||||
else:
|
||||
$prepared_title = $prepared_site_URI;
|
||||
$prepared_title = $site_URI;
|
||||
endif;
|
||||
$MySQLi->query("UPDATE `$tocrawl` SET `title` = '$prepared_title' WHERE `uri` = '$prepared_site_URI'");
|
||||
$query['update']['title']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
':title' => $prepared_title,
|
||||
));
|
||||
$query['update']['title']->closeCursor();
|
||||
endif;
|
||||
if(ONION_SEARCH['MAIN']['DEBUG']):
|
||||
$queue_count = count($queue);
|
||||
@ -393,17 +467,17 @@ while($continue_loop):
|
||||
echo "Estimated time remaining: $time_left[d] day(s), $time_left[h] hour(s), $time_left[m] minute(s), and $time_left[s] second(s)\n";
|
||||
endif;
|
||||
endwhile;
|
||||
$timestamp = time();
|
||||
$MySQLi->query("UPDATE `$tocrawl` SET `timestamp` = '$timestamp' WHERE `uri` = '$prepared_site_URI'");
|
||||
$query['update']['timestamp']->execute(array(
|
||||
':URI' => $new_URI_site,
|
||||
':time' => time(),
|
||||
));
|
||||
$query['update']['timestamp']->closeCursor();
|
||||
endforeach;
|
||||
$URIlist->free();
|
||||
endwhile;
|
||||
$MySQLi->close();
|
||||
|
||||
/**
|
||||
* TO DO LIST:
|
||||
*
|
||||
* add text file database support without removing MySQL database support
|
||||
* add check to see if crawlable site is present; if not present, remove from active list
|
||||
* add check to see if non-crawlable site is present; if not present, remove from active list
|
||||
**/
|
||||
|
Loading…
x
Reference in New Issue
Block a user