Use \PDO class instead of \mysqli class

This commit is contained in:
y.st 2016-02-10 00:01:28 -08:00
parent 6a24c899f2
commit 38c5554cb5
2 changed files with 148 additions and 85 deletions

View File

@ -29,21 +29,10 @@ const ONION_SEARCH = array(
'REFRESHDELAY' => 30*24*60*60,
),
// Settings for \mysqli use:
'MYSQLI' => array(
// (string) The host of the MySQL database
'HOST' => 'localhost',
// (string) The user name to use when connecting to the MySQL database
'USERNAME' => 'onionspider',
// (string) The password to use when connecting to the MySQL database
'PASSWORD' => 'password',// Please use a better password than this.
// (string) The name of the database that OnionSpider should use
'DATABASE' => 'onionspider',
// (integer) The port to use when connecting to the MySQL database
'PORT' => 3306,// The default MySQL port
// (string) The socket to use when connecting to the MySQL database
'SOCKET' => null,// I ... do not actually know what this does.
),
// (array) Settings for \DBO instantiation (database-specific):
// This array will be passed into \DBO::__construct() function using
// the "..." operator.
'DBO' => array(),
// Settings for \st\y\curl_limit use:
'LIMIT' => array(

View File

@ -73,21 +73,44 @@ $cURL = new remote_files(array(
// We need to use Tor to retrieve files from onion space
$cURL->setopt_array(CURLOPT_TOR);
// These four constants are easier to use in strings if they are variables.
$blacklist = ONION_SEARCH['MAIN']['BLACKLIST'];
$nocrawl = ONION_SEARCH['MAIN']['NOCRAWL'];
$tocrawl = ONION_SEARCH['MAIN']['TOCRAWL'];
$useragent = ONION_SEARCH['REMOTE']['USERAGENT'];
// The database reading/writing object
$MySQLi = new mysqli(
ONION_SEARCH['MYSQLI']['HOST'],
ONION_SEARCH['MYSQLI']['USERNAME'],
ONION_SEARCH['MYSQLI']['PASSWORD'],
ONION_SEARCH['MYSQLI']['DATABASE'],
ONION_SEARCH['MYSQLI']['PORT'],
ONION_SEARCH['MYSQLI']['SOCKET']
);
/** I think this makes it so our spider writes to the database
* automatically as data is acquired. The advantage for us is that if
* the spider is terminated prematurely, we do not lose any data that
* we already obtained.
$PDO = new PDO(...ONION_SEARCH['DBO']);
/**
* The documentation insists that using \PDO::prepare() is better than
* using \DBO::query(). I suppose that we can comply. Honestly, the
* documentation is probably right, I am just frustrated that I had to
* rewrite my code when away from the \mysqli class. The \mysqli class
* also has this functionality, but the documentation does not seem to
* dissuade people from using regular-style queries.
**/
$MySQLi->autocommit(true);
$query = array(
'in' => array(
'tocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = :URI"),
'nocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = :URI"),
'blacklist' => $PDO->prepare("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = :hash"),
),
'add' => array(
'tocrawl' => $PDO->prepare("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES (:URI,:title,'0')"),
'nocrawl' => $PDO->prepare("INSERT INTO `$nocrawl` (`uri`) VALUES (:URI)"),
),
'list' => array(
'crawl' => $PDO->prepare("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < :time ORDER BY `uri`"),
),
'remove' => array(
'tocrawl' => $PDO->prepare("DELETE FROM `$tocrawl` WHERE `uri` = :URI"),
),
'update' => array(
'title' => $PDO->prepare("UPDATE `$tocrawl` SET `title` = :title WHERE `uri` = :URI"),
'timestamp' => $PDO->prepare("UPDATE `$tocrawl` SET `timestamp` = :time WHERE `uri` = :URI"),
),
);
// For now, we only want to deal with HTTP and HTTPS.
$approved_protocols = array(
@ -95,12 +118,6 @@ $approved_protocols = array(
'https' => true,
);
// These four constants are easier to use in strings if they are variables.
$blacklist = ONION_SEARCH['MAIN']['BLACKLIST'];
$nocrawl = ONION_SEARCH['MAIN']['NOCRAWL'];
$tocrawl = ONION_SEARCH['MAIN']['TOCRAWL'];
$useragent = ONION_SEARCH['REMOTE']['USERAGENT'];
// Set up the document parser.
$DOMDocument = new DOMDocument();
@ -123,7 +140,11 @@ while($continue_loop):
* process.
**/
$continue_loop = false;
$URIlist = $MySQLi->query("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < ".(time()-ONION_SEARCH['MAIN']['REFRESHDELAY'])." ORDER BY `uri`");
$query['list']['crawl']->execute(array(
':time' => time()-ONION_SEARCH['MAIN']['REFRESHDELAY'],
));
$URIlist = $query['list']['crawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['list']['crawl']->closeCursor();
foreach($URIlist as $uri):
/**
* This time stamp is used to estimate how much time is remaining in
@ -133,17 +154,21 @@ while($continue_loop):
* low.
**/
$starttime = time();
$site_URI = new uri($uri['uri']);
$prepared_site_URI = $MySQLi->escape_string($site_URI);
$site_URI = new uri($uri->uri);
// We have URIs to process. Set this to true.
$continue_loop = true;
// If the onion address has been blacklisted, we should ignore it.
$hash = hash_uri_onion($site_URI);
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
$blacklisted = $result->fetch_array()['COUNT(*)'] != '0';
$result->free();
$hash = array(
':hash' => hash_uri_onion($site_URI),
);
$query['in']['blacklist']->execute($hash);
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['list']['crawl']->closeCursor();
$blacklisted = $result[0]->{'COUNT(*)'} != '0';
// If the onion address has blacklisted, we should remove it from the database.
if($blacklisted):
$MySQLi->query("DELETE FROM `$tocrawl` WHERE `url` = '$URL'");
$query['remove']['tocrawl']->execute($hash);
$query['remove']['tocrawl']->closeCursor();
continue;
endif;
// Different types of URI need to be handled differently.
@ -174,7 +199,7 @@ while($continue_loop):
$robotstxt = new robots_txt($page);
endif;
foreach($robotstxt->sitemaps as $sitemap):
$URI = new uri($sitemap, $site_URI);
$queue[] = new uri($sitemap, $site_URI);
endforeach;
while(null !== ($URI = array_shift($queue))):
$done[] = $URI;
@ -219,25 +244,34 @@ while($continue_loop):
$queue[] = $new_URI;
endif;
else:
// We do not want to keep track of blacklisted URIs.
$hash = hash_uri_onion($new_URI_site);
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
$not_blacklisted = $result->fetch_array()['COUNT(*)'] == '0';
$result->free();
// We do not want to keep track URIs using blacklisted onion addresses.
$hash = array(
':hash' => hash_uri_onion($new_URI_site),
);
$query['in']['blacklist']->execute($hash);
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['blacklist']->closeCursor();
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
$prepared_new_URI_site = $MySQLi->escape_string($new_URI_site);
switch($new_URI_site->scheme):
case 'https':
case 'http':
case 'gopher':
$result = $MySQLi->query("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = '$prepared_new_URI_site'");
$query['in']['tocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['tocrawl']->closeCursor();
break;
default:
$result = $MySQLi->query("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = '$prepared_new_URI_site'");
$query['in']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['nocrawl']->closeCursor();
endswitch;
$is_new_site = $result->fetch_array()['COUNT(*)'] == '0';
$result->free();
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
if(isset($new_URI->host) and (substr($new_URI->host, -6) == '.onion'
or substr($new_URI->host, -7) == '.onion.') and
$not_blacklisted and $is_new_site):
@ -245,10 +279,17 @@ while($continue_loop):
case 'https':
case 'http':
case 'gopher':
$MySQLi->query("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES('$prepared_new_URI_site','$prepared_new_URI_site','0')");
$query['add']['tocrawl']->execute(array(
':URI' => $new_URI_site,
':title' => $new_URI_site,
));
$query['add']['tocrawl']->closeCursor();
break;
default:
$MySQLi->query("INSERT INTO `$nocrawl` (`uri`) VALUES('$prepared_new_URI_site')");
$query['add']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$query['add']['nocrawl']->closeCursor();
endswitch;
endif;
endif;
@ -268,25 +309,34 @@ while($continue_loop):
$queue[] = $new_URI;
endif;
else:
// We do not want to keep track of blacklisted URIs.
$hash = hash_uri_onion($new_URI_site);
$result = $MySQLi->query("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = '$hash'");
$not_blacklisted = $result->fetch_array()['COUNT(*)'] == '0';
$result->free();
// We do not want to keep track URIs using blacklisted onion addresses.
$hash = array(
':hash' => hash_uri_onion($new_URI_site),
);
$query['in']['blacklist']->execute($hash);
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['blacklist']->closeCursor();
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
// If we do not check for the presence of an existing anchor in our
// database, the database will grow needlessly when recrawling pages.
$prepared_new_URI_site = $MySQLi->escape_string($new_URI_site);
switch($new_URI_site->scheme):
case 'https':
case 'http':
case 'gopher':
$result = $MySQLi->query("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = '$prepared_new_URI_site'");
$query['in']['tocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['tocrawl']->closeCursor();
break;
default:
$result = $MySQLi->query("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = '$prepared_new_URI_site'");
$query['in']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['nocrawl']->closeCursor();
endswitch;
$is_new_site = $result->fetch_array()['COUNT(*)'] == '0';
$result->free();
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
if(isset($new_URI_site->host) and (substr($new_URI_site->host, -6) == '.onion'
or substr($new_URI_site->host, -7) == '.onion.') and
$not_blacklisted and $is_new_site):
@ -295,14 +345,21 @@ while($continue_loop):
case 'http':
case 'gopher':
if(isset($A->textContent)):
$prepared_text = $MySQLi->escape_string($A->textContent);
$prepared_text = $A->textContent;
else:
$prepared_text = '';
endif;
$MySQLi->query("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES('$prepared_new_URI_site','$prepared_text','0')");
$query['add']['tocrawl']->execute(array(
':URI' => $new_URI_site,
':title' => $prepared_text,
));
$query['add']['tocrawl']->closeCursor();
break;
default:
$MySQLi->query("INSERT INTO `$nocrawl` (`uri`) VALUES('$prepared_new_URI_site')");
$query['add']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$query['add']['nocrawl']->closeCursor();
endswitch;
endif;
endif;
@ -346,7 +403,6 @@ while($continue_loop):
endif;
endswitch;
else:
$prepared_new_URI_site = $MySQLi->escape_string($new_URI_site);
/**
* Gopher directories are only capable of linking to Gopher servers,
* Telnet servers, and TN3270 servers. We cannot crawl Telnet or TN3270
@ -354,31 +410,49 @@ while($continue_loop):
* effectively check to see if we can crawl this server.
**/
if($new_URI_site->scheme == 'gopher'):
$result = $MySQLi->query("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = '$prepared_new_URI_site'");
$query['in']['tocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['tocrawl']->closeCursor();
else:
$result = $MySQLi->query("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = '$prepared_new_URI_site'");
$query['in']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
$query['in']['nocrawl']->closeCursor();
endif;
$is_new_site = $result->fetch_array()['COUNT(*)'] == '0';
$result->free();
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
if($is_new_site):
$prepared_text = $MySQLi->escape_string($line['title']);
if($new_URI_site->scheme == 'gopher'):
$MySQLi->query("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES('$prepared_new_URI_site','$prepared_text','0')");
$query['add']['tocrawl']->execute(array(
':URI' => $new_URI_site,
':title' => $prepared_text,
));
$query['add']['tocrawl']->closeCursor();
else:
$MySQLi->query("INSERT INTO `$nocrawl` (`uri`) VALUES('$prepared_new_URI_site')");
$query['add']['nocrawl']->execute(array(
':URI' => $new_URI_site,
));
$query['add']['nocrawl']->closeCursor();
endif;
endif;
endif;
endforeach;
// Save information about the index page:
if($URI->path == '/'):
$title = $DOMDocument->getElementsByTagName('title');
// If the page has no <title/>, we can just use the URI instead.
if(isset($index['TITLE'][0]) and isset($values[$index['TITLE'][0]]['value'])):
$prepared_title = $MySQLi->escape_string($values[$index['TITLE'][0]]['value']);
if($title->length):
$prepared_title = $title->item(0)->textContent;
else:
$prepared_title = $prepared_site_URI;
$prepared_title = $site_URI;
endif;
$MySQLi->query("UPDATE `$tocrawl` SET `title` = '$prepared_title' WHERE `uri` = '$prepared_site_URI'");
$query['update']['title']->execute(array(
':URI' => $new_URI_site,
':title' => $prepared_title,
));
$query['update']['title']->closeCursor();
endif;
if(ONION_SEARCH['MAIN']['DEBUG']):
$queue_count = count($queue);
@ -393,17 +467,17 @@ while($continue_loop):
echo "Estimated time remaining: $time_left[d] day(s), $time_left[h] hour(s), $time_left[m] minute(s), and $time_left[s] second(s)\n";
endif;
endwhile;
$timestamp = time();
$MySQLi->query("UPDATE `$tocrawl` SET `timestamp` = '$timestamp' WHERE `uri` = '$prepared_site_URI'");
$query['update']['timestamp']->execute(array(
':URI' => $new_URI_site,
':time' => time(),
));
$query['update']['timestamp']->closeCursor();
endforeach;
$URIlist->free();
endwhile;
$MySQLi->close();
/**
* TO DO LIST:
*
* add text file database support without removing MySQL database support
* add check to see if crawlable site is present; if not present, remove from active list
* add check to see if non-crawlable site is present; if not present, remove from active list
**/