521 lines
20 KiB
PHP
Executable File
521 lines
20 KiB
PHP
Executable File
#!/usr/bin/php
|
|
<?php
|
|
/**
|
|
* Onion Search
|
|
* Copyright (C) 2015-2016 y.st. <mailto:copyright@y.st>
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
**/
|
|
|
|
/**
|
|
* REMEMBER TO ADD AT LEAST ONE URI TO YOUR INITIAL DATABASE.
|
|
* IF YOU DO NOT, THE SPIDER WILL NOT GO ANYWHERE.
|
|
* After the first crawl, or rather, after the first URI you add,
|
|
* manually adding URIs will no longer be necessary.
|
|
**/
|
|
use st\y\curl_limit, st\y\remote_files, st\y\robots_txt, st\y\uri;
|
|
use function st\y\gopher, st\y\hash_uri_onion, st\y\parse_seconds;
|
|
use const st\y\CURLOPT_TOR, st\y\CURLOPT_TLS_ENCRYPTONLY, st\y\VERSION, localhost\ONION_SEARCH;
|
|
spl_autoload_register();
|
|
require 'st/y/const/CURLOPT_TOR.php';
|
|
require 'st/y/const/CURLOPT_TLS_ENCRYPTONLY.php';
|
|
require 'st/y/const/VERSION.php';
|
|
require 'st/y/function/error_handler.php';
|
|
require 'st/y/function/gopher.php';
|
|
require 'st/y/function/hash_uri_onion.php';
|
|
require 'st/y/function/parse_seconds.php';
|
|
require 'localhost/const/ONION_SEARCH.php';
|
|
set_error_handler('\\st\\y\\error_handler');
|
|
|
|
/**
|
|
* We need to be sure that the functions and classes used by this
|
|
* spider are present. If the library version is too old to contain
|
|
* these, we need to alert the user as to why the needed constructs are
|
|
* missing and alert him/her as to where the needed library can be
|
|
* downloaded.
|
|
**/
|
|
if(version_compare(VERSION['include.d'], '0.0.1.1', '<')):
|
|
throw new LogicException('Your include.d is too old. You need version 0.0.1.1 or above.
|
|
You may find an update to download at <https://notabug.org/y.st./include.d>.');
|
|
endif;
|
|
|
|
// The file-downloading object
|
|
$cURL = new remote_files(array(
|
|
CURLOPT_FOLLOWLOCATION => false,
|
|
/**
|
|
* The PHP documentation at
|
|
* <https://secure.php.net/manual/en/function.curl-setopt.php> says
|
|
* that the \CURLOPT_NOPROGRESS setting should only be set to false for
|
|
* debugging. *THIS IS A LIE.* When this is set to false, the function
|
|
* specified by the \CURLOPT_PROGRESSFUNCTION setting is actually
|
|
* called. However, if the \CURLOPT_NOPROGRESS setting is *not* set to
|
|
* false, the function specified by the \CURLOPT_PROGRESSFUNCTION
|
|
* setting is *not* called, meaning that we have no way to terminate a
|
|
* runaway download.
|
|
**/
|
|
CURLOPT_NOPROGRESS => false,
|
|
CURLOPT_PROGRESSFUNCTION => new curl_limit(ONION_SEARCH['LIMIT']['LIMIT'], ONION_SEARCH['LIMIT']['DEBUG']),
|
|
CURLOPT_USERAGENT => ONION_SEARCH['REMOTE']['USERAGENT'],
|
|
CURLOPT_TIMEOUT => ONION_SEARCH['REMOTE']['TIMEOUT'],
|
|
CURLOPT_PROTOCOLS => CURLPROTO_HTTPS|CURLPROTO_HTTP|CURLPROTO_GOPHER,
|
|
));
|
|
|
|
// We need to use Tor to retrieve files from onion space.
|
|
$cURL->setopt_array(CURLOPT_TOR);
|
|
|
|
// Use TLS for encryption only, not for identity verification.
|
|
$cURL->setopt_array(CURLOPT_TLS_ENCRYPTONLY);
|
|
|
|
// These four constants are easier to use in strings if they are variables.
|
|
$blacklist = ONION_SEARCH['MAIN']['BLACKLIST'];
|
|
$nocrawl = ONION_SEARCH['MAIN']['NOCRAWL'];
|
|
$tocrawl = ONION_SEARCH['MAIN']['TOCRAWL'];
|
|
$useragent = ONION_SEARCH['REMOTE']['USERAGENT'];
|
|
|
|
// The database reading/writing object
|
|
$PDO = new PDO(...ONION_SEARCH['DBO']);
|
|
|
|
/**
|
|
* The documentation insists that using \PDO::prepare() is better than
|
|
* using \DBO::query(). I suppose that we can comply. Honestly, the
|
|
* documentation is probably right, I am just frustrated that I had to
|
|
* rewrite my code when away from the \mysqli class. The \mysqli class
|
|
* also has this functionality, but the documentation does not seem to
|
|
* dissuade people from using regular-style queries.
|
|
**/
|
|
$query = array(
|
|
'in' => array(
|
|
'tocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$tocrawl` WHERE `uri` = :URI"),
|
|
'nocrawl' => $PDO->prepare("SELECT COUNT(*) FROM `$nocrawl` WHERE `uri` = :URI"),
|
|
'blacklist' => $PDO->prepare("SELECT COUNT(*) FROM `$blacklist` WHERE `hash` = :hash"),
|
|
),
|
|
'add' => array(
|
|
'tocrawl' => $PDO->prepare("INSERT INTO `$tocrawl` (`uri`,`title`,`timestamp`) VALUES (:URI,:title,'0')"),
|
|
'nocrawl' => $PDO->prepare("INSERT INTO `$nocrawl` (`uri`) VALUES (:URI)"),
|
|
),
|
|
'list' => array(
|
|
'crawl' => $PDO->prepare("SELECT `uri` FROM `$tocrawl` WHERE `timestamp` < :time ORDER BY `timestamp` ASC"),
|
|
),
|
|
'remove' => array(
|
|
'tocrawl' => $PDO->prepare("DELETE FROM `$tocrawl` WHERE `uri` = :URI"),
|
|
),
|
|
'update' => array(
|
|
'title' => $PDO->prepare("UPDATE `$tocrawl` SET `title` = :title WHERE `uri` = :URI"),
|
|
'timestamp' => $PDO->prepare("UPDATE `$tocrawl` SET `timestamp` = :time WHERE `uri` = :URI"),
|
|
),
|
|
);
|
|
|
|
// Set up the document parser.
|
|
$DOMDocument = new DOMDocument();
|
|
|
|
/**
|
|
* Prevent malformed Web pages from killing the spider.
|
|
* We will need to clear the errors as they come up to prevent them
|
|
* from filling the buffer though.
|
|
**/
|
|
$DOMDocument->recover = true;
|
|
libxml_use_internal_errors(true);
|
|
|
|
// This allows us to loop until we are done, then stop.
|
|
$continue_loop = true;
|
|
|
|
while($continue_loop):
|
|
/**
|
|
* If we have URIs to process, this will be set to true immediately in
|
|
* the second nested loop. If this loop finishes without executing the
|
|
* contained loop, it means that we must not have had any URIs to
|
|
* process.
|
|
**/
|
|
$continue_loop = false;
|
|
/**
|
|
* For debugging purposes, it can be useful to start this script on a
|
|
* particular URI. When this script finds URIs on a Web page, it checks
|
|
* it for validity and properly discards it if invalid. However, when
|
|
* given a URI via the command line, it assumes that this URI is valid
|
|
* and will abort otherwise. This is a feature, not a bug. If you are
|
|
* trying to debug a particular page and you accidentally specify a bad
|
|
* URI, continuing despite the mistake is the incorrect behavior.
|
|
*
|
|
* Furthermore, it is assumed that the URI specified this way is part
|
|
* of a site that already exists in the database. This tool is not a
|
|
* way to add URIs, it is a way to test bugs in the spider. If a
|
|
* particular page crashed the spider, this feature allows you to start
|
|
* with the crashing page so that you don't have to wait for the spider
|
|
* to find that page again in order to test a fix in the spider code.
|
|
**/
|
|
if(isset($argv[1])):
|
|
$DEBUG_URI = new uri($argv[1]);
|
|
$URIlist = array(
|
|
(object) array(
|
|
'uri' => new uri('/', $DEBUG_URI),
|
|
),
|
|
);
|
|
// We don't want to run this code a second time, so let's remove the trigger.
|
|
unset($argv[1]);
|
|
else:
|
|
$query['list']['crawl']->execute(array(
|
|
':time' => time()-ONION_SEARCH['MAIN']['REFRESHDELAY'],
|
|
));
|
|
$URIlist = $query['list']['crawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['list']['crawl']->closeCursor();
|
|
endif;
|
|
foreach($URIlist as $uri):
|
|
/**
|
|
* This time stamp is used to estimate how much time is remaining in
|
|
* the processing of a given site. For a more accurate estimation, we
|
|
* should probably move this closer to the loop below, but I would
|
|
* rather have the estimate be a little high than have it be a little
|
|
* low.
|
|
**/
|
|
$starttime = time();
|
|
$site_URI = new uri($uri->uri);
|
|
// We have URIs to process. Set this to true.
|
|
$continue_loop = true;
|
|
// If the onion address has been blacklisted, we should ignore it.
|
|
$hash = array(
|
|
':hash' => hash_uri_onion($site_URI),
|
|
);
|
|
$query['in']['blacklist']->execute($hash);
|
|
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['list']['crawl']->closeCursor();
|
|
$blacklisted = $result[0]->{'COUNT(*)'} != '0';
|
|
// If the onion address has blacklisted, we should remove it from the database.
|
|
if($blacklisted):
|
|
$query['remove']['tocrawl']->execute($hash);
|
|
$query['remove']['tocrawl']->closeCursor();
|
|
continue;
|
|
endif;
|
|
// Different types of URI need to be handled differently.
|
|
switch($site_URI->scheme):
|
|
case 'https':
|
|
case 'http':
|
|
$robotstxt_URI = new uri('/robots.txt', $site_URI);
|
|
break;
|
|
case 'gopher':
|
|
$robotstxt_URI = new uri('/0/robots.txt', $site_URI);
|
|
break;
|
|
default:
|
|
throw new RuntimeException("$useragent does not know how to crawl \"$site_URI->scheme\" URIs, URI \"$site_URI\" found in table \"$tocrawl\".");
|
|
endswitch;
|
|
// Information about the current site will be held in a pair of arrays.
|
|
$done = array();
|
|
$queue = array($site_URI);
|
|
/**
|
|
* If we are debugging with a particular URI, we should make sure that
|
|
* that URI is at the front of the queue.
|
|
**/
|
|
if(isset($DEBUG_URI)):
|
|
$queue[] = $DEBUG_URI;
|
|
// We don't want to run this code a second time, so let's remove the trigger.
|
|
unset($DEBUG_URI);
|
|
endif;
|
|
// The spider should respect the wishes of the webmaster.
|
|
if(ONION_SEARCH['MAIN']['DEBUG']):
|
|
echo "Checking \"robots.txt\" file at <$robotstxt_URI>\n";
|
|
endif;
|
|
$page = $cURL->get($robotstxt_URI);
|
|
if($page === false):
|
|
// Using blank data when no data is available allows us to assume that
|
|
// we actually have data.
|
|
$robotstxt = new robots_txt('');
|
|
else:
|
|
$robotstxt = new robots_txt($page);
|
|
endif;
|
|
foreach($robotstxt->sitemaps as $sitemap):
|
|
$queue[] = new uri($sitemap, $site_URI);
|
|
endforeach;
|
|
while(null !== ($URI = array_shift($queue))):
|
|
$done[] = $URI;
|
|
// The spider should respect the wishes of the webmaster.
|
|
if($robotstxt->disallows(ONION_SEARCH['REMOTE']['USERAGENT'], $URI)):
|
|
continue;
|
|
endif;
|
|
// Retrieve the page.
|
|
if(ONION_SEARCH['MAIN']['DEBUG']):
|
|
echo "Working with page at URI <$URI>\n";
|
|
endif;
|
|
$page = $cURL->get($URI);
|
|
// If the page is empty, there is no point in parsing it.
|
|
if(!empty($page)):
|
|
$DOMDocument->loadXML($page);
|
|
// Those lazy fools wrote bad markup and put errors in our buffer.
|
|
// We should clear them.
|
|
libxml_clear_errors();
|
|
/**
|
|
* If we set the document URI, the base URI resolver is able to handle
|
|
* <base/> tags that specify a relative base.
|
|
**/
|
|
$DOMDocument->documentURI = $URI;
|
|
/**
|
|
* This is just a little witchery to find the base URI only once
|
|
* instead of having to find it again for each <a/> tag. If we used the
|
|
* "baseURI" property of every <a/> tag, we would need to instantiate
|
|
* the \st\y\uri class for each hyperlink if a <base/> tag were
|
|
* present, as well as check to see if a "baseURI" property were set
|
|
* for each <a/> tag. It is much more efficient to check once per page
|
|
* and perform the needed setup once per page.
|
|
**/
|
|
$base_tag = $DOMDocument->getElementsByTagName('base');
|
|
if($base_tag->length and ($baseURI = $base_tag->item(0)->baseURI)):
|
|
$base = new uri($baseURI);
|
|
else:
|
|
$base = $URI;
|
|
endif;
|
|
foreach($DOMDocument->getElementsByTagName('loc') as $LOC):
|
|
// We need to make relative URIs absolute.
|
|
$new_URI = new uri($LOC->textContent, $base);
|
|
// We also need to strip away the fragment, if any.
|
|
unset($new_URI->fragment);
|
|
$new_URI_site = new uri('/', $new_URI);
|
|
if($new_URI_site == $site_URI):
|
|
if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)):
|
|
$queue[] = $new_URI;
|
|
endif;
|
|
else:
|
|
// We do not want to keep track URIs using blacklisted onion addresses.
|
|
$hash = array(
|
|
':hash' => hash_uri_onion($new_URI_site),
|
|
);
|
|
$query['in']['blacklist']->execute($hash);
|
|
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['blacklist']->closeCursor();
|
|
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
|
|
// If we do not check for the presence of an existing anchor in our
|
|
// database, the database will grow needlessly when recrawling pages.
|
|
switch($new_URI_site->scheme):
|
|
case 'https':
|
|
case 'http':
|
|
case 'gopher':
|
|
$query['in']['tocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['tocrawl']->closeCursor();
|
|
break;
|
|
default:
|
|
$query['in']['nocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['nocrawl']->closeCursor();
|
|
endswitch;
|
|
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
|
|
if(isset($new_URI->host) and (substr($new_URI->host, -6) == '.onion') and
|
|
$not_blacklisted and $is_new_site):
|
|
switch($new_URI_site->scheme):
|
|
case 'https':
|
|
case 'http':
|
|
case 'gopher':
|
|
$query['add']['tocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
':title' => $new_URI_site,
|
|
));
|
|
$query['add']['tocrawl']->closeCursor();
|
|
break;
|
|
default:
|
|
$query['add']['nocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$query['add']['nocrawl']->closeCursor();
|
|
endswitch;
|
|
endif;
|
|
endif;
|
|
endforeach;
|
|
foreach($DOMDocument->getElementsByTagName('a') as $A):
|
|
$href = $A->attributes->getNamedItem('href');
|
|
if($href):
|
|
// If we do not wrap this in a try block, malformed $a[URI]s in hyperlinks will kill the spider.
|
|
try {
|
|
// We need to make relative URIs absolute.
|
|
$new_URI = new uri($href->textContent, $base);
|
|
// We also need to strip away the fragment, if any.
|
|
unset($new_URI->fragment);
|
|
$new_URI_site = new uri('/', $new_URI);
|
|
if($new_URI_site == $site_URI):
|
|
if(!in_array($new_URI, $queue) and !in_array($new_URI, $done)):
|
|
$queue[] = $new_URI;
|
|
endif;
|
|
else:
|
|
// We do not want to keep track URIs using blacklisted onion addresses.
|
|
$hash = array(
|
|
':hash' => hash_uri_onion($new_URI_site),
|
|
);
|
|
$query['in']['blacklist']->execute($hash);
|
|
$result = $query['in']['blacklist']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['blacklist']->closeCursor();
|
|
$not_blacklisted = $result[0]->{'COUNT(*)'} == '0';
|
|
// If we do not check for the presence of an existing anchor in our
|
|
// database, the database will grow needlessly when recrawling pages.
|
|
switch($new_URI_site->scheme):
|
|
case 'https':
|
|
case 'http':
|
|
case 'gopher':
|
|
$query['in']['tocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['tocrawl']->closeCursor();
|
|
break;
|
|
default:
|
|
$query['in']['nocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['nocrawl']->closeCursor();
|
|
endswitch;
|
|
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
|
|
if(isset($new_URI_site->host) and substr($new_URI_site->host, -6) == '.onion' and
|
|
$not_blacklisted and $is_new_site):
|
|
switch($new_URI_site->scheme):
|
|
case 'https':
|
|
case 'http':
|
|
case 'gopher':
|
|
if(isset($A->textContent)):
|
|
$prepared_text = $A->textContent;
|
|
else:
|
|
$prepared_text = '';
|
|
endif;
|
|
$query['add']['tocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
':title' => $prepared_text,
|
|
));
|
|
$query['add']['tocrawl']->closeCursor();
|
|
break;
|
|
default:
|
|
$query['add']['nocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$query['add']['nocrawl']->closeCursor();
|
|
endswitch;
|
|
endif;
|
|
endif;
|
|
// Some people just do not know how to form valid URIs ...
|
|
} catch(DomainException $e) {
|
|
if($e->getFile() == uri::FILE):
|
|
if(ONION_SEARCH['MAIN']['DEBUG']):
|
|
echo "Malformed URI found on page <$URI>, ignoring: <$href->textContent>\n";
|
|
endif;
|
|
else:
|
|
throw $e;
|
|
endif;
|
|
}
|
|
endif;
|
|
endforeach;
|
|
endif;
|
|
foreach(gopher($page) as $line):
|
|
$new_URI = $line['uri'];
|
|
$new_URI_site = new uri('/', $new_URI);
|
|
if($new_URI_site == $site_URI):
|
|
switch($line['type']):
|
|
/**
|
|
* We might as well use Gopher's file type hints to our advantage.
|
|
* If the server says that it is something that we know we do not want,
|
|
* we will ignore it. If the server is lying, we simply will crawl the
|
|
* site in a misguided fashion. If the site administrator wants their
|
|
* site crawled properly, they should configure their server correctly.
|
|
*
|
|
* Please note that Gopher directory responses are accepted by this
|
|
* spider even if this is an HTTPS/HTTP server and not a Gopher server.
|
|
* We do not care what protocol was used to retrieve the file, we only
|
|
* care that the syntax looks like a gopher directory file.
|
|
**/
|
|
case '1':
|
|
case '2':
|
|
case '7':
|
|
case 'h':
|
|
break;
|
|
if(!in_array($queue) and !in_array($done)):
|
|
$queue[] = $line['uri'];
|
|
endif;
|
|
endswitch;
|
|
else:
|
|
/**
|
|
* Gopher directories are only capable of linking to Gopher servers,
|
|
* Telnet servers, and TN3270 servers. We cannot crawl Telnet or TN3270
|
|
* servers, so checking to see if this URI uses the 'gopher' scheme, we
|
|
* effectively check to see if we can crawl this server.
|
|
**/
|
|
if($new_URI_site->scheme == 'gopher'):
|
|
$query['in']['tocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$result = $query['in']['tocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['tocrawl']->closeCursor();
|
|
else:
|
|
$query['in']['nocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$result = $query['in']['nocrawl']->fetchAll(PDO::FETCH_CLASS, 'stdClass');
|
|
$query['in']['nocrawl']->closeCursor();
|
|
endif;
|
|
$is_new_site = $result[0]->{'COUNT(*)'} == '0';
|
|
if($is_new_site):
|
|
if($new_URI_site->scheme == 'gopher'):
|
|
$query['add']['tocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
':title' => $prepared_text,
|
|
));
|
|
$query['add']['tocrawl']->closeCursor();
|
|
else:
|
|
$query['add']['nocrawl']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
));
|
|
$query['add']['nocrawl']->closeCursor();
|
|
endif;
|
|
endif;
|
|
endif;
|
|
endforeach;
|
|
// Save information about the index page:
|
|
if($URI->path == '/'):
|
|
$title = $DOMDocument->getElementsByTagName('title');
|
|
// If the page has no <title/>, we can just use the URI instead.
|
|
if($title->length):
|
|
$prepared_title = $title->item(0)->textContent;
|
|
else:
|
|
$prepared_title = $site_URI;
|
|
endif;
|
|
$query['update']['title']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
':title' => $prepared_title,
|
|
));
|
|
$query['update']['title']->closeCursor();
|
|
endif;
|
|
if(ONION_SEARCH['MAIN']['DEBUG']):
|
|
$queue_count = count($queue);
|
|
$done_count = count($done);
|
|
$percent = (int) ($done_count / ($queue_count + $done_count) * 100);
|
|
$seconds_passed = time() - $starttime;
|
|
$seconds_left = ($seconds_passed / $done_count) * $queue_count;
|
|
$time_passed = parse_seconds($seconds_passed);
|
|
$time_left = parse_seconds($seconds_left);
|
|
echo "Queue: $queue_count; Done: $done_count; Estimated percentage complete: %$percent\n";
|
|
echo "Time spent: $time_passed[d] day(s), $time_passed[h] hour(s), $time_passed[m] minute(s), and $time_passed[s] second(s)\n";
|
|
echo "Estimated time remaining: $time_left[d] day(s), $time_left[h] hour(s), $time_left[m] minute(s), and $time_left[s] second(s)\n";
|
|
endif;
|
|
endwhile;
|
|
$query['update']['timestamp']->execute(array(
|
|
':URI' => $new_URI_site,
|
|
':time' => time(),
|
|
));
|
|
$query['update']['timestamp']->closeCursor();
|
|
endforeach;
|
|
endwhile;
|
|
|
|
/**
|
|
* TO DO LIST:
|
|
*
|
|
* add check to see if crawlable site is present; if not present, remove from active list
|
|
* add check to see if non-crawlable site is present; if not present, remove from active list
|
|
**/
|