110 lines
3.0 KiB
PHP
Executable File
110 lines
3.0 KiB
PHP
Executable File
<?php
|
|
|
|
include("config.php");
|
|
|
|
function crawl_page($url, $depth = 5)
|
|
{
|
|
static $seen = array();
|
|
if (isset($seen[$url]) || $depth === 0) {
|
|
return 3;
|
|
}
|
|
|
|
$seen[$url] = true;
|
|
|
|
$dom = new DOMDocument('1.0');
|
|
// Get source from URL, feed through loadHTMLFile
|
|
$source = file_get_contents($url, false, stream_context_create(array('ssl' => array('verify_peer' => false, 'verify_peer_name' => false))));
|
|
@$dom->loadHTML($source);
|
|
|
|
// If error on pull, skip!
|
|
if($source === FALSE) {
|
|
return 1;
|
|
}
|
|
|
|
$crawlcount = "0";
|
|
|
|
$anchors = $dom->getElementsByTagName('a');
|
|
foreach ($anchors as $element) {
|
|
$href = $element->getAttribute('href');
|
|
if (strpos($href, 'http') !== 0) {
|
|
$path = '/' . ltrim($href, '/');
|
|
$parts = parse_url($url);
|
|
$href = $parts['scheme'] . '://';
|
|
if (isset($parts['user']) && isset($parts['pass'])) {
|
|
$href .= $parts['user'] . ':' . $parts['pass'] . '@';
|
|
}
|
|
$href .= $parts['host'];
|
|
if (isset($parts['port'])) {
|
|
$href .= ':' . $parts['port'];
|
|
}
|
|
|
|
if (isset($parts['path'])) {
|
|
$href .= dirname($parts['path'], 1).$path;
|
|
} else {
|
|
$href .= $path;
|
|
}
|
|
}
|
|
|
|
$crawlcount++;
|
|
|
|
if($crawlcount>$GLOBALS['maxpagecrawl']) { break; }
|
|
|
|
crawl_page($href, $depth - 1);
|
|
}
|
|
|
|
$metas = $dom->getElementsByTagName('meta');
|
|
|
|
for ($ii = 0; $ii < $metas->length; $ii++)
|
|
{
|
|
$meta = $metas->item($ii);
|
|
if($meta->getAttribute('name') == 'description') {
|
|
$description = $meta->getAttribute('content');
|
|
}
|
|
|
|
if($meta->getAttribute('name') == 'keywords') {
|
|
$keywords = $meta->getAttribute('content');
|
|
}
|
|
}
|
|
|
|
if(!isset($description)) { $description = "No description..."; }
|
|
if(!isset($keywords)) { $keywords = "No keywords..."; }
|
|
|
|
$title = $dom->getElementsByTagName('title');
|
|
|
|
if ($title->length){
|
|
$title = $title->item(0)->nodeValue;
|
|
} else {
|
|
return 2;
|
|
}
|
|
|
|
if(strpos(file_get_contents($GLOBALS['database']), "URL: $url") === false) {
|
|
echo "URL: " . $url . "<br />\n"
|
|
. "Title: " . $title . "<br />\n"
|
|
. "Description: " . $description . "<br />\n"
|
|
. "Keywords: " . $keywords . "<br /><br />\n";
|
|
|
|
file_put_contents($GLOBALS['database'], "URL: " . $url . "\n"
|
|
. "Title: " . $title . "\n"
|
|
. "Description: " . $description . "\n"
|
|
. "Keywords: " . $keywords . "\n\n", FILE_APPEND);
|
|
} else {
|
|
echo "$url exists in " . $GLOBALS['database'];
|
|
return 3;
|
|
}
|
|
}
|
|
|
|
|
|
$urlhandler = fopen($urls, 'r');
|
|
|
|
if($urlhandler) {
|
|
while(($urlline = fgets($urlhandler)) !== false) {
|
|
if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE)
|
|
crawl_page(trim($urlline), $crawl_depth);
|
|
}
|
|
|
|
fclose($urlhandler);
|
|
} else {
|
|
// Not doing that right now
|
|
}
|
|
|