various/crawler/crawl.php

119 lines
3.4 KiB
PHP
Executable File

<?php
include("config.php");
function crawl_page($url, $depth = 5, $filename)
{
static $seen = array();
if (isset($seen[$url]) || $depth === 0) {
return 3;
}
$seen[$url] = true;
$dom = new DOMDocument('1.0');
// Get source from URL, feed through loadHTMLFile
$source = file_get_contents($url, false, stream_context_create(array('ssl' => array('verify_peer' => false, 'verify_peer_name' => false))));
@$dom->loadHTML($source);
// If error on pull, skip!
if($source === FALSE) {
return 1;
}
$crawlcount = "0";
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (strpos($href, 'http') !== 0) {
$path = '/' . ltrim($href, '/');
$parts = parse_url($url);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '@';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
if (isset($parts['path'])) {
$href .= dirname($parts['path'], 1).$path;
} else {
$href .= $path;
}
}
$crawlcount++;
if($crawlcount>$GLOBALS['maxpagecrawl']) { break; }
crawl_page($href, $depth - 1, $filename);
}
$metas = $dom->getElementsByTagName('meta');
for ($ii = 0; $ii < $metas->length; $ii++)
{
$meta = $metas->item($ii);
if($meta->getAttribute('name') == 'description') {
$description = $meta->getAttribute('content');
}
if($meta->getAttribute('name') == 'keywords') {
$keywords = $meta->getAttribute('content');
}
}
if(!isset($description)) { $description = "No description..."; }
if(!isset($keywords)) { $keywords = "No keywords..."; }
$title = $dom->getElementsByTagName('title');
if ($title->length) {
$title = $title->item(0)->nodeValue;
if(trim($title)=="" || trim($description)=="") {
return 2;
}
} else {
return 2;
}
if(strpos(file_get_contents($filename), "URL: $url") === false) {
echo "URL: " . $url . "<br />\n"
. "Title: " . $title . "<br />\n"
. "Description: " . $description . "<br />\n"
. "Keywords: " . $keywords . "<br /><br />\n";
file_put_contents($filename, "URL: " . $url . "\n"
. "Title: " . $title . "\n"
. "Description: " . $description . "\n"
. "Keywords: " . $keywords . "\n\n", FILE_APPEND);
} else {
echo "$url exists in " . $filename . PHP_EOL;
return 3;
}
}
if(isset($argc)) {
if($argc == "2") {
$urlline = $argv[1];
echo "One argument\r\n";
if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) {
echo "Crawling $urlline" . PHP_EOL;
crawl_page(trim($urlline), $crawl_depth, $GLOBALS['database']);
}
}
if($argc == "3") {
$urlline = $argv[1];
$fileout = $argv[2];
echo "Two arguments\r\n";
if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) {
echo "Crawling $urlline and saving to $fileout" . PHP_EOL;
crawl_page(trim($urlline), $crawl_depth, $fileout);
}
}
}