array('verify_peer' => false, 'verify_peer_name' => false)))); @$dom->loadHTML($source); // If error on pull, skip! if($source === FALSE) { return 1; } $crawlcount = "0"; $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (strpos($href, 'http') !== 0) { $path = '/' . ltrim($href, '/'); $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } if (isset($parts['path'])) { $href .= dirname($parts['path'], 1).$path; } else { $href .= $path; } } $crawlcount++; if($crawlcount>$GLOBALS['maxpagecrawl']) { break; } crawl_page($href, $depth - 1, $filename); } $metas = $dom->getElementsByTagName('meta'); for ($ii = 0; $ii < $metas->length; $ii++) { $meta = $metas->item($ii); if($meta->getAttribute('name') == 'description') { $description = $meta->getAttribute('content'); } if($meta->getAttribute('name') == 'keywords') { $keywords = $meta->getAttribute('content'); } } if(!isset($description)) { $description = "No description..."; } if(!isset($keywords)) { $keywords = "No keywords..."; } $title = $dom->getElementsByTagName('title'); if ($title->length) { $title = $title->item(0)->nodeValue; if(trim($title)=="" || trim($description)=="") { return 2; } } else { return 2; } if(strpos(file_get_contents($filename), "URL: $url") === false) { echo "URL: " . $url . "
\n" . "Title: " . $title . "
\n" . "Description: " . $description . "
\n" . "Keywords: " . $keywords . "

\n"; file_put_contents($filename, "URL: " . $url . "\n" . "Title: " . $title . "\n" . "Description: " . $description . "\n" . "Keywords: " . $keywords . "\n\n", FILE_APPEND); } else { echo "$url exists in " . $filename . PHP_EOL; return 3; } } if(isset($argc)) { if($argc == "2") { $urlline = $argv[1]; echo "One argument\r\n"; if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) { echo "Crawling $urlline" . PHP_EOL; crawl_page(trim($urlline), $crawl_depth, $GLOBALS['database']); } } if($argc == "3") { $urlline = $argv[1]; $fileout = $argv[2]; echo "Two arguments\r\n"; if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) { echo "Crawling $urlline and saving to $fileout" . PHP_EOL; crawl_page(trim($urlline), $crawl_depth, $fileout); } } }