<?php
// It may take a whils to spider a website ...
set_time_limit(10000);
// Inculde the phpcrawl-mainclass
include_once('../PHPCrawl_083/PHPCrawl_083/libs/PHPCrawler.class.php');
include ('2.php');
//include ('crawl3.php');
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo(PHPCrawlerDocumentInfo $DocInfo) {
if (PHP_SAPI == "cli") $lb = "\n";
else {
$lb = "<br />";
$html = @file_get_contents($DocInfo->url);
$home_url = parse_url($DocInfo->url ,PHP_URL_HOST );
$doc = new DOMDocument;
libxml_use_internal_errors(true);
$doc->loadHTML($html);
libxml_clear_errors();
// Drop the ->item(0)
$links = $doc->getElementsByTagName('a');
foreach ($links as $link){
//if (substr($link->getAttribute('href'),0,7) == "http://" and is_valid_url($link->getAttribute('href') === false)){
$link_url = parse_url($link->getAttribute('href') ,PHP_URL_HOST );
if (($link_url !== $home_url) and is_valid_url($link->getAttribute('href')) === false ){
echo $link->getAttribute('href'), PHP_EOL."<br/>";
echo '<a href = "'.$link->getAttribute('href').'" TARGET=_blank.>'.$link->nodeValue.'</a>'.'<br/>';
}
}
}
}
}
$crawler = new MyCrawler();
$crawler->setURL("http://www.tunisie-web.org/");
$crawler->addURLFilterRule("#\.(jpg|gif|png|pdf|jpeg|css|js)$#i");
$crawler->setWorkingDirectory("C:/Users/mayss/Documents/travailcrawl/");
$crawler->go();
//httpwww.annuaire-ag.com
//
?>
<?php
function is_valid_url($url) {
$resURL = curl_init();
curl_setopt($resURL, CURLOPT_URL, $url);
curl_setopt($resURL, CURLOPT_BINARYTRANSFER, 1);
curl_setopt($resURL, CURLOPT_HEADERFUNCTION, 'curlHeaderCallback');
curl_setopt($resURL, CURLOPT_FAILONERROR, 1);
curl_exec ($resURL);
$intReturnCode = curl_getinfo($resURL, CURLINFO_HTTP_CODE);
curl_close ($resURL);
if ($intReturnCode == 404) {
return false;
}
else return true;
}
?>
➡️ Offre MyRankingMetrics ⬅️
pré-audit SEO gratuit avec RM Tech (+ avis d'expert)
coaching offert aux clients (avec Olivier Duffez ou Fabien Faceries)
Voir les détails ici