PHP Web Crawler
Aug 25, 2009
Author: vvaswani
This is indicative of a bot script which index a Web site. Uses PHP and JavaScript (AJAX framework - mootools [version 1.2 +]).
<?php
function crawl($url)
{
$links = $GLOBALS['totalLinks'];
$filter = $GLOBALS['removeText'];
if($fp = @fopen($url,"rb"))
{
unset($url);
while(!feof($fp))
$content .= fread($fp, 8192);
fclose($fp);
preg_match_all("/href="(.*?)"[s|>]/", $content, $matches, PREG_PATTERN_ORDER);
unset($content);
if(count($matches[1]) > 0)
{
for($i=0;$i<count($matches[1]);$i++)
{
foreach($filter as $key => $value)
{
if(strstr($matches[1][$i], $value))
$matches[1][$i] = "";
}
if($matches[1][$i] != "")
{
if(!in_array($matches[1][$i], $links))
array_push($links, $matches[1][$i]);
}
}
unset($i, $matches);
}
}
}
$url = "address of the site, which will index";
$totalLinks = array();
$removeText = array("mailto:","javascript:",".css","http://","#");
if($_POST['links']) $totalLinks = explode(";",$_POST['links']);
if($_POST['position'] > 0)
{
$cUrl = $totalLinks[$_POST['position']];
$i = $_POST['position'];
} else $i = 0;
crawl($cUrl);
for($l=0;$l<count($totalLinks);$l++)
{
if($l == (count($totalLinks)-1)) $data .= $totalLinks[$l];
else $data .= $totalLinks[$l].";";
}
echo "<script>$('status').setText('in process...". (count($totalLinks)+1)."')</script>";
if($i < count($totalLinks))
echo "<script>crawl('$data','".($i+1)."')</script>";
else
{
if($fp = @fopen("sitemap.xml","wb+"))
{
$data = "";
$data .= "<?xml version="1.0" encoding="UTF-8"?>";
$data .= "<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">n";
$data .= "<url>n";
$data .= " <loc>".$url."</loc>n";
$data .= " <priority>1.00</priority>n";
$data .= " <changefreq>daily</changefreq>n";
$data .= "</url>";
foreach($totalLinks as $value)
{
$data .= "n<url>n";
$data .= " <loc>".htmlentities($value, ENT_QUOTES, "utf-8")."</loc>n";
$data .= " <priority>0.80</priority>n";
$data .= " <changefreq>daily</changefreq>n";
$data .= "</url>";
}
$data .= "n</urlset>";
if(@fwrite($fp, $data) === FALSE)
echo "<script>alert('Site map can not be saved!')</script>";
fclose($fp);
}
echo "<script>$('status').setText('Completed[" .count($totalLinks)."]') </script>";
unset($data, $i, $totalLinks, $url, $cUrl, $removeText);
}
?>And of course, HTML + JavaScript
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head lang="bg">
<title>Web Crawler</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<script type="text/javascript" src="ajax.js"></script>
<script type="text/javascript" src="ajax-more.js"></script>
</head>
<body>
<div align="center">
<input type="button" value="Crawl" onclick="crawl(0,0)" />
</div>
<div id="status"></div>
<script type="text/javascript">
function crawl(links, position)
{
new Request.HTML({url: "crawl.php", data: {links: links, position: position},
onFailure: function()
{
alert('The application could not be executed!');
}
}).post();
}
</script>
</body>
</html>
About Mootools
views 4225



