PHP Web Crawler

Aug 25, 2009 Author: vvaswani

This is indicative of a bot script which index a Web site. Uses PHP and JavaScript (AJAX framework - mootools [version 1.2 +]).

<?php

function crawl($url)
{
    $links = $GLOBALS['totalLinks'];
    $filter = $GLOBALS['removeText'];

    if($fp = @fopen($url,"rb"))
    {
        unset($url);
       
        while(!feof($fp))
            $content .= fread($fp, 8192);
        fclose($fp);

        preg_match_all("/href="(.*?)"[s|>]/", $content, $matches, PREG_PATTERN_ORDER);
       
        unset($content);
       
        if(count($matches[1]) > 0)
        {
            for($i=0;$i<count($matches[1]);$i++)
            {
                foreach($filter as $key => $value)
                {
                    if(strstr($matches[1][$i], $value))
                        $matches[1][$i] = "";
                   }
                if($matches[1][$i] != "")
                {
                    if(!in_array($matches[1][$i], $links))
                        array_push($links, $matches[1][$i]);
                }
            }
            unset($i, $matches);
        }
    }
}

$url = "address of the site, which will index";
$totalLinks = array();
$removeText = array("mailto:","javascript:",".css","http://","#");
   
if($_POST['links']) $totalLinks = explode(";",$_POST['links']);
   
if($_POST['position'] > 0)
{
    $cUrl = $totalLinks[$_POST['position']];
    $i = $_POST['position'];
} else $i = 0;

crawl($cUrl);
    
for($l=0;$l<count($totalLinks);$l++)
{
    if($l == (count($totalLinks)-1)) $data .= $totalLinks[$l];
    else $data .= $totalLinks[$l].";";
}
   
echo "<script>$('status').setText('in process...". (count($totalLinks)+1)."')</script>";
    
if($i < count($totalLinks))
    echo "<script>crawl('$data','".($i+1)."')</script>";
else
{
    if($fp = @fopen("sitemap.xml","wb+"))
    {
        $data = "";
        $data .= "<?xml version="1.0" encoding="UTF-8"?>";
        $data .= "<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                    xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">n";

        $data .= "<url>n";
          $data .= "    <loc>".$url."</loc>n";
          $data .= "    <priority>1.00</priority>n";
          $data .= "    <changefreq>daily</changefreq>n";
        $data .= "</url>";
       
        foreach($totalLinks as $value)
        {
            $data .= "n<url>n";
              $data .= "    <loc>".htmlentities($value, ENT_QUOTES, "utf-8")."</loc>n";
              $data .= "    <priority>0.80</priority>n";
              $data .= "    <changefreq>daily</changefreq>n";
            $data .= "</url>";
        }
        $data .= "n</urlset>";

        if(@fwrite($fp, $data) === FALSE)
            echo "<script>alert('Site map can not be saved!')</script>";
       
        fclose($fp);
    }
   
    echo "<script>$('status').setText('Completed[" .count($totalLinks)."]') </script>";
   
    unset($data, $i, $totalLinks, $url, $cUrl, $removeText);
}

?>

And of course, HTML + JavaScript

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head lang="bg">
    <title>Web Crawler</title>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <script type="text/javascript" src="ajax.js"></script>
    <script type="text/javascript" src="ajax-more.js"></script>
</head>

<body>
<div align="center">
    <input type="button" value="Crawl" onclick="crawl(0,0)" />
</div>
<div id="status"></div>
<script type="text/javascript">
function crawl(links, position)
{
    new Request.HTML({url: "crawl.php", data: {links: links, position: position},
        onFailure: function()
        {
            alert('The application could not be executed!');   
        }
    }).post();
}
</script>
</body>
</html>

About Mootools


views 7263
  1. Add New Comment