spider.func.php 2.08 KB
<?php
// PHP xml sitemap generator


function spider() {
        global $CONFIG, $SPIDER;
        $protocol = "http://";
        $tempt = "spider_temp_".$SPIDER["tid"];
        $spidert = "spider_".$SPIDER["tid"];

        $SPIDER["temp"][0] = $CONFIG["baseurl"];
        $SPIDER["baseurl"] = $CONFIG["baseurl"];

        while(sizeof($SPIDER["temp"]) > 0) {
                for($i = 0; $i < 4 && $i < sizeof($SPIDER["temp"]); $i++) $urls[] = array_pop($SPIDER["temp"]);
                multiGetURL($urls);
        }
        $fp = fopen($CONFIG["sitemap_file"], "w+");
        $xml_sitemap = genXmlSitemap();
        fputs($fp, $xml_sitemap);
        fclose($fp);

}

function handleHref($html, $href, $url) {
        global $SPIDER;
        $url_info = parse_url($href);
        if($url_info["scheme"] == "javascript") {
                return false;
        }
        if($url_info["scheme"] == "http") {
                if(!urlInSpider($href)) {
                        if(!isLinkExternal($href, $SPIDER["baseurl"])) {
                                if(!urlInTemp($href))
                                        $SPIDER["temp"][] = $href;
                        }
                }
        }
}

function getAnchors($url, $html) {
        global $SPIDER;
        $anchors = getTags($html, '<a', '>');
        for($i = 0; $i < sizeof($anchors); $i++) {
                $href = getTagField($anchors[$i], "href=");
                $href = correctUrl($href, $SPIDER["baseurl"]);
                handleHref($html, $href, $url);
        }
}

function multiGetURL($urls) {
        global $SPIDER;
        $htmls = curlMultiGetPage($urls);
        for($i = 0; $i < sizeof($urls); $i++) {
                echo "Checking ".$urls[$i]." ...\n";
                if($htmls[$i] != "") {
                        $SPIDER["spider"][] = $urls[$i];
                        getAnchors($urls[$i], $htmls[$i]);
                }
        }
}

function urlInSpider($url) {
        global $SPIDER;
        return in_array($url, $SPIDER["spider"]);
}

function urlInTemp($url) {
        global $SPIDER;
        return in_array($url, $SPIDER["temp"]);
}

?>