spider.func.php
2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
<?php
// PHP xml sitemap generator
function spider() {
        global $CONFIG, $SPIDER;
        $protocol = "http://";
        $tempt = "spider_temp_".$SPIDER["tid"];
        $spidert = "spider_".$SPIDER["tid"];
        $SPIDER["temp"][0] = $CONFIG["baseurl"];
        $SPIDER["baseurl"] = $CONFIG["baseurl"];
        while(sizeof($SPIDER["temp"]) > 0) {
                for($i = 0; $i < 4 && $i < sizeof($SPIDER["temp"]); $i++) $urls[] = array_pop($SPIDER["temp"]);
                multiGetURL($urls);
        }
        $fp = fopen($CONFIG["sitemap_file"], "w+");
        $xml_sitemap = genXmlSitemap();
        fputs($fp, $xml_sitemap);
        fclose($fp);
}
function handleHref($html, $href, $url) {
        global $SPIDER;
        $url_info = parse_url($href);
        if($url_info["scheme"] == "javascript") {
                return false;
        }
        if($url_info["scheme"] == "http") {
                if(!urlInSpider($href)) {
                        if(!isLinkExternal($href, $SPIDER["baseurl"])) {
                                if(!urlInTemp($href))
                                        $SPIDER["temp"][] = $href;
                        }
                }
        }
}
function getAnchors($url, $html) {
        global $SPIDER;
        $anchors = getTags($html, '<a', '>');
        for($i = 0; $i < sizeof($anchors); $i++) {
                $href = getTagField($anchors[$i], "href=");
                $href = correctUrl($href, $SPIDER["baseurl"]);
                handleHref($html, $href, $url);
        }
}
function multiGetURL($urls) {
        global $SPIDER;
        $htmls = curlMultiGetPage($urls);
        for($i = 0; $i < sizeof($urls); $i++) {
                echo "Checking ".$urls[$i]." ...\n";
                if($htmls[$i] != "") {
                        $SPIDER["spider"][] = $urls[$i];
                        getAnchors($urls[$i], $htmls[$i]);
                }
        }
}
function urlInSpider($url) {
        global $SPIDER;
        return in_array($url, $SPIDER["spider"]);
}
function urlInTemp($url) {
        global $SPIDER;
        return in_array($url, $SPIDER["temp"]);
}
?>