spider.func.php
2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
<?php
// PHP xml sitemap generator
function spider() {
global $CONFIG, $SPIDER;
$protocol = "http://";
$tempt = "spider_temp_".$SPIDER["tid"];
$spidert = "spider_".$SPIDER["tid"];
$SPIDER["temp"][0] = $CONFIG["baseurl"];
$SPIDER["baseurl"] = $CONFIG["baseurl"];
while(sizeof($SPIDER["temp"]) > 0) {
for($i = 0; $i < 4 && $i < sizeof($SPIDER["temp"]); $i++) $urls[] = array_pop($SPIDER["temp"]);
multiGetURL($urls);
}
$fp = fopen($CONFIG["sitemap_file"], "w+");
$xml_sitemap = genXmlSitemap();
fputs($fp, $xml_sitemap);
fclose($fp);
}
function handleHref($html, $href, $url) {
global $SPIDER;
$url_info = parse_url($href);
if($url_info["scheme"] == "javascript") {
return false;
}
if($url_info["scheme"] == "http") {
if(!urlInSpider($href)) {
if(!isLinkExternal($href, $SPIDER["baseurl"])) {
if(!urlInTemp($href))
$SPIDER["temp"][] = $href;
}
}
}
}
function getAnchors($url, $html) {
global $SPIDER;
$anchors = getTags($html, '<a', '>');
for($i = 0; $i < sizeof($anchors); $i++) {
$href = getTagField($anchors[$i], "href=");
$href = correctUrl($href, $SPIDER["baseurl"]);
handleHref($html, $href, $url);
}
}
function multiGetURL($urls) {
global $SPIDER;
$htmls = curlMultiGetPage($urls);
for($i = 0; $i < sizeof($urls); $i++) {
echo "Checking ".$urls[$i]." ...\n";
if($htmls[$i] != "") {
$SPIDER["spider"][] = $urls[$i];
getAnchors($urls[$i], $htmls[$i]);
}
}
}
function urlInSpider($url) {
global $SPIDER;
return in_array($url, $SPIDER["spider"]);
}
function urlInTemp($url) {
global $SPIDER;
return in_array($url, $SPIDER["temp"]);
}
?>