因为工作需要,用nodejs写了个简单的爬虫例子,之前也没用过nodejs,连搭环境加写大概用了5天左右,so。。。要多简陋有多简陋,放这里给以后的自己看~~
整体需求是:给一个有效的URL地址,返回该网页上所有无效链接的百分比(坏链率)
第一个文件:计算环链率 urlSpider.js
1 /*================================================
2 @author MissUU
3 链接抓取思路:
4
5 1. 获取页面内容
6 2. 正则取得所有<a>
7 3. 进一步取得href属性值,如果首位是“则剔除,不是http开头加上域名(javascript开头除外)
8 4.正则验证是否是常见URL格式
9 ================================================*/
10 var http = require(‘http‘);
11 var async = require(‘async‘);
12 var dbHandle = require(‘./dbHandle.js‘);
13
14 //主程序
15 var runUrlSpider = function(obj, callback){
16 //10s timeout
17 var request_timer = setTimeout(function() {
18 req.abort();
19 console.log(‘Request Timeout.‘);
20 }, 10000);
21
22 var urlBadLink = new UrlBadLink();
23 var html=‘‘;
24 var req = http.get(obj.url, function(res) {
25
26 clearTimeout(request_timer);
27
28 res.setEncoding(‘utf8‘);
29 res.on(‘data‘, function (chunk) {
30 html += chunk;
31 }).on(‘end‘, function(){
32 console.log(‘*******开始提取有效链接地址******‘);
33 console.log(new Date().toLocaleString());
34 console.log(obj.url);
35 urlBadLink.host = obj.url;
36 urlBadLink.id = obj.id;
37 matchURL(html, urlBadLink, function(){
38 callback();
39 });
40 });
41 });
42
43 req.on(‘error‘, function(e) {
44 console.log(‘problem with request: ‘ + e.message);
45 callback();
46 });
47 }
48
49 //this is the entrance of code
50 var main = function(){
51 var urlArray = dbHandle.showUrls(1, function(result){
54 async.eachSeries(result, runUrlSpider, function(err){
55 console.log(‘******this is the end, haha*******‘);
56 });
57 });
58 // console.log(urlArray);
59
60 };
61
62 main();
63
64 /*
65 * 用于异步放送get请求
66 *
67 * @param {string} content 原始页面信息
68 * @param {string} host 主域名
69 */
70 function matchURL(content, urlBadLink, callend){
71 var host = urlBadLink.host;
72 var anchor = /<a\s[^>]*>/g;
73 var matches = content.match(anchor);
74 var badLink = 0;
75 var flag = 0;
76 var HttpGet = function(url,callback){
77 //10s timeout
78 var request_timer = setTimeout(function() {
79 req.abort();
80 console.log(‘Request Timeout.‘);
81 }, 10000);
82
83 var req = http.get(url, function(res) {
84 clearTimeout(request_timer);
85
86 res.on(‘data‘, function () {
87 }).on(‘end‘, function(){
88 console.log(++flag + ": " + url + ‘ response status: ‘ + res.statusCode);
89
90 if(!(res.statusCode >= 200 && res.statusCode < 400)){
91 console.log(‘-----------------------‘);
92 badLink++;
93 }
94
95 callback();
96 });
97 });
98 req.on(‘error‘, function(err){
99 console.log(++flag + ": " + ‘problem with request: ‘ + err.message);
100 badLink++;
101 callback();
102 });
103 };
104
105 var urls = filterUrl(matches,host);
106
107 if(urls !== null){
108 var totalLink = urls.length;
109 //console.log(urls);
110 async.eachSeries(urls, HttpGet, function(err){
111 // var urlBadLink = new UrlBadLink(host,totalLink, badLink);
112 // console.log("坏链个数为: " + urlBadLink.badCounts);
113 // console.log("坏链率为: " + urlBadLink.getRate());
114 urlBadLink.total = totalLink;
115 urlBadLink.badCounts = badLink;
116 //data store puts here
117 dbHandle.updateBadLink(urlBadLink);
118 callend();
119 });
120 }else{
121 console.log(‘no links found‘);
122 urlBadLink.total = 10;
123 urlBadLink.badCounts = 0;
124 dbHandle.updateBadLink(urlBadLink);
125 callend();
126 }
127 }
128
129 //正则取得href属性值
130 function URLFommat(strUrl,host){
131
132 var urlPatten = /href=[\‘\"]?([^\‘\"]*)[\‘\"]?/i;
133 var temp = urlPatten.exec(strUrl);
134
135 if(temp!= null){
136 var url = temp[0].substring(6,temp[0].length-1).trim();
137
138 if(url.indexOf("\"") != -1){
139 url = url.slice(url.indexOf("\"") + 1);
140 }
141
142 if(url.charAt(0) == "/"){
143 url = url.slice(1);
144 return host + url;
145 }else if((url.indexOf("http") == -1)&&
146 (url.indexOf("javascript") == -1)){
147 return host + url;
148 }else
149 return url;
150 }else
151 return null;
152 }
153
154 //
155 function URLFommat1(strUrl,host){
156
157 var urlPatten = /href=[\‘\"]?([^\‘\"]*)[\‘\"]?/i;
158 var temp = urlPatten.exec(strUrl);
159
160 if(temp!= null){
161 var url = temp[0].substring(6,temp[0].length-1).trim();
162
163 if(url.indexOf("\"") != -1)
164 url = url.slice(url.indexOf("\"") + 1);
165
166 if(url.charAt(0) == "/")
167 return "http://" + host + url;
168 else if((url.indexOf("http") == -1)&&
169 (url.indexOf("javascript") == -1)){
170 return "http://" + host+"/" + url;
171 }else
172 return url;
173 }else
174 return null;
175 }
176 //test URLFommat
177 //var test = "http://baidu.com";
178 // var test1 = " \"http://baidu.com";
179 //var test2 = "/wenhao";
180 //console.log(URLFommat(test,"www.sina.com.cn"));
181 //console.log(URLFommat(test1,"www.sina.com.cn"));
182 //console.log(URLFommat(test2,"www.sina.com.cn"));
183
184
185 //测试是否为常见url格式
186 function IsURL(strUrl) {
187 if(strUrl != null){
188 var regular = /^\b(((http?|ftp):\/\/)?[-a-z0-9]+(\.[-a-z0-9]+)*\.(?:com|edu|gov|int|mil|net|org|biz|info|name|museum|asia|coop|aero|[a-z][a-z]|((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d))\b(\/[-a-z0-9_:\@&?=+,.!\/~%\$]*)?)$/i;
189 if (regular.test(strUrl)) {
190 return true;
191 }
192 else {
193 return false;
194 }
195 }else
196 return false;
197 }
198
199
200 //对象
201 function UrlBadLink(id, host, total, badCounts){
202 this.id = id;
203 this.host = host;
204 this.total = total;
205 this.badCounts = badCounts;
206
207 if(typeof this.getRate != "function"){
208 UrlBadLink.prototype.getRate = function(){
209 var output = Number(Math.round(this.badCounts/this.total*10000)/100).toFixed(2)+‘%‘;
210 return output;
211 };
212 }
213 }
214
215 function filterUrl(arr,host){
216
217 if(arr === null)
218 return null;
219 var output = [];
220 arr.forEach(function(item,index,array){
221 //console.log(item);
222 var formatURL = URLFommat(item,host);
223
224 if(IsURL(formatURL)){
225 output.push(formatURL);
226 }//if
227 });//forEach
228
229 return output;
230 }
第二个文件:将数据存库,dbHandle.js
/**
* @author MissUU
* @des MySql基本操作
* API: https://github.com/felixge/node-mysql
*/var mysql = require(‘mysql‘);
mysql.createConnection(‘mysql://root:[email protected]/test?debug=false‘);
var pool = mysql.createPool({
host : ‘10.102.1.00‘,
user : ‘root‘,
password : ‘root‘,
database : ‘test‘,
connectionLimit: 15
});//读取urls
exports.showUrls = function (groupId, callback){console.log(‘this is showUrl()‘);
pool.getConnection(function(err, conn){if (err) {
console.log("connection error!");
console.log(err);
}conn.query(‘SELECT id,realurl as url FROM t_site WHERE siteGroupId = ?‘,groupId, function(err, result){
if(err){
console.log(err.message);
}conn.release();
if(result.length){
// console.log(result instanceof Array);
callback(result);
return true;
}else{
callback(‘‘);
return false;
}
});
});
};exports.updateBadLink = function (urlBadLink){
//若不含数据则不插入
if (!!urlBadLink) {pool.getConnection(function(err, conn){
if (err) {
console.log("connection error!");
console.log(err);
}var updateSql = "UPDATE a_qualityinfo SET brokenRate = ‘"+ urlBadLink.getRate() +"‘ WHERE siteId = " + urlBadLink.id;
console.log(updateSql);
conn.query(updateSql, function(err, result){
if(err){
console.log(err.message);
console.log(‘update fail‘);
}conn.release();
console.log(‘update success‘);
});// conn.query
});//pool.getConnection
}
};
代码后期还会改动,这里有几点需要注意的:
1、http.get有时会一直等待响应,所以一定要判断下,超时则认为出错,要不程序就卡住了。。。= =!
2、注意callback的使用,要不然很难规范执行顺序的,用过nodejs的都懂得。。。