链接:http://www.1point3acres.com/bbs/thread-91000-1-1.html
如果是一些比较简单的规则的网页抓取,可以用SAS,纯属娱乐,SAS入门的话推荐SAS BASE和ADVANCE认证的教材,这两个认证其实没啥用,但教材的内容对于一个专业的SAS Programmer足够了,sas data step, sql, macro,会这些日常处理应该都没问题了。
%macro webScholar;
/*memlib选项是将data放在内存里,而不是放在D盘,可以提高读取速度,最后别忘记把dataset存盘*/
libname mywork "D:\" memlib;
/*建立存取结果的空数据集,抓取google学术搜索出来的标题和应用次数*/
proc sql;
create table mywork.results_web (titles char(500), citenumber char(500));
quit;
/*pageno相当于google学术搜索下面的第几页,这里试着抓取前两页,当然可以加*/
/*q=python 后的python可以换成其他的关键词*/
%do pageno = 0 %to 20 %by 10;. From 1point 3acres bbs
data _null_;
length url $ 256;
url = ‘http://scholar.google.com/scholar?start=0&q=python&hl=en&as_sdt=0,5‘;
url = prxchange("s/start=0/start=&pageno/", 1, url);
call symput("url", url);
run;
/*recfm=n是将input分成长度为256的小块,因为sas字符变量最长3万多,超过的话会被截断,有时网页代码的一行会很长*/
filename web url "%superq(url)" recfm=n debug;
/*$varying 这个format/informat很有意思,具体可以参考help*/
data mywork.web;
length webtext $ 256;
infile web length=len;
input webtext $varying256.len;
textlength = len;
run;
data mywork.extracted;
length s $ 32767; /*sas能处理的最长字符变量*/
length r $ 500;
length cite $500;
retain s; /*每次data步,将字符累加到s中,用了retain,s不会重置成缺失值*/
set mywork.web;
s = cats(s, webtext);
. From 1point 3acres bbs
/*用正则表达式来匹配标题和文献引用次数*/
/*其他编程语言的话可以找到很多package来做,sas这一点不太方便*/
position = .;
do until (position = 0);
patternID = prxparse(‘/<h3(\w|\W)*?<\/h3>(\w|\W)*?>Cite(d by )??\d*<\/a>/i‘);
call prxsubstr(patternID, s, position, length);
if position ^= 0 then do;.
patternID = prxparse(‘/<h3(\w|\W)*?<\/h3>/i‘);
call prxsubstr(patternID, s, position, length);
r = substr(s, position, length);. 1point3acres.com/bbs
/*把标题中的tag之类的奇怪字符去掉*/
r = prxchange(‘s/(<[^>]*?>)|(\[[^\]]*?\])|(&[^;]*?;s?)//‘, -1, r);
s = substrn(s, position + length);
patternID = prxparse(‘/>Cite(d by )??\d*<\/a>/i‘);
call prxsubstr(patternID, s, position, length);
cite = substr(s, position, length);
/*把数字提取出来*/
cite = prxchange(‘s/(\D*)(\d*)(\D*)/$2/‘,1, cite);
s = substrn(s, position + length);
output;
end;
end; . From 1point 3acres bbs
if length(s) > 29000 then s = substrn(s, 257);
run;
/*将结果存起来,最后的数据中会有两个变量,论文的标题和引用次数*/
/*因为开头memlib选项,这个dataset并没有存到硬盘中*/
proc sql;
insert into mywork.results_web
select r, cite from mywork.extracted;. from: 1point3acres.com/bbs
quit;
%end;.
%mend webScholar;
%webScholar