#---------------------------------import--------------------------------------- #coding:utf-8 import urllib2; from BeautifulSoup import BeautifulSoup; #------------------------------------------------------------------------------ def main(): #抓 userMainUrl = "http://tieba.baidu.com/home/main?id=38b94c4ed8add8bcccabd7d31b22&fr=userbar"; #修改抓取的链接地址 req = urllib2.Request(userMainUrl); resp = urllib2.urlopen(req); respHtml = resp.read(); print "respHtml=",respHtml; #此处输出所有抓取到的HTML源码 #取 songtasteHtmlEncoding = "GBK";#修改编码<span><span class="attribute-name">charset的格式</span></span> soup = BeautifulSoup(respHtml, fromEncoding=songtasteHtmlEncoding); foundClassH1user = soup.find(attrs={"target":"_blank"});#修改抓取内容 print "foundClassH1user=%s",foundClassH1user; if(foundClassH1user): h1userStr = foundClassH1user.string; print "h1userStr=",h1userStr; ############################################################################### if __name__=="__main__": main();
抓取1类标签
#eg:siteUrls=soup.findAll(‘a‘)
抓取2类标签
#eg:foundClassH1user = soup.find(attrs={"target":"_blank"});
抓取2类标签
#foundClassH1user = soup.find(attrs={"class":"h1user"});
时间: 2024-10-19 10:44:32