药大贴吧用户数据资料爬取与简单分析 / 憋错料

 使用python爬虫连接到药大贴吧的首页，然后爬取每个话题的链接。将链接记录到一个列表中。打开列表中的链接，读取第一页页的用户的主页链接和话题下的帖子页数。将用户的主页连接记录到一个集合中。如果发现有多页，就记录每一页的连接，再从这些连接中读取用户的主页连接记录到集合中。这样可爬取首页下所有用户的主页url。
 依次从集合中取出URL，打开主页，记录用户名称，性别，粉丝数，关注者的信息，发帖量等资料。

    #coding:utf-8
import urllib2
import re
from bs4 import BeautifulSoup
from numpy import unique
import time
from pandas import *
#从主页上提取用户的昵称，关注者数量，粉丝数量，粉丝连接，关注者的连接,吧年龄，帖子数量，性别
import socket
timeout =20
socket.setdefaulttimeout(timeout)                                      #设置20秒超时断开连接

class userinfo(object):
    def __init__(self,url):
        self.url = url
        self.username = None
        self.concern_num = None
        self.fans_num = None
        self.age = None
        self.tie = None
        self.sex = None
        self.concern_link = None
        self.fans_link = None
    def get_fans(self):
        return self.fans_link
    def get_concern(self):
        return self.concern_link

def urlget(url):
    print url
    user = userinfo(url)
    prefix="http://tieba.baidu.com"
    web_doc = urllib2.urlopen(url).read()
    soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
    try:
        username = soup.find_all(‘span‘,attrs={‘class‘:‘userinfo_username ‘})[0].get_text()       #获取网页上的用户的名称
        print "username=",username
        user.username = username
    except IndexError:
        print "username_error"
        print url
    try:
        concernlst = soup.find_all(‘span‘,attrs={‘class‘:"concern_num"})
        concern_link = concernlst[0].find_all(‘a‘,attrs={‘href‘:True,‘target‘:‘_blank‘})[0][‘href‘]
        concern_link = prefix+concern_link     #关注连接
        user.concern_link = concern_link
        concern_num = concernlst[0].find_all(‘a‘,attrs={‘href‘:True,‘target‘:‘_blank‘})[0].get_text()
        print "concern_num=",float(concern_num)
    except IndexError:
        print "concern_error----"
        print url
    try:
        fans_link = concernlst[1].find_all(‘a‘,attrs={‘href‘:True,‘target‘:‘_blank‘})[0][‘href‘]
        fans_link = prefix + fans_link            #粉丝连接
        user.fans_link = fans_link
        fans_num = concernlst[1].find_all(‘a‘,attrs={‘href‘:True,‘target‘:‘_blank‘})[0].get_text()
        print "fans_num=",fans_num
        user.fans_num = int(fans_num)
    except IndexError:
        print "fans_error-----------------------"
        print url
    try:
        infor = soup.find_all(‘div‘,attrs={‘class‘:‘userinfo_userdata‘})[0]
        agetie = infor.find_all(‘span‘,attrs={‘class‘:False})
        print "age=",agetie[0].get_text()[3:-1]       #第一个是吧年龄，第二个是帖子数量
        user.age = agetie[0].get_text()[3:]
        print "tie=",agetie[1].get_text()[3:]
        user.tie = agetie[1].get_text()[3:]
        p_sex = re.compile(r‘userinfo_sex.*‘)
        print infor.find_all(‘span‘,attrs={‘class‘:p_sex})[0]
        sexstr = infor.find_all(‘span‘,attrs={‘class‘:p_sex})[0][‘class‘][1]  #提取用户的性别
        print "the sex of the user is : "
        if "female" in sexstr:
            print "female"
            user.sex = "female"
        elif "male" in sexstr:
            print "male"
            user.sex = "male"
        else:
            print "no sex"
            user.sex = "no sex"
    except IndexError:
        print "infor_error"
        print url
    return user

‘‘‘提取关注者主页‘‘‘
def getconcern(url):
    concern_lst = getfans(url)          #输出关注者的页面连接
    return concern_lst

‘‘‘提取粉丝主页‘‘‘
def getfans(url):
    prefix="http://tieba.baidu.com"
    print url
    web_doc = urllib2.urlopen(url).read()
    soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
    span = soup.find_all(‘span‘,attrs={‘class‘:‘name‘})
    p_href = re.compile(‘/home/main.*‘);home_lst=[]
    for s in span:
        homelink = s.find_all(‘a‘,attrs={‘href‘:p_href,‘target‘:‘_blank‘})[0][‘href‘]
        print homelink
        homelink = prefix + homelink
        home_lst.append(homelink)
    return home_lst

‘‘‘从当前的主题的连接中提取人员的主页连接，输入为一个url，返回当前用户主页连接集合，并且所有页的连接‘‘‘
#提取用户连接，再判断有多少页，确定页码数量。构建连接
def homeget(url):
    web_doc = urllib2.urlopen(url).read()
    time.sleep(1)
    print "homeset,sleeping..."
    soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
    p_home = re.compile(r‘/home/main?.*‘)
    homenode = soup.find_all(‘a‘,attrs={‘href‘:p_home,‘target‘:‘_blank‘,‘class‘:‘p_author_face ‘}) #主页节点
    prefix = "http://tieba.baidu.com/"
    linklst = [prefix + home[‘href‘] for home in homenode]   #提取到用户主页的连接的列表
    try:
        locate = soup.find_all(‘li‘,attrs={‘class‘:‘l_pager pager_theme_5 pb_list_pager‘})[0]
    except:
        print url
        return unique(linklst),[]
    alst = locate.find_all(‘a‘,attrs={"href":True})
    if alst == []:
        return unique(linklst),[]
    else:
        href = alst[len(alst)-1][‘href‘]
        pagenum  = int(href.split(‘=‘)[1])  #确认数量
        pagelst = []
        prefix = "http://tieba.baidu.com/"
        for i in range(2,pagenum+1):
            page_link = prefix + href.split("=")[0] + "=" + str(i)
            pagelst.append(page_link)
        return unique(linklst),pagelst

‘‘‘给定连接集合，循环提取所有的用户主页连接，组成集合并返回一个集合‘‘‘
def pagesget(page_lst):
    if page_lst == [] : return set()
    prefix = "http://tieba.baidu.com/"
    totalset=set()
    for page in page_lst:
        web_doc = urllib2.urlopen(page).read()
        time.sleep(1)
        print "pagesget,sleeping..."
        soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
        p_home = re.compile(r‘/home/main?.*‘)
        homenode = soup.find_all(‘a‘,attrs={‘href‘:p_home,‘target‘:‘_blank‘}) #主页节点,‘class‘:‘p_author_face ‘
        linklst = [prefix + home[‘href‘] for home in homenode]   #提取到用户主页的连接的列表
    totalset = totalset | set(linklst)
    return totalset

‘‘‘给定贴吧，提取贴吧第一页的各个主题的连接‘‘‘
def topicenter(url="http://tieba.baidu.com/f?kw=%D6%D0%B9%FA%D2%A9%BF%C6%B4%F3%D1%A7"):
    web_doc = urllib2.urlopen(url).read()
    soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
    p = re.compile(r‘/p/\d{10}‘)
    theme_url = soup.find_all(‘a‘,attrs={‘href‘:p,‘title‘:True,‘target‘:‘_blank‘,‘class‘:‘j_th_tit‘})
    prefix = "http://tieba.baidu.com";url_lst=[]
    for theme in set(theme_url):                                  #主题连接补充完整
        theme = prefix + theme[‘href‘]
        url_lst.append(theme)
        print "current theme is:"
        print theme
    theme_page = soup.find_all(‘a‘,attrs={‘class‘:‘ pagination-item ‘})
    theme_page_links = [theme[‘href‘] for theme in theme_page]
    return url_lst,theme_page_links

‘‘‘提取主题列表第2页到第10的内容‘‘‘
def get_themes(theme_page_links):
    if theme_page_links == []:return None
    p = re.compile(r‘/p/\d{10}‘);url_lst=[]
    for theme in theme_page_links:
        web_doc = urllib2.urlopen(theme).read()
        print "sleeping......"
        print "theme_url=",theme
        time.sleep(1)
        soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
        theme_url = soup.find_all(‘a‘,attrs={‘href‘:p,‘title‘:True,‘target‘:‘_blank‘,‘class‘:‘j_th_tit‘})
        prefix = "http://tieba.baidu.com";
        for theme in set(theme_url):                                  #主题连接补充完整
            theme = prefix + theme[‘href‘]
            url_lst.append(theme)
    return url_lst

otherthemes = get_themes(pages)
pages = set(themes) | set(otherthemes)    #获得前十页的主题的连接
user_url = set();k=0                    #存放用户的主页链接
for pg in pages:
    curruser,pagetalk = homeget(pg)      #主题下的第一页的用户的内容和剩余页的连接
    try:
        nextuser = pagesget(pagetalk)        #剩余页的用户主页连接
    except:
        break
    themeuser = set(curruser) | nextuser #这个主题下的所有用户的主页连接
    user_url = user_url | themeuser      #合并到所有的用户中去
    print "current number is ",len(user_url)
    if len(user_url) > 902:break

print "the number of active username in baidutieba is: ",len(user_url)                      #统计用户数量

user_dic={}                              #记录用户的内容
for i in user_url:
    user = urlget(i)
    if user.username not in user_dic.keys():
        user_dic[user.username] = {‘url‘:user.url,‘concern_num‘:user.concern_num,‘fans_num‘:user.fans_num,‘age‘:user.age,‘tie‘:user.tie,‘sex‘:user.sex,‘concern_link‘:user.concern_link,‘fans_link‘:user.fans_link}

运行爬虫抓取信息要花十几分钟的时间，每次都记录主页连接集合的数量增长，发现在增长至905人后就停止了。而且贴吧的话题页的下一页连接虽然被获取，但是在打开后会跳转到第一页，这个会导致信息的重复爬取，浪费时间。因此，在905个连接之后就停止了爬取。打开主页爬取各个信息存放在一个对象中。由于中文支持不是很给力（之前是写入到记事本中，这次写入csv失败了），把输出结果复制在记事本中。写一个程序整理一下输出结果：

#coding:utf-8
from pandas import *

def loadfile(filename):
    f = open(filename)
    userinfor={};dic={};order=0
    for i in f.readlines():
        i = i.strip()
        if ‘http‘ in i:continue
        if ‘the sex of‘ in i :continue
        if ‘span‘ in i :continue
        if "username" in i: #新出现一个人
            userinfor[str(order)] = dic  #收集的数据保存到总的信息中
            order += 1              #更新顺序
            dic={‘userno‘:order,‘concernum‘:0,‘age‘:0,‘tie‘:0,‘sex‘:‘unknown‘,‘fans_num‘:0}      #篮子清空
        if (‘concern_num‘ in i) and (‘error‘ not in i):
            num = float(i.split(‘=‘)[1].strip())
            dic[‘concernum‘] = num
        if (‘age‘ in i) and (‘error‘ not in i) :
            try:
                age = float(i[4:].strip())
            except ValueError:
                print i
            dic[‘age‘] = age
        if (‘tie‘ in i) and (‘error‘ not in i) :
            try:
                tie = float(i.split(‘=‘)[1].strip())
                print i
            except ValueError:
                print i
                tie = float(i[4:].strip()[:-2])*10000.0
            dic[‘tie‘] = tie
        if (‘male‘ in i) and len(i)<=6:
            dic[‘sex‘] = i.strip()
        if (‘fans_num‘ in i) and (‘error‘ not in i):
            num = float(i.split(‘=‘)[1].strip())
            dic[‘fans_num‘] = num
    return userinfor

filename = ‘E://information.txt‘
dic = loadfile(filename)
dic.pop(‘0‘)
df = DataFrame(dic)
df = df.T
df.to_csv("E://infor.csv")

最后整理出来的信息是一个csv文件，一行是一个人的记录，使用编号表示。属性是吧龄，关注人的数量，粉丝数量，性别和帖子数量。使用R语言读取该csv文件进行分析。主要是分析男性和女性在一些信息上是否存在差异。使用帖子数量除以吧龄的值表示发帖的频率。

贴吧上的男女比例: