1.修改网页头
用独自的py文件getheaders,随机返回header
getheaders文件
import random headerstr = """Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like GeckoMozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999 """ def headers(): header = headerstr.split("\n") length = len(header) return header[random.randint(0, length - 1)] 2.主文件
# coding:utf-8from bs4 import BeautifulSoupimport urllib2from getheaders import headersfrom json import loadsimport reimport os def reqobject(): # 实例化一个请求对象,还没有访问 # 1、实例化一个请求对象,还没有访问 req = urllib2.Request("https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8") # 2、对请求对象进行加工,添加用户头 req.add_header(‘user-agent‘, headers()) return req def getUrlList(): # 获取页面所有的用户信息 req = reqobject() # 1.2.再次对对象进行加工,添加参数 req.add_data( ‘q&viewFlag=A&sortType=default&searchStyle=&searchRegion=city%3A&searchFansNum=¤tPage=1&pageSize=100‘) # 3、访问对象并解码+编码 # """ # decode(‘gbk‘) 解码:吧gbk格式解码为Unicode # decode解码时要对应网页的respon heasers里面的content-type:text/html;charset=GBK # 若没有则查看网页源代码头部<meta charset="gbk" /> # encode(‘utf-8‘) 编码:把Unicode编码为utf-8 # encode只能编码Unicode # """ html = urllib2.urlopen(req).read().decode(‘gbk‘).encode(‘utf-8‘) # 4、取值,html为一个json对象,先转化为dict,便于取值 json_dict = loads(html) # 5、返回一个列表 return json_dict[‘data‘][‘searchDOList‘] def getInfo(userid): # 获取用户的“她的爱秀” req = urllib2.Request("https://mm.taobao.com/self/aiShow.htm?&userId=%s" % userid) req.add_header(‘user-agent‘, headers()) html = urllib2.urlopen(req).read().decode(‘gbk‘).encode(‘utf-8‘) return html def getNeedInf(html): # 提取我们需要的信息 soup = BeautifulSoup(html, ‘html.parser‘) name = soup.select(‘dl > dd > a‘)[0].text.encode(‘utf-8‘) follow = soup.select(‘dl > dt > a‘)[1].text.encode(‘utf-8‘) fens = soup.select(‘dl > dt > a‘)[2].text.encode(‘utf-8‘) detail = soup.find(‘div‘, attrs={‘id‘: ‘J_ScaleImg‘}).get_text().strip().encode(‘utf-8‘) content = "姓名:{} 关注:{} 粉丝:{}\n{}".format(name, follow, fens, detail) if os.path.exists("images\\" + str(userid)) == False: os.mkdir("images\\" + str(userid)) print ‘Start downloading...‘ print ‘getInf:{}‘.format(str(userid)) with open("images\\{}\\{}.txt".format(str(userid), str(userid)), ‘wb‘) as f: f.write(content) def getAlbumList(userid): # 获取用户的“相册”和相册的封面照片链接 req = urllib2.Request("https://mm.taobao.com/self/album/open_album_list.htm?&user_id%20=" + str(userid)) # 相册链接 req.add_header(‘user-agent‘, headers()) html = urllib2.urlopen(req).read().decode(‘gbk‘).encode(‘utf-8‘) # 获取每一个相册的链接 rel = r‘class="mm-first" href="//(.*?)"‘ AlbumListurl = re.findall(rel, html) # 获取每一个相册的封面的链接,用于下载封面图片 # rel = r‘<img src="//(.*?jpg_240x240xz.jpg)" width="125" height="125">‘ # 爬取出来的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_240x240xz.jpg # 我们需要的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_620x10000.jpg # 相差为【jpg_240x240xz.jpg】和【jpg_620x10000.jpg】所以将【jpg_240x240xz.jpg】写在括号外面 # 爬到链接img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar. # 再补齐【jpg_620x10000.jpg】,如下 rel = r‘<img src="//(.*?)jpg_240x240xz.jpg" width="125" height="125">‘ AlbumListCoverurl = re.findall(rel, html) getAlbumListCoverurl = [] for url in AlbumListCoverurl: url += "jpg_620x10000.jpg" url = "http://" + url getAlbumListCoverurl.append(url) return getAlbumListCoverurl def getimages(userid, urls): # 通过图片链接下载图片 # http://img.alicdn.com/imgextra/i3/865838484/TB1_n_XKVXXXXb5XXXXXXXXXXXX_!!865838484-0-tstar.jpg_620x10000 # if os.path.exists("images\\" + str(userid)) == False: # os.mkdir("images\\" + str(userid)) i = 1 for url in urls: req = urllib2.Request(url) req.add_header(‘user-agent‘, headers()) html = urllib2.urlopen(req).read() # with open(‘images\\‘+str(userid)+"\\" + str(i) + ‘.jpg‘, ‘wb‘) as f: with open(‘images\\{}\\{}.jpg‘.format(str(userid), str(i)), ‘wb‘) as f: f.write(html) print "getImage:", url i += 1 print "End of download..." for user in getUrlList(): if os.path.exists("images") == False: os.mkdir("images") try: userid = user[‘userId‘] html = getInfo(userid) getNeedInf(html) # for i in getAlbumList(userid): # print i urls = getAlbumList(userid) getimages(userid, urls) except urllib2.URLError,e: print e.reason
时间: 2024-10-12 07:27:35