import requests
import re #正则表达式
import time
import pandas #保存成 CSV
#header={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0‘}
header = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0‘,‘Cookie‘:‘JSESSIONID=ABAAABAAADEAAFI1E0F9E93B802B158B671ED843BED6DE5; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511754333; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511765381; _ga=GA1.2.1259821048.1511754333; user_trace_token=20171127114539-6f956704-d325-11e7-ac7d-525400f775ce; LGRID=20171127144946-28372596-d33f-11e7-9a81-5254005c3644; LGUID=20171127114539-6f956cbc-d325-11e7-ac7d-525400f775ce; _gid=GA1.2.644825101.1511754336; X_HTTP_TOKEN=2eb2d7bfeb14d998ae1bc4ce0efdc0f8; _putrc=59B1D3CEDBE5250A; login=true; unick=%E6%9C%B1%E4%B8%9C%E5%8D%8E; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; index_location_city=%E5%B9%BF%E5%B7%9E; TG-TRACK-CODE=search_code; SEARCH_ID=826f4d81a0324508892895d9400bffab‘,‘Host‘:‘www.lagou.com‘}
#模拟浏览器访问
url=‘https://www.lagou.com/zhaopin/4/?filterOption=4‘
html=requests.request(‘GET‘, url, headers=header).text
#请求拉勾网的URL,获取其text。
ren=re.compile(r‘data-salary="(.*?)" data-company="(.*?)" data-positionname="(.*?)" href="(.*?)" ‘, re.S)
#正则表达式查找
data = pandas.DataFrame(re.findall(ren,html)) #csv
#爬取多页
data=[] #赋予一个列表的格式
for ii in range (1,50):
new_url = ‘https://www.lagou.com/zhaopin/‘+ str(ii)
time.sleep(2)
html=requests.request(‘GET ‘, new_url , headers=header)
data = pandas.DataFrame(re.findall(ren, html.text)) #csv
data.to_csv(‘C:\\Users\\Administrator\\Desktop\\python\\lagou1.csv‘,header=False,index=False,mode=‘a+‘)
经EXCEL分列处理后如下: