猎聘网站搜索大数据关键字,只能显示100页,爬取这一百页的相关信息,以便做分析。
__author__ = ‘Fred Zhao‘ import requests from bs4 import BeautifulSoup import os import csv class JobSearch(): def __init__(self): self.headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36‘} self.base_url = ‘https://www.liepin.com/zhaopin/?ckid=c1a868fa8b83aa5b&fromSearchBtn=2&init=-1&sfrom=click-pc_homepage-centre_searchbox-search_new°radeFlag=0&key=大数据&headckid=c1a868fa8b83aa5b&d_pageSize=40&siTag=LGV-fc5u_67LtFjetF6ACg~fA9rXquZc5IkJpXC-Ycixw&d_headId=8e7325814e7ed9919787ee3fe85e1c94&d_ckId=8e7325814e7ed9919787ee3fe85e1c94&d_sfrom=search_fp&d_curPage=99&curPage=‘ self.base_path = os.path.dirname(__file__) def makedir(self, name): path = os.path.join(self.base_path, name) isExist = os.path.exists(path) if not isExist: os.makedirs(path) print("File has been created.") else: print(‘OK!The file is existed. You do not need create a new one.‘) os.chdir(path) def request(self, url): r = requests.get(url, headers=self.headers) return r def get_detail(self, page): r = self.request(self.base_url + page) ul = BeautifulSoup(r.text, ‘lxml‘).find(‘ul‘, class_=‘sojob-list‘) plist = ul.find_all(‘li‘) self.makedir(‘job_data‘) rows = [] for item in plist: job_info = item.find(‘div‘, class_=‘sojob-item-main clearfix‘).find(‘div‘, class_=‘job-info‘) position = job_info.find(‘h3‘).get(‘title‘) print(position) job_info_list = job_info.find_all(‘p‘) job_condition = job_info_list[0].get(‘title‘) print(job_condition) job_time = job_info_list[1].find(‘time‘).get(‘title‘) print(job_time) company_info = item.find(‘div‘, class_=‘sojob-item-main clearfix‘).find(‘div‘, class_=‘company-info‘) company = company_info.find(‘p‘, class_=‘company-name‘).find(‘a‘).get(‘title‘) print(company) rows.append([position, job_condition, job_time, company]) self.save_to_csv(rows) def save_to_csv(self, rows): with open(‘job.csv‘, ‘a‘) as f: writer = csv.writer(f) writer.writerows(rows) if __name__ == ‘__main__‘: job = JobSearch() for page in range(0, 100): job.get_detail(str(page))
原文地址:https://www.cnblogs.com/fredkeke/p/9409560.html
时间: 2024-11-09 01:01:28