python3爬虫-通过requests获取拉钩职位信息

import requests, json, time, tablib

def send_ajax_request(data: dict):
    try:
        ajax_response = session.post(url=ajax_url,
                                     params={"needAddtionalResult": "false", "city": city},
                                     data=data,
                                     headers=ajax_headers,
                                     timeout=timeout)
        if ajax_response.status_code == 200:
            return ajax_response.json()
        return {}
    except Exception:
        return {}

def get_job_info(info_dic: dict):
    jobInfoMap = info_dic.get("content").get("positionResult").get("result")

    for jobInfoDict in jobInfoMap:
        dic = {}
        dic["companyId"] = jobInfoDict.get("companyId")
        dic["companyFullName"] = jobInfoDict.get("companyFullName")
        dic["positionName"] = jobInfoDict.get("positionName")
        dic["workYear"] = jobInfoDict.get("workYear")
        dic["education"] = jobInfoDict.get("education")
        dic["salary"] = jobInfoDict.get("salary")
        dic["jobNature"] = jobInfoDict.get("jobNature")
        dic["companySize"] = jobInfoDict.get("companySize")
        dic["city"] = jobInfoDict.get("city")
        dic["district"] = jobInfoDict.get("district")
        dic["createTime"] = jobInfoDict.get("createTime")
        if is_save_txtfile:
            yield json.dumps(dic, ensure_ascii=False)
        else:
            yield dic.values()

def save_to_file(json_data):
    for data in json_data:
        f.write(data + "\n")

def save_to_excel(list_data):
    for line in list_data:
        dataset.append(line)

def run():
    for i in range(1, 31):
        data = {
            "first": "false",
            "pn": i,
            "kd": "python"
        }
        info_dic = send_ajax_request(data)
        data = get_job_info(info_dic)
        if is_save_txtfile:
            save_to_file(data)
        else:
            save_to_excel(data)
        print("正在保存数据")
        time.sleep(sleeptime)

if __name__ == ‘__main__‘:
    session = requests.Session()
    job_name = "python"
    city = "成都"
    timeout = 5
    sleeptime = 10
    doc_url = "https://www.lagou.com/jobs/list_{job_name}".format(job_name=job_name)
    session.headers[
        "User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    session.headers["Host"] = "www.lagou.com"

    doc_response = session.get(url=doc_url, params={"city": city})

    ajax_headers = {
        "Origin": "https://www.lagou.com",
        "Referer": doc_response.url
    }

    ajax_url = "https://www.lagou.com/jobs/positionAjax.json?=false"

    is_save_txtfile = False

    if not is_save_txtfile:
        dataset = tablib.Dataset()
        dataset.headers = ["companyId", "companyFullName", "positionName", "workYear",
                           "education", "salary", "jobNature", "companySize", "city",
                           "district", "createTime"]

    f = open("jobinfo.txt", "a", encoding="utf-8")
    try:
        run()
    except Exception:
        print(‘出错了‘)
    finally:
        if is_save_txtfile:
            f.close()
        else:
            with open("jobInfo.xls", "wb") as f:
                f.write(dataset.xls)
                f.flush()

原文地址:https://www.cnblogs.com/zhuchunyu/p/10765945.html

时间: 2024-07-30 16:25:49

python3爬虫-通过requests获取拉钩职位信息的相关文章

python3爬虫-通过requests获取安居客房屋信息

import requests from fake_useragent import UserAgent from lxml import etree from http import cookiejar import re, time import pymysql import random from requests.exceptions import Timeout ua = UserAgent() session = requests.Session() class MyExceptio

21天打造分布式爬虫-Selenium爬取拉钩职位信息(六)

6.1.爬取第一页的职位信息 第一页职位信息 from selenium import webdriver from lxml import etree import re import time class LagouSpider(object): def __init__(self): self.driver = webdriver.Chrome() #python职位 self.url = 'https://www.lagou.com/jobs/list_python?labelWords

资料分享—— Python 分析拉钩职位

视频 PPT+ 视频 链接: https://pan.baidu.com/s/1tzG1adgpn23TSKvnR6XmYg 提取码: 2p2t 项目代码:https://github.com/nicksors/JobAnalysis 前言 近年来 Python 之火大家都有感而知,那亲们知道北京的 Python 开发岗位.运维开发岗位招聘地域都是如何分布的吗?薪水如何?是否有前景等等,这些数据呢直接通过招聘信息来了解到企业用人是最直接的,也是最简单的途径. 那本次将通过分享 Python 来抓

ruby 爬虫爬取拉钩网职位信息,产生词云报告

思路:1.获取拉勾网搜索到职位的页数 2.调用接口获取职位id 3.根据职位id访问页面,匹配出关键字 url访问采用unirest,由于拉钩反爬虫,短时间内频繁访问会被限制访问,所以没有采用多线程,而且每个页面访问时间间隔设定为10s,通过nokogiri解析页面,正则匹配只获取技能要求中的英文单词,可能存在数据不准确的情况 数据持久化到excel中,采用ruby erb生成word_cloud报告 爬虫代码: require 'unirest' require 'uri' require '

python3爬虫-通过requests爬取图虫网

import requests from fake_useragent import UserAgent from requests.exceptions import Timeout from urllib.parse import quote, unquote import re, json, os, hashlib from lxml import etree import time from multiprocessing import Process, Queue, Pool # 之前

通俗易懂的分析如何用Python实现一只小爬虫,爬取拉勾网的职位信息

源代码:https://github.com/nnngu/LagouSpider 效果预览 思路 1.首先我们打开拉勾网,并搜索"java",显示出来的职位信息就是我们的目标. 2.接下来我们需要确定,怎样将信息提取出来. 查看网页源代码,这时候发现,网页源代码里面找不到职位相关信息,这证明拉勾网关于职位的信息是异步加载的,这也是一种很常用的技术. 异步加载的信息,我们需要借助 chrome 浏览器的开发者工具进行分析,打开开发者工具的方法如下: 点击Nerwork进入网络分析界面,这

python3 requests 获取 拉勾工作数据

1 #-*- coding:utf-8 -*- 2 __author__ = "carry" 3 4 import requests,json 5 6 for x in range(1, 15): 7 url ='http://www.lagou.com/jobs/positionAjax.json' 8 #proxies = {"http":"http://125.105.17.229:808"} 9 headers={'User-Agent'

python3 爬虫之requests模块使用总结

Requests 是第三方模块,如果要使用的话需要导入.Requests也可以说是urllib模块的升级版,使用上更方便. 这是使用urllib的例子. import urllib.request import json url = 'http://www.weather.com.cn/data/sk/101190408.html' res = urllib.request.urlopen(url)#发送请求 result = res.read().decode()#获取结果,结果是byte类型

python3爬虫-使用requests爬取起点小说

import requests from lxml import etree from urllib import parse import os, time def get_page_html(url): '''向url发送请求''' resoponse = session.get(url, headers=headers, timeout=timeout) try: if resoponse.status_code == 200: return resoponse except Except