Python爬取51job职位信息

# -*- coding: utf-8 -*-
# @Time    : 2018/3/1 16:38
# @Author  : HT
# @Email   : [email protected]
# @File    : 51job.py
# @Software: PyCharm
import urllib
import re
import sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)

i = 0#统计爬取总条目
def url_input(url):
    get_html = urllib.urlopen(url)
    read_html = get_html.read().decode(‘gbk‘)
    return read_html

def find_data(html):
    #reg = re.compile(r‘class="t1 ">.*?<a target="_blank" title="(.*?)"<span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span><span class="t5">(.*?)</span>‘,re.S)
    reg = re.compile(r‘class="t1 ">.*?<a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>‘,re.S)
    items = re.findall(reg,html)
    return items

def find_all_page(html):
    #print(html)
    reg = re.compile(r‘<span class="td">(.*?)</span><input id="jump_page" class="mytxt" type="text" value="1"/>‘,re.S)
    page_all = re.findall(reg, html)
    num = re.sub("\D", "", page_all[0])#从共5页中提取数字
    return num
def data_to_txt(str):
    with open(u"51job北上广深python.txt",‘a+‘) as f:
        f.write(str)
def print_items(data_items):
    global i
    for data in data_items:
        job = data[0]
        company = data[1]
        address = data[2]
        wages = data[3]
        date = data[4]
        i = i + 1
        str1 ="["+str(i)+"] "+ job+"--"+company+"--"+address+"--"+wages+"--"+date+"\n"
        data_to_txt(str1)
        print(str1)

def urlformat(urlstart):
    url = re.sub(‘1.html‘,‘{}.html‘,urlstart)
    return url
def get_page_html(page_num,urlstart):
    list=[]
    for i in range(page_num):
        url = urlformat(urlstart)
        url = url.format(i)
        list.append(url)
    return list

if __name__ == ‘__main__‘:
    #python
    #urlstart = ‘http://search.51job.com/list/010000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=‘
    #嵌入式
    #urlstart = ‘http://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25B5%258C%25E5%2585%25A5%25E5%25BC%258F%25E5%25BC%2580%25E5%258F%2591,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=‘
    #云计算
    #urlstart =‘http://search.51job.com/list/010000,000000,0000,00,9,99,%25E4%25BA%2591%25E8%25AE%25A1%25E7%25AE%2597,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=‘
    #机器学习
    #urlstart = ‘http://search.51job.com/list/010000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=‘
    #人工智能
    #urlstart = ‘http://search.51job.com/list/010000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=‘
    #自动驾驶
    #urlstart = ‘http://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E9%25A9%25BE%25E9%25A9%25B6,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=‘
    #北上广深python
    urlstart = ‘http://search.51job.com/list/010000%252C040000%252C020000%252C030200,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=‘
    html = url_input(urlstart)
    all_page_num = int(find_all_page(html))
    print("+++++++++++++++++%s++++++++++++++++++++"%(all_page_num))
    urllist = get_page_html(all_page_num,urlstart)
    for url in urllist:
        html = url_input(url)
        data_items = find_data(html)
        print_items(data_items)




原文地址:https://www.cnblogs.com/acer-haitao/p/8490810.html

时间: 2024-10-27 12:13:14

Python爬取51job职位信息的相关文章

爬取51job职位信息之编码问题

兴趣来潮,爬了下51job,但是遇到编码问题!以下是简单的一段代码 获取整个页面数据 # -*- coding:utf-8 -*- import requests import sysreload(sys)sys.setdefaultencoding('utf-8') def spider(url): session = requests.Session() html = session.get(url, headers=headers) return html url = 'http://ww

python爬取北京租房信息

租房助手 发现官网的筛选方式不能满足自己的需求,所以爬取相关网站制作出现在的东西来 效果预览-> <a href="https://virzc.com/2018/05/17/beijingrent/#more" target="_blank">在线预览</a> ###下面进行详细分析一.首先爬取起始地和终点地的路线及沿途地铁站名称 1.爬取8684.cn的地铁查询: pattern = 'http://bjdt.8684.cn/so.p

python爬取主播信息

之前学过python的爬虫技术,现在回顾一下看看还会不会,果然有坑. 先爬取了微博评论网友的id代码如下 import requestsurl = 'https://m.weibo.cn/api/comments/show?id=4188633986790962&page=6h = requests.get(url)print(h.json()['data']['data'][0]['user']['id']) 执行的时候报错 Traceback (most recent call last):

python爬取网业信息案例

需求:爬取网站上的公司信息 代码如下: import json import os import shutil import requests import re import time requests.packages.urllib3.disable_warnings() #通过url请求接口,获取返回数据 def getPage(url,headers): try: response = requests.get(url=url, headers=headers, verify=False

爬虫:工作中编写的一个python爬取web页面信息的小demo

最近公司要求编写一个爬虫,需要完善后续金融项目的数据,由于工作隐私,就不付被爬的网址url了,下面总结下spider的工作原理. 语言:python:工具:jupyter: (1)使用requests模块,获取url页面. import requests url = "http://www.~~~~~~~~~~~~~~~~~~~~~~~~~~" r = requests.get(url) (2)解析html页面(若是pdf页面需要其他工具模块)需要使用BeautifulSoup模块,把

Python爬虫_用Python爬取csdn页面信息目录

1.原理: 这个程序可以实现批量获取到某一个CSDN博客的个人信息.目录与链接的对应,并存到一个本目录的mulu.txt文件中 2.具体代码: # -*- coding: cp936 -*- import urllib.request# import re# import sys# import time# import random import string headers = {# 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1;

【实战】用request爬取拉勾网职位信息

from urllib import request import urllib import ssl import json url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like

[Python学习] 简单爬取CSDN下载资源信息

这是一篇Python爬取CSDN下载资源信息的例子,主要是通过urllib2获取CSDN某个人所有资源的资源URL.资源名称.下载次数.分数等信息:写这篇文章的原因是我想获取自己的资源所有的评论信息,但是由于评论采用JS临时加载,所以这篇文章先简单介绍如何人工分析HTML页面爬取信息. 源代码 # coding=utf-8 import urllib import time import re import os #****************************************

python爬虫实战(一)----------爬取京东商品信息

本文章的例子仅用于学习之用,如涉及版权隐私信息,请联系本人删除,谢谢. 最近一直在练习使用python爬取不同网站的信息,最终目的是实现一个分布式的网络爬虫框架,可以灵活适用不同的爬取需求. 项目github地址: https://github.com/happyAnger6/anger6Spider 在学习的过程中遇到不少问题,在这里做一下总结,并分享出来,希望有兴趣的朋友批评指正,共同学习共同进步. 本着实用至上的目的,不准备过多讲实现细节和原理,直接通过项目需求来边实战边学习,因此本系列文