获取安居客小区信息

# -*- coding: utf-8 -*-
"""
Created on Sat Jun 24 22:03:17 2017

@author: willowj
"""
import sys
stdout, stdin, stderr  =  sys.stdout, sys.stdin, sys.stderr
reload(sys)
sys.stdout, sys.stdin, sys.stderr =  stdout, stdin, stderr
sys.setdefaultencoding(‘utf8‘)  

import requests
#import codecs
import pandas as pd
import numpy as np
from lxml import html
import math
import time
import re
#安居客 小区

#广州

headers = {
‘accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
‘accept-encoding‘:‘gzip, deflate, sdch‘,
‘accept-language‘:‘zh-CN,zh;q=0.8,en;q=0.6‘,
‘cache-control‘:‘max-age=0‘,
#‘cookie‘:‘als=0; ctid=12; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1498312808; sessid=6D343DE2-F344-C0B1-A8FD-794CF9851F6B; lps=http%3A%2F%2Fguangzhou.anjuke.com%2Fcommunity%2Fview%2F853712%7C; _ga=GA1.2.1457521041.1498312777; _gid=GA1.2.341883465.1498312777; aQQ_ajkguid=5E03C75B-5FDF-6879-5AF9-5DDAB323E51F; twe=2; 58tj_uuid=632d5e6b-78a6-483e-9c41-7e2e21023e74; new_session=0; init_refer=; new_uv=8‘,
‘referer‘:‘https://guangzhou.anjuke.com/community/‘,
‘Upgrade-Insecure-Requests‘:"1",
#"Connection": "close",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
        }

USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
        ]

def not_nulist_first(list_):
    if not list_:
        return None
    else:
        return list_[0]

def get_comnuities(page_response_):
    #从网页解析小区数据
    print page_response_.url, ‘getting‘
    etree = html.document_fromstring(page_response_.text)
    communities = etree.xpath(‘//div[@_soj="xqlb"]‘)

    #for community in communities:
    page_communities1 = []
    for community in communities:

        name = community.xpath("./a/@title")[0]

        community_web = html.urljoin(init_url, community.xpath("./a/@href")[0])
        address = community.xpath(".//address/text()")[0].strip()
        established_date = community.xpath(‘.//p[@class="date"]/text()‘)[0]
        price_perm2 = community.xpath(‘.//strong/text()‘)
        price_perm2 = not_nulist_first(price_perm2)

        page_communities1.append([name, price_perm2, established_date, address, community_web])
    print name
    return page_communities1   

class AnjukeCommunity(object):
 """docstring for ClassName"""
    def __init__(self, init_url=None):
        #广州安居客
        self.init_url = ‘https://guangzhou.anjuke.com/community/‘
        self.requ = requests.session()
        self.requ.adapters.DEFAULT_RETRIES = 5
        self.pages_max = self.pages_max()

    def pages_max(self):
        #最大页数
        page_response = self.requ.get(self.init_url, headers=headers)
        init_etree = html.document_fromstring(page_response.text)
        community_Nums = int(init_etree.xpath(‘//span[@class="tit"]/em/text()‘)[-1])
        communities = init_etree.xpath(‘//div[@_soj="xqlb"]‘)
        return int(math.ceil(float(community_Nums)/len(communities)))       

    def get_all_comnunities(self):

        page_communities = []

        for page in range(1, pages_max+1):
            url_ = ‘%s/p%s/‘%(self.init_url, page)
            #打开网页
            headers["User-Agent"] = np.random.choice(USER_AGENTS)
            page_response_ = self.requ.get(url_, headers=headers)
            print  page_response_.url, page_response_.status_code, ‘start‘
            time.sleep(20)
            page_response_.close()
            page_response_.connection.close()

            #解析网页数据
            communi = get_comnuities(page_response_)
            page_communities.extend(communi)

#网页标题
print init_etree.xpath(‘//div[@_soj="xqlb"]/a/@title‘)[0]

init_url = ‘https://guangzhou.anjuke.com/community/‘
Anjuke_guangzhou = AnjukeCommunity(init_url)
page_communities = Anjuke_guangzhou.get_all_comnunities()

#pandas 规整数据
communities_pd = pd.DataFrame(page_communities, columns=[‘name‘, ‘price_perm2‘, ‘established_date‘, ‘address‘, ‘community_web‘])
communities_pd[‘price_perm2‘] = communities_pd[‘price_perm2‘].astype(float)
communities_pd[‘price_part‘] = communities_pd[‘price_perm2‘]//5000 * 5000
communities_pd[‘established_year‘] = communities_pd[‘established_date‘].str.extract(‘(\d+)‘,expand=False).astype(int)
#communities_pd[‘religion‘] = communities_pd[‘address‘].str.extract(u"[(.+?)-", expand=False)
communities_pd[‘religion‘] = communities_pd[‘address‘].str.slice(1,3)

cols = [‘religion‘, ‘name‘, ‘price_part‘, ‘price_perm2‘, ‘established_year‘, ‘established_date‘, ‘address‘, ‘community_web‘]

communities_pd[cols].to_excel(u"安居客广州小区.xlsx",  index=False,  encoding=‘gb18030‘)
 
时间: 2024-12-28 12:21:14

获取安居客小区信息的相关文章

python3爬虫-通过requests获取安居客房屋信息

import requests from fake_useragent import UserAgent from lxml import etree from http import cookiejar import re, time import pymysql import random from requests.exceptions import Timeout ua = UserAgent() session = requests.Session() class MyExceptio

爬取安居客指定市的所有小区信息

在爬取的过程中发现,访问频率太快会导致网站弹出滑动验证,所以设定了时间随机时间延迟,这样子就能保证爬取的信息完整,我选的是青岛市的小区,后续也可以添加输入市名爬取相关内容,二级页面的房子的平均价格是动态生成的,需要发送一个请求得到一个json,请求的url比较复杂,而且还要再发送一次请求,因此直接在一级页面取平均价格,然后传入解析二级页面的函数,这样可以提高效率.代码如下: """ 爬取安居客所有小区信息 """ import requests

获取访客IP、地区位置信息、浏览器、来源页面

<?php //这个类似用来获取访客信息的 //方便统计 class visitorInfo { //获取访客ip public function getIp() { $ip=false; if(!empty($_SERVER["HTTP_CLIENT_IP"])){ $ip = $_SERVER["HTTP_CLIENT_IP"]; } if (!empty($_SERVER['HTTP_X_FORWARDED_FOR'])) { $ips = explod

找到一个在上海租房非常不错的网站 - 一步租房网,推荐推荐,综合了赶集,58同城,搜房,安居客等所有的信息

线上租房平台有以下推荐: 赶集网(www.ganji.com):赶集网租房频道汇聚了海量优质租房信息,包括个人房源.中介房源,既有整租也有合租,你还可以免费发布自己的租房信息.赶集网还推出诚信体系,租客可以从多维度对中介或者房东进行打分和点评.在房产相关页面,用户可以看到发帖人的信用等级.认证信息,以此来综合评估发帖人的靠谱程度,也可以对其进行点评或查看别人的点评情况 58同城(www.58.com):定位于本地社区及免费分类信息服务,帮助人们解决生活和工作所遇到的难题.租房频道为你提供求租房的

安居客二手房爬虫-微信提醒合适房源!

AnjukeSpider 项目地址 https://github.com/X-Mars/AnjukeSpider/ 简介 爬去安居客房源,筛选房源,微信提醒 环境搭建 安装python2.7.pip.setuptools 安装requests pip2.7 install request 使用 安居客相关 访问安居客网站,选择城市和小区后,复制当前网址,替换AnjukeUrl后面的网址 SizeRange为面积区间,既筛选面积<SizeRange的房源 PriceRange为总价区间,既筛选总价

开源项目成熟度分析工具-利用github api获取代码库的信息

1.github api github api是http形式的api,功能还是比较丰富的,博主因为项目的原因主要用到的是提取project信息这项功能,返回的数据是JSON格式. api页:https://developer.github.com/v3/ Options: (H) means HTTP/HTTPS only, (F) means FTP only --anyauth Pick "any" authentication method (H) -a, --append Ap

微信公众号开发之网页授权认证获取用户的详细信息,实现自动登陆

原创声明:本文转来源本人另一博客[http://blog.csdn.net/liaohaojian/article/details/70175835]绝非他人处转载 从接触公众号到现在,开发维护了2个公众号,开发过程中遇到很多问题,现在把部分模块功能在这备案一下,做个总结也希望能给其他人帮助 工欲善其事,必先利其器,先看看开发公众号需要准备或了解什么 web开发工具:官方提供的开发工具,使用自己的微信号来调试微信网页授权.调试.检验页面的 JS-SDK 相关功能与权限,模拟大部分 SDK 的输入

Android_获取手机各种详细信息

TelephonyManager类主要提供了一系列用于访问与手机通讯相关的状态和信息的get方法.其中包括手机SIM的状态和信息.电信网络的状态及手机用户的信息.在应用程序中可以使用这些get方法获取相关数据. TelephonyManager类的对象可以通过Context.getSystemService(Context.TELEPHONY_SERVICE)方法来获得,需要注意的是有些通讯信息的获取对应用程序的权限有一定的限制,在开发的时候需要为其添加相应的权限. 以下列出TelephonyM

C#获取电脑的相关信息

/* 创建者:菜刀居士的博客 * 创建日期: 2014年08月31号 * 功能:获取电脑的相关信息 * */ namespace Net.String.ConsoleApplication { using System; using System.Management; public class ComputerHelper { /// <summary> /// 获取CPU序列号代码 /// </summary> public static string GetCPUId() {