# -*- coding: utf-8 -*- """ Created on Sat Jun 24 22:03:17 2017 @author: willowj """ import sys stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr reload(sys) sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr sys.setdefaultencoding(‘utf8‘) import requests #import codecs import pandas as pd import numpy as np from lxml import html import math import time import re #安居客 小区 #广州 headers = { ‘accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘, ‘accept-encoding‘:‘gzip, deflate, sdch‘, ‘accept-language‘:‘zh-CN,zh;q=0.8,en;q=0.6‘, ‘cache-control‘:‘max-age=0‘, #‘cookie‘:‘als=0; ctid=12; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1498312808; sessid=6D343DE2-F344-C0B1-A8FD-794CF9851F6B; lps=http%3A%2F%2Fguangzhou.anjuke.com%2Fcommunity%2Fview%2F853712%7C; _ga=GA1.2.1457521041.1498312777; _gid=GA1.2.341883465.1498312777; aQQ_ajkguid=5E03C75B-5FDF-6879-5AF9-5DDAB323E51F; twe=2; 58tj_uuid=632d5e6b-78a6-483e-9c41-7e2e21023e74; new_session=0; init_refer=; new_uv=8‘, ‘referer‘:‘https://guangzhou.anjuke.com/community/‘, ‘Upgrade-Insecure-Requests‘:"1", #"Connection": "close", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" } USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" ] def not_nulist_first(list_): if not list_: return None else: return list_[0] def get_comnuities(page_response_): #从网页解析小区数据 print page_response_.url, ‘getting‘ etree = html.document_fromstring(page_response_.text) communities = etree.xpath(‘//div[@_soj="xqlb"]‘) #for community in communities: page_communities1 = [] for community in communities: name = community.xpath("./a/@title")[0] community_web = html.urljoin(init_url, community.xpath("./a/@href")[0]) address = community.xpath(".//address/text()")[0].strip() established_date = community.xpath(‘.//p[@class="date"]/text()‘)[0] price_perm2 = community.xpath(‘.//strong/text()‘) price_perm2 = not_nulist_first(price_perm2) page_communities1.append([name, price_perm2, established_date, address, community_web]) print name return page_communities1 class AnjukeCommunity(object): """docstring for ClassName""" def __init__(self, init_url=None): #广州安居客 self.init_url = ‘https://guangzhou.anjuke.com/community/‘ self.requ = requests.session() self.requ.adapters.DEFAULT_RETRIES = 5 self.pages_max = self.pages_max() def pages_max(self): #最大页数 page_response = self.requ.get(self.init_url, headers=headers) init_etree = html.document_fromstring(page_response.text) community_Nums = int(init_etree.xpath(‘//span[@class="tit"]/em/text()‘)[-1]) communities = init_etree.xpath(‘//div[@_soj="xqlb"]‘) return int(math.ceil(float(community_Nums)/len(communities))) def get_all_comnunities(self): page_communities = [] for page in range(1, pages_max+1): url_ = ‘%s/p%s/‘%(self.init_url, page) #打开网页 headers["User-Agent"] = np.random.choice(USER_AGENTS) page_response_ = self.requ.get(url_, headers=headers) print page_response_.url, page_response_.status_code, ‘start‘ time.sleep(20) page_response_.close() page_response_.connection.close() #解析网页数据 communi = get_comnuities(page_response_) page_communities.extend(communi) #网页标题 print init_etree.xpath(‘//div[@_soj="xqlb"]/a/@title‘)[0] init_url = ‘https://guangzhou.anjuke.com/community/‘ Anjuke_guangzhou = AnjukeCommunity(init_url) page_communities = Anjuke_guangzhou.get_all_comnunities() #pandas 规整数据 communities_pd = pd.DataFrame(page_communities, columns=[‘name‘, ‘price_perm2‘, ‘established_date‘, ‘address‘, ‘community_web‘]) communities_pd[‘price_perm2‘] = communities_pd[‘price_perm2‘].astype(float) communities_pd[‘price_part‘] = communities_pd[‘price_perm2‘]//5000 * 5000 communities_pd[‘established_year‘] = communities_pd[‘established_date‘].str.extract(‘(\d+)‘,expand=False).astype(int) #communities_pd[‘religion‘] = communities_pd[‘address‘].str.extract(u"[(.+?)-", expand=False) communities_pd[‘religion‘] = communities_pd[‘address‘].str.slice(1,3) cols = [‘religion‘, ‘name‘, ‘price_part‘, ‘price_perm2‘, ‘established_year‘, ‘established_date‘, ‘address‘, ‘community_web‘] communities_pd[cols].to_excel(u"安居客广州小区.xlsx", index=False, encoding=‘gb18030‘)
时间: 2024-12-28 12:21:14