以汽车之家为例子,抓取页面并进行解析
# -*- coding=utf-8 -*- import urllib2 from BeautifulSoup import BeautifulSoup as bs3 import json import codecs #字符检测,用来检测其真实的编码格式 import chardet #save content to file def save_to_file(filename, content): f = open(filename, 'w+') assert(f) f.write(content) f.close() def parse_json_data(content): print(chardet.detect(content[0])) name_list = ['keyLink', 'config', 'option','color', 'innerColor'] print(json.dumps(content[0].decode('GB2312'))) def parse_content(content): #content是GB2312的编码 soup = bs3(content) key_text = 'var levelId' elem_lib = soup.find('script', text=lambda(x):key_text in x) #str_script是utf-8的编码 str_script = str(elem_lib.string) #print(chardet.detect(str_script)) #由于命令行是cp936 GBK的编码,如果编码不符合无法打印 strGBK = str_script.decode('utf-8').encode('gb2312') #print(strGBK) #移除html的转义字符 strGBK = strGBK.replace(' ','') d = strGBK.splitlines() list_data = [] for i in d: if i.isspace(): continue #过滤不需要的变量 if len(i) < 100: continue #取出json数据 idx = i.find('{') if idx == -1: continue #移除最后的; k = i[idx:-1] list_data.append(k) parse_json_data(list_data) ''' print('json.count=', len(list_data)) for i in list_data: if len(i) > 200: print(i[0:200]) else: print(i) parse_json_data(list_data) ''' #不能再函数中直接使用exec,但是可以使用eval ''' strSentece = '' for i in d: if i.isspace(): continue if 'null' in j: continue #移除var的类型定义,javascript需要,python不需要 j = i[4:] strSentece += i #可以直接在python中执行json的赋值语句,类似dict赋值 exec(strSentece) #输出变量数据 var_list = ['keyLink', 'config','option','color','innerColor'] for i in var_list: exec('print %s' % (i,)) ''' def crawler_4_autohome(): autohome_url = 'http://car.autohome.com.cn/config/series/657.html' #uft-8 content = urllib2.urlopen(url=autohome_url).read() #print(chardet.detect(content)) parse_content(content) if __name__ == '__main__': crawler_4_autohome()
时间: 2024-10-24 03:13:51