# coding:utf-8
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import urllib2
import re
import json
import chardet
import pymysql
# url = "http://dytt8.net/"
# page = requests.get(url).content
# page_html = BeautifulSoup(page,‘lxml‘)
# name = page_html.select("td.inddline > a:nth-of-type(2)")
# for n in name:
# if ‘dyzz‘ in n.encode(‘gbk‘):
# print n.encode(‘gbk‘)
# file = open("move.txt","a+")
# file.write(n.encode(‘utf-8‘)+‘\n‘)
# file.close()
def getmoveinfo( url ):
page = requests.get(url).content
page_html = BeautifulSoup(page,‘lxml‘)
# title = page_html.select("div.title_all")
# title = title[4].select("h1")
# title = title[0].select("font")
# return title[0].contents;
title = page_html.find_all("font", attrs={"color": "#07519a"})
title_content = title[0].contents
if(re.findall(r"译 名(.*?)<br/>", str(page_html))):
yiming = re.findall(r"译 名(.*?)<br/>", str(page_html))[0]
else:
yiming = ‘‘
if(re.findall(r"类 别(.*?)<br/>", str(page_html))):
leibie = re.findall(r"类 别(.*?)<br/>", str(page_html))[0]
else:
leibie = ‘‘
if(re.findall(r"语 言(.*?)<br/>", str(page_html))):
yuyan = re.findall(r"语 言(.*?)<br/>", str(page_html))[0]
else:
yuyan = ‘‘
if(re.findall(r"字 幕(.*?)<br/>", str(page_html))):
zimu = re.findall(r"字 幕(.*?)<br/>", str(page_html))[0]
else:
zimu = ‘‘
if(re.findall(r"上映日期(.*?)<br/>", str(page_html))):
date = re.findall(r"上映日期(.*?)<br/>", str(page_html))[0]
else:
date = ‘‘
if(re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))):
douban = re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))[0]
else:
douban = ‘‘
if(re.findall(r"片 长(.*?)<br/>", str(page_html))):
pianchang = re.findall(r"片 长(.*?)<br/>", str(page_html))[0]
else:
pianchang = ‘‘
if(re.findall(r"导 演(.*?)<br/>", str(page_html))):
daoyan = re.findall(r"导 演(.*?)<br/>", str(page_html))[0]
else:
daoyan = ‘‘
if(re.findall(r"主 演(.*?)<br/>", str(page_html))):
zhuyan = re.findall(r"主 演(.*?)<br/>", str(page_html))[0]
else:
zhuyan = ‘‘
if(re.findall(r"简 介(.*?)【下载地址】", str(page_html))):
jianjie = re.findall(r"简 介(.*?)【下载地址】", str(page_html))[0]
else:
jianjie = ‘‘
addres = page_html.find_all("td", attrs={"bgcolor": "#fdfddf"})
if(addres):
addres = addres[0].contents;
addres = addres[0].get("href").encode(‘utf-8‘)
else:
addres = ‘‘
res = {}
res[‘title‘] =title_content[0].encode("utf-8")
res[‘yiming‘] = yiming
res[‘leibie‘] = leibie
res[‘yuyan‘] = yuyan
res[‘zimu‘] = zimu
res[‘date‘] = date
res[‘douban‘] = douban
res[‘pianchang‘] = pianchang
res[‘daoyan‘] = daoyan
res[‘zhuyan‘] = zhuyan
res[‘jianjie‘] = jianjie.replace("<br/>", "")
res[‘addres‘] = addres
return res
url = "http://dytt8.net/"
page = requests.get(url).content
page_html = BeautifulSoup(page,‘lxml‘)
name = page_html.select("td.inddline > a:nth-of-type(2)")
conn = pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,password=‘root‘,db=‘moves‘,charset=‘utf8‘)
cursor = conn.cursor()
for n in name:
if ‘dyzz‘ in n.encode(‘gbk‘):
info = getmoveinfo("http://dytt8.net"+n.get("href"))
title = info[‘title‘]
yiming = info[‘yiming‘]
leibie = info[‘leibie‘]
yuyan = info[‘yuyan‘]
zimu = info[‘zimu‘]
date = info[‘date‘]
douban = info[‘douban‘]
pianchang = info[‘pianchang‘]
daoyan = info[‘daoyan‘]
zhuyan = info[‘zhuyan‘]
jianjie = info[‘jianjie‘]
addres = info[‘addres‘]
# print title.decode(‘utf-8‘).encode(‘gbk‘)
cursor.execute("INSERT INTO move_info(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘,‘{5}‘,‘{6}‘,‘{7}‘,‘{8}‘,‘{9}‘,‘{10}‘,‘{11}‘);".format(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres))
conn.commit()
cursor.close()
conn.close()
print ‘ok‘
原文地址:http://blog.51cto.com/itafei/2073434