python 抓取电影天堂电影信息放入数据库

# coding:utf-8 import requests from bs4 import BeautifulSoup from multiprocessing import Pool import urllib2 import re import json import chardet import pymysql # url = "http://dytt8.net/" # page = requests.get(url).content # page_html = BeautifulSoup(page,‘lxml‘)


# name = page_html.select("td.inddline > a:nth-of-type(2)")

# for n in name:

#     if ‘dyzz‘ in n.encode(‘gbk‘):

#         print n.encode(‘gbk‘)

#         file = open("move.txt","a+")

#         file.write(n.encode(‘utf-8‘)+‘\n‘)

#         file.close()
def getmoveinfo( url ):

    page = requests.get(url).content

    page_html = BeautifulSoup(page,‘lxml‘)

    # title = page_html.select("div.title_all")

    # title = title[4].select("h1")

    # title = title[0].select("font")

    # return title[0].contents;

    title = page_html.find_all("font", attrs={"color": "#07519a"})

    title_content = title[0].contents

    if(re.findall(r"译　　名(.*?)<br/>", str(page_html))):

        yiming = re.findall(r"译　　名(.*?)<br/>", str(page_html))[0]

    else:

        yiming = ‘‘

    if(re.findall(r"类　　别(.*?)<br/>", str(page_html))):

        leibie = re.findall(r"类　　别(.*?)<br/>", str(page_html))[0]

    else:

        leibie = ‘‘

    if(re.findall(r"语　　言(.*?)<br/>", str(page_html))):

        yuyan = re.findall(r"语　　言(.*?)<br/>", str(page_html))[0]

    else:

        yuyan = ‘‘

    if(re.findall(r"字　　幕(.*?)<br/>", str(page_html))):

        zimu = re.findall(r"字　　幕(.*?)<br/>", str(page_html))[0]

    else:

        zimu = ‘‘

    if(re.findall(r"上映日期(.*?)<br/>", str(page_html))):

        date = re.findall(r"上映日期(.*?)<br/>", str(page_html))[0]

    else:

        date = ‘‘

    if(re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))):

        douban = re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))[0]

    else:

        douban = ‘‘

    if(re.findall(r"片　　长(.*?)<br/>", str(page_html))):

        pianchang = re.findall(r"片　　长(.*?)<br/>", str(page_html))[0]

    else:

        pianchang = ‘‘

    if(re.findall(r"导　　演(.*?)<br/>", str(page_html))):

        daoyan = re.findall(r"导　　演(.*?)<br/>", str(page_html))[0]

    else:

        daoyan = ‘‘

    if(re.findall(r"主　　演(.*?)<br/>", str(page_html))):

        zhuyan = re.findall(r"主　　演(.*?)<br/>", str(page_html))[0]

    else:

        zhuyan = ‘‘

    if(re.findall(r"简　　介(.*?)【下载地址】", str(page_html))):

        jianjie = re.findall(r"简　　介(.*?)【下载地址】", str(page_html))[0]

    else:

        jianjie = ‘‘
addres = page_html.find_all("td", attrs={"bgcolor": "#fdfddf"})

    if(addres):

        addres = addres[0].contents;

        addres = addres[0].get("href").encode(‘utf-8‘)

    else:

        addres = ‘‘

    res = {}

    res[‘title‘] =title_content[0].encode("utf-8")

    res[‘yiming‘] = yiming

    res[‘leibie‘] = leibie

    res[‘yuyan‘] = yuyan

    res[‘zimu‘] = zimu

    res[‘date‘] = date

    res[‘douban‘] = douban

    res[‘pianchang‘] = pianchang

    res[‘daoyan‘] = daoyan

    res[‘zhuyan‘] = zhuyan

    res[‘jianjie‘] = jianjie.replace("<br/>", "")

    res[‘addres‘] = addres

    return res
url = "http://dytt8.net/"

page = requests.get(url).content

page_html = BeautifulSoup(page,‘lxml‘)
name = page_html.select("td.inddline > a:nth-of-type(2)")
conn = pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,password=‘root‘,db=‘moves‘,charset=‘utf8‘)

cursor = conn.cursor()
for n in name:

    if ‘dyzz‘ in n.encode(‘gbk‘):

        info = getmoveinfo("http://dytt8.net"+n.get("href"))

        title = info[‘title‘]

        yiming = info[‘yiming‘]

        leibie = info[‘leibie‘]

        yuyan = info[‘yuyan‘]

        zimu = info[‘zimu‘]

        date = info[‘date‘]

        douban = info[‘douban‘]

        pianchang = info[‘pianchang‘]

        daoyan = info[‘daoyan‘]

        zhuyan = info[‘zhuyan‘]

        jianjie = info[‘jianjie‘]

        addres = info[‘addres‘]

        # print title.decode(‘utf-8‘).encode(‘gbk‘)

        cursor.execute("INSERT INTO move_info(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘,‘{5}‘,‘{6}‘,‘{7}‘,‘{8}‘,‘{9}‘,‘{10}‘,‘{11}‘);".format(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres))

        conn.commit()

cursor.close() conn.close() print ‘ok‘

原文地址：http://blog.51cto.com/itafei/2073434

时间： 2024-10-03 22:10:23

python 抓取电影天堂电影信息放入数据库的相关文章

python 抓取"一个"网站文章信息放入数据库

# coding:utf-8 import requests from bs4 import BeautifulSoup import json import time import datetime import pymysql import sys reload(sys) sys.setdefaultencoding('utf-8') # 获取文章内容方法 def getartinfo( url ): page = requests.get(url).content soup = Beaut

python爬虫——抓取电影天堂电影信息

做个小练习,抓取的是电影天堂里面最新电影的页面.链接地址:http://www.dytt8.net/html/gndy/dyzz/index.html 首先我们需要获取里面电影详情的网页地址: import urllib2 import os import re import string # 电影URL集合 movieUrls = [] # 获取电影列表 def queryMovieList(): url = 'http://www.dytt8.net/html/gndy/dyzz/index

房东要给我涨800房租，生气的我用Python抓取帝都几万套房源信息，我主动涨了1000。

老猫我在南五环租了一个80平两居室,租房合同马上到期,房东打电话问续租的事,想要加房租:我想现在国家正在也在抑制房价,房子价格没怎么涨,房租应该也不会涨,于是霸气拒绝了,以下是聊天记录:确认不续租之后,两三年没找过房的我上网搜索租房,没想到出来一坨自如,蛋壳,贝壳等中介网站:进去看看,各种房照非常漂亮,但是一看房租,想送给自己一首<凉凉>:附近房租居然比我当前房租高1000多RMB:自我安慰下,这些网站房源价格不是真实的,于是切换到我爱我家,链家等大中介平台,结果发现房租价格都差不多:心想这才

《一出好戏》讲述人性，使用Python抓取猫眼近10万条评论并分析，一起揭秘“这出好戏”到底如何？

黄渤首次导演的电影<一出好戏>自8月10日在全国上映,至今已有10天,其主演阵容强大,相信许多观众也都是冲着明星们去的.目前<一出好戏>在猫眼上已经获得近60万个评价,评分为8.2分,票房已破10亿. 作者本人(汤小洋 )今天也走进了电影院,对这部电影做了亲身的观看,看完后的感觉是有些许失落的,本以为是喜剧片,结果发现笑点一般,从搞笑的角度来看,不如<西虹市首富>,影片更多的是反映人类本性的一部电影,不应当做喜剧片来看,影片中展现的人与人之间的关系倒是值得我们去深思.

Python抓取需要cookie的网页

Python抓取需要cookie的网页在仿照<Python小练习:可视化人人好友关系>一文时,需要登录模拟登录人人网.然而自从CSDN事件之后,人人网开始使用加密方式处理登录名和密码,直接使用post方式已经无法登陆人人网.这时,从豆瓣讨论中找到了解决方法: 1. 首先使用浏览器登陆人人,然后找到浏览器中关于登陆的Cookie: 2. 将Cookie记录下来,在Python中使用cookie模块模拟浏览器的行为: 3. 取得并解析数据. 1. HTTP协议与Cookie 抓取网页的过程跟浏览

Python抓取网页&批量下载文件方法初探（正则表达式+BeautifulSoup） (转)

Python抓取网页&批量下载文件方法初探(正则表达式+BeautifulSoup) 最近两周都在学习Python抓取网页方法,任务是批量下载网站上的文件.对于一个刚刚入门python的人来说,在很多细节上都有需要注意的地方,以下就分享一下我在初学python过程中遇到的问题及解决方法. 一.用Python抓取网页基本方法: [python] view plaincopyprint? import urllib2,urllib url = 'http://www.baidu.com' req

python抓取百度彩票的双色球数据

最近在学习<机器学习实战>这本书,在学习的过程中不免要自己去实践,写些练习.这写练习的第一步就需要收集数据,所以为了写好自己的练习程序,我得先学会收集一些网络数据.了解到用python抓取网页数据的一些方法后,我就根据别人的demo,自己实践了一下,学着从百度彩票网站上抓取双色球的历史数据.以下我就介绍一下自己的小程序. 大致思路如下找到相关url和其参数找出页面上你要抓取的数据的位置,也就是说这个数据在那些标签下将每页中学要的数据取下来按一定格式存放在自己本地需要的环境: pytho

使用python抓取并分析数据—链家网(requests+BeautifulSoup)（转）

本篇文章是使用python抓取数据的第一篇,使用requests+BeautifulSoup的方法对页面进行抓取和数据提取.通过使用requests库对链家网二手房列表页进行抓取,通过BeautifulSoup对页面进行解析,并从中获取房源价格,面积,户型和关注度的数据. 准备工作首先是开始抓取前准备工作,导入需要使用的库文件,这里主要使用的是requests和BeautifulSoup两个.Time库负责设置每次抓取的休息时间.这里并非全部,后续还会在过程中导入新的库. 抓取列表页开始抓取

无比强大！Python抓取cssmoban网站的模版并下载

Python实现抓取http://www.cssmoban.com/cssthemes网站的模版并下载实现代码 # -*- coding: utf-8 -*- import urlparse import urllib2 import re import os import os.path URL='http://www.cssmoban.com/cssthemes' #全局超时设置 urllib2.socket.setdefaulttimeout(500) #根据url获取内容 def ge