1 #coding=utf-8 2 3 import re 4 import requests 5 import urllib 6 import os 7 import MySQLdb 8 9 #初始化配置参数 10 path = ‘img/‘ #图片存放目录 11 tableName0 = ‘imgTable‘ #表名 12 #文件操作,如果不存在该目录,创建文件夹 13 if not os.path.exists(path): 14 os.makedirs(path) 15 16 # 连接数据库 mysql 17 conn= MySQLdb.connect( 18 host=‘localhost‘, 19 port = 3306, 20 user=‘root‘, 21 passwd=‘123‘, 22 db =‘test‘, 23 charset=‘utf8‘, 24 ) 25 cur = conn.cursor() 26 print "连接成功" 27 28 # 如果数据表已经存在使用 execute() 方法删除表。 29 cur.execute("DROP TABLE IF EXISTS %s" % tableName0) 30 #创建数据表 31 cur.execute("create table %s(num varchar(2) ,name varchar(200),url varchar(200))" % tableName0) 32 33 #获取html网页 34 def getHtml(url): 35 page = urllib.urlopen(url) 36 html = page.read() 37 return html 38 39 #根据网页内容获取图片地址列表 40 def getImg(html): 41 reg = r‘" src="(.+?\=550x0)" style="‘ 42 imgre = re.compile(reg) 43 imglist = imgre.findall(html) 44 print imglist 45 return imglist 46 #t图片保存 47 def save_img(url,path): 48 message = None 49 print path 50 try: 51 file = open(path + os.path.basename(url),‘wb‘) 52 request = urllib.urlopen(url) 53 file.write(request.read()) 54 except Exception as e: 55 #捕获异常,定义异常实例e,可以捕获除与程序退出sys.exit()相关之外的所有异常 56 message = str(e) 57 print message 58 else: 59 #如果try中的语句没有引发异常,则执行else中的语句 60 message = os.path.basename(url) 61 finally: 62 #无论是否出现异常,都执行的代码 63 if not file.closed: 64 file.close() 65 return message 66 67 #插入数据 68 def insertIntoDb(list): 69 i = 1 70 for imgurl in list: 71 id1 = i 72 urlR = dealURL(imgurl, ‘?imageView&thumbnail=550x0‘) 73 name = os.path.basename(urlR) 74 # 插入数据 75 sql = ‘insert into %s values( \‘%d\‘,\‘%s\‘,\‘%s\‘)‘ % (tableName0, id1, name, urlR) 76 cur.execute(sql) 77 78 #处理url,处理成以.jpg或.png结尾的url 79 def dealURL(url,str): 80 #方法一:截取 81 # 从左往右,第一个(默认)到倒数第三十个结束,截取间距为1. 82 # urlTemp = url[:-30:1] 83 #方法二:替换 84 urlTemp = url 85 urlTemp = urlTemp.replace(str,‘‘) 86 return urlTemp 87 88 #保存图片到本地,在本程序中没用到 89 def saveImge(imgList): 90 x = 1 91 for imgurl in imgList: 92 #方法一 93 # urllib.urlretrieve(imgurl, ‘%s.jpg‘ % x) 94 # print imgurl 95 96 #方法二,可定义存储位置 97 pic = requests.get(imgurl, timeout=10) 98 string = path + str(x) + ‘.jpg‘ 99 fp = open(string, ‘wb‘) 100 fp.write(pic.content) 101 fp.close() 102 103 #方法三,图片名为url后面的一串 104 # path1 = path + str(x)+‘--‘ #添加数字方便统计 105 # save_img(imgurl,path1) 106 107 x += 1 108 109 # 方法四,从数据库获取,然后下载 110 def saveImageFromDb(): 111 # 获得表中有多少条数据 112 allData = cur.execute("select * from %s" % tableName0) 113 print allData 114 # 打印表中的多少数据 115 list = cur.fetchmany(allData) 116 x=1 117 for data in list: 118 path1 = path + str(x) + ‘--‘ 119 #data有三项,分别[0,1,2],对应的为[num,name,url] 120 url= data[2] 121 print url 122 save_img(url, path1) 123 x+=1 124 print "保存完成" 125 126 html = getHtml("http://news.163.com/17/0831/07/CT5B1SJB000181BT.html") 127 imgList123 = getImg(html) 128 insertIntoDb(imgList123) 129 saveImageFromDb() 130 # saveImge(imgList123) 131 132 # #翻转测试 133 # sStr1 = ‘abcdefg‘ 134 # sStr1 = sStr1[::-1] 135 # print sStr1 136 137 #截取测试 138 # str = ‘0123456789‘ 139 # print "1:",str[:-1:2] 140 # print "2:",str[:0:-3] 141 # print "3:",str[-8:8:2] 142 # print "4:",str[-2:2:-2] 143 # print "5:",str[:-1:] 144 # print "6:",str[::] 145 # print "7:",str[::-1][:3] 146 147 #关闭数据库 148 cur.close() 149 conn.commit() 150 conn.close()
环境:python2.7
时间: 2024-10-15 00:17:19