去掉重复的id,并且存入数据库:
import MySQLdb # 打开数据库连接 db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend") cur = db.cursor() # db = MySQLdb.connect(host="localhost",user="root",passwd="123456",db="db_websiterecommend",port=‘3306‘ ) # # 使用cursor()方法获取操作游标 # cursor = db.cursor() # 使用execute方法执行SQL语句 sql=‘select ip from t_useripvisittrace‘ cur.execute(sql) # 使用 fetchone() 方法获取一条数据库。 data = cur.fetchall() #print(data) user_ip=[] for l in data: user_ip.append(l[0]) #print(type(user_ip)) #print(user_ip) # 关闭数据库连接 user_ip_cai=set(user_ip) # print(user_ip_cai) userip=list(user_ip_cai) value=[] for i in range(len(userip)): value.append((i,userip[i])) cur.executemany(‘insert into t_userIP_list values(%s,%s)‘,value) db.commit()
根据网站情况,进行网站的分类:
# -*- coding: utf-8 -*- import MySQLdb import re import requests from lxml import etree # 打开数据库连接 count=0 url=‘http://www.tipdm.org‘ db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend") cur = db.cursor() cur1=db.cursor() sql=‘select page_path from t_useripvisittrace‘ sql1=‘select url_list from urls‘ cur.execute(sql) cur1.execute(sql1) # 使用 fetchone() 方法获取一条数据库。 value=[] data = cur.fetchall() print(len(data)) for each in data: #print(type(each[0])) if each[0]==‘/‘: print(‘2222‘) value.append((each[0],‘首页‘)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count+=1 print(count) #print(value) elif each[0]==‘/index.jhtml‘: print(‘3333‘) value.append((each[0],‘首页‘)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) #print(value) elif ‘index‘ in each[0]: print(‘4444‘) urls=url+each[0] html = requests.get(urls) selector = etree.HTML(html.text) content=selector.xpath(‘/html/body/div[6]/div[2]/div[1]/div/a[2]/text()‘) value.append((each[0],content)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) #print(value) elif ‘.jhtml‘ in each[0]: print(‘5555‘) url1=url+each[0] html = requests.get(url1) selector = etree.HTML(html.text) content=selector.xpath(‘/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text()‘) value.append((each[0],content)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) else: print(‘666‘) value.append((each[0],‘其他‘)) print(each[0]) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) print(value) print(‘finish‘)
使用pandas读取数据库进行统计
import pandas as pd from sqlalchemy import create_engine engine = create_engine(‘mysql+pymysql://root:[email protected]:3306/db_websiterecommend?charset=utf8‘) sql = pd.read_sql(‘t_useripvisittrace‘, engine, chunksize = 10000) output=‘C:\\Users\\lenovo\\Desktop\\count_.xls‘ ‘‘‘z 用create_engine建立连接,连接地址的意思依次为“数据库格式(mysql)+程序名(pymysql)+账号密码@地址端口/数据库名(test)”,最后指定编码为utf8; all_gzdata是表名,engine是连接数据的引擎,chunksize指定每次读取1万条记录。这时候sql是一个容器,未真正读取数据。 ‘‘‘ import MySQLdb # 打开数据库连接 db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend") cur = db.cursor() value=[] # for j in range(len(list(sql))): # s=sql[j][‘ip‘].value_counts() # value.append((j,s)) # print value for i in sql: s=i[‘ip‘].value_counts() print type(s) value.append((list(i[‘ip‘]),list(s))) cur.executemany(‘insert into userip values(%,%s)‘, value) db.commit() print value
时间: 2024-11-02 14:10:53