功能描述:
获取某个路径下的所有文件,提取出每个文件中出现频率最高的前300个字。保存在数据库当中。
前提,你需要配置好nltk
#!/usr/bin/python #coding=utf-8 ''' function : This script will create a database named mydb then abstract keywords of files of privacy police. author : Chicho date : 2014/7/28 running : python key_extract.py -d path_of_file ''' import sys,getopt import nltk import MySQLdb from nltk.corpus import PlaintextCorpusReader corpus_root = "" if __name__ == '__main__': opts,args = getopt.getopt(sys.argv[1:], "d:h","directory=help") #get the directory for op,value in opts: if op in ("-d", "--directory"): corpus_root = value #actually, the above method to get a directory is a little complicated,you can #do like this ''' the input include you path and use sys.argv to get the path ''' ''' running : python key_extract.py you path_of_file corpus_root = sys.argv[1] ''' # corpus_root is the directory of files of privacy policy, all of the are html files filelists = PlaintextCorpusReader(corpus_root, '.*') #get the files' list files = filelists.fileids() #connect the database conn = MySQLdb.connect(host = 'your_personal_host_ip_address', user = 'rusername', port =your_port, passwd = 'U_password') #get the cursor curs = conn.cursor() conn.set_character_set('utf8') curs.execute('set names utf8') curs.execute('SET CHARACTER SET utf8;') curs.execute('SET character_set_connection=utf8;') ''' conn.text_factory=lambda x: unicode(x, 'utf8', "ignore") #conn.text_factory=str ''' # create a database named mydb ''' try: curs.execute("create database mydb") except Exception,e: print e ''' conn.select_db('mydb') try: for i in range(300): sql = "alter table filekeywords add " + "key" + str(i) + " varchar(45)" curs.execute(sql) except Exception,e: print e i = 0 for privacyfile in files: #f = open(privacyfile,'r', encoding= 'utf-8') sql = "insert into filekeywords set id =" + str(i) curs.execute(sql) sql = "update filekeywords set name =" + "'" + privacyfile + "' where id= " + str(i) curs.execute(sql) # get the words in privacy policy wordlist = [w for w in filelists.words(privacyfile) if w.isalpha() and len(w)>2] # get the keywords fdist = nltk.FreqDist(wordlist) vol = fdist.keys() key_num = len(vol) if key_num > 300: key_num = 300 for j in range(key_num): sql = "update filekeywords set " + "key" + str(j) + "=" + "'" + vol[j] + "' where id=" + str(i) curs.execute(sql) i = i + 1 conn.commit() curs.close() conn.close()
转载注明出处:http://blog.csdn.net/chichoxian/article/details/42003603
时间: 2024-11-05 16:24:13