工作中需要判断某个文本中的URL是否能正常访问,并且随机获取其中N行能正常访问的URL数据,我的思路是:读取文本每一行数据,用urlopen访问,将返回状态码为200的URL保存到一个列表,获得列表长度,使用random产生一个随机值作为列表下标,获取该行数据。具体实现如下:
1 import urllib2,random 2 from sets import Set 3 4 def get_responses(url): 5 global good_list 6 global bad_list 7 if not url.startswith("http:"): 8 http_url = "http://" + url 9 headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1‘,} 10 try: 11 request = urllib2.Request(http_url, headers=headers) 12 resp = urllib2.urlopen(request) 13 print url 14 except urllib2.URLError, e: 15 print e 16 bad_list.append(url) 17 return 0 18 19 retcode = resp.getcode() 20 if retcode == 200: 21 good_list.append(url) 22 #return 1 23 else: 24 bad_list.append(url) 25 #return 0 26 27 def readFile(): 28 try: 29 urllist = open(r‘C:\Users\888\Desktop\urls.txt‘,‘r‘) 30 except IOError: 31 print "file does not exist.\n" 32 for item in urllist: 33 item = item.strip(‘\n‘) 34 r = get_responses(item) 35 36 urllist.close() 37 print "Total URLs: %d, Good URLs:%d, Bad URLs: %d." %((len(good_list)+len(bad_list)),len(good_list),len(bad_list)) 38 39 def writeFile(linenum): 40 result = [] 41 linelen = len(good_list) 42 while len(Set(result)) < int(linenum): 43 s = random.randint(0,linelen-1) 44 result.append(good_list[s]) 45 46 # Put the good_url in goodurl.txt file 47 try: 48 goodurl = open(r‘C:\Users\888\Desktop\goodurl.txt‘,‘w+‘) 49 except IOError: 50 print "file does not exist.\n" 51 52 for item in result: 53 goodurl.write(item+‘\n‘) 54 goodurl.close() 55 56 print "The mission is done, Please check the goodurl.txt file" 57 58 if __name__ == "__main__": 59 good_list = [] 60 bad_list = [] 61 readFile() 62 writeFile(100)
时间: 2024-10-12 09:22:01