# _*_ coding:UTF-8 _*_ import urllib.requestimport urllib.errorimport refrom bs4 import BeautifulSoupimport timeimport socket #获取主页的源码fanly_url = "http://zhide.fanli.com/p" #多页format_url = "http://zhide.fanli.com/detail/1-" #商品链接 class Faly(): #首字母大写 def __init__(self): #初始化构造函数 self.user_agent = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36‘ #头部信息 self.html_data = [] #放置商品信息的列表 #获取主页的源码 def get_html(self,start_page=1 ,end_page=7): for i in range(start_page,end_page+1): rt = urllib.request.Request(fanly_url+str(i)) #用地址创建一个对象 rt.add_header(‘User_Agent‘,self.user_agent) try: my_date = urllib.request.urlopen(rt).read().decode(‘UTF-8‘) #打开网页,获取源码 #print(my_date) #获取网页源码 self.html_data.append(my_date) time.sleep(2) socket.setdefaulttimeout(15) except urllib.error.URLError as e: if hasattr(e,‘reason‘): #判断异常是否存在的一个函数 print(u"连接失败",e.reason) return str(self.html_data) #html = Faly().get_html() #获取产品的超链接class GetData(): def __init__(self): self.html = Faly().get_html() #获取源码 self.href = [] #放置六位数字的列表 self.ls = [] self.url = [] #获取产品的超链接 def get_hrefurl(self): reg = r‘data-id="\d{6}"‘ #商品6位数字正则 result = re.compile(reg) #编译,提高效率 tag = result.findall(self.html) #tag = re.findall(result,self.html) #print(tag) for i in tag: self.href.append(i) #print(self.href) #去重 reg2 = r"\d{6}" result2 = re.findall(reg2,str(self.href)) if len(result2): for data in result2: if data not in self.ls: self.ls.append(data) url = format_url+str(data) #完整的商品链接 self.url.append(url) #print(self.url[-1]) return self.url #获取商品信息class Href_mg(): def __init__(self): self.list = GetData().get_hrefurl() self.txt_list = [] #商品信息 def show_mg(self): for item in range(len(self.list)): if len(self.list): url = str(self.list[item]) mg = urllib.request.Request(url) try: req = urllib.request.urlopen(mg).read() soup = BeautifulSoup(req,"html.parser") txt = soup.find_all(‘h1‘) #找标签 self.txt_list.append(txt) print(self.txt_list[-1]) #打印商品列表 except urllib.error.URLError as e: print(e.reason) return str(self.txt_list) if __name__ == "__main__": #判断文件入口 path = "yaozhi.txt" with open(path,‘a‘) as file: data = Href_mg().show_mg() #获取产品的内容 reg4 = r‘<.*+>‘ data_s = data.replace(u‘\xa0‘,u‘‘).replace(‘,‘,‘\n‘).replace(‘全网最低‘,‘‘).replace(‘[‘,‘‘).replace(‘]‘,‘‘).replace(‘ ‘,‘‘).strip() file.write(data_s)
时间: 2024-10-05 05:58:28