定向爬虫实战笔记
流程图如下:
来自追女神助手(痴汉)v0.1:
1.#-*-coding:utf8-*-2.3.import smtplib4.from email.mime.text import MIMEText5.import requests6.from lxml import etree7.import os8.import time9.import sys10.reload(sys)11.sys.setdefaultencoding(‘utf-8‘)12.13.14.15.class mailhelper(object):16. ‘‘‘17. 这个类实现发送邮件的功能18. ‘‘‘19. def __init__(self):20.21. self.mail_host="smtp.xxxx.com" #设置服务器22. self.mail_user="xxxx" #用户名23. self.mail_pass="xxxx" #密码24. self.mail_postfix="xxxx.com" #发件箱的后缀25.26. def send_mail(self,to_list,sub,content):27. me="xxoohelper"+"<"+self.mail_user+"@"+self.mail_postfix+">"28. msg = MIMEText(content,_subtype=‘plain‘,_charset=‘utf-8‘)29. msg[‘Subject‘] = sub30. msg[‘From‘] = me31. msg[‘To‘] = ";".join(to_list)32. try:33. server = smtplib.SMTP()34. server.connect(self.mail_host)35. server.login(self.mail_user,self.mail_pass)36. server.sendmail(me, to_list, msg.as_string())37. server.close()38. return True39. except Exception, e:40. print str(e)41. return False42.43.class xxoohelper(object):44. ‘‘‘45. 这个类实现将爬取微博第一条内容46. ‘‘‘47. def __init__(self):48. self.url = ‘http://weibo.cn/u/xxxxxxx‘ #请输入准备抓取的微博地址49. self.url_login = ‘https://login.weibo.cn/login/‘50. self.new_url = self.url_login51.52. def getSource(self):53. html = requests.get(self.url).content54. return html55.56. def getData(self,html):57. selector = etree.HTML(html)58. password = selector.xpath(‘//input[@type="password"]/@name‘)[0]59. vk = selector.xpath(‘//input[@name="vk"]/@value‘)[0]60. action = selector.xpath(‘//form[@method="post"]/@action‘)[0]61. self.new_url = self.url_login + action62. data = {63. ‘mobile‘ : ‘[email protected]‘,64. password : ‘xxxxxx‘,65. ‘remember‘ : ‘on‘,66. ‘backURL‘ : ‘http://weibo.cn/u/xxxxxx‘, #此处请修改为微博地址67. ‘backTitle‘ : u‘微博‘,68. ‘tryCount‘ : ‘‘,69. ‘vk‘ : vk,70. ‘submit‘ : u‘登录‘71. }72. return data73.74. def getContent(self,data):75. newhtml = requests.post(self.new_url,data=data).content76. new_selector = etree.HTML(newhtml)77. content = new_selector.xpath(‘//span[@class="ctt"]‘)78. newcontent = unicode(content[2].xpath(‘string(.)‘)).replace(‘http://‘,‘‘)79. sendtime = new_selector.xpath(‘//span[@class="ct"]/text()‘)[0]80. sendtext = newcontent + sendtime81. return sendtext82.83. def tosave(self,text):84. f= open(‘weibo.txt‘,‘a‘)85. f.write(text + ‘\n‘)86. f.close()87.88. def tocheck(self,data):89. if not os.path.exists(‘weibo.txt‘):90. return True91. else:92. f = open(‘weibo.txt‘, ‘r‘)93. existweibo = f.readlines()94. if data + ‘\n‘ in existweibo:95. return False96. else:97. return True98.99.if __name__ == ‘__main__‘:100. mailto_list=[‘[email protected]‘] #此处填写接收邮件的邮箱101. helper = xxoohelper()102. while True:103. source = helper.getSource()104. data = helper.getData(source)105. content = helper.getContent(data)106. if helper.tocheck(content):107. if mailhelper().send_mail(mailto_list,u"女神更新啦",content):108. print u"发送成功"109. else:110. print u"发送失败"111. helper.tosave(content)112. print content113. else:114. print u‘pass‘115. time.sleep(30)
时间: 2024-10-06 10:25:57