2018/7/21,这几天整理出来的一些Python 爬虫学习代码。
import urllib2
response = urllib2.urlopen("http://baidu.com")
html = response.read()
print html
进一步,可以request
import urllib2
req = urllib2.Request("http://www.baidu.com")
response = urllib2.urlopen(req)
html = response.read()
print html
伪装浏览器
import urllib2
url = "http://www.baidu.com"
user_agent = "Mozilla/5.0(compatible;MSTE 9.0;Windows NT 6.1;Trident/5.0;"
headers = {"User-Agent‘:user_agent}
req = urllib2.Request(rul,headers = headers)
response = urllib2.urlopen(req)
the_page = response.read()
print the_page
代码:输入输出网页
# _*_ coding:utf-8 _*_
import urllib2
def load_page(url):
user_agent = "Mozilla/5.0 (compatible;MSTE 9.0;Windows NT 6.1;Trident/5.0;"
headers = {"User-Agent":user_agent}
req = urllib2.Request(url,headers = headerss)
response = urllib2.urlopen(req)
html = response.read()
return html
def tieba_spider(url,begin_page,end_page):
"""
贴吧爬虫的方法
"""
for i in range(begin_page,end_page +1):
pn = 50 * (i-1)
my_url = url + str(pn)
html = load_page(my_url)
print "##################第%页########################" %(i)
print html
print "###############################################"
if __name__ == "__main__":
url = raw_input("请输入贴吧的url地址")
begin_page = int(raw_input("请输入起始页码"))
end_page = int(raw_input("请输入终止页码"))
tieba_spider(url,begin_page,end_page)
代码:输入输出保存网页
# _*_ coding:utf-8 _*_
import urllib2
def load_page(url):
user_agent = "Mozilla/5.0 (compatible;MSTE 9.0;Windows NT 6.1;Trident/5.0;"
headers = {"User-Agent":user_agent}
req = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(req)
html = response.read()
return html
def writee_to_file(file_name,txt):
"""将txt文本存入到file_name文件中
"""
print "正在存储文件" +filr_name
f = oprn(file_name,‘w‘)
f = write(txt)
f.close(0
def tieba_spider(url,begin_page,end_page):
"""
贴吧爬虫的方法
"""
for i in range(begin_page,end_page + 1):
pn = 50 * (i-1)
my_url = url + srt(pn)
html = load_page(my_url)
filr_name = str(i) + ".html"
write_to_file(file_name,html)
if __name__ == "__main__":
url = raw_input("请输入贴吧的url地址")
begin_page = int(raw_input("请输入起始页码"))
end_page = int(raw_input("请输入终止页码"))
tieba_spider(url,begin_page,end_page)
原文地址:https://www.cnblogs.com/bqwzx/p/9347698.html