1 from lxml import etree 2 import urllib3 3 import requests 4 urllib3.disable_warnings() 5 url="https://www.cnblogs.com/mvc/blog/news.aspx?blogApp=xiaoyujuan" 6 7 r = requests.get(url,verify=False) 8 # print(r.text) 9 10 dom = etree.HTML(r.content.decode("utf-8")) 11 block = dom.xpath("//*[@id=‘profile_block‘]") 12 t = etree.tostring(block[0],encoding=‘utf-8‘,pretty_print=True) 13 print(t.decode("utf-8")) 14 15 t1 = block[0].xpath("text()")#获取当前节点文本元素 16 print(t1) 17 t2 = block[0].xpath(‘a‘)#定位a标签 18 for i,j in zip(t1,t2): 19 print("%s%s" %(i,j.text))
1 from lxml import etree 2 htmldemo = ‘‘‘ 3 <meta charset="UTF-8"> <!-- for HTML5 --> 4 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 5 <html><head><title>yoyo ketang</title></head><body><b><!--Hey, this in comment!--></b> 6 <p class="title"><b>yoyoketang</b></p><p class="yoyo">这里是我的微信公众号:yoyoketang <br> 7 <a href="http://www.cnblogs.com/yoyoketang/tag/fiddler/" class="sister" id="link1">fiddler教程</a><br> 8 <a href="http://www.cnblogs.com/yoyoketang/tag/python/" class="sister" id="link2">python笔记</a><br> 9 <a href="http://www.cnblogs.com/yoyoketang/tag/selenium/" class="sister" id="link3">selenium文档</a><br> 10 快来关注吧!</p> 11 <p class="story">...</p> 12 ‘‘‘ 13 #etree.HTMLz解析html内容 14 demo = etree.HTML(htmldemo) 15 #打印解析之后的html内容,可用etree.tosting方法 16 #encoding="utf-8"参数可以正常输出html里面的中文内容 17 #pretty_print=True是以标准格式输出 18 t = etree.tostring(demo,encoding=‘utf-8‘,pretty_print=True) 19 print(t.decode(‘utf-8‘))
原文地址:https://www.cnblogs.com/xiaoyujuan/p/11304355.html
时间: 2024-10-19 04:56:26