1 from lxml import etree 2 text = "<div><p>nmsl</p><span>nmsl</span></div>" 3 def htmlstree(text): 4 html = etree.HTML(text) 5 result = etree.tostring(html) 6 print(result) 7 return result.decode(‘utf-8‘) 8 #解析html字符串并且会为标签自动加上<html><body></body></html> 9 def parseetree(): 10 parser = etree.HTMLParser(encoding=‘utf-8‘) 11 html = etree.parse("index111.html",parser=parser) 12 result = etree.tostring(html,encoding=‘utf-8‘).decode("utf-8") 13 print(result) 14 #解析xml 由于某写html标签会不全用普通的xml解析器会出错 如<br/> 所以要指定html解析器 15 if __name__ == ‘__main__‘: 16 parseetree()
以上为etree的使用范例
分别解析了html字符串和html文件
from lxml import etree def parseetree(): parser = etree.HTMLParser(encoding=‘utf-8‘) html = etree.parse("index111.html",parser=parser) trs = html.xpath("//a[@onclick][@id]") for tr in trs: result = etree.tostring(tr,encoding=‘utf-8‘).decode("utf-8") print(result) def parseetree1(): parser = etree.HTMLParser(encoding=‘utf-8‘) html = etree.parse("index111.html",parser=parser) tr = html.xpath("//a[@onclick][@id]")[3] result = etree.tostring(tr,encoding=‘utf-8‘).decode("utf-8") print(result) if __name__ == "__main__": parseetree() print("***************") parseetree1()
以上为运用xpath来对html进行解析
以下是运行结果
附:https://www.w3school.com.cn/xpath/xpath_syntax.asp xpath语法
原文地址:https://www.cnblogs.com/jswf/p/12287819.html
时间: 2024-10-29 07:30:49