from lxml import html def parse(): """"将html文件中的内容,使用小path进行提取""" #读取文件中的内容 f = open(‘./venv/static_/index.html‘,‘r‘,encoding = ‘utf-8‘) s = f.read() selector = html.fromstring(s) #j解析标题 h3 = selector.xpath(‘/html/body/h3/text()‘) print(h3[0])#这里取到的是个list,我用使用列表获取 f.close() #解析ul里面的内容 ul = selector.xpath(‘/html/body/ul/li‘) # ul = selector.xpath(‘//ul/li‘)也可以使用 print(len(ul)) for li in ul: print(li.xpath(‘text()‘)[0]) #解析tr里面的内容 # tr = selector.xpath(‘/html/body/form/table/tr/td/text()‘) # print(tr) #解析ul指定的元素值 ul2 = selector.xpath(‘/html/body/ul/li[@class="important"]/text()‘) print(ul2) #解析ul指定的元素属性 a = selector.xpath(‘//div[@id="container"]/a/text()‘) print(a[0]) #href属性 alink = selector.xpath(‘//div[@id="container"]/a/@href‘) print(alink[0]) #解析p标签 p = selector.xpath(‘/html/body/p/text()‘) # p = selector.xpath(‘/html/body/p[last()]/text()‘) #获取最后一个 print(len(p)) print(p[0]) #使用浏览的xpath生成工具 test = selector.xpath(‘/html/body/form/table/tr[1]/th/text()‘)#只能借鉴/html/body/form/table/tbody/tr[1]/th print(test[0]) if __name__== ‘__main__‘: parse()
原文地址:https://www.cnblogs.com/zsjlovewm/p/11106458.html
时间: 2024-10-09 22:34:16