from lxml import etree
#####################基本用法:
#####################
html = ‘‘‘ <h1 class="header">登录</h1> <form action="/login" method="post"> <label for="username">用户: </label><input type="text" name="username" /> <label for="password">密码:</label><input type="password" name="password" /> <input type="submit" value="Submit" /> </form>‘‘‘ # 生成DOM dom = etree.HTML(html) # 取内容 /text() contents = dom.xpath(‘//h1[@class="header"]/text()‘) print(contents) # 取属性 /@attrib attribs = dom.xpath(‘//form/label[@for="username"]/@for‘) print(attribs) #####################复杂用法:
#####################
html2 = ‘‘‘ <div class="content"> ==> 有相同字符开头的属性的标签: <p id="test-1">需要的内容1</p> <p id="test-2">需要的内容2</p> <p id="test-default">需要的内容3</p> </div> <div class="question"> ==> 签嵌套标签: <p id="class3">美女, <font color="red">你的微信号是多少?</font> </p> </div> ‘‘‘ dom = etree.HTML(html2) # 取有相同字符开头的属性的标签的内容 starts-with(@attrib, "abcd") contents2 = dom.xpath(‘//p[starts-with(@id, "test")]/text()‘) print(contents2) # 取标签嵌套标签的所有内容 xpath(‘string(.)‘) contents3 = dom.xpath(‘//div[@class="question"]/p‘)[0].xpath(‘string(.)‘) contents3 = contents3.replace(‘\n‘, ‘‘).replace(‘ ‘, ‘‘) print(contents3)
时间: 2024-10-30 13:06:32