import codecs from xml.dom.minidom import Document import requests from bs4 import BeautifulSoup doc = Document() def getAllUrl(pageCount): url=‘https://www.xxx.co/xxxx/{page}‘ return url.format(page=pageCount) def getHtml(pageCount): html = requests.get(getAllUrl(pageCount)) return html def WirteXml(gName,gImg,wUrl): girlName = gName girlImage = gImg webUrl = wUrl name = doc.createElement("name") aperson.appendChild(name) personname = doc.createTextNode(girlName) name.appendChild(personname) img = doc.createElement("imgUrl") aperson.appendChild(img) prersonUrl = doc.createTextNode(girlImage) img.append.Child(prersonUrl) weburl = doc.createElement("webUrl") aperson.appendChild(weburl) personname = doc.createTextNode(webUrl) weburl.appendChild(personname) if __name__ == ‘__main__‘: # f = codecs.open(‘Conker.txt‘, ‘w‘, ‘utf-8‘) filename = "people.xml" f = codecs.open(filename, "w", ‘utf-8‘) people = doc.createElement("Actresses") doc.appendChild(people) aperson = doc.createElement("person") people.appendChild(aperson) for count in range(1,1250): html = getHtml(count).text soup= BeautifulSoup(html,"lxml") trs=soup.findAll("img") length=len(trs) for i in range(length): try: girlName = trs[i].attrs["title"] girlImage = trs[i].attrs["src"] webUrl ="https://www.xxx.co/xx/"+trs[i].attrs["src"].split(‘/‘)[-1][:-6] WirteXml(girlName,girlImage,webUrl) except: None print("第"+str(count)+"页抓完!!!") f.write(doc.toprettyxml(indent=" ")) f.close()
时间: 2024-11-23 01:01:50