使用nodejs的puppeteer库爬取瓜子二手车网站

const puppeteer = require(‘puppeteer‘);

(async () => {
    const fs = require("fs");
    const rootUrl = ‘https://www.guazi.com‘
    const workPath = ‘./contents‘;

    if (!fs.existsSync(workPath)) {
        fs.mkdirSync(workPath)
    }
    const browser = await (puppeteer.launch({ headless: false }));
    const page = await browser.newPage()
    await page.setViewport({ width: 1128, height: 736 });
    await page.setRequestInterception(true); // 拦截器
    page.on(‘request‘, request => { //拦截图片
        if (request.resourceType() === ‘image‘)
            request.abort();
        else
            request.continue();
    });

    await page.goto("https://www.guazi.com/fuzhou/buy")

    const m_cityList = await page.evaluate(() => { //           获取所有城市
        const elements = Array.from(document.querySelectorAll(‘.all-city dl‘))
        return elements.map(s => {
            let dd = s.getElementsByTagName("dd").item(0)
            let ddList = []
            for (let i = 0; i < dd.getElementsByTagName("a").length; i++) {
                ddList.push({
                    "cityName": dd.getElementsByTagName("a").item(i).innerHTML,
                    "url": dd.getElementsByTagName("a").item(i).getAttribute("href")
                })
            }
            return ddList
        })
    })
    //数组扁平化
    const flattenNew = arr => arr.reduce((prev, next) => Object.prototype.toString.call(next) == ‘[object Array]‘ ? prev.concat(flattenNew(next)) : prev.concat(next), [])
    const cityList = flattenNew(m_cityList)
    console.log("城市列表爬取完毕")
    await page.waitFor(2000 + Math.round(Math.random() * 100))
    for (let i = 0; i < cityList.length; i++) {
        await page.waitFor(1000 + Math.round(Math.random() * 100))
        await page.goto(rootUrl + cityList[i].url)
        console.log("跳转到" + cityList[i].cityName)
        console.log("开始爬取" + cityList[i].cityName + "的所有二手车品牌")
        try {
            let brandList = await page.evaluate(() => {  //品牌
                let Array = []
                const dl = document.querySelectorAll(‘.screen‘).item(0).getElementsByTagName("dl")
                const div = dl.item(0).getElementsByTagName("dd").item(0).getElementsByTagName("div").item(1)
                const ul = div.getElementsByTagName("ul")
                for (let i = 0; i < ul.length; i++) {
                    let li = ul.item(i).getElementsByTagName("li")
                    for (let j = 0; j < li.length; j++) {
                        let a = li.item(j).getElementsByTagName("p").item(0).getElementsByTagName("a")
                        for (let k = 0; k < a.length; k++) {
                            Array.push({
                                "brand": a.item(k).innerHTML,
                                "url": a.item(k).getAttribute("href")
                            })
                        }
                    }
                }
                return Array
            })

            console.log(cityList[i].cityName + "的所有二手车品牌爬取完毕")

            for (let j = 0; j < brandList.length; j++) {

                console.log("开始爬取" + cityList[i].cityName + "-" + brandList[j].brand + "的所有车系")
                await page.waitFor(1000 + Math.round(Math.random() * 100))
                await page.goto(rootUrl + brandList[j].url)

                try {
                    const carTypeList = await page.evaluate(() => {  //车型
                        let Array = []
                        const dl = document.querySelectorAll(‘.screen‘).item(0).getElementsByTagName("dl")
                        const div = dl.item(1).getElementsByTagName("dd").item(0).getElementsByTagName("div").item(1)
                        const li = div.getElementsByTagName("ul").item(0).getElementsByTagName("li")

                        for (let j = 0; j < li.length; j++) {
                            let a = li.item(j).getElementsByTagName("p").item(0).getElementsByTagName("a")
                            for (let k = 0; k < a.length; k++) {
                                Array.push({
                                    "carType": a.item(k).innerHTML.replace(/\s*/g, ""),
                                    "url": a.item(k).getAttribute("href")
                                })
                            }
                        }
                        return Array
                    })

                    console.log(cityList[i].cityName + "-" + brandList[j].brand + "的所有车系爬取完毕")
                    for (let k = 0; k < carTypeList.length; k++) {
                        await page.waitFor(1000 + Math.round(Math.random() * 100))
                        console.log("开始爬取" + cityList[i].cityName + "-" + brandList[j].brand + "-" + carTypeList[k].carType + "的所有二手车")
                        let newUrl = rootUrl + carTypeList[k].url

                        pathArray = newUrl.split("/") //拿到第一页url,得到后面的页面的url
                        let urlArray = []

                        try {
                            await page.goto(newUrl)
                            const pageNum = await page.evaluate(() => {   //获取总页数
                                let li = document.querySelectorAll("ul.pageLink").item(0).getElementsByTagName("li")
                                let liNum = li.length
                                return li.item(li.length - 2).getElementsByTagName("a").item(0).getElementsByTagName("span").item(0).innerHTML
                            })

                            for (let i = 1; i <= pageNum; i++) { //将所有的页面存入数组中
                                urlArray.push(newUrl.replace(new RegExp("/" + pathArray[pathArray.length - 1], ‘g‘), "/o" + i + "/" + pathArray[pathArray.length - 1]))
                            }

                        } catch (error) {
                            console.log(cityList[i].cityName + "-" + brandList[j].brand + "-" + carTypeList[k].carType + "的所有二手车列表爬取失败,该车型可能只有少量或者没有")
                        }
                        if (urlArray.length != 0) {
                            for (let i = 0; i < urlArray.length; i++) {

                                await page.goto(urlArray[i]);

                                const list = await page.evaluate(() => {
                                    let carArray = []
                                    let li = document.querySelectorAll("ul.carlist").item(0).getElementsByTagName("li")
                                    for (let i = 0; i < li.length; i++) {
                                        a = li.item(i).getElementsByTagName("a").item(0)
                                        carArray.push({
                                            "url": a.getAttribute("href"),
                                            "imgUrl": a.getElementsByTagName("img").item(0).getAttribute("src"),
                                            "carName": a.getElementsByTagName("h2").item(0).innerHTML,
                                            "carData": (a.getElementsByTagName("div").item(0).innerHTML).replace(new RegExp(‘<span class="icon-pad">‘, ‘g‘), "").replace(new RegExp(‘</span>‘, ‘g‘), ""),
                                            "price": a.getElementsByTagName("div").item(1).getElementsByTagName("p").item(0).innerHTML.replace(new RegExp(‘<span>‘, ‘g‘), "").replace(new RegExp(‘</span>‘, ‘g‘), "").replace(/\s*/g, "")
                                        })

                                    }
                                    return carArray
                                })
                                await page.waitFor(500 + Math.round(Math.random() * 100))
                                console.log(list)
                            }
                        }else{
                            try {

                                const list = await page.evaluate(() => {
                                    let carArray = []
                                    let li = document.querySelectorAll("ul.carlist").item(0).getElementsByTagName("li")
                                    console.log("该车型少量")
                                    for (let i = 0; i < li.length; i++) {

                                        a = li.item(i).getElementsByTagName("a").item(0)
                                        carArray.push({
                                            "url": a.getAttribute("href"),
                                            "imgUrl": a.getElementsByTagName("img").item(0).getAttribute("src"),
                                            "carName": a.getElementsByTagName("h2").item(0).innerHTML,
                                            "carData": (a.getElementsByTagName("div").item(0).innerHTML).replace(new RegExp(‘<span class="icon-pad">‘, ‘g‘), "").replace(new RegExp(‘</span>‘, ‘g‘), ""),
                                            "price": a.getElementsByTagName("div").item(1).getElementsByTagName("p").item(0).innerHTML.replace(new RegExp(‘<span>‘, ‘g‘), "").replace(new RegExp(‘</span>‘, ‘g‘), "").replace(/\s*/g, "")
                                        })

                                    }
                                    return carArray
                                })
                                await page.waitFor(500 + Math.round(Math.random() * 100))
                                console.log(list)
                            } catch (error) {
                                console.log("该车型没有")
                            }

                        }

                    }

                } catch (error) {
                    console.log(cityList[i].cityName + "-" + brandList[i].brand + "的所有车系爬取失败")
                }

            }
        } catch (error) {
            console.log(cityList[i].cityName + "二手车品牌爬取失败")
        }
        await page.waitFor(1000 + Math.round(Math.random() * 100))
    }
})();

时间比较赶,先附上代码和运行截图

有兴趣的可以 看一下项目地址

https://gitee.com/xu_hui_hong/nodejs_puppeteer_guazi2

原文地址:https://www.cnblogs.com/BlackFungus/p/12199043.html

时间: 2024-10-15 02:58:30

使用nodejs的puppeteer库爬取瓜子二手车网站的相关文章

使用node爬虫,爬取指定排名网站的JS引用库

前期准备 本爬虫将从网站爬取排名前几的网站,具体前几名可以具体设置,并分别爬取他们的主页,检查是否引用特定库. github地址 所用到的node主要模块 express 不用多说 request http模块 cheerio 运行在服务器端的jQuery node-inspector node调试模块 node-dev 修改文件后自动重启app 关于调试Node 在任意一个文件夹,执行node-inspector,通过打开特定页面,在页面上进行调试,然后运行app,使用node-dev app

scrapy爬取西刺网站ip

# scrapy爬取西刺网站ip # -*- coding: utf-8 -*- import scrapy from xici.items import XiciItem class XicispiderSpider(scrapy.Spider): name = "xicispider" allowed_domains = ["www.xicidaili.com/nn"] start_urls = ['http://www.xicidaili.com/nn/']

用requests库和BeautifulSoup4库爬取新闻列表

1.用requests库和BeautifulSoup4库,爬取校园新闻列表的时间.标题.链接.来源. import requests from bs4 import BeautifulSoup mt="http://news.gzcc.cn/html/xiaoyuanxinwen/" res=requests.get(mt) res.encoding='utf-8' soup=BeautifulSoup(res.text,"html.parser") for new

requests库和BeautifulSoup4库爬取新闻列表

画图显示: import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt txt = open("zuihou.txt","r",encoding='utf-8').read() wordlist = jieba.lcut(txt) wl_split=" ".join(wordlist) mywc = WordCloud().generate(wl_spl

BeautifuSoup库爬取美女图片

爬虫模块(从网页上采集数据 数据放置在网页标签里面)1.requests2.BeautifuSoup3.urllib4.urllib25.scrapy6.lxml爬取步骤1.获取标签的内容数据 :<div> <title> <a> ....找到标签里面的内容 soup.div 2.打开网页获取文件的内容soup.prettify() //打印本地文件的内容 3.html源代码相同标签很多,怎么获取到我想要的那一部分内容网页名字 class id find:查找标签e =

requests库爬取猫眼电影“最受期待榜”榜单 --网络爬虫

目标站点:https://maoyan.com/board/6 # coding:utf8 import requests, re, json from requests.exceptions import RequestException # from multiprocessing import Pool # 获取页面 def get_one_page(url): try: resp = requests.get(url) if resp.status_code == requests.co

python requests库爬取网页小实例:ip地址查询

ip地址查询的全代码: 智力使用ip183网站进行ip地址归属地的查询,我们在查询的过程是通过构造url进行查询的,将要查询的ip地址以参数的形式添加在ip183url后面即可. #ip地址查询的全代码 import requests url="http://m.ip138.com/ip.asp?ip=" try: r=requests.get(url+'202.204.80.112') r.raise_for_status() r.encoding=r.apparent_encodi

使用beautifulSoup库爬取小说

项目地址: https://gitee.com/knightdreams/CrawNovel 爬取的网址: https://www.biqukan.net/ 原文地址:https://www.cnblogs.com/knightdreams6/p/11969798.html

Python爬取知乎网站

1 import urllib.request 2 import re 3 4 url = 'http://daily.zhihu.com/' 5 def get_html(url): 6 html = urllib.request.urlopen(url).read() 7 html = html.decode('utf-8') 8 return html 9 10 def get_url_num(html): 11 res = re.compile('<a href="/story/(