1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
{ "name": "1", "version": "1.0.0", "description": "", "main": "index.js", "dependencies": { "fs": "^0.0.1-security", "https": "^1.0.0", "iconv-lite": "^0.4.21", "jsdom": "^11.7.0" }, "devDependencies": {}, "scripts": { "test": "echo "Error: no test specified" && exit 1" }, "author": "", "license": "ISC" } |
代码
|
// config/index.js module.exports = { options: { hostname: 'www.ozon.ru', port: 443, path: '/context/detail/id/144054492/', method: 'GET', headers: { 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'if-modified-since':'Fri, 08 Jun 2018 03:42:08 GMT', 'referer':'https://www.ozon.ru/catalog/1133763/?type=48856', 'upgrade-insecure-requests':1, 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } }, baseURL: 'https://www.ozon.ru', timeout: 3000 } // tool/generateExcel.js // fs 文件系统模块 var fs = require('fs') // 读取json中的数据,用String的concat函数拼接 var datas = require('../tmp/mid_output.json') var culs = new Object() // 数据处理函数,将数据中的','和'n'替换成','和';' var prepare = str => { if (str === undefined) { return null } else { // 这里的/,/g和/n/g是正则表达式 return str.replace(/,/g,',').replace(/n/g, ';') } } // 删除没有params的数据 var num = 0; for(var i = 0;i<datas.length;i++){ if(datas[i].params === undefined){ datas.splice(i,1); } } var num1 = 0; for(var i = 0;i<datas.length;i++){ if(datas[i].params == undefined){ num1 ++; } } datas.splice(datas.length-num1,num1); for (var data of datas) { // console.log(data) for (var param of data.params) { // 将datas中的所有的params的key,添加到culs中 if (culs[param.key] === undefined) culs[param.key] = true // 将datas中的params[i].key和params[i].value变成data[param.key] = param.value data[param.key] = param.value } // 删除data中的params delete data.params } // console.log(datas[0].prototype === datas[1].prototype) // console.log(culs) // console.log(datas) var columnsName = 'number,href,img,name,price,cnum' for (var key in culs) { columnsName += ',' + prepare(key) } columnsName += 'n' fs.writeFileSync(__dirname + '/../output/output.csv', columnsName, {flag: 'a'}, err => console.log(err)) var cnt = 1 for (var data of datas) { var str = '' str += cnt++ str += ',' + prepare(data.href) str += ',' + prepare(data.img) str += ',' + prepare(data.name) str += ',' + prepare(data.price) str += ',' + prepare(data.cnum) for (var key in culs) { str += ',' + prepare(data[key]) } str += 'n' // 同步的写文件,将str写到'output.txt',将flag设置为'a',即append,将数据追加到源文件结尾 fs.writeFileSync(__dirname + '/../output/output.csv', str, {flag: 'a'}, err => console.log(err)) } console.log(cnt) // tool/get.js const https = require('https') const fs = require('fs') const iconv = require('iconv-lite') const jsdom = require('jsdom') const { JSDOM } = jsdom var config = require('../config') var items = new Array() var getInput = () => { var result = fs.readFileSync(__dirname + '/../input/input.txt') const dom = new JSDOM(result.toString()) // const dom = new JSDOM('<div><div class="a">1<div/><div class="a"><div>2<div/><div/><div/>') var lines = dom.window.document.getElementsByClassName('bOneTile inline') var i = 1 for (var line of lines) { // console.log(line['href']) var href = line.getElementsByClassName('eOneTile_link')[0].href //链接找整个div参数data-itemid的值 //var id = line.getAttribute('data-itemid') //图片找第一个eOneTile_image_link的参数data-image-src var img = line.getElementsByClassName('eOneTile_image_link')[0].getAttribute('data-image-src') //商品名找整个div参数的data-name var name = line.getAttribute('data-name') //价钱找整个div参数的data-price var price = (line.getAttribute('data-price') !== undefined) ? line.getAttribute('data-price') : 'null' //评论数找eOneTile_ReviewsCount的innerhtml var cnum = (line.getElementsByClassName('eOneTile_ReviewsCount')[0] === undefined ? '0' : line.getElementsByClassName('eOneTile_ReviewsCount')[0].innerHTML) items.push({ href: href, img: img, name: name, price: price, cnum: cnum }) // console.log(i++ + ' ' + href + ' ' + cnum) } } var getDetail = idx => { var item = items[idx] config.options.path = item.href const req = https.get(config.options, res => { var datas = [] var size = 0 res.on('data', data => { datas.push(data) size += data.length // console.log(data) }) res.on('end', () => { var buff = Buffer.concat(datas, size) var result = iconv.decode(buff, 'win1251') // console.log(result) // fs.writeFile('out',result, err => console.log(err)) const dom = new JSDOM(result.toString()) // const dom = new JSDOM('<div><div class="a">1<div/><div class="a"><div>2<div/><div/><div/>') var lines = dom.window.document.getElementsByClassName('eItemProperties_line') item.params = new Array() for (var line of lines) { var key = line.childNodes[1].innerHTML var value = line.childNodes[3].innerHTML item.params.push({ key: key, value: value }) } }) }) req.end() config.options.headers.referer = config.baseURL + item.href // 这里修正了他的referer,模拟浏览器 } var getAllDetail = (idx, end) => () => { if (idx < end) { console.log(idx) getDetail(idx) setTimeout(getAllDetail(idx + 1, end), config.timeout) } else { // console.log(items) setOutput() } } var setOutput = () => { fs.writeFile(__dirname + '/../tmp/mid_output.json',JSON.stringify(items), err => console.log(err)) } getInput() setOutput() getAllDetail(0, items.length)() |
使用说明
1.打开Qzon产品列表
2.按F12,选择列表div,右键copy->copy element
3.打开input.txt,将数据删除后粘贴新的数据
4.Ozon.ru点击页面左上角安全Cookies全部禁用,点击一个产品刷新,F12点击Network找到第一个(数字串),复制request headers
5.粘贴到index.js,对比上下修改
6.运行 node toolget.js和node toolgenerateExcel.js
7.注意完成后将页面的cookies屏蔽解除
原文:大专栏 【实战】Ozon产品列表页及产品详情页nodejs爬虫
原文地址:https://www.cnblogs.com/chinatrump/p/11601740.html
时间: 2024-10-21 04:59:08