1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
{ "name": "1", "version": "1.0.0", "description": "", "main": "index.js", "dependencies": { "fs": "^0.0.1-security", "https": "^1.0.0", "iconv-lite": "^0.4.21", "jsdom": "^11.7.0" }, "devDependencies": {}, "scripts": { "test": "echo "Error: no test specified" && exit 1" }, "author": "", "license": "ISC" } |
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
// config/index.js module.exports = { options: { hostname: 'www.ozon.ru', port: 443, path: '/context/detail/id/144054492/', method: 'GET', headers: { 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'if-modified-since':'Fri, 08 Jun 2018 03:42:08 GMT', 'referer':'https://www.ozon.ru/catalog/1133763/?type=48856', 'upgrade-insecure-requests':1, 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } }, baseURL: 'https://www.ozon.ru', timeout: 3000 } // tool/generateExcel.js // fs 文件系统模块 var fs = require('fs') // 读取json中的数据,用String的concat函数拼接 var datas = require('../tmp/mid_output.json') var culs = new Object() // 数据处理函数,将数据中的','和'n'替换成','和';' var prepare = str => { if (str === undefined) { return null } else { // 这里的/,/g和/n/g是正则表达式 return str.replace(/,/g,',').replace(/n/g, ';') } } // 删除没有params的数据 var num = 0; for(var i = 0;i<datas.length;i++){ if(datas[i].params === undefined){ datas.splice(i,1); } } var num1 = 0; for(var i = 0;i<datas.length;i++){ if(datas[i].params == undefined){ num1 ++; } } datas.splice(datas.length-num1,num1); for (var data of datas) { // console.log(data) for (var param of data.params) { // 将datas中的所有的params的key,添加到culs中 if (culs[param.key] === undefined) culs[param.key] = true // 将datas中的params[i].key和params[i].value变成data[param.key] = param.value data[param.key] = param.value } // 删除data中的params delete data.params } // console.log(datas[0].prototype === datas[1].prototype) // console.log(culs) // console.log(datas) var columnsName = 'number,href,img,name,price,cnum' for (var key in culs) { columnsName += ',' + prepare(key) } columnsName += 'n' fs.writeFileSync(__dirname + '/../output/output.csv', columnsName, {flag: 'a'}, err => console.log(err)) var cnt = 1 for (var data of datas) { var str = '' str += cnt++ str += ',' + prepare(data.href) str += ',' + prepare(data.img) str += ',' + prepare(data.name) str += ',' + prepare(data.price) str += ',' + prepare(data.cnum) for (var key in culs) { str += ',' + prepare(data[key]) } str += 'n' // 同步的写文件,将str写到'output.txt',将flag设置为'a',即append,将数据追加到源文件结尾 fs.writeFileSync(__dirname + '/../output/output.csv', str, {flag: 'a'}, err => console.log(err)) } console.log(cnt) // tool/get.js const https = require('https') const fs = require('fs') const iconv = require('iconv-lite') const jsdom = require('jsdom') const { JSDOM } = jsdom var config = require('../config') var items = new Array() var getInput = () => { var result = fs.readFileSync(__dirname + '/../input/input.txt') const dom = new JSDOM(result.toString()) // const dom = new JSDOM('<div><div class="a">1<div/><div class="a"><div>2<div/><div/><div/>') var lines = dom.window.document.getElementsByClassName('bOneTile inline') var i = 1 for (var line of lines) { // console.log(line['href']) var href = line.getElementsByClassName('eOneTile_link')[0].href //链接找整个div参数data-itemid的值 //var id = line.getAttribute('data-itemid') //图片找第一个eOneTile_image_link的参数data-image-src var img = line.getElementsByClassName('eOneTile_image_link')[0].getAttribute('data-image-src') //商品名找整个div参数的data-name var name = line.getAttribute('data-name') //价钱找整个div参数的data-price var price = (line.getAttribute('data-price') !== undefined) ? line.getAttribute('data-price') : 'null' //评论数找eOneTile_ReviewsCount的innerhtml var cnum = (line.getElementsByClassName('eOneTile_ReviewsCount')[0] === undefined ? '0' : line.getElementsByClassName('eOneTile_ReviewsCount')[0].innerHTML) items.push({ href: href, img: img, name: name, price: price, cnum: cnum }) // console.log(i++ + ' ' + href + ' ' + cnum) } } var getDetail = idx => { var item = items[idx] config.options.path = item.href const req = https.get(config.options, res => { var datas = [] var size = 0 res.on('data', data => { datas.push(data) size += data.length // console.log(data) }) res.on('end', () => { var buff = Buffer.concat(datas, size) var result = iconv.decode(buff, 'win1251') // console.log(result) // fs.writeFile('out',result, err => console.log(err)) const dom = new JSDOM(result.toString()) // const dom = new JSDOM('<div><div class="a">1<div/><div class="a"><div>2<div/><div/><div/>') var lines = dom.window.document.getElementsByClassName('eItemProperties_line') item.params = new Array() for (var line of lines) { var key = line.childNodes[1].innerHTML var value = line.childNodes[3].innerHTML item.params.push({ key: key, value: value }) } }) }) req.end() config.options.headers.referer = config.baseURL + item.href // 这里修正了他的referer,模拟浏览器 } var getAllDetail = (idx, end) => () => { if (idx < end) { console.log(idx) getDetail(idx) setTimeout(getAllDetail(idx + 1, end), config.timeout) } else { // console.log(items) setOutput() } } var setOutput = () => { fs.writeFile(__dirname + '/../tmp/mid_output.json',JSON.stringify(items), err => console.log(err)) } getInput() setOutput() getAllDetail(0, items.length)() |
使用说明
1.打开Qzon产品列表
2.按F12,选择列表div,右键copy->copy element
3.打开input.txt,将数据删除后粘贴新的数据
4.Ozon.ru点击页面左上角安全Cookies全部禁用,点击一个产品刷新,F12点击Network找到第一个(数字串),复制request headers
5.粘贴到index.js,对比上下修改
6.运行 node toolget.js和node toolgenerateExcel.js
7.注意完成后将页面的cookies屏蔽解除
原文:大专栏 【实战】Ozon产品列表页及产品详情页nodejs爬虫
原文地址:https://www.cnblogs.com/chinatrump/p/11601740.html
时间: 2024-10-21 04:59:08