node創(chuàng)建一個(gè)簡(jiǎn)單的網(wǎng)頁(yè)爬蟲(chóng)
安裝node就可以了,創(chuàng)建index.js
用到request、fs、cheerio、iconv-lite、node-xlsx
request:發(fā)送請(qǐng)求
fs:操作文件
cheerio:node里面的jQuery,操作頁(yè)面dom
iconv-lite:處理獲取數(shù)據(jù)編碼問(wèn)題
node-xlsx:將數(shù)據(jù)保存為xlsx文檔
文章以房天下的為例。
var request = require('request')
var cheerio = require('cheerio')
var iconv = require('iconv-lite'); //引入模塊
var fs = require('fs')
var xlsx = require('node-xlsx')//獲取表格
function writeXls(datas,index) {
var buffer = xlsx.build([
{
name:'sheet'+index,
data:datas
}
]);
fs.writeFileSync('test1.xlsx',buffer,{'flag':'w'}); //生成excel
}
var list = []
var datas = [];
//獲取房源數(shù)據(jù)
function getHotMovies(url,index) {
request({url,encoding: null, gzip:true}, function (err,res, body) {
if (!err && res.statusCode == 200) {
var $ = cheerio.load(iconv.decode(body, 'GBK'));
var content = $('.shop_list .tit_shop')
var length = content.length
// console.info(length)
while (length -- ) {
var title = $('.shop_list .tit_shop').eq(length).text()
var xiaoquName = $('.shop_list .add_shop a').eq(length).text().replace(/\s/g,"");
var xiaoquAddr = $('.shop_list .add_shop span').eq(length).text()
var arr = [title,xiaoquName,xiaoquAddr]
datas.push(arr)
title && list.push(`名字:${title}》》》小區(qū)名稱(chēng):${xiaoquName}》》》小區(qū)地址:${xiaoquAddr}\r\n`)
}
writeXls(datas,index);
fs.writeFile('test.txt',list,(err)=>{
if (err) throw err;
console.log('文件已被保存',index);
})
} else {
console.info('網(wǎng)頁(yè)加載失敗',err)
}
})
}
//第一頁(yè)和其他頁(yè)碼路徑不一樣,需要分開(kāi)處理
getHotMovies('https://cd.esf.fang.com/integrate',1) //第一頁(yè)
for (let index = 2; index < 80; index++) {
getHotMovies('https://cd.esf.fang.com/integrate/i3'+index+'/',index)
}
然后運(yùn)行
node index.js
然后查看test1.xlsx、test.txt文件,就可以看到node爬下來(lái)的數(shù)據(jù)了。