最近在用node js編寫(xiě)爬蟲(chóng)的過(guò)程中,遇到了一個(gè)奇怪的問(wèn)題,爬蟲(chóng)代碼正常執(zhí)行,但總是中途停止,不再繼續(xù)執(zhí)行,也不報(bào)錯(cuò)退出,而且發(fā)生的時(shí)間也很隨機(jī),很頭疼,
開(kāi)始分析是爆棧,然而檢查代碼,,發(fā)現(xiàn)基于event loop 的node js到處都是回調(diào),很難爆棧。分析是內(nèi)存占用過(guò)多卡死?定時(shí)執(zhí)行g(shù)c,也沒(méi)有用,,最后終于發(fā)現(xiàn),,竟然是因?yàn)檎?qǐng)求時(shí)未設(shè)置超時(shí)時(shí)間,,導(dǎo)致回調(diào)無(wú)法執(zhí)行,,無(wú)法進(jìn)行下一步。
然而node js http模塊不支持設(shè)置超時(shí)時(shí)間,只能偽實(shí)現(xiàn)一個(gè),話不多說(shuō),上代碼,,用到了http模塊,cheerio模塊(用于將爬取到的網(wǎng)頁(yè)變?yōu)轭?lèi)jq訪問(wèn)的文檔樹(shù)),iconv-lite(用于改變網(wǎng)頁(yè)數(shù)據(jù)編碼,這里有個(gè)坑,http.get默認(rèn)會(huì)把網(wǎng)頁(yè)編碼設(shè)置為utf-8,如果爬取網(wǎng)頁(yè)是gbk的,就會(huì)亂碼),
const http = require("http");
const fs = require("fs");
const cheerio = require("cheerio");
const iconv = require('iconv-lite');
var req = null
request_timer = setTimeout(function () { //這里通過(guò)定時(shí)器偽實(shí)現(xiàn)超時(shí)設(shè)置,20秒沒(méi)有相應(yīng),結(jié)束請(qǐng)求,執(zhí)行回調(diào),重點(diǎn)
req.abort();
console.log('Request Timeout.');
}, 20000);
req = http.get(url, res => {
if (res) {
clearTimeout(request_timer); //如果請(qǐng)求有反應(yīng),無(wú)論成功失敗,清楚定時(shí)器
console.log("準(zhǔn)備獲取數(shù)據(jù)")
var data = [];
res.on("data", data1 => {// 因?yàn)閿?shù)據(jù)分塊傳輸,監(jiān)聽(tīng)數(shù)據(jù)塊拼接,
console.log("數(shù)據(jù)獲取中")
data.push(data1)
})
res.on("end", data2 => {
var res = iconv.decode(Buffer.concat(data), 'gb2312');//通過(guò)iconv改變王爺編碼,這里根據(jù)網(wǎng)頁(yè)meta標(biāo)簽中設(shè)置的編碼而設(shè)置,如果不設(shè)置默認(rèn)為utf-8,,
var $ = cheerio.load(res);//這里是把獲取的網(wǎng)頁(yè)數(shù)據(jù)轉(zhuǎn)化為類(lèi)dom樹(shù)
console.log("處理完畢")
var text = $('').eq(1).text(); //這里是通過(guò)類(lèi)jq的方式獲取網(wǎng)頁(yè)中想要的數(shù)據(jù)
})
}
})
req.on("error", function () { //這里記得要監(jiān)聽(tīng)錯(cuò)誤事件,否則請(qǐng)求出錯(cuò)就會(huì)結(jié)束進(jìn)程
console.log("錯(cuò)誤暫停")
})