参考了各位大大的,然后自己写了个爬虫
用到的modules:
utils.js --- moment
module_url.js
var http = require("http"); //获得页面数据
var cheerio = require("cheerio"); //分析页面数据,提取内容
var sanitize = require("validator"); //过滤没用的数据 如空格等
var fs = require('fs'); //操作文件,保存结果
app.js
var async = require("async"); //异步操作 如each, filter
var ts = require("timespans") //计算花费时间
var sanitize = require("validator"); //过滤没用的数据 如空格等
获得每个页面的话题列表 -- 并行的
根据话题列表获得的话题具体内容 -- 并行的 但是最后输出的内容是按顺序的
别处拷来的utils 里面重写了下console.log 增加了输出的时间

var moment = require('moment');
exports.inc = function(n, callback, timeout) {
timeout = timeout || 200;
setTimeout(function() {
callback(null, n+1);
}, timeout);
};
exports.fire = function(obj, callback, timeout) {
timeout = timeout || 200;
setTimeout(function() {
callback(null, obj);
}, timeout);
};
exports.err = function(errMsg, callback, timeout) {
timeout = timeout || 200;
setTimeout(function() {
callback(errMsg);
}, timeout);
};
// utils
exports.log = function(msg, obj) {
process.stdout.write(moment().format('ss.SSS')+'> ');
if(obj!==undefined) {
process.stdout.write(msg);
console.log(obj);
} else {
console.log(msg);
}
};
exports.wait = function(mils) {
var now = new Date;
while(new Date - now <= mils);
}
抓取页面数据

//获得页面数据
var http = require("http");
//分析页面数据,提前内容
var cheerio = require("cheerio");
//过滤没用的数据 如空格等
var sanitize = require("validator");
//操作文件,保存结果
var fs = require('fs');
var scrapy = {};
scrapy.get = function(url, callback) {
http.get(url, function(res) {
var size = 0;
var chunks = [];
res.on('data', function(chunk) {
size += chunk.length;
chunks.push(chunk);
});
res.on('end', function() {
var data = Buffer.concat(chunks, size);
callback(null, data);
});
}).on('error', function(e) {
callback(e, null);
});
}
var getPage = function(pageUrl, callback){
scrapy.get(pageUrl, function(err, data){
if(err){
callback(err);
}
var html = data.toString();
$ = cheerio.load(html);
//title link, link to detail page
var news = $('.cell .topic_title_wrapper a');
callback(null, news);
});
}
var getDetail = function(detailUrl, callback){
scrapy.get(detailUrl, function(err, data){
if(err){
callback(err);
}
var html = data.toString();
$ = cheerio.load(html);
var item = {};
item.href = detailUrl;
$('.header .topic_full_title .put_top').remove(); //删除 “置顶”
item.title = sanitize.escape(sanitize.trim($('.header .topic_full_title').text()));
item.content = sanitize.escape(sanitize.trim($('.inner.topic .topic_content').text()));
callback(null, item);
});
}
var save = function(fileName, data) {
var result = JSON.stringify(data);
fs.writeFileSync(fileName, result);
}
exports.getUrl = scrapy.get;
exports.getPage = getPage;
exports.getDetail = getDetail;
exports.save = save;
主文件

//自定义console.log 加入了输出时间
var utils = require("./utils");
var log = utils.log;
//异步操作 如each, filter
var async = require("async");
//计算花费时间
var ts = require("timespans")
//过滤没用的数据 如空格等
var sanitize = require("validator");
var url = require("./module_url")
var baseUrl = 'http://cnodejs.org';
var pageUrl = baseUrl + '/?page=';
var isOnlyTitle = true;
var pages = [];
for (var i = 1; i < 4; i++) {
pages.push(i);
};
ts.start();
var titles = {};
//page 之间并行
async.forEach(pages, function(page, callback_each){
titles[page] = [];
url.getPage(pageUrl + page, function(err, news){
if(err){
log("page error");
return;
}
if (news.length === 0) {
log("no data for the page:" + page);
return;
}
async.filter(news, function(index, callback){
var detailUrl = baseUrl + news[index].attribs['href'];
if(isOnlyTitle){
var curNew = news[index];
var item = {};
item.href = detailUrl;
$(curNew).find(".put_top").remove(); //删除 “置顶”
item.title = sanitize.escape(sanitize.trim($(curNew).text()));
titles[page][index] = item;
callback(true);
}
else{
url.getDetail(detailUrl, function(err, item){
if(err){
log("detail error");
return;
}
titles[page][index] = item;
//titles[page].push(item);
callback(true);
});
}
}, function(result){
//log("filter news:", result);
callback_each(null);
});
});
}, function(err){
ts.stop();
//ts.pause(); --- ts.continue();
console.log('total: %s pause: %s used: %s', ts.elapsedtime(), ts.pausetime(), ts.usedtime());
log(titles);
//url.save("cnodejs.json", titles);
});
另外:想实现抓取某个时间段内的话题,努力ing...
来源:https://www.cnblogs.com/dfg727/p/3809615.html
