├── README.md ├── app.js ├── data.js └── package.json /README.md: -------------------------------------------------------------------------------- 1 | # crawler 2 | 3 | 基于node.js做的一个网络爬虫 4 | 5 | ## 安装 6 | 7 | 进入项目文件夹 8 | 9 | npm install 10 | 11 | 然后开启服务 12 | 13 | node app.js 14 | 15 | 在浏览器输入localhost:3000 爬虫就开始扒取指定的网站 命令框提示完成后 打开项目文件夹下的data.js 所扒取的数据都在里面 16 | 17 | -------------------------------------------------------------------------------- /app.js: -------------------------------------------------------------------------------- 1 | var eventproxy = require('eventproxy'); 2 | var cheerio = require('cheerio'); 3 | var superagent = require('superagent'); 4 | var url = require('url'); 5 | var async = require('async'); 6 | var fs = require('fs'); 7 | var path = require('path'); 8 | var express = require('express'); 9 | 10 | 11 | var app = express(); 12 | 13 | var cnodeUrl = 'https://cnodejs.org/'; 14 | 15 | //并发数 16 | var count = 0; 17 | 18 | app.get('/', function(req, res, next) { 19 | 20 | superagent.get(cnodeUrl).end(function(err, sres) { 21 | if (err) { 22 | next(err); 23 | } 24 | var $ = cheerio.load(sres.text); 25 | var topicUrls = []; 26 | $('#topic_list .topic_title').each(function(idx, element) { 27 | var $element = $(element); 28 | var href = url.resolve(cnodeUrl, $element.attr('href')); 29 | topicUrls.push(href); 30 | }); 31 | 32 | //此时已经得到了所有文章的url 33 | //console.log(topicUrls); 34 | 35 | //设置最大并发数为5 36 | async.mapLimit(topicUrls, 5, function(url, resolve) { 37 | count++; 38 | //获取每一个对应的网址的html字符串 39 | superagent.get(url).end(function(err, content) { 40 | count--; 41 | console.log('现在的并发数是:' + count); 42 | resolve(null, [url, content.text]); 43 | }); 44 | 45 | }, function(err, result) { 46 | result = result.map(function(topic) { 47 | var url = topic[0]; 48 | var html = topic[1]; 49 | var $ = cheerio.load(html); 50 | return { 51 | title: $('.topic_full_title').text().trim(), 52 | href: url, 53 | comment: $('.reply_content').eq(0).text().trim() 54 | }; 55 | }); 56 | 57 | //到这里已经得到了所有想要的json信息 58 | //console.log(result); 59 | 60 | //开始写入文件 61 | fs.writeFile(path.join('./', 'data.js'), JSON.stringify(result), function(err) { 62 | if (err) throw err; 63 | console.log("Export Account Success!"); 64 | }); 65 | //读文件并输出内容 66 | fs.readFile(path.join('./', 'data.js'), 'utf-8', function(err, content) { 67 | // 触发结果事件 68 | console.log(content); 69 | }); 70 | }); 71 | 72 | 73 | }); 74 | 75 | }); 76 | 77 | 78 | app.listen(3000, function() { 79 | console.log('listen to port 3000'); 80 | }); 81 | 82 | -------------------------------------------------------------------------------- /data.js: -------------------------------------------------------------------------------- 1 | [{"title":"置顶\n\n\n\n [活动]创业松鼠全球嘉年华上海站:一个周末启动你的创业项目!","href":"https://cnodejs.org/topic/56ea48be45c032f5425e9b05","comment":"可惜不在上海啊"},{"title":"置顶\n\n\n\n 展望Nodejs 2016和新年祝福","href":"https://cnodejs.org/topic/56b70c15c3f170d2629955b5","comment":"过年了,借个楼给大家拜个年!\n来自酷炫的 CNodeMD"},{"title":"置顶\n\n\n\n CNode客户端专题","href":"https://cnodejs.org/topic/55c5f41139273b92193362fb","comment":"+1"},{"title":"置顶\n\n\n\n 国内Nodejs 2015汇总","href":"https://cnodejs.org/topic/5696e43e6272216e51bff67e","comment":"桑大大,很赞👍"},{"title":"高能!你知道或者不知道的虚拟化巨头VMware北京需要Node.js一名","href":"https://cnodejs.org/topic/56ea75a4022b5ba142eec803","comment":"代大家问薪水多少"},{"title":"【阿里巴巴】2016年,新的起航!这里需要你,bingo","href":"https://cnodejs.org/topic/569f8fa08392272262331d40","comment":"这应该是杭州吧?北京的职位有没"},{"title":"nodejs初学,在学做一个网页爬虫,想请教如何用Nodejs将爬下来的html中的script、img、link、a等标签的外链也爬下来","href":"https://cnodejs.org/topic/56ea6845cb3a38d94214c5eb","comment":"nodejs可以用中间件node-jquery把抓过来的页面解析成html,然后有了jQuery你想干嘛都行的"},{"title":"ionicPopover有使用的同学没?上面大小放四个项或超过四个项比较合适,如果只有两个项,下面会留出好大的空白,好丑,怎么解决?","href":"https://cnodejs.org/topic/56ea735ea70420bd420d0054","comment":""},{"title":"有人用极光推送吗?里面那个setTags方法,当我设置好几个标签时,只有第一个标签起作用。有人遇到过这种情况没?","href":"https://cnodejs.org/topic/56e66bebd62bdb576d051d1f","comment":"看不懂你这代码,你要在node连接极光的服务端,通过极光服务端向不同tags的客户端发消息?"},{"title":"koa使用koa-static模块时碰到的问题","href":"https://cnodejs.org/topic/56ea660f022b5ba142eec7f8","comment":"这个是ES6的模板字符串.\n资料: http://es6.ruanyifeng.com/#docs/string#模板字符串"},{"title":"精华\n\n\n\n 浅谈cnode社区如何防止csrf攻击","href":"https://cnodejs.org/topic/5533dd6e9138f09b629674fd","comment":"学习了"},{"title":"express+mysql连接","href":"https://cnodejs.org/topic/56ea6a85a70420bd420d004c","comment":"连接池"},{"title":"[上海-闵行紫竹园区] 知名互联网公司招聘 Node.js 程序员 3 名 一年以上经验即可","href":"https://cnodejs.org/topic/56ea6ca6cb3a38d94214c5ed","comment":""},{"title":"精华\n\n\n\n JavaScript 资源大全中文版","href":"https://cnodejs.org/topic/56e8c95dcf7763a6045c4ae4","comment":"mark"},{"title":"[北京]教育类startup, 求技术合伙人二枚(还剩1枚), MEAN+公有云+敏捷开发,等风来","href":"https://cnodejs.org/topic/56ea67a7022b5ba142eec7f9","comment":""},{"title":"精华\n\n\n\n 基于 Node.js 的自动化持续集成","href":"https://cnodejs.org/topic/56e8d829cf7763a6045c4af8","comment":"想法真多~~"},{"title":"关于mongo 中数组的删除问题$pull为啥删除不了","href":"https://cnodejs.org/topic/56ea66cfa70420bd420d0047","comment":""},{"title":"哪种付费vpn比较稳定?","href":"https://cnodejs.org/topic/53e5c890977012ba554b3e7b","comment":"代理99"},{"title":"用electron写了个douban.fm mac版本","href":"https://cnodejs.org/topic/56ea5f6f022b5ba142eec7f7","comment":""},{"title":"orm2的模型定义是写在数据库共用链接的地方 还是 写在每个业务model类里面?","href":"https://cnodejs.org/topic/56ea5f43022b5ba142eec7f6","comment":""},{"title":"做个调查,一下大家的前端项目都是如何部署的?","href":"https://cnodejs.org/topic/56ea580d45c032f5425e9b0f","comment":""},{"title":"前后端分离,使用nodejs做为JAVA和前端的中间层,session维护的问题","href":"https://cnodejs.org/topic/56e9177ca70420bd420cffcf","comment":"接口用session作鉴权?好吧。意思是node.js用来模拟登录后进行一系列的调用api吗?调用完了就可以抛弃登录状态?若是如此的话,看下request库,有cookie jaw功能,可以维护cookie状态\n来自酷炫的 CNodeMD"},{"title":"解决ssh的\"Write failed: Broken pipe\"问题","href":"https://cnodejs.org/topic/56ea506fcb3a38d94214c5df","comment":""},{"title":"分享一个buffer库, ebuffer","href":"https://cnodejs.org/topic/56ea4d06022b5ba142eec7f2","comment":""},{"title":"ubuntu和mac哪个更适合做node开发","href":"https://cnodejs.org/topic/56e774b5545c5c736d1238c0","comment":"看自己喜好\nrt"},{"title":"express怎么加权限?","href":"https://cnodejs.org/topic/56e8cce4833b7c8a0492e226","comment":"参考中间件的方式,在自己的中间件中判断是否有权限,有的话调用next函数进去下一层,没有权限则return"},{"title":"jade怎么共用header","href":"https://cnodejs.org/topic/56ea3e3a45c032f5425e9afc","comment":"layout.jade\nh1 Main\nblock content\nxx.jade\nextends layout\nblock content\n h2 users"},{"title":"ui-sref 如何传递参数","href":"https://cnodejs.org/topic/5608b4d7272b724e5efefcfd","comment":"你这样的写法应该是没有问题的,取参数用$stateParams,给这个state对应的controller加上$stateParams参数,然后$stateParams.type、$stateParams.role"},{"title":"大家开发nodejs用的什么开发工具(IDE)?","href":"https://cnodejs.org/topic/5191ca8563e9f8a542bb4681","comment":"sublime2"},{"title":"分享一个开源博客-Jackblog","href":"https://cnodejs.org/topic/569cf8e7adf526da2aeb2421","comment":"看起来,好好的"},{"title":"nodejs模拟登录网站出错,请大神帮忙解决,成功后支付宝请喝咖啡!","href":"https://cnodejs.org/topic/56e7a642d62bdb576d051d7d","comment":"好悲催,竟无人作答,若有大神帮忙实现,支付宝请喝咖啡的哦。谢啦"},{"title":"linux安装nodejs v5.6.0","href":"https://cnodejs.org/topic/56e8d362833b7c8a0492e22a","comment":"直接去官网下最新的tar.gz 然后自己解压 设置下环境变量 看看能不能运行"},{"title":"写了篇有关CSRF的博客,大家多批评~","href":"https://cnodejs.org/topic/56c833f9d1e0945c614187e6","comment":"做安全的么。。。"},{"title":"json串转Object问题","href":"https://cnodejs.org/topic/56e9184f45c032f5425e9a8b","comment":"typeof Val 看看类型是不是字符串"},{"title":"nodejs如何创建定时任务","href":"https://cnodejs.org/topic/54239e50d13dece96d6a8953","comment":"setInterval(function() {\n //you task here\n}, 5 * 60 * 1000);"},{"title":"Swift,你不得不学的5个原因","href":"https://cnodejs.org/topic/56e929bc45c032f5425e9a96","comment":"强烈支持swift"},{"title":"mysql的orm哪个好用","href":"https://cnodejs.org/topic/56da9c4d820d3c9b3d63e342","comment":"node-mysql直接写SQL语句不好吗?"},{"title":"计算机开放电子书汇总","href":"https://cnodejs.org/topic/56e7fe6383cbb63b6d120352","comment":"mark\n自豪地采用 CNodeJS ionic"},{"title":"《Kubernetes集成外部服务实践》- 第三期Docker技术沙龙主题剖析系列【第一篇】","href":"https://cnodejs.org/topic/56ea1438022b5ba142eec7c8","comment":""},{"title":"demos 一个开源的web开发调试工具","href":"https://cnodejs.org/topic/541bfa40ad60405c1f1597a9","comment":"确实方便,不过po主博客貌似有点问题(⊙o⊙)哦"}] -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawler", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "app.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "dependencies": { 10 | "async": "^0.9.0", 11 | "cheerio": "^0.17.0", 12 | "express": "^4.9.5", 13 | "superagent": "^0.20.0", 14 | "eventproxy": "^0.3.1" 15 | }, 16 | "author": "", 17 | "license": "ISC" 18 | } 19 | --------------------------------------------------------------------------------