├── .gitignore ├── README.md ├── config ├── config.article.js ├── config.mail.js └── config.mail.sample.js ├── index.js ├── package.json └── src ├── config.js ├── crawler.js └── mailer.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | config/config.mail.js -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Techweekly 2 | 3 | 高可配的技术周报邮件推送工具。 4 | 5 | ![](http://img002.qufenqi.com/products/7c/ba/7cbac61933c8e8d21fe2e3915d4618e4.jpeg) 6 | 7 | ## 快速入门 8 | 9 | 第一步,下载代码,安装依赖: 10 | ```shell 11 | $ git clone https://github.com/xiongwilee/Techweekly.git 12 | $ cd Techweekly && npm install --registry=https://registry.npm.taobao.org 13 | ``` 14 | 15 | 第二步,修改邮件配置`config/config.mail.js`: 16 | ```javascript 17 | module.exports = { 18 | "sender": { 19 | "host": "邮箱服务器host", 20 | "port": "邮箱服务器端口号", 21 | "auth": { 22 | "user": "邮箱地址", 23 | "pass": "邮箱密码" 24 | } 25 | }, 26 | "subject": "邮件主题", 27 | "from": "你的名字 <邮箱地址>", 28 | "to": ["收件人邮箱地址"] 29 | } 30 | ``` 31 | 32 | 或者,你也可以直接使用默认的邮箱配置`config.mail.sample.js`,修改`config.mail.sample.js`为`config.mail.js` 33 | 34 | 第三步,发送周报邮件: 35 | ```shell 36 | $ node index.js 37 | ``` 38 | 39 | 40 | **FYI:** 41 | 42 | 如果你需要定时发送邮件,推荐使用`crontab`: 43 | ```shell 44 | * 10 * * 5 cd /your/project/path/ && node index.js 45 | ``` 46 | 47 | 48 | ## 贡献 49 | 50 | Techweekly默认支持[fex](https://github.com/zenany/weekly/tree/master/software/)和[75team](https://weekly.75team.com/)两个默认周报源,你可以根据自己的需求配置周报来源: 51 | ```javascript 52 | "源ID(可以配置任意字符)": { 53 | /** 54 | * 页面连接,可以是一个string, 也可以是function,如果是function则: 55 | * @return {String} 页面URL 56 | */ 57 | url: function() {}, 58 | 59 | /** 60 | * 通过url获取文章内容URL的方法 61 | * @param {string} html 通过页面连接爬取到的页面html 62 | * @return {String} 从html中解析到的文章内容的链接 63 | */ 64 | getLink: function(html) {}, 65 | 66 | /** 67 | * 通过文章内容的链接爬取到文章主体 68 | * @param {String} html 通过文章内容的链接爬取到文章的html 69 | * @return {String} 文章主体部分的html 70 | */ 71 | getContent: function(html) {} 72 | } 73 | ``` 74 | 75 | **FYI:** 76 | 77 | 在`getLink`和`getContent`方法里,你可以直接使用[cheerio](https://github.com/cheeriojs/cheerio#cheerio)来解析DOM。 78 | 79 | ## 作者 80 | 81 | * [xiongwilee](https://github.com/xiongwilee) 82 | 83 | -------------------------------------------------------------------------------- /config/config.article.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 文章源配置文件 3 | * @author xiongwilee 4 | */ 5 | 6 | 'use strict'; 7 | 8 | const cheerio = require("cheerio"); 9 | const url_opera = require('url'); 10 | 11 | module.exports = { 12 | "fex": { 13 | /** 14 | * 页面连接,可以是一个string, 也可以是function 15 | * @return {String} 页面URL 16 | */ 17 | url: function() { 18 | let year = new Date().getFullYear(); 19 | return `https://github.com/zenany/weekly/blob/master/software/${year}/` 20 | }, 21 | 22 | /** 23 | * 通过url获取文章内容URL的方法 24 | * @param {string} html 通过页面连接爬取到的页面html 25 | * @return {String} 从html中解析到的文章内容的链接 26 | */ 27 | getLink: function(html) { 28 | try { 29 | let curLink = 'https://github.com/'; 30 | 31 | let $ = cheerio.load(html); 32 | let links = $('table.files .content a'); 33 | for (let i = links.length; i > 0; i--) { 34 | let url = $(links[i - 1]).attr('href'); 35 | 36 | // 匹配这种类型的URL: /zenany/weekly/blob/master/software/2017/0220.md 37 | let urlReg = /.\/[\d]+\.md/g; 38 | if (/.\/[\d]+\.md/g.test(url)) return url_opera.resolve(curLink , url); 39 | } 40 | } catch (err) { 41 | return; 42 | } 43 | }, 44 | 45 | /** 46 | * 通过文章内容的链接爬取到文章主体 47 | * @param {String} html 通过文章内容的链接爬取到文章的html 48 | * @return {String} 文章主体部分的html 49 | */ 50 | getContent: function(html) { 51 | let $ = cheerio.load(html); 52 | try { 53 | let html = $('.entry-content').html(); 54 | html = html.replace('

-- THE END --

', ''); 55 | return html; 56 | } catch (err) { 57 | return; 58 | } 59 | } 60 | }, 61 | "75team": { 62 | url: "https://weekly.75team.com/", 63 | getLink: function(html) { 64 | try { 65 | let curLink = 'https://weekly.75team.com/'; 66 | 67 | let urlMatch = html.match(/href\=\'(issue\d+\.html)/); 68 | if (urlMatch) { 69 | return url_opera.resolve(curLink , urlMatch[1]) 70 | } else { 71 | return; 72 | } 73 | /* 这个页面下的html注释写成了 导致cheerio不识别,改用正则 74 | let $ = cheerio.load(html); 75 | return curLink + $('.issue-list li:first-child a').attr('href'); 76 | */ 77 | } catch (err) { 78 | return; 79 | } 80 | }, 81 | getContent: function(html) { 82 | let $ = cheerio.load(html); 83 | try { 84 | let contentDom = $('#main #content>ul'); 85 | return contentDom.html(); 86 | } catch (err) { 87 | return; 88 | } 89 | } 90 | } 91 | } -------------------------------------------------------------------------------- /config/config.mail.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 邮箱配置文件 3 | * @author xiongwilee 4 | */ 5 | 6 | module.exports = { 7 | "sender": { 8 | "host": "smtp.163.com", 9 | "port": 465, 10 | "auth": { 11 | "user": "wileetest04@163.com", 12 | "pass": "123qwe" 13 | } 14 | }, 15 | "subject": "每周技术文章推荐", 16 | "from": "xiongwilee ", 17 | "to": ["xiongwilee@foxmail.com"] 18 | } 19 | -------------------------------------------------------------------------------- /config/config.mail.sample.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 邮箱配置案例 3 | * @author xiongwilee 4 | */ 5 | 6 | module.exports = { 7 | "sender": { 8 | "host": "smtp.163.com", 9 | "port": 465, 10 | "auth": { 11 | "user": "wileetest04@163.com", 12 | "pass": "123qwe" 13 | } 14 | }, 15 | "subject": "每周技术文章推荐", 16 | "from": "xiongwilee ", 17 | "to": ["xiongwilee@foxmail.com"] 18 | } 19 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const config = require('./src/config'); 4 | const crawler = require('./src/crawler'); 5 | const mailer = require('./src/mailer'); 6 | 7 | let mailConfig = config.getConfig('mail'); 8 | let articleConfig = config.getConfig('article'); 9 | 10 | crawler(articleConfig, (content) => { 11 | mailer.sendMail(mailConfig, content); 12 | }) 13 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "techweekly", 3 | "version": "1.0.0", 4 | "description": "每周技术文章推荐爬虫工具", 5 | "main": "index.js", 6 | "dependencies": { 7 | "request": "^2.79.0", 8 | "cheerio": "^0.20.0", 9 | "nodemailer": "^2.7.0" 10 | }, 11 | "devDependencies": {}, 12 | "scripts": { 13 | "test": "echo \"Error: no test specified\" && exit 1" 14 | }, 15 | "repository": { 16 | "type": "git", 17 | "url": "git+https://github.com/xiongwilee/tech-weekly-crawler.git" 18 | }, 19 | "keywords": [ 20 | "tech", 21 | "weekly", 22 | "crawler" 23 | ], 24 | "author": "xiongwilee", 25 | "license": "MIT", 26 | "bugs": { 27 | "url": "https://github.com/xiongwilee/tech-weekly-crawler/issues" 28 | }, 29 | "homepage": "https://github.com/xiongwilee/tech-weekly-crawler#readme" 30 | } 31 | -------------------------------------------------------------------------------- /src/config.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const path = require('path'); 4 | 5 | let config = { 6 | article: __dirname + '/../config/config.article.js', 7 | mail: __dirname + '/../config/config.mail.js' 8 | } 9 | 10 | /** 11 | * 获取config 12 | * @param {string} type 配置类型 13 | */ 14 | exports.getConfig = function(type) { 15 | if (!config[type]) return; 16 | 17 | let filePath = path.resolve(config[type]); 18 | 19 | return require(filePath); 20 | } -------------------------------------------------------------------------------- /src/crawler.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const request = require("request"); 4 | 5 | /** 6 | * 通过文章列表配置和爬虫获取到每周技术内容,生成html 7 | * @param {Object} articleConfig 文章列表配置 8 | * @param {Function} callback 回调函数 9 | * @return {Undefined} 10 | */ 11 | function crawler(articleConfig, callback) { 12 | let promiseAll = allArticle(articleConfig); 13 | 14 | promiseAll.then((linkList) => { 15 | 16 | let promiseList = []; 17 | 18 | linkList.forEach((article) => { 19 | if (!article.linkBody) return; 20 | 21 | let contentPromise = getContentPromise(article); 22 | contentPromise && promiseList.push(contentPromise); 23 | }) 24 | 25 | Promise.all(promiseList).then((articleList) => { 26 | let contentList = []; 27 | 28 | articleList.forEach((article) => { 29 | if (!article.articleBody) return; 30 | 31 | let content = article.getContent(article.articleBody) 32 | if (!content) { 33 | console.error(`解析文章内容失败:${article.contentLink}`) 34 | return; 35 | } 36 | 37 | contentList.push(Object.assign(article, { 38 | articleHtml: content 39 | })) 40 | }) 41 | 42 | let htmlContent = getMailHtml(contentList); 43 | callback(htmlContent) 44 | }) 45 | }) 46 | } 47 | 48 | /** 49 | * 通过articleList获取HTML片段 50 | * @param {Array} contentList 获取到的文章列表 51 | * @return {String} 生成的HTML内容 52 | */ 53 | function getMailHtml(contentList) { 54 | let htmlContent = ''; 55 | contentList.forEach((article) => { 56 | htmlContent += `

▼ 来源: ${article.contentLink}

`; 57 | htmlContent += article.articleHtml; 58 | htmlContent +=`
` 59 | }) 60 | 61 | htmlContent += `

该技术周报由Techweekly强力驱动

`; 62 | 63 | return htmlContent 64 | } 65 | 66 | /** 67 | * 通过页面连接获取文档内容的HTML 68 | * @param {Object} article 文档列表页 69 | * @return {Object} Promise 70 | */ 71 | function getContentPromise(article) { 72 | if (!article.linkBody) return; 73 | 74 | let contentLink = article.getLink(article.linkBody); 75 | if (!contentLink) return; 76 | 77 | return new Promise((resolve, reject) => { 78 | request(contentLink, (err, res, body) => { 79 | if (err) { console.error(`抓取内容失败:${contentLink}`, err) } 80 | resolve(Object.assign(article, { 81 | articleBody: body, 82 | contentLink: contentLink 83 | })) 84 | }) 85 | }) 86 | } 87 | 88 | /** 89 | * 获取所有文档列表页面的页面HTML 90 | * @param {Object} articleConfig 文章列表配置 91 | * @return {Obejct} Promise 92 | */ 93 | function allArticle(articleConfig) { 94 | let promiseList = []; 95 | 96 | for (let key in articleConfig) { 97 | let article = articleConfig[key]; 98 | let url = typeof article.url == 'function' ? article.url() : article.url; 99 | 100 | promiseList.push(new Promise((resolve, reject) => { 101 | request(url, (err, res, body) => { 102 | if (err) { console.error(`抓取列表失败:${url}`, err) } 103 | resolve(Object.assign(article, { 104 | linkBody: body 105 | })); 106 | }) 107 | })) 108 | } 109 | 110 | return Promise.all(promiseList) 111 | } 112 | 113 | module.exports = crawler; 114 | -------------------------------------------------------------------------------- /src/mailer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const nodemailer = require("nodemailer"); 4 | 5 | //配置邮件服务信息 6 | let smtpTransport; 7 | 8 | 9 | /** 10 | * 发送邮件 11 | * @param {Object} mailConfig 邮箱配置 12 | * @param {String} html 邮件内容 13 | * @return {Undefined} 14 | */ 15 | function sendMail(mailConfig, html) { 16 | smtpTransport = smtpTransport || nodemailer.createTransport(mailConfig.sender); 17 | 18 | smtpTransport.sendMail({ 19 | subject: mailConfig.subject, 20 | from: mailConfig.from, 21 | to: mailConfig.to.join(','), 22 | html: html 23 | }, (error, response) => { 24 | if (error) { 25 | console.error(error); 26 | } else { 27 | console.log(response); 28 | } 29 | smtpTransport.close(); 30 | }); 31 | } 32 | 33 | exports.sendMail = sendMail; --------------------------------------------------------------------------------