├── screenshot ├── download.png ├── results.png ├── running.png ├── search.png └── video_list.png ├── package.json ├── .gitignore ├── README.md ├── src └── crawler.coffee └── crawler.js /screenshot/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkeym4ster/imooc-crawler/HEAD/screenshot/download.png -------------------------------------------------------------------------------- /screenshot/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkeym4ster/imooc-crawler/HEAD/screenshot/results.png -------------------------------------------------------------------------------- /screenshot/running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkeym4ster/imooc-crawler/HEAD/screenshot/running.png -------------------------------------------------------------------------------- /screenshot/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkeym4ster/imooc-crawler/HEAD/screenshot/search.png -------------------------------------------------------------------------------- /screenshot/video_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkeym4ster/imooc-crawler/HEAD/screenshot/video_list.png -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "imooc_crawler", 3 | "version": "1.0.0", 4 | "devDependencies": { 5 | "coffee-script": "^1.10.0" 6 | }, 7 | "description": "imooc web crawler", 8 | "main": "crawler.js", 9 | "repository": { 10 | "type": "git", 11 | "url": "git+ssh://git@github.com/monkeym4ster/imooc_crawler.git" 12 | }, 13 | "scripts": { 14 | "start": "node crawler.js" 15 | }, 16 | "keywords": [ 17 | "crawler" 18 | ], 19 | "author": "M4ster", 20 | "license": "ISC", 21 | "dependencies": { 22 | "cheerio": "^0.19.0", 23 | "colors": "^1.1.2", 24 | "multi-progress": "^1.0.0", 25 | "request": "^2.64.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Node template 3 | # Logs 4 | logs 5 | *.log 6 | npm-debug.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | 13 | # Directory for instrumented libs generated by jscoverage/JSCover 14 | lib-cov 15 | 16 | # Coverage directory used by tools like istanbul 17 | coverage 18 | 19 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 20 | .grunt 21 | 22 | # node-waf configuration 23 | .lock-wscript 24 | 25 | # Compiled binary addons (http://nodejs.org/api/addons.html) 26 | build/Release 27 | 28 | # Dependency directory 29 | # https://docs.npmjs.com/misc/faq#should-i-check-my-node-modules-folder-into-git 30 | node_modules 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # imooc crawler 2 | imooc website: http://www.imooc.com 3 | 4 | ## Usage 5 | $ node crawler.js 6 | 7 | Usage: crawler.js [Options] 8 | --search Search for the specified keywords 9 | --list List the video list under the specified course ID or URL 10 | --download Download the video list under the specified course ID or URL 11 | 12 | ## Screenshot 13 | ![running](https://github.com/monkeym4ster/imooc_crawler/raw/master/screenshot/running.png) 14 | ![search](https://github.com/monkeym4ster/imooc_crawler/raw/master/screenshot/search.png) 15 | ![download](https://github.com/monkeym4ster/imooc_crawler/raw/master/screenshot/download.png) 16 | ![videolist](https://github.com/monkeym4ster/imooc_crawler/raw/master/screenshot/video_list.png) 17 | ![results](https://github.com/monkeym4ster/imooc_crawler/raw/master/screenshot/results.png) 18 | 19 | ## Installation 20 | 21 | ### Install Node.js. 22 | Like this https://nodejs.org/en/download/package-manager/ 23 | 24 | ### Download 25 | You can download the zip or clone the git repository : 26 | 27 | git clone https://github.com/monkeym4ster/imooc_crawler 28 | 29 | ### Install package 30 | npm install 31 | 32 | -------------------------------------------------------------------------------- /src/crawler.coffee: -------------------------------------------------------------------------------- 1 | request = require('request') 2 | cheerio = require('cheerio') 3 | fs = require('fs') 4 | colors = require('colors') 5 | Multiprogress = require('multi-progress') 6 | multi = new Multiprogress(process.stderr) 7 | 8 | website = 'http://www.imooc.com' 9 | 10 | ### 11 | # Read video list 12 | # @param {String} URL 13 | # @param {Function} callback 14 | ### 15 | readVideoList = (url, callback) -> 16 | console.log colors.gray("Read video list: #{url}") 17 | request.get url, (err, res) -> 18 | if err 19 | return callback err 20 | if res and res.statusCode is 200 21 | $ = cheerio.load(res.body) 22 | videos = [] 23 | $('.J-media-item').each () -> 24 | $me = $(this) 25 | item = { 26 | id: $me.attr('href').match(/\d+/)[0] 27 | name: $me.text().trim() 28 | } 29 | videos.push item 30 | return callback null, videos 31 | else 32 | return calback res.statusCode 33 | return 34 | return 35 | 36 | ### 37 | # Read video detail 38 | # @param {Object} video 39 | # @param {Function} callback 40 | ### 41 | readVideoDetailAndDownload = (video, callback) -> 42 | api = website + '/course/ajaxmediainfo/?mode=flash&mid=' 43 | url = api + video.id 44 | filename = video.name.replace(/\(\d.+$/, '').trim() + '.mp4' 45 | console.log colors.gray "Download course: #{filename}, url: #{url}" 46 | request.get url, (err, res) -> 47 | if err 48 | return callback err 49 | if res and res.statusCode is 200 50 | body = JSON.parse(res.body) 51 | if body.result is 0 52 | filename = filename.replace(/([\\\/\*\:\?\"\<\>\|])/g, '_') 53 | request.get body.data.result.mpath[0] 54 | .on('response', (res) -> 55 | len = parseInt(res.headers['content-length'], 10) 56 | progressBar = multi.newBar("Downloading #{filename} [:bar] :percent :etas", { 57 | width: 50 58 | total: len 59 | }) 60 | res.on 'data', (chunk) -> 61 | progressBar.tick(chunk.length) 62 | ) 63 | .pipe(fs.createWriteStream(filename)) 64 | else 65 | return callback body.msg 66 | return 67 | return 68 | 69 | ### 70 | # Read course list 71 | # @param {String} url 72 | # @param {Function} callback 73 | ### 74 | readCourseList = (url, callback) -> 75 | console.log colors.gray "Read course list: #{url}" 76 | request url, (err, res) -> 77 | if err 78 | return callback(err) 79 | if res and res.statusCode is 200 80 | $ = cheerio.load(res.body) 81 | courses = [] 82 | courseItem = $('.course-item') 83 | courseItem.each(() -> 84 | $me = $(this) 85 | item = { 86 | title: $me.find('.title').text().trim() 87 | description: $me.find('.description').text().trim() 88 | url: website + $me.find('a').attr('href') 89 | } 90 | courses.push item 91 | ) 92 | nextPage = $('.page').find('.active').next().attr('data-page') 93 | if not nextPage 94 | return callback null, courses 95 | nextPageURL = url.replace(/(\d+$)/, nextPage) 96 | readCourseList nextPageURL, (err, courses2) -> 97 | if err 98 | return callback err 99 | return callback null, courses.concat(courses2) 100 | return 101 | return 102 | 103 | ### 104 | # Search course 105 | # @param {String} words 106 | # @param {Function} callback 107 | ### 108 | searchCourse = (words, callback) -> 109 | url = website + '/index/search?words=' + words + '&page=1' 110 | request url, (err, res) -> 111 | if err 112 | return callback(err) 113 | if res and res.statusCode is 200 114 | $ = cheerio.load(res.body) 115 | courseItem = $('.course-item') 116 | if not courseItem.length 117 | return callback("There is no result on \"#{words}\".") 118 | readCourseList url, callback 119 | return 120 | return 121 | 122 | ### 123 | # Do work 124 | # @param {String} action 125 | # @param {String} value 126 | # @param {Function} callback 127 | ### 128 | doWork = (action, value, callback) -> 129 | switch action 130 | when '--search' 131 | if not value 132 | return callback 'Please input keywords.' 133 | return searchCourse(value, callback) 134 | when '--list' 135 | if not value 136 | return callback 'Please input course URL or ID' 137 | url = if isNaN value then value else website + '/learn/' + value 138 | return readVideoList url, callback 139 | when '--download' 140 | if not value 141 | return callback 'Please input course URL or ID' 142 | url = if isNaN value then value else website + '/learn/' + value 143 | readVideoList url, (err, videos) -> 144 | if err 145 | return callback err 146 | for video in videos 147 | readVideoDetailAndDownload video, callback 148 | return 149 | return 150 | else 151 | return callback 'Unknown action.' 152 | 153 | argv = process.argv.slice(2) 154 | 155 | if not argv[0] 156 | console.log "Usage: crawler.js [Options]" 157 | console.log " --search\t Search for the specified keywords" 158 | console.log " --list\t List the video list under the specified course ID or URL" 159 | console.log " --download\t Download the video list under the specified course ID or URL" 160 | return 161 | 162 | for arg of argv 163 | if arg % 2 isnt 0 164 | continue 165 | action = argv[arg] 166 | value = argv[Number(arg) + 1] 167 | doWork action, value, (err, res) -> 168 | if err 169 | return console.error colors.red(err) 170 | line = '' 171 | i = 0 172 | while i++ < 30 173 | line += '-' 174 | for arr in res 175 | console.log line 176 | for key of arr 177 | val = arr[key] 178 | console.log "#{colors.green(key)}: #{val}" 179 | return 180 | -------------------------------------------------------------------------------- /crawler.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | (function() { 4 | var Multiprogress, action, arg, argv, cheerio, colors, doWork, fs, multi, readCourseList, readVideoDetailAndDownload, readVideoList, request, searchCourse, value, website; 5 | 6 | request = require('request'); 7 | 8 | cheerio = require('cheerio'); 9 | 10 | fs = require('fs'); 11 | 12 | colors = require('colors'); 13 | 14 | Multiprogress = require('multi-progress'); 15 | 16 | multi = new Multiprogress(process.stderr); 17 | 18 | website = 'http://www.imooc.com'; 19 | 20 | 21 | /* 22 | * Read video list 23 | * @param {String} URL 24 | * @param {Function} callback 25 | */ 26 | 27 | readVideoList = function(url, callback) { 28 | console.log(colors.gray("Read video list: " + url)); 29 | request.get(url, function(err, res) { 30 | var $, videos; 31 | if (err) { 32 | return callback(err); 33 | } 34 | if (res && res.statusCode === 200) { 35 | $ = cheerio.load(res.body); 36 | videos = []; 37 | $('.J-media-item').each(function() { 38 | var $me, item; 39 | $me = $(this); 40 | item = { 41 | id: $me.attr('href').match(/\d+/)[0], 42 | name: $me.text().trim() 43 | }; 44 | return videos.push(item); 45 | }); 46 | return callback(null, videos); 47 | } else { 48 | return calback(res.statusCode); 49 | } 50 | }); 51 | }; 52 | 53 | 54 | /* 55 | * Read video detail 56 | * @param {Object} video 57 | * @param {Function} callback 58 | */ 59 | 60 | readVideoDetailAndDownload = function(video, callback) { 61 | var api, filename, url; 62 | api = website + '/course/ajaxmediainfo/?mode=flash&mid='; 63 | url = api + video.id; 64 | filename = video.name.replace(/\(\d.+$/, '').trim() + '.mp4'; 65 | console.log(colors.gray("Download course: " + filename + ", url: " + url)); 66 | request.get(url, function(err, res) { 67 | var body; 68 | if (err) { 69 | return callback(err); 70 | } 71 | if (res && res.statusCode === 200) { 72 | body = JSON.parse(res.body); 73 | if (body.result === 0) { 74 | filename = filename.replace(/([\\\/\*\:\?\"\<\>\|])/g, '_'); 75 | request.get(body.data.result.mpath[0]).on('response', function(res) { 76 | var len, progressBar; 77 | len = parseInt(res.headers['content-length'], 10); 78 | progressBar = multi.newBar("Downloading " + filename + " [:bar] :percent :etas", { 79 | width: 50, 80 | total: len 81 | }); 82 | return res.on('data', function(chunk) { 83 | return progressBar.tick(chunk.length); 84 | }); 85 | }).pipe(fs.createWriteStream(filename)); 86 | } else { 87 | return callback(body.msg); 88 | } 89 | } 90 | }); 91 | }; 92 | 93 | 94 | /* 95 | * Read course list 96 | * @param {String} url 97 | * @param {Function} callback 98 | */ 99 | 100 | readCourseList = function(url, callback) { 101 | console.log(colors.gray("Read course list: " + url)); 102 | request(url, function(err, res) { 103 | var $, courseItem, courses, nextPage, nextPageURL; 104 | if (err) { 105 | return callback(err); 106 | } 107 | if (res && res.statusCode === 200) { 108 | $ = cheerio.load(res.body); 109 | courses = []; 110 | courseItem = $('.course-item'); 111 | courseItem.each(function() { 112 | var $me, item; 113 | $me = $(this); 114 | item = { 115 | title: $me.find('.title').text().trim(), 116 | description: $me.find('.description').text().trim(), 117 | url: website + $me.find('a').attr('href') 118 | }; 119 | return courses.push(item); 120 | }); 121 | nextPage = $('.page').find('.active').next().attr('data-page'); 122 | if (!nextPage) { 123 | return callback(null, courses); 124 | } 125 | nextPageURL = url.replace(/(\d+$)/, nextPage); 126 | readCourseList(nextPageURL, function(err, courses2) { 127 | if (err) { 128 | return callback(err); 129 | } 130 | return callback(null, courses.concat(courses2)); 131 | }); 132 | } 133 | }); 134 | }; 135 | 136 | 137 | /* 138 | * Search course 139 | * @param {String} words 140 | * @param {Function} callback 141 | */ 142 | 143 | searchCourse = function(words, callback) { 144 | var url; 145 | url = website + '/index/search?words=' + words + '&page=1'; 146 | request(url, function(err, res) { 147 | var $, courseItem; 148 | if (err) { 149 | return callback(err); 150 | } 151 | if (res && res.statusCode === 200) { 152 | $ = cheerio.load(res.body); 153 | courseItem = $('.course-item'); 154 | if (!courseItem.length) { 155 | return callback("There is no result on \"" + words + "\"."); 156 | } 157 | readCourseList(url, callback); 158 | } 159 | }); 160 | }; 161 | 162 | 163 | /* 164 | * Do work 165 | * @param {String} action 166 | * @param {String} value 167 | * @param {Function} callback 168 | */ 169 | 170 | doWork = function(action, value, callback) { 171 | var url; 172 | switch (action) { 173 | case '--search': 174 | if (!value) { 175 | return callback('Please input keywords.'); 176 | } 177 | return searchCourse(value, callback); 178 | case '--list': 179 | if (!value) { 180 | return callback('Please input course URL or ID'); 181 | } 182 | url = isNaN(value) ? value : website + '/learn/' + value; 183 | return readVideoList(url, callback); 184 | case '--download': 185 | if (!value) { 186 | return callback('Please input course URL or ID'); 187 | } 188 | url = isNaN(value) ? value : website + '/learn/' + value; 189 | readVideoList(url, function(err, videos) { 190 | var j, len1, video; 191 | if (err) { 192 | return callback(err); 193 | } 194 | for (j = 0, len1 = videos.length; j < len1; j++) { 195 | video = videos[j]; 196 | readVideoDetailAndDownload(video, callback); 197 | } 198 | }); 199 | break; 200 | default: 201 | return callback('Unknown action.'); 202 | } 203 | }; 204 | 205 | argv = process.argv.slice(2); 206 | 207 | if (!argv[0]) { 208 | console.log("Usage: crawler.js [Options]"); 209 | console.log(" --search\t Search for the specified keywords"); 210 | console.log(" --list\t List the video list under the specified course ID or URL"); 211 | console.log(" --download\t Download the video list under the specified course ID or URL"); 212 | return; 213 | } 214 | 215 | for (arg in argv) { 216 | if (arg % 2 !== 0) { 217 | continue; 218 | } 219 | action = argv[arg]; 220 | value = argv[Number(arg) + 1]; 221 | doWork(action, value, function(err, res) { 222 | var arr, i, j, key, len1, line, val; 223 | if (err) { 224 | return console.error(colors.red(err)); 225 | } 226 | line = ''; 227 | i = 0; 228 | while (i++ < 30) { 229 | line += '-'; 230 | } 231 | for (j = 0, len1 = res.length; j < len1; j++) { 232 | arr = res[j]; 233 | console.log(line); 234 | for (key in arr) { 235 | val = arr[key]; 236 | console.log((colors.green(key)) + ": " + val); 237 | } 238 | } 239 | }); 240 | } 241 | 242 | }).call(this); 243 | --------------------------------------------------------------------------------