├── .gitignore ├── .npmignore ├── LICENSE.md ├── README.md ├── bin ├── cli.js └── test.js └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | bower_components 2 | node_modules 3 | *.log 4 | .DS_Store 5 | bundle.js 6 | logos 7 | tmp -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | bower_components 2 | node_modules 3 | *.log 4 | .DS_Store 5 | bundle.js 6 | test 7 | test.js 8 | demo/ 9 | .npmignore 10 | LICENSE.md -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2015 Matt DesLauriers 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 18 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 20 | OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gh-readme-scrape 2 | 3 | [![unstable](http://badges.github.io/stability-badges/dist/unstable.svg)](http://github.com/badges/stability-badges) 4 | 5 | ![screen](http://i.imgur.com/kfmkGFj.png) 6 | 7 | A small CLI which scrapes GitHub readme pages for links and images of a certain file type, and then bulk downloads them into a destination folder. 8 | 9 | ## Install 10 | 11 | ```sh 12 | npm install -g gh-readme-scrape 13 | ``` 14 | 15 | ## Examples 16 | 17 | For example, to bulk download the SVGs from [gilbarbara/logos](https://github.com/gilbarbara/logos): 18 | 19 | ```sh 20 | gh-readme-scrape gilbarbara/logos logos/ -e svg 21 | ``` 22 | 23 | This will save all the SVGs into a local folder called `logos`, see below: 24 | 25 | ![image](http://i.imgur.com/69BHg0K.png) 26 | 27 | Or, to bulk download PDFs from the [graphics-resources](https://github.com/mattdesl/graphics-resources) readme: 28 | 29 | ```sh 30 | gh-readme-scrape mattdesl/graphics-resources papers/ -e pdf --rename 31 | ``` 32 | 33 | The `--rename` flag will use the anchor text in the markdown to determine the file name. Result: 34 | 35 | ![image](http://i.imgur.com/QnO0iAE.png) 36 | 37 | ## Usage 38 | 39 | [![NPM](https://nodei.co/npm/gh-readme-scrape.png)](https://www.npmjs.com/package/gh-readme-scrape) 40 | 41 | 42 | ```sh 43 | Usage: 44 | gh-readme-scrape repository output [opts] 45 | 46 | Options: 47 | --extension, -e a list of extensions, comma-separated 48 | --rename, -r rename filenames to the Markdown anchor text 49 | --timeout=n ms timeout before failing a request (default 4000ms) 50 | --verbose log all requests 51 | --auth authorize the readme request with GH API 52 | ``` 53 | 54 | The `repository` can be a full URL to the repository, or a shorthand like `gilbarbara/logos`. 55 | 56 | The extensions can be comma-separated, such as: 57 | 58 | ```sh 59 | gh-readme-scrape gilbarbara/logos tmp -e svg,png,gif 60 | ``` 61 | 62 | ## License 63 | 64 | MIT, see [LICENSE.md](http://github.com/mattdesl/gh-readme-scrape/blob/master/LICENSE.md) for details. 65 | -------------------------------------------------------------------------------- /bin/cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | var path = require('path') 3 | var getUrls = require('gh-md-urls') 4 | var parseRepo = require('github-url-to-object') 5 | var got = require('got') 6 | var contentType = require('content-type') 7 | var mimeExtensions = require('mime-types').extensions 8 | var async = require('async') 9 | var mkdirp = require('mkdirp') 10 | var path = require('path') 11 | var fs = require('fs') 12 | var chalk = require('chalk') 13 | var ghauth = require('ghauth') 14 | 15 | var ASYNC_LIMIT = 20 16 | var argv = require('minimist')(process.argv.slice(2), { 17 | alias: { 18 | extension: 'e', 19 | rename: 'r' 20 | }, 21 | boolean: ['verbose', 'auth'] 22 | }) 23 | 24 | var timeout = typeof argv.timeout === 'number' ? argv.timeout : 4000 25 | var repoUrl = argv._[0] 26 | if (!repoUrl) { 27 | throw new Error('must provide GitHub repository') 28 | } 29 | 30 | var repo = parseRepo(repoUrl) 31 | if (!repo) { 32 | throw new Error('could not parse repository URL') 33 | } 34 | 35 | var findExtensions = argv.extension 36 | if (!Array.isArray(findExtensions)) { 37 | findExtensions = [ findExtensions ] 38 | } 39 | 40 | findExtensions = findExtensions.map(function (ext) { 41 | return ext && ext.split(',') 42 | }).reduce(function (a, b) { 43 | return a.concat(b) 44 | }, []).filter(Boolean) 45 | 46 | if (findExtensions.length === 0) { 47 | throw new Error('must provide at least one extension to look for') 48 | } 49 | 50 | var output = argv._[1] 51 | if (!output) { 52 | throw new Error('must specify output folder') 53 | } 54 | 55 | output = path.join(process.cwd(), output) 56 | 57 | mkdirp(output, function (err) { 58 | if (err) throw err 59 | getReadme(repo, function (err, body) { 60 | if (err) throw err 61 | 62 | var nodes = getUrls(body, { 63 | raw: true, 64 | repository: repoUrl, 65 | baseUrl: '' // don't resolve fragments 66 | }).filter(function (node) { 67 | var url = node.url 68 | return /^[^\#]/.test(url) 69 | }) 70 | 71 | console.error(chalk.gray('Searching ' + chalk.bold(nodes.length) + ' links')) 72 | filterContentType(nodes, findExtensions, function (results) { 73 | nodes = results 74 | if (argv.verbose) { 75 | console.error(chalk.gray('Filtered down to ' + chalk.bold(nodes.length) + ' links')) 76 | } 77 | async.eachLimit(nodes, ASYNC_LIMIT, function (item, next) { 78 | saveResource(item, function (err) { 79 | if (err) { 80 | console.error(chalk.yellow('Error requesting URL contents: ' + item)) 81 | } else { 82 | return next(null) 83 | } 84 | }) 85 | }, function (err) { 86 | done(err, nodes) 87 | }) 88 | }) 89 | }) 90 | }) 91 | 92 | function done (err, urls) { 93 | if (err) console.error(err) 94 | else if (urls.length === 0) { 95 | console.error(chalk.yellow('No resources found')) 96 | } else { 97 | var word = urls.length === 1 ? 'resource' : 'resources' 98 | console.error(chalk.green('Finished downloading ') + chalk.bold(urls.length) + chalk.green(' ' + word)) 99 | } 100 | } 101 | 102 | function filterContentType (nodes, extensions, cb) { 103 | async.mapLimit(nodes, ASYNC_LIMIT, function (node, next) { 104 | var url = node.url 105 | got.head(url, { 106 | timeout: timeout 107 | }, function (err, body, res) { 108 | if (err) { 109 | console.error(chalk.yellow('Error requesting URL: ') + url) 110 | return next(null, null) 111 | } 112 | if (argv.verbose) { 113 | console.error(chalk.dim('GET ' + url)) 114 | } 115 | 116 | if (res.headers && res.headers['content-type']) { 117 | var content = contentType.parse(res.headers['content-type']) 118 | var exts = mimeExtensions[content.type] 119 | var any = exts.some(function (ext) { 120 | return extensions.indexOf(ext) >= 0 121 | }) 122 | return next(null, any ? node : null) 123 | } else { 124 | return next(null, null) 125 | } 126 | }) 127 | }, function (err, results) { 128 | if (err) { 129 | console.error(err) 130 | process.exit(1) 131 | } 132 | cb(results.filter(Boolean)) 133 | }) 134 | } 135 | 136 | function saveResource (node, cb) { 137 | var url = node.url 138 | var ext = path.extname(node.url) 139 | var name = argv.rename ? (node.text + ext) : path.basename(url) 140 | var file = path.join(output, name) 141 | var out = fs.createWriteStream(file) 142 | console.error(chalk.magenta('Downloading ') + chalk.gray(name)) 143 | var stream = got(url, { 144 | timeout: timeout 145 | }).on('error', cb) 146 | 147 | stream.pipe(out) 148 | .on('close', function () { 149 | cb(null) 150 | }) 151 | .on('error', function (err) { 152 | cb(err) 153 | }) 154 | } 155 | 156 | function getReadme (repo, cb) { 157 | if (argv.auth) { 158 | ghauth({ 159 | configName: 'gh-readme-scrape', 160 | scopes: ['user', 'repo'] 161 | }, function (err, data) { 162 | if (err) throw err 163 | getReadmeRequest(repo, data.token, cb) 164 | }) 165 | } else { 166 | getReadmeRequest(repo, null, cb) 167 | } 168 | } 169 | 170 | function getReadmeRequest (repo, token, cb) { 171 | var api = 'https://api.github.com/repos/' 172 | var url = api + repo.user + '/' + repo.repo + '/readme' 173 | 174 | var headers = { 175 | accept: 'application/vnd.github.v3+json', 176 | 'user-agent': 'gh-api-stream' 177 | } 178 | if (token) { 179 | headers.authorization = 'token ' + token 180 | } 181 | 182 | got(url, { 183 | timeout: timeout, 184 | json: true, 185 | headers: headers 186 | }, function (err, result, res) { 187 | if (res.statusCode === 403) { 188 | console.error(chalk.red('HTTP 403 Error:'), "You've hit the GitHub API Limit") 189 | console.error('Try running again with --auth') 190 | process.exit(1) 191 | } 192 | if (err) return cb(err) 193 | if (!(/^2/.test(res.statusCode))) return cb(new Error('invalid status code ' + res.statusCode)) 194 | var contents = result.content 195 | try { 196 | var md = new Buffer(contents, result.encoding).toString('utf8') 197 | cb(null, md) 198 | } catch (e) { 199 | return cb(e) 200 | } 201 | }) 202 | } 203 | -------------------------------------------------------------------------------- /bin/test.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | var url = 'http://github.com/mattdesl/graphics-resources/blob/master/LICENSE.md' 3 | require('got')(url, { 4 | // json: true, 5 | timeout: 4000 6 | }, function (err, result, res) { 7 | if (err) { 8 | return console.error('err', err.message) 9 | } 10 | console.log("RESULT", result.url) 11 | }) -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gh-readme-scrape", 3 | "version": "1.3.0", 4 | "description": "a CLI to bulk download URLs (images/pdfs/etc) in Markdown", 5 | "main": "index.js", 6 | "license": "MIT", 7 | "author": { 8 | "name": "Matt DesLauriers", 9 | "email": "dave.des@gmail.com", 10 | "url": "https://github.com/mattdesl" 11 | }, 12 | "dependencies": { 13 | "async": "^1.3.0", 14 | "chalk": "^1.1.0", 15 | "content-type": "^1.0.1", 16 | "gh-md-urls": "^1.1.1", 17 | "ghauth": "^3.0.0", 18 | "github-url-to-object": "^1.5.2", 19 | "got": "^3.3.0", 20 | "mime-types": "^2.1.2", 21 | "minimist": "^1.1.1", 22 | "mkdirp": "^0.5.1", 23 | "urljoin": "^0.1.5" 24 | }, 25 | "devDependencies": {}, 26 | "scripts": { 27 | "test": "node test.js" 28 | }, 29 | "keywords": [ 30 | "md", 31 | "markdown", 32 | "curl", 33 | "download", 34 | "request", 35 | "get", 36 | "image", 37 | "images", 38 | "pdf", 39 | "pdfs", 40 | "link", 41 | "links", 42 | "down", 43 | "load" 44 | ], 45 | "repository": { 46 | "type": "git", 47 | "url": "git://github.com/mattdesl/gh-readme-scrape.git" 48 | }, 49 | "homepage": "https://github.com/mattdesl/gh-readme-scrape", 50 | "bugs": { 51 | "url": "https://github.com/mattdesl/gh-readme-scrape/issues" 52 | }, 53 | "bin": { 54 | "gh-readme-scrape": "./bin/cli.js" 55 | } 56 | } 57 | --------------------------------------------------------------------------------