├── .gitignore
├── .npmignore
├── LICENSE.md
├── README.md
├── bin
    ├── cli.js
    └── test.js
└── package.json


/.gitignore:
--------------------------------------------------------------------------------
1 | bower_components
2 | node_modules
3 | *.log
4 | .DS_Store
5 | bundle.js
6 | logos
7 | tmp


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | bower_components
 2 | node_modules
 3 | *.log
 4 | .DS_Store
 5 | bundle.js
 6 | test
 7 | test.js
 8 | demo/
 9 | .npmignore
10 | LICENSE.md


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2015 Matt DesLauriers
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
20 | OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gh-readme-scrape
 2 | 
 3 | [![unstable](http://badges.github.io/stability-badges/dist/unstable.svg)](http://github.com/badges/stability-badges)
 4 | 
 5 | ![screen](http://i.imgur.com/kfmkGFj.png)
 6 | 
 7 | A small CLI which scrapes GitHub readme pages for links and images of a certain file type, and then bulk downloads them into a destination folder.
 8 | 
 9 | ## Install
10 | 
11 | ```sh
12 | npm install -g gh-readme-scrape
13 | ```
14 | 
15 | ## Examples
16 | 
17 | For example, to bulk download the SVGs from [gilbarbara/logos](https://github.com/gilbarbara/logos):
18 | 
19 | ```sh
20 | gh-readme-scrape gilbarbara/logos logos/ -e svg
21 | ```
22 | 
23 | This will save all the SVGs into a local folder called `logos`, see below:
24 | 
25 | ![image](http://i.imgur.com/69BHg0K.png)
26 | 
27 | Or, to bulk download PDFs from the [graphics-resources](https://github.com/mattdesl/graphics-resources) readme:
28 | 
29 | ```sh
30 | gh-readme-scrape mattdesl/graphics-resources papers/ -e pdf --rename
31 | ```
32 | 
33 | The `--rename` flag will use the anchor text in the markdown to determine the file name. Result:
34 | 
35 | ![image](http://i.imgur.com/QnO0iAE.png)
36 | 
37 | ## Usage
38 | 
39 | [![NPM](https://nodei.co/npm/gh-readme-scrape.png)](https://www.npmjs.com/package/gh-readme-scrape)
40 | 
41 | 
42 | ```sh
43 | Usage:
44 |   gh-readme-scrape repository output [opts]
45 | 
46 | Options:
47 |   --extension, -e  a list of extensions, comma-separated
48 |   --rename, -r     rename filenames to the Markdown anchor text
49 |   --timeout=n      ms timeout before failing a request (default 4000ms)
50 |   --verbose        log all requests
51 |   --auth           authorize the readme request with GH API
52 | ```
53 | 
54 | The `repository` can be a full URL to the repository, or a shorthand like `gilbarbara/logos`.
55 | 
56 | The extensions can be comma-separated, such as:
57 | 
58 | ```sh
59 | gh-readme-scrape gilbarbara/logos tmp -e svg,png,gif
60 | ```
61 | 
62 | ## License
63 | 
64 | MIT, see [LICENSE.md](http://github.com/mattdesl/gh-readme-scrape/blob/master/LICENSE.md) for details.
65 | 


--------------------------------------------------------------------------------
/bin/cli.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | var path = require('path')
  3 | var getUrls = require('gh-md-urls')
  4 | var parseRepo = require('github-url-to-object')
  5 | var got = require('got')
  6 | var contentType = require('content-type')
  7 | var mimeExtensions = require('mime-types').extensions
  8 | var async = require('async')
  9 | var mkdirp = require('mkdirp')
 10 | var path = require('path')
 11 | var fs = require('fs')
 12 | var chalk = require('chalk')
 13 | var ghauth = require('ghauth')
 14 | 
 15 | var ASYNC_LIMIT = 20
 16 | var argv = require('minimist')(process.argv.slice(2), {
 17 |   alias: {
 18 |     extension: 'e',
 19 |     rename: 'r'
 20 |   },
 21 |   boolean: ['verbose', 'auth']
 22 | })
 23 | 
 24 | var timeout = typeof argv.timeout === 'number' ? argv.timeout : 4000
 25 | var repoUrl = argv._[0]
 26 | if (!repoUrl) {
 27 |   throw new Error('must provide GitHub repository')
 28 | }
 29 | 
 30 | var repo = parseRepo(repoUrl)
 31 | if (!repo) {
 32 |   throw new Error('could not parse repository URL')
 33 | }
 34 | 
 35 | var findExtensions = argv.extension
 36 | if (!Array.isArray(findExtensions)) {
 37 |   findExtensions = [ findExtensions ]
 38 | }
 39 | 
 40 | findExtensions = findExtensions.map(function (ext) {
 41 |   return ext && ext.split(',')
 42 | }).reduce(function (a, b) {
 43 |   return a.concat(b)
 44 | }, []).filter(Boolean)
 45 | 
 46 | if (findExtensions.length === 0) {
 47 |   throw new Error('must provide at least one extension to look for')
 48 | }
 49 | 
 50 | var output = argv._[1]
 51 | if (!output) {
 52 |   throw new Error('must specify output folder')
 53 | }
 54 | 
 55 | output = path.join(process.cwd(), output)
 56 | 
 57 | mkdirp(output, function (err) {
 58 |   if (err) throw err
 59 |   getReadme(repo, function (err, body) {
 60 |     if (err) throw err
 61 | 
 62 |     var nodes = getUrls(body, {
 63 |       raw: true,
 64 |       repository: repoUrl,
 65 |       baseUrl: '' // don't resolve fragments
 66 |     }).filter(function (node) {
 67 |       var url = node.url
 68 |       return /^[^\#]/.test(url)
 69 |     })
 70 | 
 71 |     console.error(chalk.gray('Searching ' + chalk.bold(nodes.length) + ' links'))
 72 |     filterContentType(nodes, findExtensions, function (results) {
 73 |       nodes = results
 74 |       if (argv.verbose) {
 75 |         console.error(chalk.gray('Filtered down to ' + chalk.bold(nodes.length) + ' links'))
 76 |       }
 77 |       async.eachLimit(nodes, ASYNC_LIMIT, function (item, next) {
 78 |         saveResource(item, function (err) {
 79 |           if (err) {
 80 |             console.error(chalk.yellow('Error requesting URL contents: ' + item))
 81 |           } else {
 82 |             return next(null)
 83 |           }
 84 |         })
 85 |       }, function (err) {
 86 |         done(err, nodes)
 87 |       })
 88 |     })
 89 |   })
 90 | })
 91 | 
 92 | function done (err, urls) {
 93 |   if (err) console.error(err)
 94 |   else if (urls.length === 0) {
 95 |     console.error(chalk.yellow('No resources found'))
 96 |   } else {
 97 |     var word = urls.length === 1 ? 'resource' : 'resources'
 98 |     console.error(chalk.green('Finished downloading ') + chalk.bold(urls.length) + chalk.green(' ' + word))
 99 |   }
100 | }
101 | 
102 | function filterContentType (nodes, extensions, cb) {
103 |   async.mapLimit(nodes, ASYNC_LIMIT, function (node, next) {
104 |     var url = node.url
105 |     got.head(url, {
106 |       timeout: timeout
107 |     }, function (err, body, res) {
108 |       if (err) {
109 |         console.error(chalk.yellow('Error requesting URL: ') + url)
110 |         return next(null, null)
111 |       }
112 |       if (argv.verbose) {
113 |         console.error(chalk.dim('GET ' + url))
114 |       }
115 |       
116 |       if (res.headers && res.headers['content-type']) {
117 |         var content = contentType.parse(res.headers['content-type'])
118 |         var exts = mimeExtensions[content.type]
119 |         var any = exts.some(function (ext) {
120 |           return extensions.indexOf(ext) >= 0
121 |         })
122 |         return next(null, any ? node : null)
123 |       } else {
124 |         return next(null, null)
125 |       }
126 |     })
127 |   }, function (err, results) {
128 |     if (err) {
129 |       console.error(err)
130 |       process.exit(1)
131 |     }
132 |     cb(results.filter(Boolean))
133 |   })
134 | }
135 | 
136 | function saveResource (node, cb) {
137 |   var url = node.url
138 |   var ext = path.extname(node.url)
139 |   var name = argv.rename ? (node.text + ext) : path.basename(url)
140 |   var file = path.join(output, name)
141 |   var out = fs.createWriteStream(file)
142 |   console.error(chalk.magenta('Downloading ') + chalk.gray(name))
143 |   var stream = got(url, {
144 |     timeout: timeout
145 |   }).on('error', cb)
146 | 
147 |   stream.pipe(out)
148 |     .on('close', function () {
149 |       cb(null)
150 |     })
151 |     .on('error', function (err) {
152 |       cb(err)
153 |     })
154 | }
155 | 
156 | function getReadme (repo, cb) {
157 |   if (argv.auth) {
158 |     ghauth({
159 |       configName: 'gh-readme-scrape',
160 |       scopes: ['user', 'repo']
161 |     }, function (err, data) {
162 |       if (err) throw err
163 |       getReadmeRequest(repo, data.token, cb)
164 |     })
165 |   } else {
166 |     getReadmeRequest(repo, null, cb)
167 |   }
168 | }
169 | 
170 | function getReadmeRequest (repo, token, cb) {
171 |   var api = 'https://api.github.com/repos/'
172 |   var url = api + repo.user + '/' + repo.repo + '/readme'
173 |   
174 |   var headers = {
175 |     accept: 'application/vnd.github.v3+json',
176 |     'user-agent': 'gh-api-stream'
177 |   }
178 |   if (token) {
179 |     headers.authorization = 'token ' + token
180 |   }
181 |   
182 |   got(url, {
183 |     timeout: timeout,
184 |     json: true,
185 |     headers: headers
186 |   }, function (err, result, res) {
187 |     if (res.statusCode === 403) {
188 |       console.error(chalk.red('HTTP 403 Error:'), "You've hit the GitHub API Limit")
189 |       console.error('Try running again with --auth')
190 |       process.exit(1)
191 |     }
192 |     if (err) return cb(err)
193 |     if (!(/^2/.test(res.statusCode))) return cb(new Error('invalid status code ' + res.statusCode))
194 |     var contents = result.content
195 |     try {
196 |       var md = new Buffer(contents, result.encoding).toString('utf8')
197 |       cb(null, md)
198 |     } catch (e) {
199 |       return cb(e)
200 |     }
201 |   })
202 | }
203 | 


--------------------------------------------------------------------------------
/bin/test.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | var url = 'http://github.com/mattdesl/graphics-resources/blob/master/LICENSE.md'
 3 | require('got')(url, {
 4 |   // json: true,
 5 |   timeout: 4000
 6 | }, function (err, result, res) {
 7 |   if (err) {
 8 |     return console.error('err', err.message)
 9 |   }
10 |   console.log("RESULT", result.url)
11 | })


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "gh-readme-scrape",
 3 |   "version": "1.3.0",
 4 |   "description": "a CLI to bulk download URLs (images/pdfs/etc) in Markdown",
 5 |   "main": "index.js",
 6 |   "license": "MIT",
 7 |   "author": {
 8 |     "name": "Matt DesLauriers",
 9 |     "email": "dave.des@gmail.com",
10 |     "url": "https://github.com/mattdesl"
11 |   },
12 |   "dependencies": {
13 |     "async": "^1.3.0",
14 |     "chalk": "^1.1.0",
15 |     "content-type": "^1.0.1",
16 |     "gh-md-urls": "^1.1.1",
17 |     "ghauth": "^3.0.0",
18 |     "github-url-to-object": "^1.5.2",
19 |     "got": "^3.3.0",
20 |     "mime-types": "^2.1.2",
21 |     "minimist": "^1.1.1",
22 |     "mkdirp": "^0.5.1",
23 |     "urljoin": "^0.1.5"
24 |   },
25 |   "devDependencies": {},
26 |   "scripts": {
27 |     "test": "node test.js"
28 |   },
29 |   "keywords": [
30 |     "md",
31 |     "markdown",
32 |     "curl",
33 |     "download",
34 |     "request",
35 |     "get",
36 |     "image",
37 |     "images",
38 |     "pdf",
39 |     "pdfs",
40 |     "link",
41 |     "links",
42 |     "down",
43 |     "load"
44 |   ],
45 |   "repository": {
46 |     "type": "git",
47 |     "url": "git://github.com/mattdesl/gh-readme-scrape.git"
48 |   },
49 |   "homepage": "https://github.com/mattdesl/gh-readme-scrape",
50 |   "bugs": {
51 |     "url": "https://github.com/mattdesl/gh-readme-scrape/issues"
52 |   },
53 |   "bin": {
54 |     "gh-readme-scrape": "./bin/cli.js"
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------