├── .gitignore ├── LICENSE.md ├── README.md ├── config.json ├── github-dorks.txt ├── index.js ├── package.json └── readme ├── .dontdelete └── secrets-2.png /.gitignore: -------------------------------------------------------------------------------- 1 | data*.txt 2 | node_modules* 3 | config.json 4 | collected/* -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Dylan Katz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | ## Windows 3 | To use GHScraper, you must first download and install [nodejs](https://nodejs.org/en/download/) and npm. Once this is finished, open command prompt in the GHScraper directory and use 4 | 5 | npm install 6 | 7 | to download the required dependencies. 8 | 9 | ## Ubuntu 10 | Install npm and nodejs: 11 | 12 | sudo apt-get install nodejs npm git 13 | git clone https://github.com/Plazmaz/GHScraper.git 14 | cd GHScraper 15 | npm install 16 | 17 | # Configuration 18 | 19 | To use GHScraper, you'll need to create a new oauth application. You can do this [here](https://github.com/settings/applications/new). Once you've created an oauth application, GitHub will generate a client id and secret. 20 | ![secrets](https://github.com/Plazmaz/GHScraper/blob/master/readme/secrets-2.png?raw=true) 21 | 22 | From here, simply open the `config.json` file and put in your client id and secret. 23 | 24 | Once configured, you can run 25 | 26 | npm start 27 | 28 | or 29 | 30 | node index.js 31 | 32 | to begin scraping. 33 | 34 | 35 | # Extension/Contributing 36 | To add new queries to the database or modify existing ones, you can edit the `github-dorks.txt` file. This uses a modified version of GitHub's search syntax. 37 | 38 | | Query | Purpose | Example usage | 39 | |-----------|--------------------------------------------------------|--------------------| 40 | | filename | Search for files by name | filename:README.md | 41 | | path | Searches for files within a specific path | path:var/www | 42 | | extension | Searches for files by extension | extension:txt | 43 | | other | All other text is treated as a search of file contents | Test one two three | 44 | 45 | If you find something cool, feel free to make a pull request! 46 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "client_id": "YOUR_CLIENT_ID_HERE", 3 | "client_secret": "YOUR_SECRET_HERE" 4 | } -------------------------------------------------------------------------------- /github-dorks.txt: -------------------------------------------------------------------------------- 1 | filename:.npmrc _auth 2 | filename:.dockercfg auth 3 | extension:pem private 4 | extension:ppk private 5 | filename:id_rsa 6 | filename:id_dsa 7 | extension:sql mysql dump 8 | extension:sql mysql dump password 9 | define("DB_PASSWORD" 10 | e10adc3949ba59abbe56e057f20f883e 11 | filename:secret_token.rb config 12 | extension:rb secret_token 13 | extension:conf FTP server configuration 14 | filename:credentials aws_access_key_id 15 | filename:.s3cfg 16 | filename:wp-config.php 17 | filename:.htpasswd 18 | filename:.env DB_USERNAME 19 | filename:.env MAIL_HOST=smtp.gmail.com 20 | filename:.git-credentials 21 | PT_TOKEN extension:sh 22 | filename:.bashrc password 23 | filename:.bashrc mailchimp 24 | filename:.bash_profile aws 25 | rds.amazonaws.com password 26 | extension:json api.forecast.io 27 | extension:json mongolab.com 28 | extension:yaml mongolab.com 29 | extension:xls mail 30 | extension:xchat 31 | extension:xchat2 32 | extension:dbeaver-data-sources.xml 33 | extension:muttrc 34 | extension:kdb 35 | extension:agilekeychain 36 | extension:keychain 37 | jsforce extension:js conn.login 38 | SF_USERNAME "salesforce" 39 | filename:.tugboat 40 | HEROKU_API_KEY extension:sh 41 | HEROKU_API_KEY extension:json 42 | filename:.netrc password 43 | filename:_netrc password 44 | filename:hub oauth_token 45 | filename:robomongo.json 46 | filename:filezilla.xml Pass 47 | filename:recentservers.xml Pass 48 | filename:config.json auths 49 | filename:idea14.key 50 | filename:config irc_pass 51 | filename:connections.xml 52 | filename:express.conf path:.openshift 53 | filename:.pgpass 54 | filename:proftpdpasswd 55 | filename:ventrilo_srv.ini 56 | [WFClient] Password= extension:ica 57 | filename:server.cfg rcon password 58 | JEKYLL_GITHUB_TOKEN 59 | filename:.bash_history 60 | filename:.cshrc 61 | filename:.sh_history 62 | filename:sshd_config 63 | filename:dhcpd.conf 64 | filename:prod.exs 65 | filename:prod.secret.exs 66 | filename:configuration.php JConfig password 67 | filename:config.php dbpasswd 68 | filename:config.php passw 69 | path:sites databases password 70 | shodan_api_key extension:py 71 | shodan_api_key extension:sh 72 | shodan_api_key extension:json 73 | shodan_api_key extension:rb 74 | filename:shadow path:etc 75 | filename:passwd path:etc 76 | filename:ssh_config 77 | filename:.ssh_config 78 | extension:pkcs12 private 79 | filename:.mysql_history 80 | filename:.pgpass 81 | filename:database.yml pass 82 | filename:credentials.xml 83 | filename:knife.rb -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var fs = require('fs'); 3 | 4 | const USER_AGENT = "GHScraper"; 5 | var cid; 6 | var secret; 7 | 8 | var dorks = []; 9 | var scannedCommits = 0; 10 | var etags = {}; 11 | var oldLog = console.log; 12 | console.log = function(txt) { 13 | oldLog(txt); 14 | fs.appendFile('data-log.txt', txt.trim() + '\r\n'); 15 | } 16 | var Dork = function(queryParts) { 17 | this.queryParts = queryParts; 18 | this.matches = function(fileData) { 19 | var foundMatch = true; 20 | for(var i = 0; i < this.queryParts.length; i++) { 21 | if(this.queryParts[i].isFilename) { 22 | foundMatch = foundMatch && fileData.filename.toString().trim().endsWith(this.queryParts[i].queryString); 23 | } else if(this.queryParts[i].isExtension) { 24 | foundMatch = foundMatch && fileData.filename.toString().trim().endsWith('.' + this.queryParts[i].queryString); 25 | } else if(this.queryParts[i].isPath) { 26 | foundMatch = foundMatch && fileData.filename.toString().match(new RegExp('(.*?)\/' + this.queryParts[i].queryString)); 27 | } else { 28 | foundMatch = foundMatch && fileData.patch && fileData.patch.toString().indexOf(this.queryParts[i].queryString) != -1; 29 | } 30 | } 31 | return foundMatch; 32 | } 33 | } 34 | String.prototype.endsWith = function(suffix) { 35 | return this.match(suffix+"$") == suffix; 36 | }; 37 | 38 | fs.readFile('config.json', 'utf-8', function(err, data) { 39 | if(err) { 40 | console.error(err); 41 | return; 42 | } else { 43 | var json = JSON.parse(data); 44 | if(!json.client_id || !json.client_secret) { 45 | console.error("Invalid or corrupted configuration file!") 46 | return; 47 | } 48 | cid = json.client_id; 49 | secret = json.client_secret; 50 | 51 | getDorks('github-dorks.txt'); 52 | } 53 | }); 54 | 55 | function QueryPart() { 56 | this.isFilename = false; 57 | this.isExtension = false; 58 | this.isPath = false; 59 | this.queryString = ''; 60 | } 61 | 62 | function getOptions(url) { 63 | return { 64 | url: url, 65 | headers: { 66 | 'User-Agent': USER_AGENT, 67 | 'If-None-Match': etags[url] ? etags[url] : '' 68 | } 69 | }; 70 | } 71 | 72 | function getDorks(filename) { 73 | fs.readFile(filename, 'utf-8', function(err, data) { 74 | if(err) { 75 | console.error(err); 76 | } 77 | var tmpDorks = data.split('\n'); 78 | for(var i = 0; i < tmpDorks.length; i++) { 79 | var rawDork = tmpDorks[i]; 80 | if(rawDork.indexOf("//") == 0) { 81 | continue; 82 | } 83 | dorks.push(new Dork(getQueryParts(rawDork))); 84 | 85 | } 86 | }); 87 | } 88 | 89 | function getQueryParts(dork) { 90 | var parts = []; 91 | var stringParts = dork.split(" "); 92 | for(var i = 0; i < stringParts.length; i++) { 93 | stringParts[i] = stringParts[i].trim(); 94 | var QueryPartObj = new QueryPart(); 95 | if(stringParts[i].indexOf('filename:') === 0) { 96 | QueryPartObj.isFilename = true; 97 | QueryPartObj.queryString = stringParts[i].substring('filename:'.length); 98 | } else if(stringParts[i].indexOf('extension:') === 0) { 99 | QueryPartObj.isExtension = true; 100 | QueryPartObj.queryString = stringParts[i].substring('extension:'.length); 101 | } else if(stringParts[i].indexOf('path:') === 0) { 102 | QueryPartObj.isPath = true; 103 | QueryPartObj.queryString = stringParts[i].substring('path:'.length); 104 | } else { 105 | QueryPartObj.queryString = stringParts[i]; 106 | } 107 | parts.push(QueryPartObj); 108 | } 109 | this.toString = function() { 110 | return parts.join(' '); 111 | } 112 | return parts; 113 | } 114 | 115 | function queryTimeline(callback) { 116 | var options = getOptions('https://api.github.com/events?per_page=135&client_id=' + cid + '&client_secret=' + secret); 117 | request(options, function(err, response, body) { 118 | if(err) { 119 | console.error(err); 120 | return; 121 | } 122 | if(response) { 123 | etags[options.url] = response.headers['ETag']; 124 | } 125 | 126 | var events = JSON.parse(body); 127 | if(!events) { 128 | console.log(body); 129 | return; 130 | } 131 | for(var i = 0; i < events.length; i++) { 132 | var evt = events[i]; 133 | var type = evt.type; 134 | if(type.indexOf('PushEvent') == 0 || type.indexOf('CreateEvent') == 0 || type.indexOf('PullRequestEvent') == 0) { 135 | if(!evt.payload.commits) { 136 | continue; 137 | } 138 | var commits = evt.payload.commits; 139 | commits.forEach(function(commit) { 140 | var commitURL = commit.url; 141 | var options = getOptions(commitURL + '?client_id=' + cid + '&client_secret=' + secret); 142 | 143 | request(options, function(err, response, body) { 144 | if(response) { 145 | etags[options.url] = response.headers['ETag']; 146 | } 147 | scannedCommits++; 148 | if(scannedCommits % 50 == 0 && scannedCommits > 0) { 149 | console.log(scannedCommits + " commits scanned..."); 150 | } 151 | if(err) { 152 | console.error(body); 153 | console.error(err); 154 | return; 155 | } 156 | var files = JSON.parse(body).files; 157 | if(!files) { 158 | console.error('Ratelimit hit or no files found.'); 159 | return; 160 | } 161 | if(files.length === 0) { 162 | return; 163 | } 164 | for(var j = 0; j < dorks.length; j++) { 165 | for(var k = 0; k < files.length; k++) { 166 | var file = files[k] 167 | if(dorks[j].matches(file)) { 168 | console.log(JSON.stringify(dorks[j].queryParts) + ' matched file ' + file.filename + ' with sha ' + file.sha + '!'); 169 | console.log('Data url: ' + file.raw_url); 170 | var fileStream = fs.createWriteStream("collected/" + file.raw_url.substring('https://github.com'.length).replace(/\//g, "_")); 171 | var r = request(file.raw_url).pipe(fileStream); 172 | r.on("close", fileStream.close); 173 | return; 174 | } 175 | } 176 | } 177 | }); 178 | }); 179 | } 180 | }; 181 | }) 182 | } 183 | console.log("=== Starting new scan ===") 184 | queryTimeline(); 185 | setInterval(queryTimeline, 11000); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "GHScraper", 3 | "version": "1.0.0", 4 | "description": "Scrape github commits for sensitive data", 5 | "main": "index.js", 6 | "dependencies": { 7 | "request": "^2.75.0", 8 | "xml2js": "^0.4.17" 9 | }, 10 | "devDependencies": {}, 11 | "scripts": { 12 | "start": "node index.js" 13 | }, 14 | "repository": { 15 | "type": "git", 16 | "url": "https://github.com/Plazmaz/GHScraper.git" 17 | }, 18 | "author": "Dylan Katz", 19 | "license": "MIT", 20 | "bugs": { 21 | "url": "https://github.com/Plazmaz/GHScraper/issues" 22 | }, 23 | "homepage": "https://github.com/Plazmaz/GHScraper" 24 | } 25 | -------------------------------------------------------------------------------- /readme/.dontdelete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /readme/secrets-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plazmaz/GHScraper/07b4e7277c8c7d6631e7013b8b850229e2810c2f/readme/secrets-2.png --------------------------------------------------------------------------------