├── .gitignore
├── LICENSE.md
├── README.md
├── config.json
├── github-dorks.txt
├── index.js
├── package.json
└── readme
    ├── .dontdelete
    └── secrets-2.png


/.gitignore:
--------------------------------------------------------------------------------
1 | data*.txt
2 | node_modules*
3 | config.json
4 | collected/*


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Dylan Katz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | ## Windows
 3 | To use GHScraper, you must first download and install [nodejs](https://nodejs.org/en/download/) and npm. Once this is finished, open command prompt in the GHScraper directory and use
 4 | 
 5 |     npm install
 6 |     
 7 | to download the required dependencies.
 8 | 
 9 | ## Ubuntu
10 | Install npm and nodejs:
11 | 
12 |     sudo apt-get install nodejs npm git
13 |     git clone https://github.com/Plazmaz/GHScraper.git
14 |     cd GHScraper
15 |     npm install
16 | 
17 | # Configuration
18 | 
19 | To use GHScraper, you'll need to create a new oauth application. You can do this [here](https://github.com/settings/applications/new). Once you've created an oauth application, GitHub will generate a client id and secret.
20 | ![secrets](https://github.com/Plazmaz/GHScraper/blob/master/readme/secrets-2.png?raw=true)
21 | 
22 | From here, simply open the `config.json` file and put in your client id and secret.
23 | 
24 | Once configured, you can run
25 | 
26 |     npm start
27 |     
28 | or
29 |     
30 |     node index.js
31 |   
32 | to begin scraping. 
33 | 
34 | 
35 | # Extension/Contributing
36 | To add new queries to the database or modify existing ones, you can edit the `github-dorks.txt` file. This uses a modified version of GitHub's search syntax.
37 | 
38 | | Query     | Purpose                                                | Example usage      |
39 | |-----------|--------------------------------------------------------|--------------------|
40 | | filename  | Search for files by name                               | filename:README.md |
41 | | path      | Searches for files within a specific path              | path:var/www       |
42 | | extension | Searches for files by extension                        | extension:txt      |
43 | | other     | All other text is treated as a search of file contents | Test one two three |
44 | 
45 | If you find something cool, feel free to make a pull request! 
46 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "client_id": "YOUR_CLIENT_ID_HERE",
3 |     "client_secret": "YOUR_SECRET_HERE"
4 | }


--------------------------------------------------------------------------------
/github-dorks.txt:
--------------------------------------------------------------------------------
 1 | filename:.npmrc _auth
 2 | filename:.dockercfg auth
 3 | extension:pem private
 4 | extension:ppk private
 5 | filename:id_rsa
 6 | filename:id_dsa
 7 | extension:sql mysql dump
 8 | extension:sql mysql dump password
 9 | define("DB_PASSWORD"
10 | e10adc3949ba59abbe56e057f20f883e
11 | filename:secret_token.rb config
12 | extension:rb secret_token
13 | extension:conf FTP server configuration
14 | filename:credentials aws_access_key_id
15 | filename:.s3cfg
16 | filename:wp-config.php
17 | filename:.htpasswd
18 | filename:.env DB_USERNAME
19 | filename:.env MAIL_HOST=smtp.gmail.com
20 | filename:.git-credentials
21 | PT_TOKEN extension:sh
22 | filename:.bashrc password
23 | filename:.bashrc mailchimp
24 | filename:.bash_profile aws
25 | rds.amazonaws.com password
26 | extension:json api.forecast.io
27 | extension:json mongolab.com
28 | extension:yaml mongolab.com
29 | extension:xls mail
30 | extension:xchat
31 | extension:xchat2
32 | extension:dbeaver-data-sources.xml
33 | extension:muttrc
34 | extension:kdb
35 | extension:agilekeychain
36 | extension:keychain
37 | jsforce extension:js conn.login
38 | SF_USERNAME "salesforce"
39 | filename:.tugboat
40 | HEROKU_API_KEY extension:sh
41 | HEROKU_API_KEY extension:json
42 | filename:.netrc password
43 | filename:_netrc password
44 | filename:hub oauth_token
45 | filename:robomongo.json
46 | filename:filezilla.xml Pass
47 | filename:recentservers.xml Pass
48 | filename:config.json auths
49 | filename:idea14.key
50 | filename:config irc_pass
51 | filename:connections.xml
52 | filename:express.conf path:.openshift
53 | filename:.pgpass
54 | filename:proftpdpasswd
55 | filename:ventrilo_srv.ini
56 | [WFClient] Password= extension:ica
57 | filename:server.cfg rcon password
58 | JEKYLL_GITHUB_TOKEN
59 | filename:.bash_history
60 | filename:.cshrc
61 | filename:.sh_history
62 | filename:sshd_config
63 | filename:dhcpd.conf
64 | filename:prod.exs
65 | filename:prod.secret.exs
66 | filename:configuration.php JConfig password
67 | filename:config.php dbpasswd
68 | filename:config.php passw
69 | path:sites databases password
70 | shodan_api_key extension:py
71 | shodan_api_key extension:sh
72 | shodan_api_key extension:json
73 | shodan_api_key extension:rb
74 | filename:shadow path:etc
75 | filename:passwd path:etc
76 | filename:ssh_config
77 | filename:.ssh_config
78 | extension:pkcs12 private
79 | filename:.mysql_history
80 | filename:.pgpass
81 | filename:database.yml pass
82 | filename:credentials.xml
83 | filename:knife.rb


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | var request = require('request');
  2 | var fs = require('fs');
  3 | 
  4 | const USER_AGENT = "GHScraper";
  5 | var cid;
  6 | var secret;
  7 | 
  8 | var dorks = [];
  9 | var scannedCommits = 0;
 10 | var etags = {};
 11 | var oldLog = console.log;
 12 | console.log = function(txt) {
 13 | 	oldLog(txt);
 14 | 	fs.appendFile('data-log.txt', txt.trim() + '\r\n');
 15 | }
 16 | var Dork = function(queryParts) {
 17 | 	this.queryParts = queryParts;
 18 | 	this.matches = function(fileData) {
 19 | 		var foundMatch = true;
 20 | 		for(var i = 0; i < this.queryParts.length; i++) {
 21 | 			if(this.queryParts[i].isFilename) {
 22 | 				foundMatch = foundMatch && fileData.filename.toString().trim().endsWith(this.queryParts[i].queryString);
 23 | 			} else if(this.queryParts[i].isExtension) {
 24 | 				foundMatch = foundMatch && fileData.filename.toString().trim().endsWith('.' + this.queryParts[i].queryString);
 25 | 			} else if(this.queryParts[i].isPath) {
 26 | 				foundMatch = foundMatch && fileData.filename.toString().match(new RegExp('(.*?)\/' + this.queryParts[i].queryString));
 27 | 			} else {
 28 | 				foundMatch = foundMatch && fileData.patch && fileData.patch.toString().indexOf(this.queryParts[i].queryString) != -1;
 29 | 			}
 30 | 		}
 31 | 		return foundMatch;
 32 | 	}	
 33 | }
 34 | String.prototype.endsWith = function(suffix) {
 35 |     return this.match(suffix+"$") == suffix;
 36 | };
 37 | 
 38 | fs.readFile('config.json', 'utf-8', function(err, data) {
 39 | 	if(err) {
 40 | 		console.error(err);
 41 | 		return;
 42 | 	} else {
 43 | 		var json = JSON.parse(data);
 44 | 		if(!json.client_id || !json.client_secret) {
 45 | 			console.error("Invalid or corrupted configuration file!")
 46 | 			return;
 47 | 		}
 48 | 		cid = json.client_id;
 49 | 		secret = json.client_secret;
 50 | 		
 51 | 		getDorks('github-dorks.txt');
 52 | 	}
 53 | });
 54 | 
 55 | function QueryPart() {
 56 | 	this.isFilename = false;
 57 | 	this.isExtension = false;
 58 | 	this.isPath = false;
 59 | 	this.queryString = '';
 60 | }
 61 | 
 62 | function getOptions(url) {
 63 | 	return {
 64 | 	  url: url,
 65 | 	  headers: {
 66 | 		'User-Agent': USER_AGENT,
 67 | 		'If-None-Match': etags[url] ? etags[url] : ''
 68 | 	  }
 69 | 	};
 70 | }
 71 | 
 72 | function getDorks(filename) {
 73 | 	fs.readFile(filename, 'utf-8', function(err, data) {
 74 | 		if(err) {
 75 | 			console.error(err);
 76 | 		}
 77 | 		var tmpDorks = data.split('\n');
 78 | 		for(var i = 0; i < tmpDorks.length; i++) {
 79 | 			var rawDork = tmpDorks[i];
 80 | 			if(rawDork.indexOf("//") == 0) {
 81 | 				continue;
 82 | 			}
 83 | 			dorks.push(new Dork(getQueryParts(rawDork)));
 84 | 			
 85 | 		}
 86 | 	});
 87 | }
 88 | 
 89 | function getQueryParts(dork) {
 90 | 	var parts = [];
 91 | 	var stringParts = dork.split(" ");
 92 | 	for(var i = 0; i < stringParts.length; i++) {
 93 | 		stringParts[i] = stringParts[i].trim();
 94 | 		var QueryPartObj = new QueryPart();
 95 | 		if(stringParts[i].indexOf('filename:') === 0) {
 96 | 			QueryPartObj.isFilename = true;
 97 | 			QueryPartObj.queryString = stringParts[i].substring('filename:'.length);
 98 | 		} else if(stringParts[i].indexOf('extension:') === 0) {
 99 | 			QueryPartObj.isExtension = true;
100 | 			QueryPartObj.queryString = stringParts[i].substring('extension:'.length);
101 | 		} else if(stringParts[i].indexOf('path:') === 0) {
102 | 			QueryPartObj.isPath = true;
103 | 			QueryPartObj.queryString = stringParts[i].substring('path:'.length);
104 | 		} else {
105 | 			QueryPartObj.queryString = stringParts[i];
106 | 		}
107 | 		parts.push(QueryPartObj);
108 | 	}
109 | 	this.toString = function() {
110 | 		return parts.join(' ');
111 | 	}
112 | 	return parts;
113 | }
114 | 
115 | function queryTimeline(callback) {
116 | 	var options = getOptions('https://api.github.com/events?per_page=135&client_id=' + cid + '&client_secret=' + secret);
117 | 	request(options, function(err, response, body) {
118 | 		if(err) {
119 | 			console.error(err);
120 | 			return;
121 | 		}
122 | 		if(response) {
123 | 			etags[options.url] = response.headers['ETag'];
124 | 		}
125 | 		
126 | 		var events = JSON.parse(body);
127 | 		if(!events) {
128 | 			console.log(body);
129 | 			return;
130 | 		}
131 | 		for(var i = 0; i < events.length; i++) {
132 | 			var evt = events[i];
133 | 			var type = evt.type;
134 | 			if(type.indexOf('PushEvent') == 0 || type.indexOf('CreateEvent') == 0 || type.indexOf('PullRequestEvent') == 0) {
135 | 				if(!evt.payload.commits) {
136 | 					continue;
137 | 				}
138 | 				var commits = evt.payload.commits;
139 | 				commits.forEach(function(commit) {
140 | 					var commitURL = commit.url;
141 | 					var options = getOptions(commitURL + '?client_id=' + cid + '&client_secret=' + secret);
142 | 
143 | 					request(options, function(err, response, body) {
144 | 						if(response) {
145 | 							etags[options.url] = response.headers['ETag'];
146 | 						}
147 | 						scannedCommits++;
148 | 						if(scannedCommits % 50 == 0 && scannedCommits > 0) {
149 | 							console.log(scannedCommits + " commits scanned...");
150 | 						}
151 | 						if(err) {
152 | 							console.error(body);
153 | 							console.error(err);
154 | 							return;
155 | 						}
156 | 						var files = JSON.parse(body).files;
157 | 						if(!files) {
158 | 							console.error('Ratelimit hit or no files found.');
159 | 							return;
160 | 						}
161 | 						if(files.length === 0) {
162 | 							return;
163 | 						}
164 | 						for(var j = 0; j < dorks.length; j++) {
165 | 							for(var k = 0; k < files.length; k++) {
166 | 								var file = files[k]
167 | 								if(dorks[j].matches(file)) {
168 | 									console.log(JSON.stringify(dorks[j].queryParts) + ' matched file ' + file.filename + ' with sha ' + file.sha + '!');
169 | 									console.log('Data url: ' + file.raw_url);
170 | 								    var fileStream = fs.createWriteStream("collected/" + file.raw_url.substring('https://github.com'.length).replace(/\//g, "_"));
171 | 								    var r = request(file.raw_url).pipe(fileStream);
172 | 								    r.on("close", fileStream.close);
173 | 									return;
174 | 								}
175 | 							}
176 | 						}
177 | 					});
178 | 				});
179 | 			}
180 | 		};
181 | 	})
182 | }
183 | console.log("=== Starting new scan ===")
184 | queryTimeline();
185 | setInterval(queryTimeline, 11000);


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "GHScraper",
 3 |   "version": "1.0.0",
 4 |   "description": "Scrape github commits for sensitive data",
 5 |   "main": "index.js",
 6 |   "dependencies": {
 7 |     "request": "^2.75.0",
 8 |     "xml2js": "^0.4.17"
 9 |   },
10 |   "devDependencies": {},
11 |   "scripts": {
12 | 	"start": "node index.js"
13 |   },
14 |   "repository": {
15 |     "type": "git",
16 |     "url": "https://github.com/Plazmaz/GHScraper.git"
17 |   },
18 |   "author": "Dylan Katz",
19 |   "license": "MIT",
20 |   "bugs": {
21 |     "url": "https://github.com/Plazmaz/GHScraper/issues"
22 |   },
23 |   "homepage": "https://github.com/Plazmaz/GHScraper"
24 | }
25 | 


--------------------------------------------------------------------------------
/readme/.dontdelete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/readme/secrets-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plazmaz/GHScraper/07b4e7277c8c7d6631e7013b8b850229e2810c2f/readme/secrets-2.png


--------------------------------------------------------------------------------