├── collections.jpg ├── leaked_credentials.sqlite ├── README.md └── parse2bbdd.py /collections.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p3pperp0tts/leaks_parser/HEAD/collections.jpg -------------------------------------------------------------------------------- /leaked_credentials.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p3pperp0tts/leaks_parser/HEAD/leaked_credentials.sqlite -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # leaks_parser 2 | 3 | Parser for data dumps Collection #1 / Collection #2-5 4 | 5 | ## Description 6 | 7 | This python script is a parser for the latest data dumps collections #1, #2-5, Antipublic #1 and Antipublic MYR & ZABUGOR #2. 8 | 9 | It will parse text files from data dumps and will create a sqlite database. 10 | 11 | ## How to use 12 | 13 | The script and the empty database must be moved to the root folder where collections have been decompressed: 14 | 15 | ``` 16 | Collection #1 17 | Collection #2 18 | Collection #3 19 | Collection #4 20 | Collection #5 21 | Antipublic MYR & ZABUGOR #2 22 | Antipublic #1 23 | parse2bbdd.py 24 | leaked_credentials.sqlite 25 | ``` 26 | 27 | Each collection contains subcollections that are compressed tar.gz files, and should be decompressed too, before calling the script. For example: 28 | 29 | ``` 30 | dir F:\Collection #1 31 | 32 | Collection #1_BTC combos 33 | Collection #1_Dumps - dehashed 34 | Collection #1_EU combos 35 | Collection #1_EU combos_1 36 | Collection #1_Games combos 37 | Collection #1_Games combos_Dumps 38 | Collection #1_Games combos_Sharpening 39 | Collection #1_MAIL ACCESS combos 40 | Collection #1_Monetary combos 41 | Collection #1_NEW combo semi private_Dumps 42 | Collection #1_NEW combo semi private_EU combo 43 | Collection #1_NEW combo semi private_Private combos 44 | ... 45 | ``` 46 | 47 | Each subcollection contains the files with the credentials to be parsed: 48 | 49 | ``` 50 | dir F:\Collection #1_BTC combos 51 | 52 | 144.txt 53 | 158.txt 54 | 151.txt 55 | 214.txt 56 | 120.txt 57 | 208.txt 58 | 205.txt 59 | 161.txt 60 | ... 61 | ``` 62 | 63 | The script will be able to parse most of these files with credentials. 64 | 65 | When a file is correctly parsed (and credentials are added to the database), it is renamed by adding the extension .ALREADYPARSED. 66 | 67 | The script will create three output files: 68 | 69 | - consistences.txt -> path to files correctly imported to database 70 | - inconsistences.txt -> path to files with unknown format that were not imported to database 71 | - exceptions.txt -> path to files that cause exception while managing them 72 | 73 | Most of the files are imported correctly. The files that were not imported are logged into inconsistences.txt and exceptions.txt (and, in addition, they are not renamed to *.ALREADYPARSED). Probably it is necesary to implement a custom parser for that files. 74 | 75 | ## Database format 76 | 77 | ### Tables 78 | 79 | - Collections 80 | - Subcollections 81 | - Credentials 82 | 83 | ### Credentials table's columns 84 | 85 | - collection INTEGER -> index for Collections table 86 | - subcollection INTEGER -> index for Subcollections table 87 | - username TEXT 88 | - email TEXT 89 | - password_plaintext TEXT 90 | - password_md5 TEXT 91 | - password_sha1 TEXT 92 | - password_sha256 TEXT 93 | - password_bcrypt TEXT 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /parse2bbdd.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import sys 3 | from validate_email import validate_email 4 | import hashlib 5 | import binascii 6 | import os 7 | import shutil 8 | 9 | ########################################################################## 10 | 11 | class LineParser: 12 | 13 | @staticmethod 14 | def md5(s): 15 | m = hashlib.md5() 16 | m.update(s) 17 | return binascii.hexlify(m.digest()) 18 | 19 | @staticmethod 20 | def sha1(s): 21 | m = hashlib.sha1() 22 | m.update(s) 23 | return binascii.hexlify(m.digest()) 24 | 25 | @staticmethod 26 | def sha256(s): 27 | m = hashlib.sha256() 28 | m.update(s) 29 | return binascii.hexlify(m.digest()) 30 | 31 | @staticmethod 32 | def parsesimplelinebyseparator(s, sep, sepname): 33 | typ = "" 34 | mail = "" 35 | user = "" 36 | passwd = "" 37 | if s.count(sep)==1: #and s[-1]!=sep: 38 | l = s.split(sep) 39 | if validate_email(l[0]): 40 | typ = "user_or_mail_%s_pass" % sepname 41 | user = l[0] 42 | mail = l[0] 43 | else: 44 | typ = "user_or_mail_%s_pass" % sepname 45 | user = l[0] 46 | passwd = l[1] 47 | return True, user, mail, passwd, typ 48 | return False, "", "", "", "" 49 | 50 | @staticmethod 51 | def ismd5(passwd): 52 | if len(passwd)==32: 53 | for e in passwd: 54 | if not ((ord(e)>=ord('0') and ord(e)<=ord('9')) or (ord(e)>=ord('A') and ord(e)<=ord('F')) or (ord(e)>=ord('a') and ord(e)<=ord('f'))): 55 | return False 56 | return True 57 | else: 58 | return False 59 | 60 | @staticmethod 61 | def issha1(passwd): 62 | if len(passwd)==40: 63 | for e in passwd: 64 | if not ((ord(e)>=ord('0') and ord(e)<=ord('9')) or (ord(e)>=ord('A') and ord(e)<=ord('F')) or (ord(e)>=ord('a') and ord(e)<=ord('f'))): 65 | return False 66 | return True 67 | else: 68 | return False 69 | 70 | @staticmethod 71 | def issha256(passwd): 72 | if len(passwd)==64: 73 | for e in passwd: 74 | if not ((ord(e)>=ord('0') and ord(e)<=ord('9')) or (ord(e)>=ord('A') and ord(e)<=ord('F')) or (ord(e)>=ord('a') and ord(e)<=ord('f'))): 75 | return False 76 | return True 77 | else: 78 | return False 79 | 80 | @staticmethod 81 | def isbcrypt(passwd): 82 | if len(passwd)>50 and len(passwd)<70 and (passwd[0:4]=="$2a$" or passwd[0:4]=="$2b$" or passwd[0:4]=="$2y$"): 83 | return True 84 | return False 85 | 86 | @staticmethod 87 | def parsepasswd(passwd): 88 | if LineParser.ismd5(passwd): 89 | passwdmd5 = passwd 90 | passwdsha1 = "" 91 | passwdsha256 = "" 92 | passwdbcrypt = "" 93 | passwd = "" 94 | elif LineParser.issha1(passwd): 95 | passwdmd5 = "" 96 | passwdsha1 = passwd 97 | passwdsha256 = "" 98 | passwdbcrypt = "" 99 | passwd = "" 100 | elif LineParser.issha256(passwd): 101 | passwdmd5 = "" 102 | passwdsha1 = "" 103 | passwdsha256 = passwd 104 | passwdbcrypt = "" 105 | passwd = "" 106 | elif LineParser.isbcrypt(passwd): 107 | passwdmd5 = "" 108 | passwdsha1 = "" 109 | passwdsha256 = "" 110 | passwdbcrypt = passwd 111 | passwd = "" 112 | else: 113 | passwdmd5 = LineParser.md5(passwd) 114 | passwdsha1 = LineParser.sha1(passwd) 115 | passwdsha256 = LineParser.sha256(passwd) 116 | passwdbcrypt = "" 117 | 118 | return passwd, passwdmd5, passwdsha1, passwdsha256, passwdbcrypt 119 | 120 | @staticmethod 121 | def parseline(s): 122 | typ = "" 123 | mail = "" 124 | user = "" 125 | passwd = "" 126 | passwdmd5 = "" 127 | passwdsha1 = "" 128 | passwdsha256 = "" 129 | passwdbcrypt = "" 130 | bvalid = False 131 | #case mail@mail.com:password 132 | #case username:password 133 | #case mail@mail.com;password 134 | #case username;password 135 | good, user, mail, passwd, typ = LineParser.parsesimplelinebyseparator(s, ':', "doubledots_or_dotcomma") 136 | if not good: good, user, mail, passwd, typ = LineParser.parsesimplelinebyseparator(s, ';', "doubledots_or_dotcomma") 137 | if good: bvalid = True 138 | if bvalid: 139 | passwd, passwdmd5, passwdsha1, passwdsha256, passwdbcrypt = LineParser.parsepasswd(passwd) 140 | return {"type": typ, "mail": mail , "user": user, "pass": passwd, "passmd5": passwdmd5, "passsha1": passwdsha1, "passsha256": passwdsha256, "passwdbcrypt": passwdbcrypt} 141 | 142 | ########################################################################## 143 | 144 | class LeakParser: 145 | 146 | def updatecache(self): 147 | if self.beof: return 148 | if len(self.fleakcache)-self.icurcache > self.maxlinelength: return 149 | newread = self.fleak.read(0x1000000) 150 | if len(newread) < 0x1000000: self.beof = True 151 | self.fleakcache = self.fleakcache[self.icurcache:] + newread 152 | self.icurcache = 0 153 | 154 | def updatecurline(self): 155 | self.updatecache() 156 | for linebreak in self.linebreaks: 157 | try: 158 | ibr = self.fleakcache.index(linebreak, self.icurcache, self.icurcache+self.maxlinelength) 159 | self.icurlinestart = self.icurcache 160 | self.icurlineend = ibr+len(linebreak) 161 | self.icurcache = self.icurlineend 162 | #print self.icurlinestart, "->", self.icurlineend, ":", self.getcurline() 163 | return 164 | except: 165 | continue 166 | if self.beof and (len(self.fleakcache)-self.icurcache<=self.maxlinelength) and (len(self.fleakcache)-self.icurcache!=0): 167 | self.icurlinestart = self.icurcache 168 | self.icurcache = len(self.fleakcache) 169 | self.icurlineend = self.icurcache 170 | return 171 | self.lineerr = True 172 | 173 | def getcurline(self): 174 | return self.fleakcache[self.icurlinestart:self.icurlineend].strip() 175 | 176 | def setcollection(self): 177 | scriptdir = os.path.dirname(os.path.realpath(__file__)) 178 | relleakpath = os.path.relpath(self.leakpath, scriptdir) 179 | temp = os.path.normpath(relleakpath) 180 | temp = temp.split(os.sep) 181 | self.collection = str(temp[0]) 182 | self.subcollection = str(temp[1]) 183 | bupdatecollections = False 184 | if not self.BBDDcollections.has_key(self.collection): self.addBBDDcollection(self.collection) 185 | if not self.BBDDsubcollections.has_key(self.subcollection): self.addBBDDsubcollection(self.subcollection) 186 | self.collectionid = self.BBDDcollections[self.collection] 187 | self.subcollectionid = self.BBDDsubcollections[self.subcollection] 188 | 189 | def addBBDDcollection(self, collection): 190 | print collection 191 | sql = """INSERT INTO collections(collectionname) VALUES(?)""" 192 | self.cursor.execute(sql, (collection,)) 193 | self.getBBDDcollections() 194 | 195 | def addBBDDsubcollection(self, subcollection): 196 | print subcollection 197 | sql = """INSERT INTO subcollections(subcollectionname) VALUES(?)""" 198 | self.cursor.execute(sql, (subcollection,)) 199 | self.getBBDDcollections() 200 | 201 | def getBBDDcollections(self): 202 | self.BBDDcollections = {} 203 | l = self.cursor.execute("SELECT * FROM collections") 204 | for e in l: 205 | self.BBDDcollections[str(e[1])] = e[0] 206 | self.BBDDsubcollections = {} 207 | l = self.cursor.execute("SELECT * FROM subcollections") 208 | for e in l: 209 | self.BBDDsubcollections[str(e[1])] = e[0] 210 | print self.BBDDcollections 211 | print self.BBDDsubcollections 212 | 213 | def __init__(self, leakpath): 214 | self.maxlinelength = 200 215 | self.minlinelength = 3 216 | self.linebreaks = ["\r\n", "\r", "\n"] 217 | self.BBDDcollections = {} 218 | self.BBDDsubcollections = {} 219 | self.collection = "" 220 | self.subcollection = "" 221 | self.collectionid = 0 222 | self.subcollectionid = 0 223 | self.leakpath = leakpath 224 | self.conn = sqlite3.connect('leaked_credentials.sqlite') 225 | self.conn.text_factory = str 226 | self.cursor = self.conn.cursor() 227 | self.getBBDDcollections() 228 | self.setcollection() 229 | self.fleak = open(self.leakpath, "rb") 230 | self.fleakcache = "" 231 | self.icurcache = 0 232 | self.icurlinestart = 0 233 | self.icurlineend = 0 234 | self.beof = False 235 | self.lineerr = False 236 | self.test_info2bbdd_counter = 0 237 | self.info2bbdd = self.info2bbdd_real 238 | 239 | def info2bbdd_test(self, info): 240 | if self.test_info2bbdd_counter % 20000 == 0: 241 | if info: print repr(info) 242 | self.test_info2bbdd_counter += 1 243 | 244 | def info2bbdd_real(self, info): 245 | if self.test_info2bbdd_counter % 20000 == 0: 246 | if info: print repr(info) 247 | self.test_info2bbdd_counter += 1 248 | if info: 249 | sql = """INSERT INTO credentials(collection, subcollection, username, email, password_plaintext, password_md5, password_sha1, password_sha256, password_bcrypt) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)""" 250 | self.cursor.execute(sql, (self.collectionid, self.subcollectionid, str(info["user"]), str(info["mail"]), str(info["pass"]), str(info["passmd5"]), str(info["passsha1"]), str(info["passsha256"]), str(info["passwdbcrypt"]),)) 251 | 252 | def run(self): 253 | lastinconsistences = [] 254 | binconsistence = False 255 | self.updatecurline() 256 | line1 = LineParser.parseline(self.getcurline()) 257 | line2 = LineParser.parseline(self.getcurline()) 258 | line3 = LineParser.parseline(self.getcurline()) 259 | line4 = LineParser.parseline(self.getcurline()) 260 | line5 = LineParser.parseline(self.getcurline()) 261 | if self.lineerr or not(line1!=None and \ 262 | line2!=None and \ 263 | line3!=None and \ 264 | line4!=None and \ 265 | line5!=None and \ 266 | line1["type"] == line2["type"] and \ 267 | line2["type"] == line3["type"] and \ 268 | line3["type"] == line4["type"] and \ 269 | line4["type"] == line5["type"]): 270 | print "Inconsistent file by first lines" 271 | binconsistence = True 272 | if not binconsistence: 273 | FileLeakType = line1["type"] 274 | print "FileLeakType:", FileLeakType 275 | InconsistencesCounter = 0 276 | self.info2bbdd(line1) 277 | self.info2bbdd(line2) 278 | self.info2bbdd(line3) 279 | self.info2bbdd(line4) 280 | self.info2bbdd(line5) 281 | while not self.lineerr: 282 | self.updatecurline() 283 | line = LineParser.parseline(self.getcurline()) 284 | if not line or line["type"]!=FileLeakType: 285 | InconsistencesCounter += 1 286 | lastinconsistences.append(self.getcurline()) 287 | if len(lastinconsistences)>10: lastinconsistences = lastinconsistences[-10:] 288 | #print "CAREFUL Inconsistent line after pre-filter!!!!", self.getcurline() 289 | else: 290 | if InconsistencesCounter: InconsistencesCounter -= 1 291 | if InconsistencesCounter>=10: 292 | print "CAREFUL Too much Inconsistences, break!" 293 | binconsistence = True 294 | break 295 | self.info2bbdd(line) 296 | if binconsistence: 297 | f = open("inconsistences.txt", "a+b") 298 | f.write(self.leakpath+":::"+repr(lastinconsistences)+"\r\n") 299 | f.close() 300 | else: 301 | f = open("consistences.txt", "a+b") 302 | f.write(self.leakpath+"\r\n") 303 | f.close() 304 | self.conn.commit() 305 | 306 | 307 | def managefile(p): 308 | try: 309 | print "Managing file:", p 310 | lp = LeakParser(p) 311 | print lp.collection 312 | print lp.subcollection 313 | lp.run() 314 | lp.fleak.close() 315 | shutil.move(p, p+".ALREADYPARSED") 316 | except Exception as e: 317 | s = p + "----" + repr(e.message) + "----" + repr(e.args) + "\r\n" 318 | f = open("exceptions.txt", "a+b") 319 | f.write(s) 320 | f.close() 321 | 322 | def recurfiles(p): 323 | for e in os.listdir(p): 324 | if "NOPARSE" not in e and "ALREADYPARSED" not in e: 325 | if os.path.isdir(p+"/"+e): 326 | recurfiles(p+"/"+e) 327 | else: 328 | managefile(p+"/"+e) 329 | 330 | #f = open("inconsistences.txt", "w+b") 331 | #f.close() 332 | #f = open("consistences.txt", "w+b") 333 | #f.close() 334 | #f = open("exceptions.txt", "w+b") 335 | #f.close() 336 | 337 | recurfiles(os.path.dirname(os.path.realpath(__file__))) 338 | --------------------------------------------------------------------------------