├── README.md └── models.py /README.md: -------------------------------------------------------------------------------- 1 | # BotDetection (Requires Python 3.X) 2 | 3 | Neural Network trained to detect spammers on Reddit. Usage is as simple as: 4 | 5 | ``` 6 | from models import Classifier 7 | t = 0.9 #confidence threshold to return True 8 | 9 | Algorithm = Classifier(t) 10 | print(Algorithm.is_a_bot('ThaChippa')) 11 | ``` 12 | 13 | The `is_a_bot` method from the previous example returns the following tuple: `(True, 0.9999999999956763)`, where the first item is the result and the second item is the network's confidence. 14 | 15 | **TODO: Ignore NSFW/Foreign Languages** -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import praw, operator, string, csv, numpy as np 2 | from nltk.corpus import stopwords 3 | 4 | np.random.seed(1) 5 | 6 | r = praw.Reddit() 7 | 8 | 9 | 10 | blacklisted = ['askouija', 'test', 'freekarma4you'] #Subreddits where people engage in bot-like behavior 11 | 12 | 13 | def botInName(user): 14 | return 1 if 'bot' in str(user).lower() else 0.01 15 | 16 | 17 | def getSameLevels(user, comlist): 18 | # print(user) 19 | total = 0 20 | tlevel = 0 21 | for comment in comlist: 22 | total += 1 23 | if comment.is_root: 24 | tlevel += 1 25 | pcount = max(float(tlevel)/float(total), (float(total)-float(tlevel))/float(total)) 26 | return 0.01 if pcount == 0 else pcount 27 | 28 | def uniqueComments(user, comlist): 29 | comlist = [x for x in comlist if x.distinguished == None] 30 | x = 0 31 | uniqueComments = [] 32 | flist = [j for j in comlist] 33 | flength = len(flist) 34 | freqDict = {} 35 | while x < len(flist): 36 | content = flist[x].body 37 | if content.lower() not in uniqueComments: 38 | uniqueComments.append(content.lower()) 39 | x += 1 40 | return len(uniqueComments)/len(comlist) 41 | 42 | def getSpread(user, comlist): 43 | subsList = [] 44 | for item in comlist: 45 | if str(item.subreddit) not in subsList: 46 | subsList.append(str(item.subreddit)) 47 | return 0.01 if float(len(comlist))/float(len(subsList)) == 0 else float(len(comlist))/float(len(subsList)) 48 | 49 | def isRepetitive(user, comlist): 50 | comlist = [x for x in comlist if x.distinguished == None] 51 | if len(comlist) < 15: 52 | return 0.01 53 | 54 | x = 0 55 | uniqueComments = [] 56 | flist = [j for j in comlist] 57 | #print('FLIST', flist) 58 | #print(len(flist)) 59 | flength = len(flist) 60 | #print(flength) 61 | freqDict = {} 62 | while x < len(flist): 63 | #print(x) 64 | aldone = [] 65 | bad = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('portuguese') + list(string.punctuation) 66 | content = flist[x].body.replace("["," ").replace("]"," ") 67 | if 'http' in content.lower() or 'www.' in content.lower(): 68 | content = content.replace('/', ' ') 69 | #print("check1") 70 | if content.lower() not in uniqueComments: 71 | uniqueComments.append(content.lower()) 72 | for key in list(string.punctuation) + ['\n']: 73 | content = content.replace(key, '') 74 | words = [i for i in content.lower().split(' ') if i.replace("'s",'').replace("'nt",'').replace("'ve",'').replace("'ll",'').replace("'m",'') not in bad and i.strip("'`").strip('`') != ''] 75 | for thing in words: 76 | if thing not in aldone: 77 | aldone.append(thing) 78 | try: 79 | freqDict[thing] += 1 80 | except: 81 | freqDict[thing] = 1 82 | 83 | x += 1 84 | oDict = sorted(freqDict.items(), key=operator.itemgetter(1)) 85 | #print('ODICT', oDict) 86 | #print(flength) 87 | try: 88 | return (float(oDict[-1][1])/float(flength) + float(oDict[-2][1])/float(flength))/2 89 | except IndexError: 90 | return (float(oDict[-1][1])/float(flength)) 91 | 92 | def keyWordPres(user, comlist): 93 | comlist = [x for x in comlist if x.distinguished == None] 94 | tcount = 0 95 | hopeful = {"bot":0, "source code":0, "feedback":0, "contact":0, "faq":0, "*":0, "**":0, "^":0} 96 | for comment in comlist: 97 | tcount += 1 98 | for item in hopeful: 99 | if item in comment.body.lower(): 100 | hopeful[item] += 1 101 | hopeful_sort = sorted(hopeful.items(), key=operator.itemgetter(1)) 102 | return 0.01 if float(hopeful_sort[-1][1])/float(tcount) == 0 else float(hopeful_sort[-1][1])/float(tcount) 103 | 104 | def fewPosts(user, innum): 105 | tplist = [post for post in r.redditor(user).submissions.new(limit=100)] 106 | return 100.0 if len(tplist) == 0 else float(innum)/float(len(tplist)) 107 | 108 | 109 | def avTime(user, comlist): 110 | x = 0 111 | totals = [] 112 | flist = [j for j in comlist] 113 | while x < len(flist)-1: 114 | totals.append(flist[x].created_utc - flist[x+1].created_utc) 115 | x+=1 116 | # print(sum(totals)) 117 | # print(len(totals)) 118 | result = float(sum(totals))/float(len(totals)) #CAUSE FOR QUITE A FEW ERRORS 119 | return result 120 | 121 | meanList = [0.26882352941176468, 0.81103500008282825, 0.30068612106439713, 0.84628547896566597, 34.195187552394998, 0.42117868625821758, 7.9323057789799902, 62143.172017787212] 122 | stdList = [0.43502376342044174, 0.17692329372199109, 0.4103803095640271, 0.31432681573818694, 43.278973928533802, 0.40117225759820513, 15.443142775982409, 147452.60000900787] 123 | 124 | 125 | def normalize_alone(single): 126 | newList = [] 127 | x = np.array(single) 128 | for idx in range(len(single)): 129 | y = np.array(x[idx]) 130 | new = (y-meanList[idx])/stdList[idx] 131 | newList.append(new.tolist()) 132 | return np.array(newList).T.tolist() 133 | 134 | class User: 135 | def __init__(self, usr): 136 | comment_list = [x for x in r.redditor(usr).comments.new(limit=100) if str(x.subreddit).lower() not in blacklisted] 137 | if len(comment_list) < 15: 138 | self.invalid_flag = True 139 | else: 140 | self.invalid_flag = False 141 | self.bot_in_name = botInName(usr) 142 | self.same_level_comments = getSameLevels(usr, comment_list) 143 | self.keyword_present = keyWordPres(usr, comment_list) 144 | self.unique_comments = uniqueComments(usr, comment_list) 145 | self.few_posts = fewPosts(usr, len(comment_list)) 146 | self.is_repetitive = isRepetitive(usr, comment_list) 147 | self.spread = getSpread(usr, comment_list) 148 | self.average_time_beetween_comments = avTime(usr, comment_list) 149 | self.data = [self.invalid_flag, self.bot_in_name, self.same_level_comments, self.keyword_present, self.unique_comments, self.few_posts, self.is_repetitive, self.spread, self.average_time_beetween_comments] 150 | 151 | 152 | class Classifier: 153 | def __init__(self, threshold): 154 | self.threshold = threshold 155 | self.wih = np.array([[2.69201043e-01,-7.66616015e-02, 5.37590287e-02,-3.97873730e-01 156 | ,5.99034137e-02,-1.65300133e-01, 1.69002916e-01,-1.27212345e-01] 157 | ,[ -1.05991397e-01,-4.11156476e-01,-5.11020321e-01, 1.69261709e+00 158 | ,1.64746169e-01,-7.77246989e-01, 1.33937963e+00, 3.28459518e-02] 159 | ,[2.06643111e-01, 4.85683056e-02, 4.41266255e-01,-1.06308484e+00 160 | , -2.17737269e-01, 7.76239440e-01,-1.22464070e-02,-3.17771332e-02] 161 | ,[ -1.41573443e-01,-3.04611663e-01,-5.74945755e-01, 1.17999135e+00 162 | ,1.20915907e-01,-5.94478996e-01, 1.13528085e-01, 6.72478186e-02] 163 | ,[ -2.28950660e-01,-3.24794564e-01,-5.37459949e-01, 1.41047789e+00 164 | ,7.74998200e-02,-5.10754063e-01, 1.06588535e+00, 1.18869828e-01] 165 | ,[1.59138024e-01, 1.64332208e-02, 3.24109756e-01,-7.45074471e-01 166 | , -1.03254040e-01, 4.13048308e-01,-2.82940292e-02, 1.21081843e-01] 167 | ,[1.00072380e-01, 9.20597912e-02, 2.07490602e-01,-4.11541526e-01 168 | , -1.81249332e-01, 1.42513322e-01,-4.92596801e-02, 1.70184103e-02] 169 | ,[3.12570818e-01, 5.07755025e-01, 7.99758276e-01,-1.63859771e+00 170 | , -1.58640376e-01, 1.04454600e+00,-2.01986067e-01,-9.25766922e-02] 171 | ,[2.75542641e-01, 6.19160578e-01, 8.71133768e-01,-2.04872939e+00 172 | , -6.83477657e-02, 7.86638051e-01,-1.11013966e+00,-1.06634807e-01] 173 | ,[ -1.53585194e-01,-4.49980553e-02,-3.72068587e-01, 6.36085042e-01 174 | ,6.38797800e-02,-3.61838469e-01, 9.85935942e-02, 1.37879050e-01] 175 | ,[4.12251200e-02,-2.71299112e-02, 1.35835530e-01,-2.02469529e-01 176 | , -1.71733513e-02, 1.38925195e-01,-1.09821733e-01, 8.44875563e-03] 177 | ,[2.78102483e-01, 1.06807612e+00, 1.21235330e+00,-2.53614668e+00 178 | , -1.17519691e-01, 1.20483764e+00,-3.71075156e-01,-1.01154780e-01] 179 | ,[2.25389570e-01, 3.09370386e-01, 5.44547807e-01,-1.49165542e+00 180 | , -1.99947871e-01, 1.00929785e+00,-1.06430638e-01,-4.78790048e-02] 181 | ,[2.54594090e-02, 2.37898864e-01, 5.63143892e-01,-1.39453030e+00 182 | , -1.76180727e-01, 4.77656132e-01,-9.10164438e-01,-2.82810725e-02] 183 | ,[3.28965720e-01, 5.21833167e-01, 9.17969457e-01,-2.07818687e+00 184 | , -5.81314566e-02, 7.63236669e-01,-1.31028428e-01,-1.18704268e-01] 185 | ,[ -1.29633626e-01,-1.69051323e-01,-2.15957762e-01, 1.41134564e+00 186 | ,1.48512404e-02,-2.14510314e-01, 1.14914144e+00, 1.06685451e-01] 187 | ,[1.00092676e-01, 2.75198776e-01, 4.39135632e-01,-1.20503590e+00 188 | , -2.03239914e-01, 6.50391289e-01,-7.10173947e-02,-9.99902978e-02] 189 | ,[3.23926404e-01, 5.57635819e-01, 1.00120707e+00,-2.01821838e+00 190 | , -1.01963166e-01, 1.01328674e+00,-1.89362723e-01,-8.13646844e-02] 191 | ,[3.21827056e-01, 8.87919012e-01, 1.04329572e+00,-2.29413309e+00 192 | , -9.66993447e-02, 1.20613399e+00,-3.22749086e-01,-1.29832318e-01] 193 | ,[ -1.27558486e-01,-4.86239046e-02,-4.89004722e-01, 1.24832326e+00 194 | ,1.38180084e-01,-7.06346196e-01, 4.72076492e-02, 5.80445915e-03] 195 | ,[ -1.46006270e-01,-1.89286239e-01,-1.78092084e-01, 4.33329293e-01 196 | ,1.13966626e-01,-2.62479104e-01, 2.06753831e-01, 7.07932612e-02] 197 | ,[3.66512478e-01,-1.82682495e-01, 2.01707403e-01,-4.54146623e-01 198 | ,1.83864766e-01, 2.41694494e-01,-6.43353929e-02,-1.09329263e-01] 199 | ,[5.39024204e-02,-1.28627839e-01,-1.40163887e-01, 4.82343704e-01 200 | , -9.04652845e-02,-2.27451957e-01, 1.03130029e-01, 2.01471007e-01] 201 | ,[ -1.39615945e-01,-1.64959741e-01,-2.20166078e-01, 7.62747165e-01 202 | ,1.10194009e-01,-3.54911747e-01, 3.93171518e-01, 9.71666412e-02] 203 | ,[3.63112136e-01, 3.18736091e-01, 5.92318967e-01,-1.33099806e+00 204 | , -8.05678723e-02, 6.02688931e-01,-8.35558126e-02,-4.62139469e-02] 205 | ,[ -1.34306180e-01, 1.34593578e-02,-2.92967753e-01, 7.94008935e-01 206 | , -3.83106824e-02,-4.82618844e-01,-4.80866897e-02,-4.33226728e-02] 207 | ,[ -3.37294852e-01,-7.45747760e-01,-1.06874538e+00, 2.23565565e+00 208 | ,1.26981475e-01,-1.31262112e+00, 2.69903365e-01, 7.78344146e-02] 209 | ,[2.04461155e-01,-3.34676861e-02, 4.19841412e-01,-5.63357669e-01 210 | ,7.66770103e-02, 3.32390012e-01, 2.52030279e-02,-5.54604252e-02] 211 | ,[ -2.53579120e-01, 1.78653268e-01,-3.83075279e-01, 5.89676138e-01 212 | , -4.28175110e-03,-3.58996387e-01, 5.34379442e-02, 8.68467452e-02] 213 | ,[ -1.87078809e-01, 1.83647219e-02,-4.96025135e-01, 9.70139460e-01 214 | ,3.77531658e-02,-6.34890440e-01, 3.25267034e-02, 1.11052452e-01] 215 | ,[ -3.03034325e-01,-8.13465697e-01,-1.04615907e+00, 2.16732435e+00 216 | ,1.04377852e-01,-1.10715777e+00, 3.00386420e-01, 1.03154833e-01] 217 | ,[3.41427082e-01, 1.10499099e-01, 3.86679053e-01,-1.53370661e+00 218 | , -2.99358941e-01, 1.23587063e+00,-8.42612365e-04,-4.56316434e-02] 219 | ,[ -1.71782623e-01,-1.40411476e+00,-1.19929011e+00, 2.96805457e+00 220 | ,1.88671323e-01,-1.65868122e+00, 4.74396048e-01, 6.99827902e-02] 221 | ,[ -2.54805242e-01,-9.29323201e-01,-9.97025202e-01, 2.41993413e+00 222 | ,1.19669423e-01,-1.28759714e+00, 3.17129346e-01, 1.01123767e-01] 223 | ,[2.37391616e-04, 3.42086250e-02,-2.86014519e-01, 3.23062186e-01 224 | , -1.17703218e-01,-7.98576660e-02, 1.32812019e-02, 1.90666719e-01] 225 | ,[2.50798912e-01, 4.95532168e-01, 8.41781067e-01,-1.79160684e+00 226 | , -1.33605951e-01, 1.00324077e+00,-1.88100963e-01,-7.68031079e-02] 227 | ,[2.32219311e-01, 6.41497468e-01, 8.55599388e-01,-2.05184632e+00 228 | , -9.84776605e-02, 9.44331059e-01,-2.10771260e-01,-1.09559478e-01] 229 | ,[ -2.02236860e-01,-4.49482082e-01,-7.59721925e-01, 1.78840125e+00 230 | ,2.16292291e-01,-1.14098769e+00, 1.48944915e-01, 2.40171741e-02] 231 | ,[ -7.36925494e-02, 1.45934437e-01,-1.12976348e-01, 1.89458717e-01 232 | ,1.04799187e-01,-2.15790994e-01, 2.18954331e-01, 5.45174596e-02] 233 | ,[ -1.19499341e-01,-6.25751439e-01,-2.93438607e-01, 2.43861719e+00 234 | ,2.95423704e-01,-1.46331028e+00, 9.49515302e-02, 5.91275723e-03] 235 | ,[2.35405856e-01, 3.74058737e-01, 7.24448923e-01,-1.67665819e+00 236 | , -1.13997955e-01, 9.31875633e-01,-1.58923543e-01,-5.27549729e-02] 237 | ,[5.16763200e-02,-5.79839917e-01,-7.44296134e-01, 2.41156503e+00 238 | ,1.68253800e-01,-5.69324434e-01, 2.42565104e+00, 2.32166896e-02] 239 | ,[ -2.93799929e-01,-2.83015671e-01,-3.81373585e-01,-6.22640431e-02 240 | , -5.88684957e-03, 4.73316959e-02,-6.39187758e-03, 4.45030572e-02] 241 | ,[2.54029432e-01, 3.74365521e-01, 7.10361022e-01,-1.68714214e+00 242 | , -9.32929869e-02, 7.00601663e-01,-1.00264096e-01,-7.72111920e-02] 243 | ,[ -7.85146667e-02, 3.45461034e-02,-2.46972774e-01, 2.71504674e-01 244 | ,6.13933857e-02,-2.05319787e-01,-1.24801131e-02,-4.32126359e-02] 245 | ,[2.34946384e-01, 7.77538104e-01, 6.90613090e-01,-2.25403111e+00 246 | , -1.29924686e-01, 1.18751020e+00,-2.43592404e-01,-1.44155938e-01] 247 | ,[3.25670504e-02,-2.66143309e-01,-5.14183759e-01, 1.44945890e+00 248 | ,2.24861704e-01,-4.82809131e-01, 1.58609077e+00, 4.75966320e-02] 249 | ,[4.17018104e-01,-6.08063658e-02,-3.87892834e-02,-5.73269319e-01 250 | ,1.14146581e-02, 2.74160559e-01,-5.84783644e-01,-1.06836696e-01] 251 | ,[3.87562383e-02, 2.44602282e-01, 4.90723082e-01,-1.50621383e+00 252 | , -1.75347482e-01, 3.75014305e-01,-1.47040983e+00, 3.38229481e-02] 253 | ,[ -8.81472344e-02, 6.29379826e-03,-2.10720134e-01, 2.98726776e-01 254 | , -2.06519712e-01, 5.67631552e-03, 1.49210245e-02, 2.71120169e-01] 255 | ,[ -1.08386016e-01, 5.39537356e-01, 8.24936618e-01,-2.12773681e+00 256 | , -2.66895007e-01, 5.73538064e-01,-2.55301070e+00,-1.64158116e-02] 257 | ,[2.56365675e-01, 1.31752873e-01, 2.58553279e-01,-8.13983170e-01 258 | , -4.37185733e-03, 3.58684422e-01,-1.04622499e+00,-1.99652849e-01] 259 | ,[ -3.37944417e-01,-6.82120424e-01,-1.05350770e+00, 2.11040164e+00 260 | ,9.06747238e-02,-1.01479939e+00, 2.36512356e-01, 1.00239084e-01] 261 | ,[1.62081571e-01, 2.59181763e+00, 1.73574855e+00,-4.69140525e+00 262 | , -2.87320894e-01, 2.08711457e+00,-7.43083957e-01,-1.85138118e-01] 263 | ,[2.70214015e-01, 1.50733632e-01, 6.04270744e-01,-1.06985343e+00 264 | , -9.74478846e-02, 6.49956511e-01,-6.34980691e-02, 1.89956331e-02] 265 | ,[1.08497758e-01,-2.20686617e-01,-8.29191760e-02,-7.21376575e-03 266 | , -7.05909392e-02, 1.33577870e-01,-1.47112657e-01, 9.67671678e-02] 267 | ,[ -2.43147851e-01,-6.99823425e-01,-9.48364556e-01, 2.12532750e+00 268 | ,1.24008957e-01,-1.12927769e+00, 2.57829417e-01, 7.14155716e-02] 269 | ,[ -4.37970466e-02, 7.53824429e-02, 4.83613084e-03,-2.69392533e-01 270 | , -7.62309710e-02,-3.33256947e-02, 1.07912702e-01, 4.72232529e-02] 271 | ,[ -1.31235628e-01,-1.68244522e-01,-2.82688261e-01, 1.03580891e+00 272 | ,1.52662575e-01,-4.93712595e-01, 2.37808253e-01, 5.68194642e-02] 273 | ,[1.62087096e-01, 3.80115578e-01, 4.93934438e-01,-1.21762461e+00 274 | , -2.11815650e-01, 4.22672012e-01,-7.94037715e-02,-1.37945927e-01] 275 | ,[1.70395961e-01,-8.75226368e-02, 1.63154344e-01,-5.17133251e-01 276 | ,6.37618672e-02, 3.62886235e-01, 5.47064418e-02, 1.41940119e-01] 277 | ,[9.68741454e-03, 8.25732519e-01, 9.53660860e-01,-2.64244817e+00 278 | , -2.14481616e-01, 8.23654135e-01,-2.57526723e+00,-4.26209262e-02] 279 | ,[1.21986605e-01,-2.78649967e-01,-6.36752363e-02,-2.44618319e-01 280 | , -2.27719933e-01, 1.32092384e-01,-1.64168078e-01,-5.08826067e-02] 281 | ,[1.58820598e-01, 8.86701581e-02, 2.76492325e-01,-8.88567416e-01 282 | , -1.79629258e-01, 5.33067363e-01, 2.33486904e-02,-5.04859270e-02]]) 283 | 284 | self.who = np.array([[2.19930175e-01,-1.14900576e+00, 7.70074408e-01,-6.83125143e-01 285 | , -7.84349487e-01, 6.49719348e-01, 3.16943512e-01, 1.22109311e+00 286 | ,1.58250979e+00,-4.11974072e-01, 2.38638724e-01, 1.93178396e+00 287 | ,1.08295374e+00, 9.09399122e-01, 1.41503976e+00,-6.47088875e-01 288 | ,7.89248701e-01, 1.50133249e+00, 1.75518887e+00,-6.91060905e-01 289 | , -1.91215143e-01, 3.87755449e-01,-2.18677799e-01,-3.51119487e-01 290 | ,7.94587237e-01,-2.08154381e-01,-1.65203247e+00, 3.93385349e-01 291 | , -2.41934056e-01,-5.58694363e-01,-1.49773873e+00, 1.07606414e+00 292 | , -2.22619254e+00,-1.64334689e+00,-1.35823332e-01, 1.34925699e+00 293 | ,1.46613275e+00,-1.06795054e+00,-5.74365781e-02,-1.50217327e+00 294 | ,1.13697996e+00,-1.56887839e+00,-3.34669494e-04, 1.15636887e+00 295 | , -8.33920428e-02, 1.63907343e+00,-9.00251415e-01, 4.77599898e-01 296 | ,1.06997557e+00,-2.21615106e-04, 1.69975780e+00, 7.18329474e-01 297 | , -1.52357335e+00, 4.23760131e+00, 7.25643367e-01, 1.85076743e-01 298 | , -1.33633240e+00, 2.18413371e-01,-4.16495054e-01, 8.21425071e-01 299 | ,4.16765908e-01, 2.10138903e+00, 2.28111855e-01, 6.11093075e-01]]) 300 | self.activation_function = lambda x: __import__('scipy').special.expit(x) 301 | def is_a_bot(self, name): 302 | tObj = User(name) 303 | if tObj.invalid_flag: return None 304 | inpt_list = normalize_alone(tObj.data[1:]) 305 | inputs = np.array(inpt_list, ndmin=2).T 306 | hidden_outputs = self.activation_function(np.dot(self.wih, inputs)) 307 | result = self.activation_function(np.dot(self.who, hidden_outputs))[0][0] 308 | # print('Returning {} with confidence {}'.format(result>=self.threshold, result)) 309 | return (result >= self.threshold, result) 310 | 311 | 312 | 313 | --------------------------------------------------------------------------------