├── README.md ├── .gitignore ├── CTI_classifer.py ├── construct_tweet_threads.py ├── utility.py ├── results └── current_users.txt ├── main.py ├── LICENSE └── CTI_expert_finder.py /README.md: -------------------------------------------------------------------------------- 1 | # IoCMiner 2 | A Prototype for IoCMiner which is a framework to automaticly extract Indicators of Compromise (IoCs) from Twitter. 3 | 4 | (README is in progress) 5 | 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | config/* 107 | results/* 108 | !results/current_users.txt 109 | .idea 110 | 111 | -------------------------------------------------------------------------------- /CTI_classifer.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from sklearn.model_selection import train_test_split 3 | from sklearn import metrics 4 | from sklearn.ensemble import RandomForestClassifier 5 | import pandas as ps 6 | import statistics 7 | from nltk.tokenize import word_tokenize 8 | from nltk.stem import WordNetLemmatizer 9 | import nltk 10 | 11 | 12 | number_of_classifiers = 11 13 | nltk.download('punkt') 14 | lem = WordNetLemmatizer() 15 | 16 | def get_random_forest_classifiers(number, X, y): 17 | classifiers = [] 18 | for i in range(number): 19 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i, stratify=y) 20 | estimator = RandomForestClassifier(n_estimators=300, max_features=.15, criterion='entropy', min_samples_split=4) 21 | estimator.fit(x_train, y_train) 22 | classifiers.append(estimator) 23 | return classifiers 24 | 25 | def construct_classifier(): 26 | # For training the classifier 27 | master_Table = ps.read_csv(r'dataset\training-set.csv', delimiter=',') 28 | X, y = master_Table.iloc[:, 1:-1], master_Table.iloc[:, -1] 29 | classifiers = get_random_forest_classifiers(number_of_classifiers, X, y) 30 | columns = master_Table.head() 31 | 32 | column_names = [] 33 | for col in columns: 34 | column_names.append(col) 35 | column_names = column_names[1:-3] 36 | 37 | return classifiers, column_names 38 | 39 | 40 | def vectorize(tweet, vocab): 41 | vector = [] 42 | 43 | tweet = tweet.lower() 44 | bag = word_tokenize(tweet) 45 | 46 | for i in vocab: 47 | count = 0 48 | for j in bag: 49 | if i == j: 50 | count += 1 51 | vector.append(count) 52 | 53 | return vector 54 | 55 | 56 | if __name__ == '__main__': 57 | final_estimate = [] 58 | 59 | classifiers, col_names = construct_classifier() 60 | 61 | # For evaluating the classifier 62 | test_table = ps.read_csv(r'dataset\test-set-random.csv', delimiter=',') 63 | 64 | x2, y2 = test_table.iloc[:, 1:-1], test_table.iloc[:, -1] 65 | 66 | estimates = [] 67 | for i in range(number_of_classifiers): 68 | y_estimate = classifiers[i].predict(x2) 69 | estimates.append(y_estimate) 70 | 71 | aggregated_results = [] 72 | n = 0 73 | 74 | # do the majority voting here 75 | while n < 143: 76 | Y=0 77 | N=0 78 | R=0 79 | for i in estimates: 80 | vote = i[n] 81 | aggregated_results.append(vote) 82 | final_estimate.append(statistics.mode(aggregated_results)) 83 | aggregated_results.clear() 84 | n+=1 85 | accuracy = metrics.accuracy_score(y_true=y2, y_pred=final_estimate) 86 | print(accuracy) 87 | -------------------------------------------------------------------------------- /construct_tweet_threads.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | 5 | ioc_base_dir = r'results' 6 | 7 | class TweetInfo: 8 | def __init__(self, tweet): 9 | self.tweet = tweet 10 | self.responses = [] 11 | self.reply_to = None 12 | 13 | @staticmethod 14 | def get_all_text(tweet): 15 | result = '' 16 | for response in tweet.responses: 17 | result += response.tweet + '\n' 18 | result += TweetInfo.get_all_text(response) + '\n' 19 | return result 20 | 21 | 22 | def __str__(self): 23 | parent = self.reply_to 24 | ancestors_text = '' 25 | while parent != None: 26 | ancestors_text = parent.tweet +'\n' + ancestors_text 27 | 28 | return '[ancestors_text]\n'+ancestors_text+'\n[self]\n'+self.tweet +'\n[decendent]\n'+ TweetInfo.get_all_text(self) 29 | 30 | 31 | def get_text(tweet): 32 | if 'retweeted_status' in tweet and \ 33 | 'extended_tweet' in tweet['retweeted_status'] and \ 34 | 'full_text' in tweet['retweeted_status']['extended_tweet']: 35 | text = tweet['retweeted_status']['extended_tweet']['full_text'] 36 | elif 'extended_tweet' in tweet and 'full_text' in tweet['extended_tweet']: 37 | text = tweet['extended_tweet']['full_text'] 38 | else: 39 | text = tweet['text'] 40 | return text 41 | 42 | def get_ioc_tweet_ids(file): 43 | result = set() 44 | iocs = set() 45 | with open(file, 'r', encoding='utf_8') as input_file: 46 | for line in input_file: 47 | segments = line.strip().split(',') 48 | seg_len = len(segments) 49 | if seg_len >= 7: 50 | 51 | segments[6] = segments[6].strip() 52 | if segments[6] not in iocs and segments[5] != 'email': 53 | result.add(segments[0]) 54 | iocs.add(segments[6]) 55 | return result 56 | 57 | if __name__ == "__main__": 58 | # Get the IoCs from the daily IoC files 59 | ioc_tweet_ids = set() 60 | for file in glob.glob(os.path.join(ioc_base_dir, '*.ioc.csv')): 61 | file_name = file.split('\\')[-1] 62 | res = get_ioc_tweet_ids(file) 63 | ioc_tweet_ids.update(res) 64 | 65 | tweets = {} 66 | with open(os.path.join(ioc_base_dir, 'top_user_dump.json'), 'r', encoding='utf_8') as input_file: 67 | tag_dataset = {} 68 | seen_tweets = set() 69 | # Construct tweet threads from a tweet dump 70 | for count, line in enumerate(input_file): 71 | tweet = json.loads(line) 72 | current_tweet = TweetInfo(get_text(tweet)) 73 | if tweet['in_reply_to_status_id_str'] == None: 74 | tweets[tweet['id_str']] = current_tweet 75 | else: 76 | if tweet['in_reply_to_status_id_str'] in tweets: 77 | tweets[tweet['in_reply_to_status_id_str']].responses.append(current_tweet) 78 | tweets[tweet['id_str']] = current_tweet 79 | 80 | # Print the tweet threads containing IoCs 81 | for ioc_tweet_id in ioc_tweet_ids: 82 | if ioc_tweet_id in tweets: 83 | if len(tweets[ioc_tweet_id].responses) > 0: 84 | print(tweets[ioc_tweet_id]) 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /utility.py: -------------------------------------------------------------------------------- 1 | import tweepy 2 | import json 3 | import datetime 4 | 5 | def get_twitter_api(): 6 | with open(r'config/tweeter.auth', 'r') as auth_file: 7 | consumer_key, consumer_secret, access_token, access_token_secret = auth_file.read().split() 8 | 9 | # OAuth process, using the keys and tokens 10 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 11 | auth.set_access_token(access_token, access_token_secret) 12 | 13 | api = tweepy.API(auth, wait_on_rate_limit=True) 14 | 15 | return api 16 | 17 | 18 | def get_user_timeline(api, screen_name, count = 200): 19 | # Twitter only allows access to a users most recent 3240 tweets with this method 20 | # initialize a list to hold all the tweepy Tweets 21 | 22 | alltweets = [] 23 | 24 | fetched_count = 200 25 | 26 | # make initial request for most recent tweets (200 is the maximum allowed count) 27 | new_tweets = api.user_timeline(screen_name=screen_name, count=200) 28 | 29 | # save most recent tweets 30 | alltweets.extend(new_tweets) 31 | 32 | print("...%s tweets downloaded so far" % (len(alltweets))) 33 | 34 | if fetched_count >= count: 35 | return alltweets[:count] 36 | 37 | # save the id of the oldest tweet less one 38 | oldest = alltweets[-1].id - 1 39 | 40 | # keep grabbing tweets until there are no tweets left to grab 41 | while len(new_tweets) > 0: 42 | # all subsiquent requests use the max_id param to prevent duplicates 43 | new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) 44 | 45 | 46 | # save most recent tweets 47 | alltweets.extend(new_tweets) 48 | 49 | print("...%s tweets downloaded so far" % (len(alltweets))) 50 | 51 | if len(alltweets) >= count: 52 | break 53 | 54 | # update the id of the oldest tweet less one 55 | oldest = alltweets[-1].id - 1 56 | 57 | 58 | 59 | 60 | return alltweets[:count] 61 | 62 | 63 | def get_list_timeline(api, list_id, count): 64 | # Twitter only allows access to a users most recent 3240 tweets with this method 65 | # initialize a list to hold all the tweepy Tweets 66 | 67 | alltweets = [] 68 | 69 | fetched_count = 200 70 | # make initial request for most recent tweets (200 is the maximum allowed count) 71 | new_tweets = api.list_timeline(list_id=list_id, count=200) 72 | 73 | # save most recent tweets 74 | alltweets.extend(new_tweets) 75 | 76 | print("...%s list downloaded so far" % (len(alltweets))) 77 | if fetched_count >= count: 78 | return alltweets[:count] 79 | 80 | # save the id of the oldest tweet less one 81 | oldest = alltweets[-1].id - 1 82 | 83 | # keep grabbing tweets until there are no tweets left to grab 84 | while len(new_tweets) > 0: 85 | # all subsiquent requests use the max_id param to prevent duplicates 86 | new_tweets = api.user_timeline(list_id=list_id, count=200, max_id=oldest) 87 | 88 | fetched_count += 200 89 | 90 | # save most recent tweets 91 | alltweets.extend(new_tweets) 92 | 93 | print("...%s list downloaded so far" % (len(alltweets))) 94 | 95 | if fetched_count >= count: 96 | break 97 | 98 | # update the id of the oldest tweet less one 99 | oldest = alltweets[-1].id - 1 100 | 101 | 102 | 103 | return alltweets[:count] 104 | 105 | def get_date(): 106 | return datetime.datetime.now().strftime('%Y%m%d') 107 | -------------------------------------------------------------------------------- /results/current_users.txt: -------------------------------------------------------------------------------- 1 | NelsonSecurity,3095065120 2 | http_error_418,2983529314 3 | SfyLabs,3328717103 4 | Bitcoin,1141391982 5 | Qantas,218730857 6 | asset_island_,915670001595682816 7 | AmazonHelp,85741735 8 | SendGrid,42126617 9 | sweatshack,543867579 10 | lunomoney,1274469684 11 | SpiritGroupInc,396292584 12 | WeTransfer,81694711 13 | texlinepk,814796199802896384 14 | evanderburg,16309969 15 | amazon,20793816 16 | CraneHassold,758651689662550016 17 | PassiveTotal,2479900676 18 | ziraatbankasi,213983669 19 | VakifBank,613251601 20 | CEPTETEB,2195601962 21 | letsencrypt,2887837801 22 | unitedlayer,18454193 23 | fumik0_,75868226 24 | phishing,15234215 25 | FedEx,134887156 26 | Ficohsa,78359919 27 | ChaseSupport,274789264 28 | Discover,16147150 29 | ATTCares,62643312 30 | Rabobank,7385462 31 | CyprusMFA,1446632138 32 | comsadotio,878294178693652482 33 | CenterX1,97054852 34 | douglasmun,620341271 35 | SwiftOnSecurity,2436389418 36 | ET_Labs,2835071339 37 | AskNationwide,469358921 38 | HRCSaudi,636184033 39 | MrGlaive,2692504059 40 | Barclays,191781601 41 | HSBC,467368287 42 | instagram,180505807 43 | foofightersbr,28891221 44 | foofighters,19081001 45 | AmericanExpress,42712551 46 | serasaexperian,48455669 47 | HostGator,15770852 48 | Cielo_br,126090155 49 | PosteNews,388197163 50 | comododesktop,19257473 51 | tjbahia,51874865 52 | sicredi_oficial,146573998 53 | Mailbox,624947324 54 | Caixa,229191436 55 | Match,19013415 56 | americanascom,35019751 57 | Bitly,15234886 58 | defesa_digital,328254142 59 | santander_br,48700857 60 | Homehost,31437607 61 | github,13334762 62 | AskAmex,62911603 63 | coinhive_com,963717856356651008 64 | publicww,2546955637 65 | riodejaneiro,35001292 66 | itau,179398386 67 | usponline,24241923 68 | AWSSupport,120967386 69 | 000webhosting,376402154 70 | 000webhost_com,718444759828807680 71 | Adobe,63786611 72 | AskPayPal,63247343 73 | blockchain,125304737 74 | bancosantander,246196403 75 | NavyFederalHelp,22972532 76 | NavyFederal,145227381 77 | inj3ct0r,56179836 78 | AlibabaGroup,2694564780 79 | Cielo,489961357 80 | petrobras,45083460 81 | Cryptopia_NZ,2916954277 82 | Vivoemrede,43218789 83 | Mastercard,75014376 84 | ipiranga,15836990 85 | uolhost,15047744 86 | sekoia_fr,1479558380 87 | mustarddawg,872794204086444032 88 | BroadAnalysis,1055529325 89 | DynamicAnalysis,177813214 90 | Anti_Expl0it,778763839974232065 91 | Arubait,289758610 92 | CoatonDean,2999674613 93 | orange,296824934 94 | Cdiscount,63142684 95 | Eranetcom1,583560916 96 | HSBC_UK,2922732233 97 | WifiRumHam,773924520189169666 98 | NetflixBrasil,231579149 99 | PhishTank_Bot,920309658912407552 100 | Gendarmerie,2184489764 101 | hexlax,2610260400 102 | caixabank,270429778 103 | USAA,16584443 104 | web4africa,34330865 105 | errorinn,1567253886 106 | virustotal,145033865 107 | ForstPenguin,37678351 108 | nixcraft,17484680 109 | phish_total,920628142385098754 110 | APWG,15495424 111 | the_root_labs,731181031953293312 112 | serveriusbv,94043958 113 | DCUcreditunion,55548810 114 | juangameztorres,1341334147 115 | TDGroupInc,397527876 116 | aboutworldlangs,65455730 117 | USTreasury,120176950 118 | TMobileHelp,185728888 119 | _DanRoberts_,846248293852049411 120 | abel1ma,225299286 121 | nullcookies,723905075811332096 122 | FourOctets,3185889936 123 | Jan0fficial,735370482896244736 124 | Racco42,450579130 125 | RealRalf9000,900300460514562050 126 | MrClickyClicks,976229553029943296 127 | _odisseus,1630455818 128 | ps66uk,894741613624348672 129 | MalWebHunter,847813680003600384 130 | yashkadakia,18903282 131 | ali0une,127333069 132 | _larry0,17055552 133 | TigzyRK,158836607 134 | kobebryamV2,211117902 135 | KylianXAnalyst,893476344293580802 136 | EllandBack1,569434401 137 | doppelvizsla,783834754722521088 138 | rymcvicar,508460308 139 | 0Btemos_BHS,2570023247 140 | HerbieZimmerman,982533264 141 | JackBurt0n,14089433 142 | stecar792,886573394224971776 143 | Lvanoel,2654177803 144 | mgiovamo,365600157 145 | M157q_News_RSS,4639018735 146 | ebubekirbastama,2993714093 147 | fcafra,323249092 148 | F_kZ_,600820120 149 | Darkn1ght10,708000179773956096 150 | secb0t,864824454945353730 151 | p4r4n0id_il,303725749 152 | misojosgatos,123093184 153 | Jakeashacks,762240284134633472 154 | MalwareHunterBR,1369263216 155 | FewAtoms,951498497584492544 156 | DissectMalware,967487209535361025 157 | PhishingAi,974046842408357894 158 | SecuriTears,909731475314049025 159 | malware_traffic,1612403564 160 | P3pperP0tts,948580944323194880 161 | bry_campbell,837320959 162 | MakFLwana,376580061 163 | CryptoInsane,3740550016 164 | Zerophage1337,736664966980308992 165 | killamjr,744930225352548352 166 | taku888infinity,272034153 167 | packet_Wire,294036033 168 | dvk01uk,4272657142 169 | neonprimetime,48459503 170 | catnap707,265308969 171 | avman1995,3069922981 172 | mateeuslinno,1788573764 173 | ViriBack,3101795085 174 | Dashowl,2896982439 175 | papa_anniekey,171305529 176 | malwrhunterteam,2847021941 177 | certbr,190765950 178 | James_inthe_box,703614655 179 | JAMESWT_MHT,3433210978 180 | Dropbox,14749606 181 | illegalFawn,778528483395895296 182 | pollo290987,838969349998272512 183 | WellsFargo,1178011 184 | bad_packets,856982087101849600 185 | BancodoBrasil,83723557 186 | NaomiSuzuki_,1479126768 187 | autumn_good_35,747100113231745026 188 | virusbay_io,923656996741615617 189 | PayPal,30018058 190 | anyrun_app,833639043862786048 191 | digitalocean,457033547 192 | BankofAmerica,204881628 193 | UOL,70799317 194 | phishingalert,895935810469494786 195 | seen8th,4706453413 196 | JRoosen,29199860 197 | JaromirHorejsi,1550544901 198 | Artilllerie,296167523 199 | facebook,2425151 200 | chronic,86315276 201 | 0x7fff9,809023873245204481 202 | APNews,16744503 203 | LinkedIn,13058772 204 | Techhelplistcom,1222068722 205 | dyngnosis,14268523 206 | ameli_actu,2842497633 207 | Chase,274673392 208 | PayPalInfoSec,23807666 209 | netflix,16573941 210 | GoDaddy,14949454 211 | urlscanio,784841998964514816 212 | umbler,2707504046 213 | Ledtech3,4579549601 214 | EmergingThreats,156671416 215 | buffaloverflow,293922100 216 | ShiaoQu17,973233102947364865 217 | TDBank_US,67378997 218 | NCCInfoSec,880355951529820160 219 | Toyota,14219877 220 | CarrefourES,275430095 221 | namesilo,196128578 222 | OVH,317647291 223 | gmail,38679388 224 | Microsoft,74286565 225 | theschoolsleuth,2885930067 226 | DHLUS,2507741466 227 | MobileLegendsOL,762959278189584384 228 | AppleSupport,3309375033 229 | Halkbank,220951448 230 | BofA_Help,18735040 231 | WBPolice,751102643909591040 232 | DeptVetAffairs,78408666 233 | HMRCcustomers,2848281329 234 | ficoba,19187698 235 | Cloudflare,32499999 236 | DocuSign,18909248 237 | SamsungUS,34356968 238 | ASBBank,20730278 239 | Bradesco,79184083 240 | locaweb,15020362 241 | Outlook,605805760 242 | baberpervez2,850551473624797184 243 | ABNAMRO,2270841 244 | fs0c131y,3317233336 245 | PopularResponde,1359682945 246 | ItsReallyNick,73195179 247 | Dinosn,128484298 248 | _devonkerr_,2843933755 249 | lennyzeltser,14780493 250 | retBandit,325116309 251 | LukasStefanko,2936786110 252 | JohnLaTwC,208346105 253 | decalage2,937763623 254 | VK_Intel,3332934374 255 | Mrtn9,91752953 256 | B_H101,867327304850178048 257 | securitydoggo,792480469568937988 258 | Bank_Security,1886777581 259 | cocaman,1169961 260 | blu3_team,876111091818000385 261 | hasherezade,1590754944 262 | powershellcode,951900231347531776 263 | TheMalwareQueen,849590344102682624 264 | TheWit0k,715097237378957312 265 | executemalware,743883460587167744 -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import tweepy # https://github.com/tweepy/tweepy 2 | from queue import Queue 3 | from threading import Thread 4 | from gglsbl import SafeBrowsingList 5 | import requests 6 | import shutil 7 | from CTI_expert_finder import * 8 | from CTI_classifer import * 9 | import numpy 10 | 11 | 12 | class IOCMinerStreamListener(tweepy.StreamListener): 13 | 14 | def __init__(self, api, top_users, rand_users): 15 | self.api = api 16 | self.output = r"results\top_user_dump.json" 17 | self.output_file = open(self.output, "a") 18 | self.rand_users = rand_users 19 | self.top_users = top_users 20 | self.classifiers, self.wordlist = construct_classifier() 21 | self.enclosure_queue = Queue() 22 | self.worker = Thread(target=self.worker, args=(1, self.enclosure_queue,)) 23 | self.worker.setDaemon(True) 24 | self.worker.start() 25 | 26 | 27 | def on_status(self, status): 28 | self.output_file.write(json.dumps(status._json )+'\n') 29 | self.enclosure_queue.put(status) 30 | 31 | def worker(self, id, queue): 32 | 33 | with open(r'config\gglsbl.auth', 'r') as auth_file: 34 | gglsbl_key = auth_file.read().strip() 35 | 36 | sbl = SafeBrowsingList(gglsbl_key, db_path=r"dataset\google_safe_browisng_db") 37 | # sbl.update_hash_prefix_cache() 38 | 39 | turn = True 40 | while True: 41 | 42 | # Update Google SBL database every 12 hours at time X (e.g. 3 AM and 3 PM) 43 | hour = datetime.datetime.today().hour 44 | if hour % 12 == 3 and turn: 45 | sbl.update_hash_prefix_cache() 46 | turn = False 47 | elif hour % 12 != 3: 48 | turn = True 49 | 50 | today = get_date() 51 | with open(os.path.join('results', today+'.ioc.csv'),'a+',encoding='utf_8') as output_file: 52 | tweet = queue.get() 53 | try: 54 | if hasattr(tweet, 'retweeted_status') and hasattr(tweet.retweeted_status, 'extended_tweet') and 'full_text' in tweet.retweeted_status.extended_tweet: 55 | text = tweet.retweeted_status.extended_tweet['full_text'] 56 | elif hasattr(tweet, 'extended_tweet') and 'full_text' in tweet.extended_tweet: 57 | text = tweet.extended_tweet['full_text'] 58 | elif not hasattr(tweet, 'text'): 59 | text = tweet['text'] 60 | else: 61 | text = tweet.text 62 | 63 | if hasattr(tweet, 'retweeted_status'): 64 | if hasattr(tweet.retweeted_status, 'extended_tweet'): 65 | final_urls = tweet.retweeted_status.extended_tweet['entities']['urls'] 66 | else: 67 | final_urls = tweet.retweeted_status.entities['urls'] 68 | else: 69 | if hasattr(tweet, 'extended_tweet'): 70 | final_urls = tweet.extended_tweet['entities']['urls'] 71 | else: 72 | final_urls = tweet.entities['urls'] 73 | 74 | for final_url in final_urls: 75 | # If a pastebin URL, get the raw content and append it to the tweet content 76 | if final_url['expanded_url'].startswith('https://pastebin.com/'): 77 | pastebin = final_url['expanded_url'] 78 | if 'raw' not in pastebin: 79 | pastebin = pastebin.replace('https://pastebin.com/', 'https://pastebin.com/raw/') 80 | 81 | req = requests.get(pastebin) 82 | text += '\n' + req.content 83 | 84 | user_type = 'top' 85 | if tweet.user.id_str in self.rand_users: 86 | user_type = 'rand' 87 | 88 | print("###########################$$$$$$$$$$$$$$$$$$$$$$$$$$$") 89 | print(text) 90 | 91 | # classifier must be retrained with new data 92 | # vector = vectorize(text, self.wordlist) 93 | # vector.append(len(tweet.entities['hashtags'])) 94 | # vector.append(len(tweet.entities['user_mentions'])) 95 | # vector = numpy.array(vector).reshape(1, -1) 96 | # estimates = [] 97 | # for i in range(number_of_classifiers): 98 | # y_estimate = self.classifiers[i].predict(vector) 99 | # estimates.append(y_estimate) 100 | # vote = statistics.mode([x[0] for x in estimates]) 101 | # print("Prediction: "+vote) 102 | 103 | ips = list(iocextract.extract_ips(text, refang=True)) 104 | for ip in ips: 105 | if ip not in text: 106 | output_file.write('{},{},{},{},{},ip,{}\n'.format(tweet.id,tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, ip)) 107 | 108 | urls = list(iocextract.extract_urls(text, refang=True)) 109 | for url in urls: 110 | if url not in text: 111 | result = sbl.lookup_url(url.rstrip('.')) 112 | if result is not None: 113 | output_file.write('{},{},{},{},{},url,{},{}\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, url.rstrip('.'),result)) 114 | else: 115 | output_file.write('{},{},{},{},{},url,{},benign\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, url.rstrip('.'))) 116 | 117 | emails = list(iocextract.extract_emails(text, refang=True)) 118 | for email in emails: 119 | if email not in text: 120 | output_file.write('{},{},{},{},{},email,{}\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, email)) 121 | hashes = list(iocextract.extract_hashes(text)) 122 | for hash in hashes: 123 | output_file.write('{},{},{},{},{},hash,{}\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, hash)) 124 | except Exception as exp: 125 | print(exp) 126 | 127 | queue.task_done() 128 | 129 | def on_error(self, status_code): 130 | if status_code == 420: 131 | # returning False in on_data disconnects the stream 132 | return False 133 | 134 | def __del__(self): 135 | self.output_file.close() 136 | 137 | 138 | UPDATE_CURRENT_USER = False 139 | 140 | api = get_twitter_api() 141 | val = api.rate_limit_status() 142 | 143 | if api.verify_credentials(): 144 | 145 | today = get_date() 146 | 147 | base_dir = os.path.join(r'results\days', today) 148 | 149 | top_users_final_path = os.path.join(base_dir, 'top_users_final') 150 | if not os.path.exists('top_users_final'): 151 | top_users = dump_cti_experts(api, base_dir, test_run=True) 152 | if UPDATE_CURRENT_USER: 153 | shutil.copy2('results\\current_users.txt', 'results\\current_users.old.txt') 154 | with open('results\\current_users.txt','w', encoding='utf_8') as current_users_file: 155 | for user in top_users: 156 | current_users_file.write("{},{}\n".format( user[0], user[1])) 157 | 158 | user_ids = [] 159 | with open(os.path.join(base_dir, 'top_users_final'), 'r', encoding='utf_8') as top_user_file: 160 | next(top_user_file) 161 | csv_reader = csv.reader(top_user_file) 162 | for row in csv_reader: 163 | user_ids.append(row[0]) 164 | 165 | rand_user_ids = [] 166 | # with open(os.path.join(base_dir, 'rand_users_final_1k'), 'r', encoding='utf_8') as top_user_file: 167 | # next(top_user_file) 168 | # csv_reader = csv.reader(top_user_file) 169 | # for row in csv_reader: 170 | # rand_user_ids.append(row[0]) 171 | 172 | twitter_listener = IOCMinerStreamListener(api, set(user_ids), set(rand_user_ids)) 173 | IOC_stream = tweepy.Stream(auth=api.auth, listener=twitter_listener) 174 | 175 | # Collect tweets from a set of top CTI experts and a set of randomly selected users (indefinite loop) 176 | while True: 177 | try: 178 | users = user_ids[0:1000] 179 | users.extend(rand_user_ids) 180 | IOC_stream.filter(follow=users) 181 | except Exception as exp: 182 | print(str(exp)) 183 | IOC_stream.disconnect() 184 | 185 | # If the stream listener is terminated, wait 120 seconds before creating a new one 186 | time.sleep(120) 187 | 188 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /CTI_expert_finder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import csv 4 | import time 5 | import re 6 | import math 7 | from utility import * 8 | import iocextract 9 | from dateutil.parser import parse 10 | 11 | class Dummy(object): 12 | pass 13 | 14 | def get_user_lists(api, user, max_count=1000): 15 | res = api.lists_memberships(screen_name=user, count=max_count) 16 | # Sorting the results based on subscriber_count and member_count 17 | res = sorted([i for i in res if i.mode.lower() == 'public'], key=lambda x: x.member_count, reverse=True) 18 | res = sorted([i for i in res], key=lambda x: x.subscriber_count, reverse=True) 19 | return res 20 | 21 | 22 | def dump_user_lists(user, lists, dump_file_path): 23 | with open(dump_file_path, 'w', encoding='utf_8', newline='') as output_file: 24 | csv_writer = csv.writer(output_file) 25 | row = [ 26 | 'id', 27 | 'name', 28 | 'slug', 29 | 'description', 30 | 'member_count', 31 | 'subscriber_count', 32 | 'mode', 33 | 'created_at', 34 | 'owner.id_str', 35 | 'owner.screen_name', 36 | 'owner.name', 37 | 'owner.favourites_count', 38 | 'owner.followers_count', 39 | 'owner.friends_count', 40 | 'owner.created_at' 41 | ] 42 | csv_writer.writerow(row) 43 | 44 | for list in lists: 45 | row = [ 46 | list.id, 47 | list.name, 48 | list.slug, 49 | list.description, 50 | list.member_count, 51 | list.subscriber_count, 52 | list.mode, 53 | list.created_at, 54 | list.user.id_str, 55 | list.user.screen_name, 56 | list.user.name, 57 | list.user.favourites_count, 58 | list.user.followers_count, 59 | list.user.friends_count, 60 | list.user.created_at, 61 | ] 62 | csv_writer.writerow(row) 63 | 64 | 65 | def dump_list_tweets(list, tweets, dump_file_path): 66 | with open(dump_file_path, 'w', encoding='utf_8') as output_file: 67 | for tweet in tweets: 68 | json_str = json.dumps(tweet._json) 69 | output_file.write(json_str + '\n') 70 | 71 | 72 | def dump_list_users(list, users, dump_file_path): 73 | with open(dump_file_path, 'w', encoding='utf_8') as output_file: 74 | for user in users: 75 | json_str = json.dumps(user._json) 76 | output_file.write(json_str + '\n') 77 | 78 | 79 | def get_list_members(api, list_id, max_count=5000): 80 | res = api.list_members(list_id=list_id, count=max_count) 81 | return res 82 | 83 | 84 | def create_list(api, name): 85 | res = api.create_list(name, mode='private') 86 | return res.id_str 87 | 88 | 89 | def add_to_list(api, list_id, members): 90 | for i in range(0, len(members), 100): 91 | api.add_list_members(list_id=list_id, user_id=members[i: i + 100]) 92 | 93 | 94 | def get_current_user(): 95 | result = [] 96 | current_users = r'results\current_users.txt' 97 | with open(current_users, 'r', encoding='utf_8') as input: 98 | for line in input: 99 | user, id = line.strip().split(',') 100 | result.append((user, id)) 101 | return result 102 | 103 | 104 | def select_top_lists(all_lists, 105 | avg_sec_word_count, 106 | avg_member_score, 107 | avg_subscriber_count, 108 | avg_owner_strenght, 109 | count=1000): 110 | for i in all_lists: 111 | all_lists[i]['score'] = (all_lists[i]['sec_word_count'] / avg_sec_word_count) * \ 112 | (all_lists[i]['member_score'] / avg_member_score) * \ 113 | (all_lists[i]['subscriber_count'] / avg_subscriber_count) * \ 114 | (all_lists[i]['owner_strength'] / avg_owner_strenght) 115 | lists_rank = sorted(all_lists.items(), key=lambda x: x[1]['score'], reverse=True) 116 | discard_index = len(all_lists) 117 | counter = 0 118 | for i in lists_rank: 119 | if i[1]['score'] == 0: 120 | discard_index = counter 121 | break 122 | counter += 1 123 | 124 | if discard_index > count: 125 | discard_index = count 126 | 127 | lists_rank = lists_rank[:discard_index] 128 | 129 | return lists_rank 130 | 131 | 132 | def dump_cti_experts(api, base_dir, test_run=False): 133 | # If you want to 134 | top_users_res = [] 135 | user_dir = os.path.join(base_dir, 'users') 136 | user_status_dir = os.path.join(base_dir, r'users\status') 137 | list_dir = os.path.join(base_dir, 'lists') 138 | 139 | if not os.path.exists(user_status_dir): 140 | os.makedirs(user_status_dir) 141 | if not os.path.exists(list_dir): 142 | os.makedirs(list_dir) 143 | 144 | # For each CTI expert (in the input list), dump the info of all the lists that the expert is a member into a file 145 | for user, user_id in get_current_user(): 146 | try: 147 | user_lists_dump_path = os.path.join(user_dir, user_id + '.user.csv') 148 | if not os.path.exists(user_lists_dump_path): 149 | lists = get_user_lists(api, user) 150 | dump_user_lists(user, lists, user_lists_dump_path) 151 | time.sleep(2) 152 | 153 | if test_run: 154 | break 155 | 156 | except Exception as exp: 157 | print('ERROR {}:{}'.format(user, exp.reason)) 158 | 159 | list_rank = [] 160 | specific_words = ['ioc', 161 | 'malware', 162 | 'Indicator.?of.?Compromise', 163 | 'threat.?hunt', 164 | 'threat.?hunt', 165 | 'phishing.?hunt', 166 | 'phish.?hunt', 167 | 'threat.?int', 168 | 'threat.?research', 169 | 'ransomware', 170 | 'mal.?doc'] 171 | 172 | generic_words = ['info.?sec', 173 | 'cyber.?sec', 174 | 'security', 175 | 'ransomware'] 176 | 177 | specific_regex_rule = re.compile('|'.join(specific_words), re.IGNORECASE) 178 | generic_regex_rule = re.compile('|'.join(generic_words), re.IGNORECASE) 179 | 180 | # sub_scores: number of relevant words, number_follower/log(number_followers), number_subscriber, owner_strength 181 | # score is a product of the above sub scores 182 | # each sub score must be in the range [0,+infinity), however, average must be 1 183 | # sub scores that are above average increase the total score 184 | 185 | all_lists = {} 186 | total_sec_word_count = 0 187 | total_member_score = 0 188 | total_subscriber_count = 0 189 | total_owner_strength = 0 190 | 191 | 192 | for file in glob.glob(os.path.join(user_dir, "*.user.csv")): 193 | with open(file, 'r', encoding='utf_8') as input_file: 194 | reader = csv.reader(input_file) 195 | next(reader) 196 | counter = 0 197 | for row in reader: 198 | counter += 1 199 | id = row[0] 200 | if id not in all_lists: 201 | all_lists[id] = {} 202 | all_lists[id]['id'] = row[0] 203 | all_lists[id]['name'] = row[1] 204 | all_lists[id]['text'] = row[1] + ' ' + row[3] 205 | all_lists[id]['sec_word_count'] = len(specific_regex_rule.findall(all_lists[id]['text'])) * 3 + \ 206 | len(generic_regex_rule.findall(all_lists[id]['text'])) 207 | total_sec_word_count += all_lists[id]['sec_word_count'] 208 | 209 | all_lists[id]['member_count'] = int(row[4]) 210 | if all_lists[id]['member_count'] > 1: 211 | all_lists[id]['member_score'] = all_lists[id]['member_count'] / math.log2( 212 | all_lists[id]['member_count']) 213 | else: 214 | all_lists[id]['member_score'] = 0 215 | total_member_score += all_lists[id]['member_score'] 216 | 217 | all_lists[id]['subscriber_count'] = int(row[5]) 218 | all_lists[id]['subscriber_count'] += 1 219 | total_subscriber_count += all_lists[id]['subscriber_count'] 220 | 221 | all_lists[id]['owner_screen_name'] = row[9] 222 | all_lists[id]['owner_followers_count'] = int(row[12]) 223 | all_lists[id]['owner_friends_count'] = int(row[13]) 224 | if all_lists[id]['owner_friends_count'] >= 1: 225 | all_lists[id]['owner_strength'] = math.log2( 226 | (all_lists[id]['owner_followers_count'] + all_lists[id]['owner_friends_count']) / 227 | all_lists[id]['owner_friends_count']) 228 | else: 229 | all_lists[id]['owner_strength'] = 0 230 | total_owner_strength += all_lists[id]['owner_strength'] 231 | 232 | if test_run: 233 | if counter > 10: 234 | break 235 | 236 | avg_sec_word_count = total_sec_word_count / len(all_lists) 237 | avg_member_score = total_member_score / len(all_lists) 238 | avg_subscriber_count = total_subscriber_count / len(all_lists) 239 | avg_owner_strength = total_owner_strength / len(all_lists) 240 | 241 | top_lists = select_top_lists(all_lists, 242 | avg_sec_word_count, 243 | avg_member_score, 244 | avg_subscriber_count, 245 | avg_owner_strength) 246 | 247 | counter = 0 248 | 249 | # Dump the latest 1000 timeline tweets of each top lists 250 | for top_list in top_lists: 251 | try: 252 | print(top_list[0] + '\t' + top_list[1]['owner_screen_name'] + '\t\t' + top_list[1]['name']) 253 | file_name = top_list[0] + '---' + top_list[1]['owner_screen_name'] + '---' + top_list[1][ 254 | 'name'].replace('/', '-') + '.dump.list.csv' 255 | list_tweets_file_path = os.path.join(list_dir, file_name) 256 | if not os.path.exists(list_tweets_file_path): 257 | tweets = get_list_timeline(api, top_list[0], 1000) 258 | dump_list_tweets(top_list[0], tweets, list_tweets_file_path) 259 | counter += 1 260 | 261 | if test_run: 262 | if counter > 10: 263 | break 264 | else: 265 | if counter > 150: 266 | break 267 | except Exception as exp: 268 | print('ERROR processing tweets of ' + str(top_list[0])) 269 | print(exp) 270 | 271 | # For each List, count the number of IoCs appread in the dump of the latest 1000 timeline tweets 272 | top_lists_iocs = {} 273 | ioc_global_freq = {} 274 | count = 0 275 | for file in glob.glob(os.path.join(list_dir, "*.dump.list.csv")): 276 | count += 1 277 | name = os.path.basename(file) 278 | print('processing ' + name) 279 | id = name.split('---')[0] 280 | if id not in top_lists_iocs: 281 | top_lists_iocs[id] = set() 282 | with open(file, 'r', encoding='utf_8') as input_file: 283 | for line in input_file: 284 | try: 285 | tweet = json.loads(line) 286 | iocs = iocextract.extract_iocs(tweet['text'], refang=True) 287 | for ioc in iocs: 288 | if ioc not in tweet['text']: 289 | top_lists_iocs[id].add(ioc) 290 | if ioc not in ioc_global_freq: 291 | ioc_global_freq[ioc] = 1 292 | else: 293 | ioc_global_freq[ioc] += 1 294 | except Exception as exp: 295 | print('ERROR processing ' + name + ' tweet: ' + line) 296 | 297 | # Calculate the uniqueness score for each of the lists 298 | list_ranking = {} 299 | average_score = 0 300 | for list_id, iocs in top_lists_iocs.items(): 301 | total_score = 0 302 | for ioc in iocs: 303 | ioc_count = ioc_global_freq[ioc] + 1 304 | # total_score += 1 / math.log2(ioc_count) 305 | total_score += 1 / ioc_count 306 | list_ranking[list_id] = total_score 307 | 308 | average_score += total_score 309 | 310 | average_score = average_score / len(list_ranking) 311 | 312 | list_rank_ioc = [] 313 | for top_list in top_lists: 314 | if top_list[0] in list_ranking: 315 | top_list[1]['ioc_uniqness'] = list_ranking[top_list[0]] / average_score 316 | top_list[1]['score'] *= top_list[1]['ioc_uniqness'] 317 | list_rank_ioc.append(top_list[1]) 318 | 319 | ranked_list = sorted(list_rank_ioc, key=lambda x: x['score'], reverse=True) 320 | 321 | with open(os.path.join(list_dir, 'list_ioc_rank'), 'w', encoding='utf_8') as rank_output: 322 | for list in ranked_list: 323 | rank_output.write( 324 | '{},{},{},{}\n'.format(list['id'], list['owner_screen_name'], list['name'], list['score'])) 325 | 326 | member_scores = {} 327 | for list in ranked_list: 328 | try: 329 | 330 | file_name = list['id'] + '---' + list['owner_screen_name'] + '---' + list['name'].replace('/', 331 | '-') + '.members.list.csv' 332 | print('Getting members of ' + list['id']) 333 | list_members_file_path = os.path.join(list_dir, file_name) 334 | if not os.path.exists(list_members_file_path): 335 | members = get_list_members(api, list['id']) 336 | print('List members count ' + str(len(members))) 337 | dump_list_users(list['id'], members, list_members_file_path) 338 | else: 339 | members =[] 340 | with open (list_members_file_path, 'r') as member_file: 341 | for line in member_file: 342 | member = Dummy() 343 | member_json_obj = json.loads(line) 344 | member.screen_name = member_json_obj['screen_name'] 345 | member.id = member_json_obj['id'] 346 | members.append(member) 347 | 348 | for member in members: 349 | if member.id not in member_scores: 350 | member_scores[member.id] = {'score': 0, 'screen_name': member.screen_name, 'lists': set()} 351 | 352 | member_scores[member.id]['lists'].add(list['id']) 353 | member_scores[member.id]['score'] += list['score'] 354 | print('All members count ' + str(len(member_scores))) 355 | except Exception as exp: 356 | print('ERROR getting members ' + list_id) 357 | 358 | member_ranks = sorted(member_scores.items(), key=lambda x: x[1]['score'], reverse=True) 359 | 360 | with open(os.path.join(base_dir, 'top_users'), 'w', encoding='utf_8', newline='') as top_users_output: 361 | writer = csv.writer(top_users_output) 362 | writer.writerow(['id', 'screen_name', 'score', 'lists']) 363 | for member in member_ranks: 364 | writer.writerow([member[0], member[1]['screen_name'], member[1]['score'], member[1]['lists']]) 365 | 366 | 367 | member_ranks = member_ranks[:1000] 368 | 369 | print("Top 1k users before considering users' tweeting history") 370 | print(member_ranks) 371 | 372 | count = 0 373 | # For each user in top_users file 374 | with open(os.path.join(base_dir, 'top_users'), 'r', encoding='utf_8', newline='') as top_users_input: 375 | csv_reader = csv.reader(top_users_input) 376 | next(csv_reader) 377 | user_iocs = {} 378 | ignore = True 379 | for row in csv_reader: 380 | user_id, screen_name, score = row[0], row[1], float(row[2]) 381 | try: 382 | print(str(count) + " - Getting tweets of "+screen_name) 383 | user_tweets_file_path = os.path.join(user_status_dir, '{}_{}_tweets.csv'.format(user_id, screen_name)) 384 | time_now = datetime.datetime.now() 385 | if screen_name not in user_iocs: 386 | user_iocs[screen_name] = {'id': user_id, 'screen_name': screen_name, 'score': score, 'days': {}} 387 | 388 | # If we have not the tweet history of the user, collect the latest of their 400 timeline tweets 389 | if not os.path.exists(user_tweets_file_path): 390 | all_tweets = get_user_timeline(api, screen_name, 400) 391 | # write the csv 392 | with open(os.path.join(user_status_dir, '{}_{}_tweets.csv'.format(user_id, screen_name)), 'w', 393 | encoding='utf_8') as output_file: 394 | # dump tweets 395 | for i in all_tweets: 396 | output_file.write(json.dumps(i._json) + '\n') 397 | output_file.flush() 398 | else: 399 | ignore = False 400 | all_tweets = [] 401 | with open(user_tweets_file_path, 'r', encoding='utf_8') as input_file: 402 | next(input_file) 403 | for line in input_file: 404 | try: 405 | all_tweets.append(json.loads(line)) 406 | except Exception as exp: 407 | print("Error loading tweets in "+ user_tweets_file_path) 408 | 409 | 410 | for tweet in all_tweets: 411 | if not hasattr(tweet, 'text'): 412 | text = tweet['text'] 413 | else: 414 | text = tweet.text 415 | 416 | if not hasattr(tweet, 'created_at'): 417 | created_at = parse(tweet['created_at']) 418 | created_at = created_at.replace(tzinfo=None) 419 | else: 420 | created_at = tweet.created_at 421 | 422 | iocs = iocextract.extract_iocs(text, refang=True) 423 | for ioc in iocs: 424 | if ioc not in text: 425 | day_diff = (time_now - created_at).days 426 | if day_diff < 0: 427 | day_diff = 0 428 | if day_diff not in user_iocs[screen_name]['days']: 429 | user_iocs[screen_name]['days'][day_diff] = set() 430 | 431 | user_iocs[screen_name]['days'][day_diff].add(ioc) 432 | 433 | count += 1 434 | except Exception as exp: 435 | print('Error getting statuses of '+ screen_name) 436 | 437 | if test_run: 438 | if count > 20: 439 | break 440 | else: 441 | if count > 5000: 442 | break 443 | if count % 50 == 0: 444 | 445 | print("\n\n\n\ncurrent number " + str(count)+'\n\n\n\n') 446 | 447 | avg_ioc_score = 0 448 | for screen_name, ioc in user_iocs.items(): 449 | ioc_score = 0 450 | for day, iocs in ioc['days'].items(): 451 | ioc_score += len(iocs) / ((int(day)+1)**(1/3)) 452 | ioc_score += 1 453 | ioc['ioc_score'] = ioc_score 454 | avg_ioc_score += ioc_score 455 | 456 | avg_ioc_score = avg_ioc_score / len(user_iocs) 457 | 458 | for screen_name, ioc in user_iocs.items(): 459 | ioc['ioc_score'] /= avg_ioc_score 460 | ioc['total_score'] = ioc['ioc_score']* ioc['score'] 461 | 462 | final_user_rank = sorted(user_iocs.items(), key=lambda x:x[1]['total_score'], reverse=True) 463 | 464 | with open(os.path.join(base_dir, 'top_users_final'), 'w', encoding='utf_8', newline='') as top_users_output: 465 | writer = csv.writer(top_users_output) 466 | writer.writerow(['id', 'screen_name', 'score','ioc_score', 'final_score','days']) 467 | for screen_name, details in final_user_rank: 468 | writer.writerow([details['id'], 469 | details['screen_name'], 470 | details['score'], 471 | details['ioc_score'], 472 | details['total_score'], 473 | json.dumps({x:len(y) for x,y in details['days'].items()})]) 474 | top_users_res.append((details['screen_name'],details['id'])) 475 | 476 | return top_users_res 477 | 478 | --------------------------------------------------------------------------------