├── README.md
├── .gitignore
├── CTI_classifer.py
├── construct_tweet_threads.py
├── utility.py
├── results
    └── current_users.txt
├── main.py
├── LICENSE
└── CTI_expert_finder.py


/README.md:
--------------------------------------------------------------------------------
1 | # IoCMiner
2 | A Prototype for IoCMiner which is a framework to automaticly extract Indicators of Compromise (IoCs) from Twitter.
3 | 
4 | (README is in progress)
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | config/*
107 | results/*
108 | !results/current_users.txt
109 | .idea 
110 | 
111 | 


--------------------------------------------------------------------------------
/CTI_classifer.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn import metrics
 4 | from sklearn.ensemble import RandomForestClassifier
 5 | import pandas as ps
 6 | import statistics
 7 | from nltk.tokenize import word_tokenize
 8 | from nltk.stem import WordNetLemmatizer
 9 | import nltk
10 | 
11 | 
12 | number_of_classifiers = 11
13 | nltk.download('punkt')
14 | lem = WordNetLemmatizer()
15 | 
16 | def get_random_forest_classifiers(number, X, y):
17 |     classifiers = []
18 |     for i in range(number):
19 |         x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i, stratify=y)
20 |         estimator = RandomForestClassifier(n_estimators=300, max_features=.15, criterion='entropy', min_samples_split=4)
21 |         estimator.fit(x_train, y_train)
22 |         classifiers.append(estimator)
23 |     return classifiers
24 | 
25 | def construct_classifier():
26 |     # For training the classifier
27 |     master_Table = ps.read_csv(r'dataset\training-set.csv', delimiter=',')
28 |     X, y = master_Table.iloc[:, 1:-1], master_Table.iloc[:, -1]
29 |     classifiers = get_random_forest_classifiers(number_of_classifiers, X, y)
30 |     columns = master_Table.head()
31 | 
32 |     column_names = []
33 |     for col in columns:
34 |         column_names.append(col)
35 |     column_names = column_names[1:-3]
36 | 
37 |     return classifiers, column_names
38 | 
39 | 
40 | def vectorize(tweet, vocab):
41 |     vector = []
42 | 
43 |     tweet = tweet.lower()
44 |     bag = word_tokenize(tweet)
45 | 
46 |     for i in vocab:
47 |         count = 0
48 |         for j in bag:
49 |             if i == j:
50 |                 count += 1
51 |         vector.append(count)
52 | 
53 |     return vector
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     final_estimate = []
58 | 
59 |     classifiers, col_names = construct_classifier()
60 | 
61 |     # For evaluating the classifier
62 |     test_table = ps.read_csv(r'dataset\test-set-random.csv', delimiter=',')
63 | 
64 |     x2, y2 = test_table.iloc[:, 1:-1], test_table.iloc[:, -1]
65 | 
66 |     estimates = []
67 |     for i in range(number_of_classifiers):
68 |         y_estimate = classifiers[i].predict(x2)
69 |         estimates.append(y_estimate)
70 | 
71 |     aggregated_results = []
72 |     n = 0
73 | 
74 |     # do the majority voting here
75 |     while n < 143:
76 |         Y=0
77 |         N=0
78 |         R=0
79 |         for i in estimates:
80 |             vote = i[n]
81 |             aggregated_results.append(vote)
82 |         final_estimate.append(statistics.mode(aggregated_results))
83 |         aggregated_results.clear()
84 |         n+=1
85 |     accuracy = metrics.accuracy_score(y_true=y2, y_pred=final_estimate)
86 |     print(accuracy)
87 | 


--------------------------------------------------------------------------------
/construct_tweet_threads.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import json
 4 | 
 5 | ioc_base_dir = r'results'
 6 | 
 7 | class TweetInfo:
 8 |     def __init__(self, tweet):
 9 |         self.tweet = tweet
10 |         self.responses = []
11 |         self.reply_to = None
12 | 
13 |     @staticmethod
14 |     def get_all_text(tweet):
15 |         result = ''
16 |         for response in tweet.responses:
17 |             result += response.tweet + '\n'
18 |             result += TweetInfo.get_all_text(response) + '\n'
19 |         return result
20 | 
21 | 
22 |     def __str__(self):
23 |         parent = self.reply_to
24 |         ancestors_text = ''
25 |         while parent != None:
26 |             ancestors_text = parent.tweet +'\n' + ancestors_text
27 | 
28 |         return '[ancestors_text]\n'+ancestors_text+'\n[self]\n'+self.tweet +'\n[decendent]\n'+ TweetInfo.get_all_text(self)
29 | 
30 | 
31 | def get_text(tweet):
32 |     if 'retweeted_status' in tweet and \
33 |             'extended_tweet' in tweet['retweeted_status'] and \
34 |             'full_text' in tweet['retweeted_status']['extended_tweet']:
35 |         text = tweet['retweeted_status']['extended_tweet']['full_text']
36 |     elif 'extended_tweet' in tweet and 'full_text' in tweet['extended_tweet']:
37 |         text = tweet['extended_tweet']['full_text']
38 |     else:
39 |         text = tweet['text']
40 |     return text
41 | 
42 | def get_ioc_tweet_ids(file):
43 |     result = set()
44 |     iocs = set()
45 |     with open(file, 'r', encoding='utf_8') as input_file:
46 |         for line in input_file:
47 |             segments = line.strip().split(',')
48 |             seg_len = len(segments)
49 |             if seg_len >= 7:
50 | 
51 |                 segments[6] = segments[6].strip()
52 |                 if segments[6] not in iocs and segments[5] != 'email':
53 |                     result.add(segments[0])
54 |                     iocs.add(segments[6])
55 |     return result
56 | 
57 | if __name__ == "__main__":
58 |     # Get the IoCs from the daily IoC files
59 |     ioc_tweet_ids = set()
60 |     for file in glob.glob(os.path.join(ioc_base_dir, '*.ioc.csv')):
61 |         file_name = file.split('\\')[-1]
62 |         res = get_ioc_tweet_ids(file)
63 |         ioc_tweet_ids.update(res)
64 | 
65 |     tweets = {}
66 |     with open(os.path.join(ioc_base_dir, 'top_user_dump.json'), 'r', encoding='utf_8') as input_file:
67 |         tag_dataset = {}
68 |         seen_tweets = set()
69 |         # Construct tweet threads from a tweet dump
70 |         for count, line in enumerate(input_file):
71 |             tweet = json.loads(line)
72 |             current_tweet = TweetInfo(get_text(tweet))
73 |             if tweet['in_reply_to_status_id_str'] == None:
74 |                 tweets[tweet['id_str']] = current_tweet
75 |             else:
76 |                 if tweet['in_reply_to_status_id_str'] in tweets:
77 |                     tweets[tweet['in_reply_to_status_id_str']].responses.append(current_tweet)
78 |                     tweets[tweet['id_str']] = current_tweet
79 | 
80 |         # Print the tweet threads containing IoCs
81 |         for ioc_tweet_id in ioc_tweet_ids:
82 |             if ioc_tweet_id in tweets:
83 |                 if len(tweets[ioc_tweet_id].responses) > 0:
84 |                     print(tweets[ioc_tweet_id])
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/utility.py:
--------------------------------------------------------------------------------
  1 | import tweepy
  2 | import json
  3 | import datetime
  4 | 
  5 | def get_twitter_api():
  6 |     with open(r'config/tweeter.auth', 'r') as auth_file:
  7 |         consumer_key, consumer_secret, access_token, access_token_secret = auth_file.read().split()
  8 | 
  9 |         # OAuth process, using the keys and tokens
 10 |         auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
 11 |         auth.set_access_token(access_token, access_token_secret)
 12 | 
 13 |         api = tweepy.API(auth, wait_on_rate_limit=True)
 14 | 
 15 |         return api
 16 | 
 17 | 
 18 | def get_user_timeline(api, screen_name, count = 200):
 19 |     # Twitter only allows access to a users most recent 3240 tweets with this method
 20 |     # initialize a list to hold all the tweepy Tweets
 21 | 
 22 |     alltweets = []
 23 | 
 24 |     fetched_count = 200
 25 | 
 26 |     # make initial request for most recent tweets (200 is the maximum allowed count)
 27 |     new_tweets = api.user_timeline(screen_name=screen_name, count=200)
 28 | 
 29 |     # save most recent tweets
 30 |     alltweets.extend(new_tweets)
 31 | 
 32 |     print("...%s tweets downloaded so far" % (len(alltweets)))
 33 | 
 34 |     if fetched_count >= count:
 35 |         return alltweets[:count]
 36 | 
 37 |     # save the id of the oldest tweet less one
 38 |     oldest = alltweets[-1].id - 1
 39 | 
 40 |     # keep grabbing tweets until there are no tweets left to grab
 41 |     while len(new_tweets) > 0:
 42 |         # all subsiquent requests use the max_id param to prevent duplicates
 43 |         new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)
 44 | 
 45 | 
 46 |         # save most recent tweets
 47 |         alltweets.extend(new_tweets)
 48 | 
 49 |         print("...%s tweets downloaded so far" % (len(alltweets)))
 50 | 
 51 |         if len(alltweets) >= count:
 52 |             break
 53 | 
 54 |         # update the id of the oldest tweet less one
 55 |         oldest = alltweets[-1].id - 1
 56 | 
 57 | 
 58 | 
 59 | 
 60 |     return alltweets[:count]
 61 | 
 62 | 
 63 | def get_list_timeline(api, list_id, count):
 64 |     # Twitter only allows access to a users most recent 3240 tweets with this method
 65 |     # initialize a list to hold all the tweepy Tweets
 66 | 
 67 |     alltweets = []
 68 | 
 69 |     fetched_count = 200
 70 |     # make initial request for most recent tweets (200 is the maximum allowed count)
 71 |     new_tweets = api.list_timeline(list_id=list_id, count=200)
 72 | 
 73 |     # save most recent tweets
 74 |     alltweets.extend(new_tweets)
 75 | 
 76 |     print("...%s list downloaded so far" % (len(alltweets)))
 77 |     if fetched_count >= count:
 78 |         return alltweets[:count]
 79 | 
 80 |     # save the id of the oldest tweet less one
 81 |     oldest = alltweets[-1].id - 1
 82 | 
 83 |     # keep grabbing tweets until there are no tweets left to grab
 84 |     while len(new_tweets) > 0:
 85 |         # all subsiquent requests use the max_id param to prevent duplicates
 86 |         new_tweets = api.user_timeline(list_id=list_id, count=200, max_id=oldest)
 87 | 
 88 |         fetched_count += 200
 89 | 
 90 |         # save most recent tweets
 91 |         alltweets.extend(new_tweets)
 92 | 
 93 |         print("...%s list downloaded so far" % (len(alltweets)))
 94 | 
 95 |         if fetched_count >= count:
 96 |             break
 97 | 
 98 |         # update the id of the oldest tweet less one
 99 |         oldest = alltweets[-1].id - 1
100 | 
101 | 
102 | 
103 |     return alltweets[:count]
104 | 
105 | def get_date():
106 |     return datetime.datetime.now().strftime('%Y%m%d')
107 | 


--------------------------------------------------------------------------------
/results/current_users.txt:
--------------------------------------------------------------------------------
  1 | NelsonSecurity,3095065120
  2 | http_error_418,2983529314
  3 | SfyLabs,3328717103
  4 | Bitcoin,1141391982
  5 | Qantas,218730857
  6 | asset_island_,915670001595682816
  7 | AmazonHelp,85741735
  8 | SendGrid,42126617
  9 | sweatshack,543867579
 10 | lunomoney,1274469684
 11 | SpiritGroupInc,396292584
 12 | WeTransfer,81694711
 13 | texlinepk,814796199802896384
 14 | evanderburg,16309969
 15 | amazon,20793816
 16 | CraneHassold,758651689662550016
 17 | PassiveTotal,2479900676
 18 | ziraatbankasi,213983669
 19 | VakifBank,613251601
 20 | CEPTETEB,2195601962
 21 | letsencrypt,2887837801
 22 | unitedlayer,18454193
 23 | fumik0_,75868226
 24 | phishing,15234215
 25 | FedEx,134887156
 26 | Ficohsa,78359919
 27 | ChaseSupport,274789264
 28 | Discover,16147150
 29 | ATTCares,62643312
 30 | Rabobank,7385462
 31 | CyprusMFA,1446632138
 32 | comsadotio,878294178693652482
 33 | CenterX1,97054852
 34 | douglasmun,620341271
 35 | SwiftOnSecurity,2436389418
 36 | ET_Labs,2835071339
 37 | AskNationwide,469358921
 38 | HRCSaudi,636184033
 39 | MrGlaive,2692504059
 40 | Barclays,191781601
 41 | HSBC,467368287
 42 | instagram,180505807
 43 | foofightersbr,28891221
 44 | foofighters,19081001
 45 | AmericanExpress,42712551
 46 | serasaexperian,48455669
 47 | HostGator,15770852
 48 | Cielo_br,126090155
 49 | PosteNews,388197163
 50 | comododesktop,19257473
 51 | tjbahia,51874865
 52 | sicredi_oficial,146573998
 53 | Mailbox,624947324
 54 | Caixa,229191436
 55 | Match,19013415
 56 | americanascom,35019751
 57 | Bitly,15234886
 58 | defesa_digital,328254142
 59 | santander_br,48700857
 60 | Homehost,31437607
 61 | github,13334762
 62 | AskAmex,62911603
 63 | coinhive_com,963717856356651008
 64 | publicww,2546955637
 65 | riodejaneiro,35001292
 66 | itau,179398386
 67 | usponline,24241923
 68 | AWSSupport,120967386
 69 | 000webhosting,376402154
 70 | 000webhost_com,718444759828807680
 71 | Adobe,63786611
 72 | AskPayPal,63247343
 73 | blockchain,125304737
 74 | bancosantander,246196403
 75 | NavyFederalHelp,22972532
 76 | NavyFederal,145227381
 77 | inj3ct0r,56179836
 78 | AlibabaGroup,2694564780
 79 | Cielo,489961357
 80 | petrobras,45083460
 81 | Cryptopia_NZ,2916954277
 82 | Vivoemrede,43218789
 83 | Mastercard,75014376
 84 | ipiranga,15836990
 85 | uolhost,15047744
 86 | sekoia_fr,1479558380
 87 | mustarddawg,872794204086444032
 88 | BroadAnalysis,1055529325
 89 | DynamicAnalysis,177813214
 90 | Anti_Expl0it,778763839974232065
 91 | Arubait,289758610
 92 | CoatonDean,2999674613
 93 | orange,296824934
 94 | Cdiscount,63142684
 95 | Eranetcom1,583560916
 96 | HSBC_UK,2922732233
 97 | WifiRumHam,773924520189169666
 98 | NetflixBrasil,231579149
 99 | PhishTank_Bot,920309658912407552
100 | Gendarmerie,2184489764
101 | hexlax,2610260400
102 | caixabank,270429778
103 | USAA,16584443
104 | web4africa,34330865
105 | errorinn,1567253886
106 | virustotal,145033865
107 | ForstPenguin,37678351
108 | nixcraft,17484680
109 | phish_total,920628142385098754
110 | APWG,15495424
111 | the_root_labs,731181031953293312
112 | serveriusbv,94043958
113 | DCUcreditunion,55548810
114 | juangameztorres,1341334147
115 | TDGroupInc,397527876
116 | aboutworldlangs,65455730
117 | USTreasury,120176950
118 | TMobileHelp,185728888
119 | _DanRoberts_,846248293852049411
120 | abel1ma,225299286
121 | nullcookies,723905075811332096
122 | FourOctets,3185889936
123 | Jan0fficial,735370482896244736
124 | Racco42,450579130
125 | RealRalf9000,900300460514562050
126 | MrClickyClicks,976229553029943296
127 | _odisseus,1630455818
128 | ps66uk,894741613624348672
129 | MalWebHunter,847813680003600384
130 | yashkadakia,18903282
131 | ali0une,127333069
132 | _larry0,17055552
133 | TigzyRK,158836607
134 | kobebryamV2,211117902
135 | KylianXAnalyst,893476344293580802
136 | EllandBack1,569434401
137 | doppelvizsla,783834754722521088
138 | rymcvicar,508460308
139 | 0Btemos_BHS,2570023247
140 | HerbieZimmerman,982533264
141 | JackBurt0n,14089433
142 | stecar792,886573394224971776
143 | Lvanoel,2654177803
144 | mgiovamo,365600157
145 | M157q_News_RSS,4639018735
146 | ebubekirbastama,2993714093
147 | fcafra,323249092
148 | F_kZ_,600820120
149 | Darkn1ght10,708000179773956096
150 | secb0t,864824454945353730
151 | p4r4n0id_il,303725749
152 | misojosgatos,123093184
153 | Jakeashacks,762240284134633472
154 | MalwareHunterBR,1369263216
155 | FewAtoms,951498497584492544
156 | DissectMalware,967487209535361025
157 | PhishingAi,974046842408357894
158 | SecuriTears,909731475314049025
159 | malware_traffic,1612403564
160 | P3pperP0tts,948580944323194880
161 | bry_campbell,837320959
162 | MakFLwana,376580061
163 | CryptoInsane,3740550016
164 | Zerophage1337,736664966980308992
165 | killamjr,744930225352548352
166 | taku888infinity,272034153
167 | packet_Wire,294036033
168 | dvk01uk,4272657142
169 | neonprimetime,48459503
170 | catnap707,265308969
171 | avman1995,3069922981
172 | mateeuslinno,1788573764
173 | ViriBack,3101795085
174 | Dashowl,2896982439
175 | papa_anniekey,171305529
176 | malwrhunterteam,2847021941
177 | certbr,190765950
178 | James_inthe_box,703614655
179 | JAMESWT_MHT,3433210978
180 | Dropbox,14749606
181 | illegalFawn,778528483395895296
182 | pollo290987,838969349998272512
183 | WellsFargo,1178011
184 | bad_packets,856982087101849600
185 | BancodoBrasil,83723557
186 | NaomiSuzuki_,1479126768
187 | autumn_good_35,747100113231745026
188 | virusbay_io,923656996741615617
189 | PayPal,30018058
190 | anyrun_app,833639043862786048
191 | digitalocean,457033547
192 | BankofAmerica,204881628
193 | UOL,70799317
194 | phishingalert,895935810469494786
195 | seen8th,4706453413
196 | JRoosen,29199860
197 | JaromirHorejsi,1550544901
198 | Artilllerie,296167523
199 | facebook,2425151
200 | chronic,86315276
201 | 0x7fff9,809023873245204481
202 | APNews,16744503
203 | LinkedIn,13058772
204 | Techhelplistcom,1222068722
205 | dyngnosis,14268523
206 | ameli_actu,2842497633
207 | Chase,274673392
208 | PayPalInfoSec,23807666
209 | netflix,16573941
210 | GoDaddy,14949454
211 | urlscanio,784841998964514816
212 | umbler,2707504046
213 | Ledtech3,4579549601
214 | EmergingThreats,156671416
215 | buffaloverflow,293922100
216 | ShiaoQu17,973233102947364865
217 | TDBank_US,67378997
218 | NCCInfoSec,880355951529820160
219 | Toyota,14219877
220 | CarrefourES,275430095
221 | namesilo,196128578
222 | OVH,317647291
223 | gmail,38679388
224 | Microsoft,74286565
225 | theschoolsleuth,2885930067
226 | DHLUS,2507741466
227 | MobileLegendsOL,762959278189584384
228 | AppleSupport,3309375033
229 | Halkbank,220951448
230 | BofA_Help,18735040
231 | WBPolice,751102643909591040
232 | DeptVetAffairs,78408666
233 | HMRCcustomers,2848281329
234 | ficoba,19187698
235 | Cloudflare,32499999
236 | DocuSign,18909248
237 | SamsungUS,34356968
238 | ASBBank,20730278
239 | Bradesco,79184083
240 | locaweb,15020362
241 | Outlook,605805760
242 | baberpervez2,850551473624797184
243 | ABNAMRO,2270841
244 | fs0c131y,3317233336
245 | PopularResponde,1359682945
246 | ItsReallyNick,73195179
247 | Dinosn,128484298
248 | _devonkerr_,2843933755
249 | lennyzeltser,14780493
250 | retBandit,325116309
251 | LukasStefanko,2936786110
252 | JohnLaTwC,208346105
253 | decalage2,937763623
254 | VK_Intel,3332934374
255 | Mrtn9,91752953
256 | B_H101,867327304850178048
257 | securitydoggo,792480469568937988
258 | Bank_Security,1886777581
259 | cocaman,1169961
260 | blu3_team,876111091818000385
261 | hasherezade,1590754944
262 | powershellcode,951900231347531776
263 | TheMalwareQueen,849590344102682624
264 | TheWit0k,715097237378957312
265 | executemalware,743883460587167744


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import tweepy  # https://github.com/tweepy/tweepy
  2 | from queue import Queue
  3 | from threading import Thread
  4 | from gglsbl import SafeBrowsingList
  5 | import requests
  6 | import shutil
  7 | from CTI_expert_finder import *
  8 | from CTI_classifer import *
  9 | import numpy
 10 | 
 11 | 
 12 | class IOCMinerStreamListener(tweepy.StreamListener):
 13 | 
 14 |     def __init__(self, api, top_users, rand_users):
 15 |         self.api = api
 16 |         self.output = r"results\top_user_dump.json"
 17 |         self.output_file = open(self.output, "a")
 18 |         self.rand_users = rand_users
 19 |         self.top_users = top_users
 20 |         self.classifiers, self.wordlist = construct_classifier()
 21 |         self.enclosure_queue = Queue()
 22 |         self.worker = Thread(target=self.worker, args=(1, self.enclosure_queue,))
 23 |         self.worker.setDaemon(True)
 24 |         self.worker.start()
 25 | 
 26 | 
 27 |     def on_status(self, status):
 28 |         self.output_file.write(json.dumps(status._json )+'\n')
 29 |         self.enclosure_queue.put(status)
 30 | 
 31 |     def worker(self, id, queue):
 32 | 
 33 |         with open(r'config\gglsbl.auth', 'r') as auth_file:
 34 |             gglsbl_key = auth_file.read().strip()
 35 | 
 36 |         sbl = SafeBrowsingList(gglsbl_key, db_path=r"dataset\google_safe_browisng_db")
 37 |         # sbl.update_hash_prefix_cache()
 38 | 
 39 |         turn = True
 40 |         while True:
 41 | 
 42 |             # Update Google SBL database every 12 hours at time X (e.g. 3 AM and 3 PM)
 43 |             hour = datetime.datetime.today().hour
 44 |             if hour % 12 == 3 and turn:
 45 |                 sbl.update_hash_prefix_cache()
 46 |                 turn = False
 47 |             elif hour % 12 != 3:
 48 |                 turn = True
 49 | 
 50 |             today = get_date()
 51 |             with open(os.path.join('results', today+'.ioc.csv'),'a+',encoding='utf_8') as output_file:
 52 |                 tweet = queue.get()
 53 |                 try:
 54 |                     if hasattr(tweet, 'retweeted_status') and hasattr(tweet.retweeted_status, 'extended_tweet') and 'full_text' in tweet.retweeted_status.extended_tweet:
 55 |                         text = tweet.retweeted_status.extended_tweet['full_text']
 56 |                     elif hasattr(tweet, 'extended_tweet') and 'full_text' in tweet.extended_tweet:
 57 |                         text = tweet.extended_tweet['full_text']
 58 |                     elif not hasattr(tweet, 'text'):
 59 |                         text = tweet['text']
 60 |                     else:
 61 |                         text = tweet.text
 62 | 
 63 |                     if hasattr(tweet, 'retweeted_status'):
 64 |                         if hasattr(tweet.retweeted_status, 'extended_tweet'):
 65 |                             final_urls = tweet.retweeted_status.extended_tweet['entities']['urls']
 66 |                         else:
 67 |                             final_urls = tweet.retweeted_status.entities['urls']
 68 |                     else:
 69 |                         if hasattr(tweet, 'extended_tweet'):
 70 |                             final_urls = tweet.extended_tweet['entities']['urls']
 71 |                         else:
 72 |                             final_urls = tweet.entities['urls']
 73 | 
 74 |                     for final_url in final_urls:
 75 |                         # If a pastebin URL, get the raw content and append it to the tweet content
 76 |                         if final_url['expanded_url'].startswith('https://pastebin.com/'):
 77 |                             pastebin = final_url['expanded_url']
 78 |                             if 'raw' not in pastebin:
 79 |                                 pastebin = pastebin.replace('https://pastebin.com/', 'https://pastebin.com/raw/')
 80 | 
 81 |                             req = requests.get(pastebin)
 82 |                             text += '\n' + req.content
 83 | 
 84 |                     user_type = 'top'
 85 |                     if tweet.user.id_str in self.rand_users:
 86 |                         user_type = 'rand'
 87 | 
 88 |                     print("###########################$$$$$$$$$$$$$$$$$$$$$$$$$$$")
 89 |                     print(text)
 90 | 
 91 |                     # classifier must be retrained with new data
 92 |                     # vector = vectorize(text, self.wordlist)
 93 |                     # vector.append(len(tweet.entities['hashtags']))
 94 |                     # vector.append(len(tweet.entities['user_mentions']))
 95 |                     # vector = numpy.array(vector).reshape(1, -1)
 96 |                     # estimates = []
 97 |                     # for i in range(number_of_classifiers):
 98 |                     #     y_estimate = self.classifiers[i].predict(vector)
 99 |                     #     estimates.append(y_estimate)
100 |                     # vote = statistics.mode([x[0] for x in estimates])
101 |                     # print("Prediction: "+vote)
102 | 
103 |                     ips = list(iocextract.extract_ips(text, refang=True))
104 |                     for ip in ips:
105 |                         if ip not in text:
106 |                             output_file.write('{},{},{},{},{},ip,{}\n'.format(tweet.id,tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, ip))
107 | 
108 |                     urls = list(iocextract.extract_urls(text, refang=True))
109 |                     for url in urls:
110 |                         if url not in text:
111 |                             result = sbl.lookup_url(url.rstrip('.'))
112 |                             if result is not None:
113 |                                 output_file.write('{},{},{},{},{},url,{},{}\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, url.rstrip('.'),result))
114 |                             else:
115 |                                 output_file.write('{},{},{},{},{},url,{},benign\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, url.rstrip('.')))
116 | 
117 |                     emails = list(iocextract.extract_emails(text, refang=True))
118 |                     for email in emails:
119 |                         if email not in text:
120 |                             output_file.write('{},{},{},{},{},email,{}\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, email))
121 |                     hashes = list(iocextract.extract_hashes(text))
122 |                     for hash in hashes:
123 |                         output_file.write('{},{},{},{},{},hash,{}\n'.format(tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, hash))
124 |                 except Exception as exp:
125 |                     print(exp)
126 | 
127 |                 queue.task_done()
128 | 
129 |     def on_error(self, status_code):
130 |         if status_code == 420:
131 |             # returning False in on_data disconnects the stream
132 |             return False
133 | 
134 |     def __del__(self):
135 |         self.output_file.close()
136 | 
137 | 
138 | UPDATE_CURRENT_USER = False
139 | 
140 | api = get_twitter_api()
141 | val = api.rate_limit_status()
142 | 
143 | if api.verify_credentials():
144 | 
145 |     today = get_date()
146 | 
147 |     base_dir = os.path.join(r'results\days', today)
148 | 
149 |     top_users_final_path = os.path.join(base_dir, 'top_users_final')
150 |     if not os.path.exists('top_users_final'):
151 |         top_users = dump_cti_experts(api, base_dir, test_run=True)
152 |         if UPDATE_CURRENT_USER:
153 |             shutil.copy2('results\\current_users.txt', 'results\\current_users.old.txt')
154 |             with open('results\\current_users.txt','w', encoding='utf_8') as current_users_file:
155 |                 for user in top_users:
156 |                     current_users_file.write("{},{}\n".format( user[0], user[1]))
157 | 
158 |     user_ids = []
159 |     with open(os.path.join(base_dir, 'top_users_final'), 'r', encoding='utf_8') as top_user_file:
160 |         next(top_user_file)
161 |         csv_reader = csv.reader(top_user_file)
162 |         for row in csv_reader:
163 |             user_ids.append(row[0])
164 | 
165 |     rand_user_ids = []
166 |     # with open(os.path.join(base_dir, 'rand_users_final_1k'), 'r', encoding='utf_8') as top_user_file:
167 |     #     next(top_user_file)
168 |     #     csv_reader = csv.reader(top_user_file)
169 |     #     for row in csv_reader:
170 |     #         rand_user_ids.append(row[0])
171 | 
172 |     twitter_listener = IOCMinerStreamListener(api, set(user_ids), set(rand_user_ids))
173 |     IOC_stream = tweepy.Stream(auth=api.auth, listener=twitter_listener)
174 | 
175 |     # Collect tweets from a set of top CTI experts and a set of randomly selected users (indefinite loop)
176 |     while True:
177 |         try:
178 |             users = user_ids[0:1000]
179 |             users.extend(rand_user_ids)
180 |             IOC_stream.filter(follow=users)
181 |         except Exception as exp:
182 |             print(str(exp))
183 |             IOC_stream.disconnect()
184 | 
185 |         # If the stream listener is terminated, wait 120 seconds before creating a new one
186 |         time.sleep(120)
187 | 
188 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/CTI_expert_finder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import csv
  4 | import time
  5 | import re
  6 | import math
  7 | from utility import *
  8 | import iocextract
  9 | from dateutil.parser import parse
 10 | 
 11 | class Dummy(object):
 12 |     pass
 13 | 
 14 | def get_user_lists(api, user, max_count=1000):
 15 |     res = api.lists_memberships(screen_name=user, count=max_count)
 16 |     # Sorting the results based on subscriber_count and member_count
 17 |     res = sorted([i for i in res if i.mode.lower() == 'public'], key=lambda x: x.member_count, reverse=True)
 18 |     res = sorted([i for i in res], key=lambda x: x.subscriber_count, reverse=True)
 19 |     return res
 20 | 
 21 | 
 22 | def dump_user_lists(user, lists, dump_file_path):
 23 |     with open(dump_file_path, 'w', encoding='utf_8', newline='') as output_file:
 24 |         csv_writer = csv.writer(output_file)
 25 |         row = [
 26 |             'id',
 27 |             'name',
 28 |             'slug',
 29 |             'description',
 30 |             'member_count',
 31 |             'subscriber_count',
 32 |             'mode',
 33 |             'created_at',
 34 |             'owner.id_str',
 35 |             'owner.screen_name',
 36 |             'owner.name',
 37 |             'owner.favourites_count',
 38 |             'owner.followers_count',
 39 |             'owner.friends_count',
 40 |             'owner.created_at'
 41 |         ]
 42 |         csv_writer.writerow(row)
 43 | 
 44 |         for list in lists:
 45 |             row = [
 46 |                 list.id,
 47 |                 list.name,
 48 |                 list.slug,
 49 |                 list.description,
 50 |                 list.member_count,
 51 |                 list.subscriber_count,
 52 |                 list.mode,
 53 |                 list.created_at,
 54 |                 list.user.id_str,
 55 |                 list.user.screen_name,
 56 |                 list.user.name,
 57 |                 list.user.favourites_count,
 58 |                 list.user.followers_count,
 59 |                 list.user.friends_count,
 60 |                 list.user.created_at,
 61 |             ]
 62 |             csv_writer.writerow(row)
 63 | 
 64 | 
 65 | def dump_list_tweets(list, tweets, dump_file_path):
 66 |     with open(dump_file_path, 'w', encoding='utf_8') as output_file:
 67 |         for tweet in tweets:
 68 |             json_str = json.dumps(tweet._json)
 69 |             output_file.write(json_str + '\n')
 70 | 
 71 | 
 72 | def dump_list_users(list, users, dump_file_path):
 73 |     with open(dump_file_path, 'w', encoding='utf_8') as output_file:
 74 |         for user in users:
 75 |             json_str = json.dumps(user._json)
 76 |             output_file.write(json_str + '\n')
 77 | 
 78 | 
 79 | def get_list_members(api, list_id, max_count=5000):
 80 |     res = api.list_members(list_id=list_id, count=max_count)
 81 |     return res
 82 | 
 83 | 
 84 | def create_list(api, name):
 85 |     res = api.create_list(name, mode='private')
 86 |     return res.id_str
 87 | 
 88 | 
 89 | def add_to_list(api, list_id, members):
 90 |     for i in range(0, len(members), 100):
 91 |         api.add_list_members(list_id=list_id, user_id=members[i: i + 100])
 92 | 
 93 | 
 94 | def get_current_user():
 95 |     result = []
 96 |     current_users = r'results\current_users.txt'
 97 |     with open(current_users, 'r', encoding='utf_8') as input:
 98 |         for line in input:
 99 |             user, id = line.strip().split(',')
100 |             result.append((user, id))
101 |     return result
102 | 
103 | 
104 | def select_top_lists(all_lists,
105 |                      avg_sec_word_count,
106 |                      avg_member_score,
107 |                      avg_subscriber_count,
108 |                      avg_owner_strenght,
109 |                      count=1000):
110 |     for i in all_lists:
111 |         all_lists[i]['score'] = (all_lists[i]['sec_word_count'] / avg_sec_word_count) * \
112 |                                 (all_lists[i]['member_score'] / avg_member_score) * \
113 |                                 (all_lists[i]['subscriber_count'] / avg_subscriber_count) * \
114 |                                 (all_lists[i]['owner_strength'] / avg_owner_strenght)
115 |     lists_rank = sorted(all_lists.items(), key=lambda x: x[1]['score'], reverse=True)
116 |     discard_index = len(all_lists)
117 |     counter = 0
118 |     for i in lists_rank:
119 |         if i[1]['score'] == 0:
120 |             discard_index = counter
121 |             break
122 |         counter += 1
123 | 
124 |     if discard_index > count:
125 |         discard_index = count
126 | 
127 |     lists_rank = lists_rank[:discard_index]
128 | 
129 |     return lists_rank
130 | 
131 | 
132 | def dump_cti_experts(api, base_dir, test_run=False):
133 |     # If you want to
134 |     top_users_res = []
135 |     user_dir = os.path.join(base_dir, 'users')
136 |     user_status_dir = os.path.join(base_dir, r'users\status')
137 |     list_dir = os.path.join(base_dir, 'lists')
138 | 
139 |     if not os.path.exists(user_status_dir):
140 |         os.makedirs(user_status_dir)
141 |     if not os.path.exists(list_dir):
142 |         os.makedirs(list_dir)
143 | 
144 |     # For each CTI expert (in the input list), dump the info of all the lists that the expert is a member into a file
145 |     for user, user_id in get_current_user():
146 |         try:
147 |             user_lists_dump_path = os.path.join(user_dir, user_id + '.user.csv')
148 |             if not os.path.exists(user_lists_dump_path):
149 |                 lists = get_user_lists(api, user)
150 |                 dump_user_lists(user, lists, user_lists_dump_path)
151 |                 time.sleep(2)
152 | 
153 |             if test_run:
154 |                 break
155 | 
156 |         except Exception as exp:
157 |             print('ERROR {}:{}'.format(user, exp.reason))
158 | 
159 |     list_rank = []
160 |     specific_words = ['ioc',
161 |                       'malware',
162 |                       'Indicator.?of.?Compromise',
163 |                       'threat.?hunt',
164 |                       'threat.?hunt',
165 |                       'phishing.?hunt',
166 |                       'phish.?hunt',
167 |                       'threat.?int',
168 |                       'threat.?research',
169 |                       'ransomware',
170 |                       'mal.?doc']
171 | 
172 |     generic_words = ['info.?sec',
173 |                      'cyber.?sec',
174 |                      'security',
175 |                      'ransomware']
176 | 
177 |     specific_regex_rule = re.compile('|'.join(specific_words), re.IGNORECASE)
178 |     generic_regex_rule = re.compile('|'.join(generic_words), re.IGNORECASE)
179 | 
180 |     # sub_scores: number of relevant words, number_follower/log(number_followers), number_subscriber, owner_strength
181 |     # score is a product of the above sub scores
182 |     # each sub score must be in the range [0,+infinity), however, average must be 1
183 |     # sub scores that are above average increase the total score
184 | 
185 |     all_lists = {}
186 |     total_sec_word_count = 0
187 |     total_member_score = 0
188 |     total_subscriber_count = 0
189 |     total_owner_strength = 0
190 | 
191 | 
192 |     for file in glob.glob(os.path.join(user_dir, "*.user.csv")):
193 |         with open(file, 'r', encoding='utf_8') as input_file:
194 |             reader = csv.reader(input_file)
195 |             next(reader)
196 |             counter = 0
197 |             for row in reader:
198 |                 counter += 1
199 |                 id = row[0]
200 |                 if id not in all_lists:
201 |                     all_lists[id] = {}
202 |                     all_lists[id]['id'] = row[0]
203 |                     all_lists[id]['name'] = row[1]
204 |                     all_lists[id]['text'] = row[1] + ' ' + row[3]
205 |                     all_lists[id]['sec_word_count'] = len(specific_regex_rule.findall(all_lists[id]['text'])) * 3 + \
206 |                                                       len(generic_regex_rule.findall(all_lists[id]['text']))
207 |                     total_sec_word_count += all_lists[id]['sec_word_count']
208 | 
209 |                     all_lists[id]['member_count'] = int(row[4])
210 |                     if all_lists[id]['member_count'] > 1:
211 |                         all_lists[id]['member_score'] = all_lists[id]['member_count'] / math.log2(
212 |                             all_lists[id]['member_count'])
213 |                     else:
214 |                         all_lists[id]['member_score'] = 0
215 |                     total_member_score += all_lists[id]['member_score']
216 | 
217 |                     all_lists[id]['subscriber_count'] = int(row[5])
218 |                     all_lists[id]['subscriber_count'] += 1
219 |                     total_subscriber_count += all_lists[id]['subscriber_count']
220 | 
221 |                     all_lists[id]['owner_screen_name'] = row[9]
222 |                     all_lists[id]['owner_followers_count'] = int(row[12])
223 |                     all_lists[id]['owner_friends_count'] = int(row[13])
224 |                     if all_lists[id]['owner_friends_count'] >= 1:
225 |                         all_lists[id]['owner_strength'] = math.log2(
226 |                             (all_lists[id]['owner_followers_count'] + all_lists[id]['owner_friends_count']) /
227 |                             all_lists[id]['owner_friends_count'])
228 |                     else:
229 |                         all_lists[id]['owner_strength'] = 0
230 |                     total_owner_strength += all_lists[id]['owner_strength']
231 | 
232 |                 if test_run:
233 |                     if counter > 10:
234 |                         break
235 | 
236 |     avg_sec_word_count = total_sec_word_count / len(all_lists)
237 |     avg_member_score = total_member_score / len(all_lists)
238 |     avg_subscriber_count = total_subscriber_count / len(all_lists)
239 |     avg_owner_strength = total_owner_strength / len(all_lists)
240 | 
241 |     top_lists = select_top_lists(all_lists,
242 |                                  avg_sec_word_count,
243 |                                  avg_member_score,
244 |                                  avg_subscriber_count,
245 |                                  avg_owner_strength)
246 | 
247 |     counter = 0
248 | 
249 |     # Dump the latest 1000 timeline tweets of each top lists
250 |     for top_list in top_lists:
251 |         try:
252 |             print(top_list[0] + '\t' + top_list[1]['owner_screen_name'] + '\t\t' + top_list[1]['name'])
253 |             file_name = top_list[0] + '---' + top_list[1]['owner_screen_name'] + '---' + top_list[1][
254 |                 'name'].replace('/', '-') + '.dump.list.csv'
255 |             list_tweets_file_path = os.path.join(list_dir, file_name)
256 |             if not os.path.exists(list_tweets_file_path):
257 |                 tweets = get_list_timeline(api, top_list[0], 1000)
258 |                 dump_list_tweets(top_list[0], tweets, list_tweets_file_path)
259 |                 counter += 1
260 | 
261 |             if test_run:
262 |                 if counter > 10:
263 |                     break
264 |             else:
265 |                 if counter > 150:
266 |                     break
267 |         except Exception as exp:
268 |             print('ERROR processing tweets of ' + str(top_list[0]))
269 |             print(exp)
270 | 
271 |     # For each List, count the number of IoCs appread in the dump of the latest 1000 timeline tweets
272 |     top_lists_iocs = {}
273 |     ioc_global_freq = {}
274 |     count = 0
275 |     for file in glob.glob(os.path.join(list_dir, "*.dump.list.csv")):
276 |         count += 1
277 |         name = os.path.basename(file)
278 |         print('processing ' + name)
279 |         id = name.split('---')[0]
280 |         if id not in top_lists_iocs:
281 |             top_lists_iocs[id] = set()
282 |         with open(file, 'r', encoding='utf_8') as input_file:
283 |             for line in input_file:
284 |                 try:
285 |                     tweet = json.loads(line)
286 |                     iocs = iocextract.extract_iocs(tweet['text'], refang=True)
287 |                     for ioc in iocs:
288 |                         if ioc not in tweet['text']:
289 |                             top_lists_iocs[id].add(ioc)
290 |                             if ioc not in ioc_global_freq:
291 |                                 ioc_global_freq[ioc] = 1
292 |                             else:
293 |                                 ioc_global_freq[ioc] += 1
294 |                 except Exception as exp:
295 |                     print('ERROR processing ' + name + ' tweet: ' + line)
296 | 
297 |     # Calculate the uniqueness score for each of the lists
298 |     list_ranking = {}
299 |     average_score = 0
300 |     for list_id, iocs in top_lists_iocs.items():
301 |         total_score = 0
302 |         for ioc in iocs:
303 |             ioc_count = ioc_global_freq[ioc] + 1
304 |             # total_score += 1 / math.log2(ioc_count)
305 |             total_score += 1 / ioc_count
306 |         list_ranking[list_id] = total_score
307 | 
308 |         average_score += total_score
309 | 
310 |     average_score = average_score / len(list_ranking)
311 | 
312 |     list_rank_ioc = []
313 |     for top_list in top_lists:
314 |         if top_list[0] in list_ranking:
315 |             top_list[1]['ioc_uniqness'] = list_ranking[top_list[0]] / average_score
316 |             top_list[1]['score'] *= top_list[1]['ioc_uniqness']
317 |             list_rank_ioc.append(top_list[1])
318 | 
319 |     ranked_list = sorted(list_rank_ioc, key=lambda x: x['score'], reverse=True)
320 | 
321 |     with open(os.path.join(list_dir, 'list_ioc_rank'), 'w', encoding='utf_8') as rank_output:
322 |         for list in ranked_list:
323 |             rank_output.write(
324 |                 '{},{},{},{}\n'.format(list['id'], list['owner_screen_name'], list['name'], list['score']))
325 | 
326 |     member_scores = {}
327 |     for list in ranked_list:
328 |         try:
329 | 
330 |             file_name = list['id'] + '---' + list['owner_screen_name'] + '---' + list['name'].replace('/',
331 |                                                                                                       '-') + '.members.list.csv'
332 |             print('Getting members of ' + list['id'])
333 |             list_members_file_path = os.path.join(list_dir, file_name)
334 |             if not os.path.exists(list_members_file_path):
335 |                 members = get_list_members(api, list['id'])
336 |                 print('List members count ' + str(len(members)))
337 |                 dump_list_users(list['id'], members, list_members_file_path)
338 |             else:
339 |                 members =[]
340 |                 with open (list_members_file_path, 'r') as member_file:
341 |                     for line in member_file:
342 |                         member = Dummy()
343 |                         member_json_obj = json.loads(line)
344 |                         member.screen_name = member_json_obj['screen_name']
345 |                         member.id = member_json_obj['id']
346 |                         members.append(member)
347 | 
348 |             for member in members:
349 |                 if member.id not in member_scores:
350 |                     member_scores[member.id] = {'score': 0, 'screen_name': member.screen_name, 'lists': set()}
351 | 
352 |                 member_scores[member.id]['lists'].add(list['id'])
353 |                 member_scores[member.id]['score'] += list['score']
354 |             print('All members count ' + str(len(member_scores)))
355 |         except Exception as exp:
356 |             print('ERROR getting members ' + list_id)
357 | 
358 |     member_ranks = sorted(member_scores.items(), key=lambda x: x[1]['score'], reverse=True)
359 | 
360 |     with open(os.path.join(base_dir, 'top_users'), 'w', encoding='utf_8', newline='') as top_users_output:
361 |         writer = csv.writer(top_users_output)
362 |         writer.writerow(['id', 'screen_name', 'score', 'lists'])
363 |         for member in member_ranks:
364 |             writer.writerow([member[0], member[1]['screen_name'], member[1]['score'], member[1]['lists']])
365 | 
366 | 
367 |     member_ranks = member_ranks[:1000]
368 | 
369 |     print("Top 1k users before considering users' tweeting history")
370 |     print(member_ranks)
371 | 
372 |     count = 0
373 |     # For each user in top_users file
374 |     with open(os.path.join(base_dir, 'top_users'), 'r', encoding='utf_8', newline='') as top_users_input:
375 |         csv_reader = csv.reader(top_users_input)
376 |         next(csv_reader)
377 |         user_iocs = {}
378 |         ignore = True
379 |         for row in csv_reader:
380 |             user_id, screen_name, score = row[0], row[1], float(row[2])
381 |             try:
382 |                 print(str(count) + " - Getting tweets of "+screen_name)
383 |                 user_tweets_file_path = os.path.join(user_status_dir, '{}_{}_tweets.csv'.format(user_id, screen_name))
384 |                 time_now = datetime.datetime.now()
385 |                 if screen_name not in user_iocs:
386 |                     user_iocs[screen_name] = {'id': user_id, 'screen_name': screen_name, 'score': score, 'days': {}}
387 | 
388 |                 # If we have not the tweet history of the user, collect the latest of their 400 timeline tweets
389 |                 if not os.path.exists(user_tweets_file_path):
390 |                     all_tweets = get_user_timeline(api, screen_name, 400)
391 |                     # write the csv
392 |                     with open(os.path.join(user_status_dir, '{}_{}_tweets.csv'.format(user_id, screen_name)), 'w',
393 |                               encoding='utf_8') as output_file:
394 |                         # dump tweets
395 |                         for i in all_tweets:
396 |                             output_file.write(json.dumps(i._json) + '\n')
397 |                             output_file.flush()
398 |                 else:
399 |                     ignore = False
400 |                     all_tweets = []
401 |                     with open(user_tweets_file_path, 'r', encoding='utf_8') as input_file:
402 |                         next(input_file)
403 |                         for line in input_file:
404 |                             try:
405 |                                 all_tweets.append(json.loads(line))
406 |                             except Exception as exp:
407 |                                 print("Error loading tweets in "+ user_tweets_file_path)
408 | 
409 | 
410 |                 for tweet in all_tweets:
411 |                     if not hasattr(tweet, 'text'):
412 |                         text = tweet['text']
413 |                     else:
414 |                         text = tweet.text
415 | 
416 |                     if not hasattr(tweet, 'created_at'):
417 |                         created_at = parse(tweet['created_at'])
418 |                         created_at = created_at.replace(tzinfo=None)
419 |                     else:
420 |                         created_at = tweet.created_at
421 | 
422 |                     iocs = iocextract.extract_iocs(text, refang=True)
423 |                     for ioc in iocs:
424 |                         if ioc not in text:
425 |                             day_diff = (time_now - created_at).days
426 |                             if day_diff < 0:
427 |                                 day_diff = 0
428 |                             if day_diff not in user_iocs[screen_name]['days']:
429 |                                 user_iocs[screen_name]['days'][day_diff] = set()
430 | 
431 |                             user_iocs[screen_name]['days'][day_diff].add(ioc)
432 | 
433 |                 count += 1
434 |             except Exception as exp:
435 |                 print('Error getting statuses of '+ screen_name)
436 | 
437 |             if test_run:
438 |                 if count > 20:
439 |                     break
440 |             else:
441 |                 if count > 5000:
442 |                     break
443 |             if count % 50 == 0:
444 | 
445 |                 print("\n\n\n\ncurrent number " + str(count)+'\n\n\n\n')
446 | 
447 |         avg_ioc_score = 0
448 |         for screen_name, ioc in user_iocs.items():
449 |             ioc_score = 0
450 |             for day, iocs in ioc['days'].items():
451 |                 ioc_score += len(iocs) / ((int(day)+1)**(1/3))
452 |             ioc_score += 1
453 |             ioc['ioc_score'] = ioc_score
454 |             avg_ioc_score += ioc_score
455 | 
456 |         avg_ioc_score = avg_ioc_score / len(user_iocs)
457 | 
458 |         for screen_name, ioc in user_iocs.items():
459 |             ioc['ioc_score'] /= avg_ioc_score
460 |             ioc['total_score'] = ioc['ioc_score']* ioc['score']
461 | 
462 |         final_user_rank = sorted(user_iocs.items(), key=lambda x:x[1]['total_score'], reverse=True)
463 | 
464 |         with open(os.path.join(base_dir, 'top_users_final'), 'w', encoding='utf_8', newline='') as top_users_output:
465 |             writer = csv.writer(top_users_output)
466 |             writer.writerow(['id', 'screen_name', 'score','ioc_score', 'final_score','days'])
467 |             for screen_name, details in final_user_rank:
468 |                 writer.writerow([details['id'],
469 |                                         details['screen_name'],
470 |                                         details['score'],
471 |                                         details['ioc_score'],
472 |                                         details['total_score'],
473 |                                         json.dumps({x:len(y) for x,y in details['days'].items()})])
474 |                 top_users_res.append((details['screen_name'],details['id']))
475 | 
476 |     return top_users_res
477 | 
478 | 


--------------------------------------------------------------------------------