├── Corpus ├── .classpath ├── .project ├── CodeJamMetadata.json ├── allusers.py ├── allusers2.py ├── cjcompile.py ├── cjscrape.py ├── cjstats.py └── users │ ├── 1128486.txt │ ├── 1145485.txt │ ├── 1150485.txt │ ├── 1150486.txt │ ├── 1158485.txt │ ├── 1327485.txt │ ├── 1460488.txt │ ├── 1645485.txt │ ├── 1781488.txt │ ├── 1835486.txt │ ├── 1836486.txt │ ├── 1842485.txt │ ├── 186264.txt │ ├── 188266.txt │ ├── 189252.txt │ ├── 204113.txt │ ├── 2075486.txt │ ├── 2270488.txt │ ├── 2418487.txt │ ├── 243103.txt │ ├── 2433487.txt │ ├── 2434486.txt │ ├── 2437488.txt │ ├── 2437491.txt │ ├── 2442487.txt │ ├── 2974486.txt │ ├── 2984486.txt │ ├── 2994486.txt │ ├── 3004486.txt │ ├── 3014486.txt │ ├── 3024486.txt │ ├── 311101.txt │ ├── 32001.txt │ ├── 32002.txt │ ├── 32005.txt │ ├── 32008.txt │ ├── 32010.txt │ ├── 32011.txt │ ├── 32013.txt │ ├── 32015.txt │ ├── 32016.txt │ ├── 32017.txt │ ├── 433101.txt │ ├── 544101.txt │ ├── 619102.txt │ ├── 635101.txt │ ├── 635102.txt │ ├── 639102.txt │ ├── 7214486.txt │ ├── 801485.txt │ ├── 90101.txt │ ├── 975485.txt │ └── users.txt ├── Naive Baseline ├── .classpath ├── .gitignore ├── .project └── src │ ├── ARFFFactory.java │ ├── ARFFFactory2.java │ ├── ARFFFactory3.java │ ├── ARFFFactory4.java │ ├── AbstractExtractor.java │ ├── CodeBlock.java │ ├── ControlStatement.java │ ├── Driver.java │ ├── ExtractorC.java │ ├── ExtractorCPP.java │ ├── FeatureSet.java │ ├── Loops.java │ ├── MultiSet.java │ ├── ReservedC.java │ ├── ReservedCPP.java │ ├── WhiteSpace.java │ ├── c_reserved_words.txt │ └── cpp_reserved_words.txt ├── README.md └── SCAA ├── .classpath ├── .gitignore ├── .project ├── .settings └── org.eclipse.jdt.core.prefs ├── commons-exec-1.2.jar ├── commons-lang3-3.3.2.jar ├── commons.io_2.0.1.jar ├── javacsv.jar ├── src ├── AuthorClassification.java ├── AuthorClassificationRelaxed.java ├── BigramExtractor.java ├── CheckFiles.java ├── DatasetCreator.java ├── DepthASTNode.java ├── DistanceCalculations.java ├── FeatureCalculators.java ├── FeatureExtractor.java ├── FeatureExtractorConcurrent.java ├── FeatureExtractorInfoGain.java ├── IndexWrapper.java ├── LevenshteinDistance.java ├── MergeArffFiles.java ├── MergeArffFilesNew.java ├── ProblemSetWriter.java ├── RelaxedEvaluation.java ├── RemoveComments.java ├── Util.java └── WholeWordIndexFinder.java └── weka.jar /Corpus/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /Corpus/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Corpus 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /Corpus/allusers.py: -------------------------------------------------------------------------------- 1 | from urllib import urlopen 2 | import json 3 | 4 | # 5 | # Gets all users who participated in the Google Code Jam competition. 6 | # Posts results in a single text file. 7 | # 8 | 9 | users = {} # dictionary of all discovered users 10 | 11 | # adds all users who participated in the given round to the dictionary 12 | def get_all_users(round_id, num_players): 13 | global users 14 | for pos in range(1, int(num_players), 30): 15 | meta_url = "http://code.google.com/codejam/contest/scoreboard/" \ 16 | + "do?cmd=GetScoreboard&contest_id=" \ 17 | + round_id \ 18 | + "&show_type=all&start_pos=" \ 19 | + str(pos) \ 20 | + "&views_time=1&views_file=0&csrfmiddlewaretoken=" 21 | meta_url_data = urlopen(meta_url).read() 22 | meta_json = json.loads(meta_url_data) 23 | for row in meta_json['rows']: 24 | username = row['n'] 25 | users[username] = True 26 | 27 | user_file = open('users/users.txt', 'w') 28 | metadatafile = open(os.path.dirname(os.path.realpath(__file__)) + "/CodeJamMetadata.json").read() 29 | metadata = json.loads(metadatafile) 30 | 31 | # loop through all years 32 | for year_json in metadata['competitions']: 33 | qual_round = year_json['round'][0] # get only the qualification round 34 | num_players = qual_round['numPlayers'] 35 | round_id = qual_round['contest'] 36 | get_all_users(round_id, num_players) # get users for the qualification round of the given year 37 | 38 | # write out all users 39 | for user in users.keys(): 40 | user_file.write(user) 41 | user_file.write('\n') 42 | user_file.close() 43 | -------------------------------------------------------------------------------- /Corpus/allusers2.py: -------------------------------------------------------------------------------- 1 | from urllib import urlopen 2 | import json 3 | import os 4 | 5 | # 6 | # Gets all users who participated in the Google Code Jam competition. 7 | # Posts results according to round number. 8 | # 9 | 10 | # writes a list of all users who participated in the round 11 | def get_all_users(round_id, num_players): 12 | round_file = open('users/' + round_id + '.txt', 'w') 13 | 14 | # loop through each page of users 15 | for pos in range(1, int(num_players), 30): 16 | meta_url = "http://code.google.com/codejam/contest/scoreboard/" \ 17 | + "do?cmd=GetScoreboard&contest_id=" \ 18 | + round_id \ 19 | + "&show_type=all&start_pos=" \ 20 | + str(pos) \ 21 | + "&views_time=1&views_file=0&csrfmiddlewaretoken=" 22 | print meta_url 23 | meta_url_data = urlopen(meta_url).read() 24 | meta_json = json.loads(meta_url_data) 25 | 26 | # find and print usernames 27 | for row in meta_json['rows']: 28 | username = row['n'] 29 | round_file.write(username) 30 | round_file.write('\n') 31 | print username 32 | round_file.close() 33 | 34 | # load JSON 35 | metadatafile = open(os.path.dirname(os.path.realpath(__file__)) + "/CodeJamMetadata.json").read() 36 | metadata = json.loads(metadatafile) 37 | 38 | # loop through all years 39 | for year_json in metadata['competitions']: 40 | for round_json in year_json['round']: 41 | num_players = round_json['numPlayers'] 42 | round_id = round_json['contest'] 43 | get_all_users(round_id, num_players) 44 | -------------------------------------------------------------------------------- /Corpus/cjcompile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | # 5 | # Usage: python /directory/path/to/cjcompile.py [compiler flags] 6 | # 7 | # Compiles all C/C++ source files in the current working directory. 8 | # Also recursively compiles all C/C++ source files in all subdirectories. 9 | # 10 | 11 | flags = sys.argv 12 | flags.pop(0) 13 | 14 | # go through all files under the root directory 15 | for (path, dirs, files) in os.walk('.'): 16 | for f in files: 17 | # check if file is a C or C++ file 18 | if f.endswith(('.c', '.cpp')): 19 | if f.endswith('.c'): # C file 20 | command = "gcc " 21 | else: # C++ file 22 | command = "g++ " 23 | 24 | # add full directory path of the source file 25 | command += path 26 | command += '/' 27 | command += f 28 | 29 | # add full directory path of the executable 30 | command += " -o " 31 | command += path 32 | command += '/' 33 | command += os.path.splitext(f)[0] # remove file extension 34 | 35 | # adding in compiler flags (specified in the arguments) 36 | for flag in flags: 37 | command += " " 38 | command += flag 39 | 40 | # compile 41 | os.system(command) 42 | print command 43 | -------------------------------------------------------------------------------- /Corpus/cjscrape.py: -------------------------------------------------------------------------------- 1 | from urllib import urlopen 2 | from urllib import urlretrieve 3 | import json 4 | import sys 5 | import os 6 | import zipfile 7 | import shutil 8 | import multiprocessing 9 | 10 | # 11 | # Scrapes Google Code Jam data, and extracts the C/C++/Python source files. 12 | # 13 | # The directory structure and naming convention of the data is as follows: 14 | # 15 | # ./codejamdata/ |--> c/ | --> username0 | --> p[problem_number].[user_name]0.c 16 | # | | | --> p2453486.Bob0.c 17 | # | | | --> etc... 18 | # | | 19 | # | | --> name0 | --> etc... 20 | # | | | --> etc... 21 | # | | | --> etc... 22 | # | | 23 | # | | --> another0 | --> etc... 24 | # | | --> etc... 25 | # | 26 | # |--> cpp/ | --> etc... | --> etc... 27 | # | | | --> etc... 28 | # | | 29 | # | | --> etc... | --> etc... 30 | # | 31 | # |--> py/ | --> etc... | --> etc... 32 | # 33 | 34 | # returns the URL to download the user submission 35 | def get_download_url(round_id, problem_id, username): 36 | return "http://code.google.com/codejam/contest/scoreboard/do?cmd=GetSourceCode&contest=" \ 37 | + round_id \ 38 | + "&problem=" \ 39 | + problem_id \ 40 | + "&io_set_id=0&username=" \ 41 | + username 42 | 43 | # scrapes the C/C++/Python files of the given round 44 | def scrape(round_id, problems, script_path): 45 | 46 | # load list of users 47 | user_file = open(script_path + '/users/' + round_id + '.txt', 'r') 48 | users = user_file.read().splitlines() 49 | 50 | # loop through problems in the round 51 | for problem_json in problems: 52 | problem_id = problem_json['id'] 53 | 54 | # loop through users who participated in the round 55 | for username in users: 56 | download_url = get_download_url(round_id, problem_id, username) 57 | 58 | # print and flush URL 59 | print download_url 60 | sys.stdout.flush() 61 | 62 | # make temp directory for storing zips 63 | tempdir = round_id + 'temp' 64 | if not os.path.exists(tempdir): 65 | os.makedirs(tempdir) 66 | 67 | # download and read zip 68 | target_zip = tempdir + '/' + problem_id + '.' + username + '0.zip' 69 | urlretrieve(download_url,target_zip) 70 | zip_header = open(target_zip, 'rb') 71 | 72 | # try-except in case of a bad header 73 | try: 74 | my_zip = zipfile.ZipFile(zip_header) 75 | 76 | # loop through each file in the zip file 77 | for my_file in my_zip.namelist(): 78 | 79 | # check for C/C++/Python source 80 | if my_file.endswith(('.c', '.cpp', '.py')): 81 | target_source = username + '0' # destination of source files 82 | file_newname = 'p' + problem_id + '.' + username + '0.' # appropriate name for file 83 | if my_file.endswith('.c'): 84 | file_newname += 'c' 85 | target_source = 'c/' + target_source 86 | elif my_file.endswith('.cpp'): 87 | file_newname += 'cpp' 88 | target_source = 'cpp/' + target_source 89 | else: 90 | file_newname += 'py' 91 | target_source = 'py/' + target_source 92 | target_source = 'codejamfolder/' + target_source 93 | 94 | # make directory for language and author 95 | if not os.path.exists(target_source): 96 | os.makedirs(target_source) 97 | 98 | # extract and rename source file 99 | my_zip.extract(my_file, target_source) 100 | os.rename((target_source + '/' + my_file), (target_source + '/' + file_newname)) 101 | 102 | # print location of extracted source file 103 | print target_source + '/' + file_newname 104 | sys.stdout.flush() 105 | except: 106 | print "error:", sys.exc_info()[0] # can happen if the user didn't do a problem 107 | sys.stdout.flush() 108 | 109 | # delete temp directory 110 | if os.path.exists(tempdir): 111 | shutil.rmtree(tempdir) 112 | return 113 | 114 | # main section of script 115 | if __name__ == '__main__': 116 | script_path = os.path.dirname(os.path.realpath(__file__)) 117 | metadatafile = open(script_path + "/CodeJamMetadata.json").read() 118 | metadata = json.loads(metadatafile) 119 | 120 | # loop through years 121 | for year_json in metadata['competitions']: 122 | year = year_json['year'] 123 | 124 | # loop through rounds 125 | for round_json in year_json['round']: 126 | round_id = round_json['contest'] 127 | problems = round_json['problems'] 128 | 129 | # run scraper on current round 130 | scraper = multiprocessing.Process(target=scrape, args=(round_id, problems, script_path)) 131 | scraper.start() 132 | -------------------------------------------------------------------------------- /Corpus/cjstats.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | import json 5 | 6 | # 7 | # Counts number of files per round based off filename. 8 | # Run in root directory of files you want to search. 9 | # CodeJamMetadata.json must be in the same directory as this script. 10 | # 11 | 12 | def get_problem_id(filename): 13 | regex = re.search('[0-9]+', filename, flags=0) 14 | return regex.group() 15 | 16 | def get_username(filename): 17 | filename += os.path.splitext(filename)[0] 18 | return re.sub('p[0-9]+\.', '', filename) 19 | 20 | metadatafile = open(os.path.dirname(os.path.realpath(__file__)) + "/CodeJamMetadata.json").read() 21 | metadata = json.loads(metadatafile) 22 | 23 | 24 | # hash: p# -> r# 25 | # another hash: r# -> {user -> true} 26 | prob_to_round = {} 27 | #round_users = {} 28 | round_users = [] 29 | round_to_desc = [] 30 | 31 | round_count = 0 32 | 33 | for year_json in metadata['competitions']: 34 | year = year_json['year'] 35 | for round_json in year_json['round']: 36 | description = round_json['desc'] 37 | round_id = round_json['contest'] 38 | #round_users[round_id] = {} # 39 | round_users.append({}) 40 | round_to_desc.append(description) 41 | num_players = round_json['numPlayers'] 42 | for problem_json in round_json['problems']: 43 | problem_name = problem_json['name'] 44 | problem_id = problem_json['id'] 45 | #prob_to_round[problem_id] = round_id # 46 | prob_to_round[problem_id] = round_count 47 | round_count += 1 48 | 49 | # go through all files under the root directory 50 | for (path, dirs, files) in os.walk('.'): 51 | for f in files: 52 | p_id = get_problem_id(f) 53 | u_name = get_username(f) 54 | round_users[prob_to_round[p_id]][u_name] = True 55 | 56 | for i in range(len(round_users)): 57 | print round_to_desc[i] 58 | print len(round_users[i]) 59 | -------------------------------------------------------------------------------- /Corpus/users/1158485.txt: -------------------------------------------------------------------------------- 1 | linguo 2 | nika 3 | winger 4 | zyz915 5 | misof 6 | andrewzta 7 | rng..58 8 | mystic 9 | acrush 10 | natalia 11 | hanshuai 12 | meret 13 | darnley 14 | eatmore 15 | ilyakor 16 | g201513 17 | ashmelev 18 | Egor 19 | dolphinigle 20 | omeometo 21 | ilyaraz 22 | Bin.Jin 23 | vepifanov 24 | neal.wu 25 | ir5 26 | RAD. 27 | Palmtenor 28 | cgy4ever 29 | pashka 30 | iddaga 31 | voover 32 | ogiekako 33 | Al.Cash 34 | Maja 35 | pdallago 36 | PaulJefferys 37 | surwdkgo 38 | fagu 39 | chokudai 40 | Ra16bit 41 | stgatilov 42 | Eryx 43 | hos.lyric 44 | mikhailOK 45 | SergeyFedorov 46 | vot 47 | iwi 48 | tikitikirevenge 49 | SergeyRogulenko 50 | tos.lunar 51 | anari 52 | dasko 53 | Jonick 54 | seanwu 55 | Vasyl 56 | open 57 | KAP 58 | izulin 59 | Yaro 60 | Khuc.Anh.Tuan 61 | KennyHORROR 62 | uwi 63 | jh1 64 | Landertxu 65 | Tomato 66 | a70babat 67 | levlam 68 | tom612pl 69 | Stigius 70 | jinlin 71 | qizichao 72 | Klinck 73 | kitamasa 74 | yczhang 75 | Ahyangyi 76 | Gennady.Korotkevich 77 | dAnton 78 | theycallhimtom 79 | earl 80 | slippy 81 | peter50216 82 | JAPLJ 83 | HiltonLange 84 | dan19 85 | WSX 86 | Wataru 87 | Vedensky 88 | Vytis 89 | pawelparys 90 | Milanin 91 | bayleef 92 | wata 93 | Fumiya 94 | emaxx 95 | niyaznigmatul 96 | tckwok 97 | Fdg 98 | masha.and.beer 99 | vsb 100 | watashi 101 | dzhulgakov 102 | Akim 103 | yeputons 104 | Louty 105 | RAVEman 106 | ConanKudo247 107 | qwaker.00 108 | Burunduk1 109 | lqp1831 110 | iwiskimo 111 | Farmer.John 112 | sevenkplus 113 | eduardische 114 | navi 115 | X.Ray 116 | monsoon 117 | LinesPrower 118 | GarnetCrow 119 | sdya 120 | Seyaua 121 | lidaobing 122 | bmerry 123 | blmarket 124 | flashmt 125 | R.R. 126 | venco5 127 | kubus 128 | aleksey 129 | sisu 130 | alantian 131 | LoRdTaPaKaH 132 | MaxBuzz 133 | wychen 134 | SmileIJP 135 | temper 136 | gaoxin 137 | szsz 138 | Fcdkbear 139 | PhilipPV 140 | ONP 141 | mk.al13n 142 | ACube 143 | exod40 144 | W.Junqiao 145 | Sammarize 146 | MRoizner 147 | jimison 148 | Gassa 149 | RoBa 150 | polesp 151 | kappahouse 152 | Pedro.Bello 153 | stolis 154 | Sempr 155 | Koyaah 156 | eaglet 157 | Connector 158 | EvgeniusASPX 159 | TangKy 160 | emppu 161 | enot.1.10 162 | Bicheng.Cao 163 | DKI 164 | austrin 165 | paladin8 166 | KOTEHOK 167 | Burunduk2 168 | Jonasz 169 | ll931110 170 | charliez 171 | kcm1700 172 | shik 173 | ivan.popelyshev 174 | YiningWang 175 | xreborner 176 | Progbeat 177 | burdakovd 178 | alexkas 179 | pieguy 180 | gawry 181 | Anton.Lunyov 182 | WhiteBear 183 | SoCalledName 184 | XiaoZiqian 185 | akira.nekoneko 186 | kcd 187 | msg555 188 | kia 189 | yayamao 190 | chenwang0517 191 | 2rf 192 | foison 193 | w10d 194 | DjinnKahn 195 | butterfly21 196 | flowlight 197 | huameng 198 | zaq1xsw2tktk 199 | DD.tt 200 | gilesg 201 | nk.karpov 202 | qin 203 | DamianS 204 | janq 205 | thinfaifai 206 | Zhiwei.Li 207 | simp1eton 208 | dano 209 | xlmj531 210 | This 211 | fidels 212 | Alexander86 213 | gustav 214 | Aiz 215 | pasin30055 216 | fero 217 | Pompon 218 | vpj 219 | boris4 220 | zhendongjia 221 | nihao 222 | domeng 223 | DmitryEgorov 224 | Pasqual45 225 | rafaeldaigo 226 | bcloud7 227 | han6 228 | Kirino 229 | xiaowuc1 230 | Gluk 231 | Joshik 232 | Lovro 233 | Dragoon 234 | int9 235 | tsukuno 236 | Ljq 237 | megaterik 238 | tanakh 239 | zibada 240 | VulpesX 241 | SmartSchizo 242 | drazil 243 | Ignat 244 | fuch 245 | VArtem 246 | SkidanovAlexander 247 | Zlobober 248 | VITAKS 249 | guilherme 250 | defrager 251 | imbanoob 252 | humblefool 253 | sprea 254 | yiuyuho 255 | Yakumo 256 | latsyrc 257 | chEEtah 258 | beingryu 259 | Romka 260 | LayCurse 261 | Sergey.Bankevich 262 | gdiver 263 | mrozik 264 | ronalchn 265 | berger 266 | Ostap 267 | hpmv 268 | KevinErikLee 269 | ploh 270 | blando 271 | psir 272 | darkKelvin 273 | dooglius 274 | tantian 275 | alyaxey 276 | Vitaliy 277 | madking 278 | Slimper 279 | cedriclin 280 | dennis.lissov 281 | kuba97531 282 | resty 283 | eagleonhill 284 | pmnox 285 | paisa 286 | AVictor 287 | Yao 288 | EpicWu 289 | cocls 290 | .dP. 291 | goodwind 292 | ush 293 | QuJun 294 | C.A. 295 | ikatanic 296 | oioi98 297 | zouxun 298 | technolt 299 | dalex 300 | hillboy 301 | Sanny 302 | GVS 303 | dj3500 304 | AlexLin 305 | stan 306 | balakrishnan.v 307 | alexmat21 308 | jellies 309 | jackchen92 310 | TripleManiac 311 | moonlight 312 | Dembel 313 | love.wenxuan 314 | SceneTree 315 | wuzhengkai 316 | iscsi 317 | AquaSnail 318 | shadowind 319 | MauricioC 320 | dzetkulict 321 | ramlaf 322 | Tony 323 | zhouxiaobo 324 | radeye 325 | sjtu.pigoneand 326 | eMBe 327 | k.kojima 328 | siavosh 329 | mlwong 330 | Ax.h 331 | shreevatsa 332 | zero.lin 333 | AlexanderL 334 | springegg 335 | LordKlotski 336 | dozingcat 337 | dyukha 338 | narri 339 | naagi 340 | EricStansifer 341 | goober 342 | Michael.Levin 343 | Ahbong 344 | 357 345 | BM954 346 | SeMeKh 347 | Kinan.Sarmini 348 | pablo.aguilar 349 | angwuy 350 | dahlukeh 351 | aditsu 352 | aurinegro 353 | pP5438 354 | liymouse 355 | Myth 356 | Lightmoon 357 | strangecow 358 | Navid 359 | xneby 360 | Nikkolloz 361 | azhai 362 | Peteris 363 | Factorial 364 | random.johnnyh 365 | hmao5 366 | wangchaohui 367 | jaehyunp 368 | sbzlyessit 369 | HerrKanzler 370 | almaz 371 | Akai 372 | ttim 373 | coolzzz 374 | dan.banica 375 | berestinsky 376 | forifchen 377 | Weiqi 378 | solidsnake1905 379 | falcon112358 380 | Jed 381 | vlad 382 | Jrdevil.1984 383 | Hitrez 384 | Tomi 385 | seuamigohenry 386 | Kalmakka 387 | piyifan 388 | cypress 389 | WeiLiu 390 | Davidmg 391 | akaki 392 | ghostgold 393 | Handojo1 394 | jzj 395 | TRYang 396 | AekdyCoin 397 | Naonao 398 | ErickW 399 | shouhm 400 | FloppyCat 401 | TDteach 402 | kmjp 403 | gepa 404 | stupidbear 405 | Nooodles 406 | whh 407 | ArtHoly 408 | -------------------------------------------------------------------------------- /Corpus/users/1327485.txt: -------------------------------------------------------------------------------- 1 | rng..58 2 | mystic 3 | meret 4 | RAD. 5 | misof 6 | g201513 7 | pashka 8 | vepifanov 9 | eatmore 10 | winger 11 | natalia 12 | acrush 13 | ilyakor 14 | ir5 15 | Bin.Jin 16 | voover 17 | hanshuai 18 | Palmtenor 19 | cgy4ever 20 | ashmelev 21 | Egor 22 | linguo 23 | neal.wu 24 | darnley 25 | zyz915 26 | -------------------------------------------------------------------------------- /Corpus/users/1835486.txt: -------------------------------------------------------------------------------- 1 | EgorKulikov 2 | Eryx 3 | SnapDragon 4 | eatmore 5 | bmerry 6 | Ahyangyi 7 | squark 8 | andrewzta 9 | Vasyl 10 | misof 11 | Gennady.Korotkevich 12 | meret 13 | neal.wu 14 | watashi 15 | sdya 16 | Burunduk1 17 | vepifanov 18 | hos.lyric 19 | fhlasek 20 | chokudai 21 | paladin8 22 | Farmer.John 23 | gawry 24 | Dlougach 25 | rng..58 26 | g201513 27 | dzhulgakov 28 | omeometo 29 | sevenkplus 30 | Gassa 31 | exod40 32 | peter50216 33 | mikhailOK 34 | emaxx 35 | Krazul 36 | cos 37 | Romka 38 | kcm1700 39 | Zhuojie 40 | zithral 41 | mk.al13n 42 | shik 43 | kohyatoh 44 | Murphy 45 | latsyrc 46 | Plagapong 47 | ivan.popelyshev 48 | withleave 49 | Myth5 50 | Tigvarts 51 | Tomato 52 | chEEtah 53 | RAD. 54 | LayCurse 55 | yeputons 56 | azhai 57 | DmitryEgorov 58 | Chmel.Tolstiy 59 | sisu 60 | ConanKudo247 61 | MiminoCoder 62 | gaoyihan 63 | Vitaliy 64 | Peti 65 | Alexander86 66 | DKI 67 | AdrianKuegel 68 | tikitikirevenge 69 | tkociumaka 70 | ploh 71 | ir5 72 | SergeyFedorov 73 | vlad89 74 | Xhark 75 | cedriclin 76 | gnomnain 77 | wap 78 | stgatilov 79 | iwi 80 | tomerun 81 | Pasqual45 82 | Saeed 83 | WhiteBear 84 | Anton.Lunyov 85 | thocevar 86 | PavelKunyavskiy 87 | MikeMirzayanov 88 | KAP 89 | Maryann 90 | kmod 91 | anton.akhi 92 | Zlobober 93 | LGM 94 | razimantv 95 | mystic 96 | DamianS 97 | Ra16bit 98 | LoRdTaPaKaH 99 | AHdoc 100 | atetubou 101 | gepa 102 | T.Insane 103 | darnley 104 | logicmachine 105 | nik239 106 | CNRICville 107 | chavit92 108 | voover 109 | sky58 110 | Jonasz 111 | LXYXYNT 112 | IvanRomanov 113 | Palmtenor 114 | dzetkulict 115 | donvel 116 | bwps 117 | GeKa 118 | acrush 119 | CaseyRoberts711 120 | Cifko 121 | homo.sapiens 122 | vot 123 | AlexUdalov 124 | lightholy 125 | daidailanlan 126 | fuch 127 | uwi 128 | goober 129 | fuseidenamida 130 | fixme 131 | ashmelev 132 | Astein 133 | vexorian 134 | natalia 135 | Borisp 136 | Vytis 137 | cgy4ever 138 | jakubr 139 | AS1 140 | popwax 141 | Smylic 142 | argentony 143 | aropan 144 | Silence 145 | linguo 146 | izulin 147 | Lipstick 148 | natsugiri 149 | hadi 150 | DAle 151 | pasin30055 152 | alexmat21 153 | pjsdream 154 | zw7840 155 | MrRoach 156 | dj3500 157 | llx 158 | Yao 159 | levlam 160 | lidaobing 161 | kelvinlau 162 | anrieff 163 | elfness 164 | angwuy 165 | ZhukovDmitry 166 | xujie 167 | yvasyliv 168 | WSX 169 | unbing 170 | Dovgaluk 171 | fuwenjie 172 | Kirino 173 | pieguy 174 | Imo 175 | FloppyCat 176 | rohanp77 177 | VulpesX 178 | dg. 179 | Cruiser 180 | tfliao 181 | geeky.elk 182 | exKAZUu 183 | maksay 184 | eldering 185 | .dP. 186 | RedApe 187 | stqn 188 | Nicolas16 189 | boleyn.su 190 | zhujiaye 191 | DoublePointer 192 | 2rf 193 | RiaD 194 | Kristofer 195 | Wataru 196 | WXYZ 197 | JaapB 198 | wata 199 | s.y 200 | shevchen 201 | marcoskwkm 202 | JAPLJ 203 | nika 204 | h4tguy 205 | pawelparys 206 | lzw75 207 | joy32812 208 | darkKelvin 209 | GlebsHP 210 | Aiz 211 | xreborner 212 | MonEtoile 213 | ania7 214 | Sempr 215 | alberist 216 | olalia 217 | wojteks 218 | sbzlyessit 219 | maciejk 220 | ZaN 221 | kitamasa 222 | NP...np 223 | AekdyCoin 224 | rowdark 225 | eduardische 226 | guilherme 227 | kappahouse 228 | pflueger 229 | technolt 230 | dai1741 231 | AHA 232 | Nabb 233 | eddyferreira 234 | KOTEHOK 235 | Michael.Levin 236 | a3nm 237 | beingryu 238 | jzj 239 | Erop 240 | porker2008 241 | immoonancient 242 | Tojot 243 | Milanin 244 | ZbanIlya 245 | jdmetz 246 | W4yneb0t 247 | wuzhengkai 248 | palacios.roy 249 | ush 250 | Ljq 251 | tos.lunar 252 | chnlich 253 | chaemon 254 | jpaulson 255 | alantian 256 | desertfox 257 | kobra 258 | ytj 259 | hillboy 260 | lcch 261 | Ostap 262 | kawatea 263 | spnautilus 264 | iakudryashov 265 | frank12268 266 | Matej 267 | Fatest 268 | EvgeniSergeev 269 | byte 270 | zbwmqlw 271 | simp1eton 272 | Felix 273 | jki14 274 | Ixanezis 275 | MilesEdgeworth 276 | Nyatl 277 | Mister 278 | AndreySiunov 279 | yaray 280 | Caoqinxiang 281 | pashkal 282 | Garyzx 283 | eXtreme 284 | gaoxin 285 | polmauri 286 | k21 287 | SergeyLazarev 288 | tsukuno 289 | imbanoob 290 | MRoizner 291 | xiaowuc1 292 | ulzha 293 | aussie 294 | Arios 295 | Taehyun 296 | jthread 297 | kmjp 298 | i314 299 | ACube 300 | arihayes 301 | jackchen92 302 | evima 303 | IwfWcf 304 | a180285 305 | Amtrix 306 | lintaor1 307 | superjoel 308 | Psyho 309 | Logic..IU 310 | Cronos 311 | goffrie 312 | kAc 313 | xiaodao 314 | Aleks 315 | kdalex 316 | abrackadabra 317 | ChingYunH 318 | K.A.D.R 319 | yangzhe1991 320 | DarLam 321 | geka666 322 | lrgar 323 | mr146 324 | GreenPeace 325 | Abscp 326 | delta2323 327 | kormyshov 328 | iPeter 329 | userresu 330 | Imsbuno 331 | williamljb 332 | Fancy 333 | Lovro 334 | daviduarte 335 | coldcutter 336 | Paf 337 | paramaciej 338 | LinesPrower 339 | EmK 340 | bruce3557 341 | jackfeng 342 | Gigz 343 | dogwalker 344 | zouxun 345 | Dener 346 | impetus 347 | issue9 348 | elizarov 349 | strapahuulius 350 | swgr 351 | LYW 352 | Kriii 353 | marim 354 | svick 355 | HexTree 356 | b0b0b0b 357 | domob 358 | SergGr 359 | real 360 | Robert.Newey 361 | tur.turczyn 362 | hansonw 363 | framalex 364 | Yoshiap 365 | jaigupta 366 | Smitty 367 | jinlin 368 | DaniJVaz 369 | YUKI.M 370 | justever86 371 | JC.C. 372 | LiuKe 373 | UESTC.Fish 374 | SumuduF 375 | yairchu 376 | A.Grishchenko 377 | C0pymaster 378 | nullmineral 379 | gasho 380 | DryukAlex 381 | bnulzm 382 | narri 383 | blue.boy 384 | alpc104 385 | fidels 386 | ytau 387 | Jan.D.Huang 388 | Boping 389 | srh 390 | francoisvn 391 | Jens 392 | RalfKistner 393 | Nikolay.Kalinin 394 | pashka 395 | GunnERs 396 | Gordderp 397 | LazyLie 398 | nip 399 | fswenton 400 | kevinsogo 401 | ssssss 402 | -------------------------------------------------------------------------------- /Corpus/users/2075486.txt: -------------------------------------------------------------------------------- 1 | meret 2 | neal.wu 3 | misof 4 | vepifanov 5 | hos.lyric 6 | bmerry 7 | watashi 8 | SnapDragon 9 | dzhulgakov 10 | eatmore 11 | g201513 12 | Farmer.John 13 | Ahyangyi 14 | exod40 15 | EgorKulikov 16 | Burunduk1 17 | gawry 18 | sdya 19 | rng..58 20 | chokudai 21 | Dlougach 22 | paladin8 23 | Eryx 24 | Vasyl 25 | fhlasek 26 | -------------------------------------------------------------------------------- /Corpus/users/243103.txt: -------------------------------------------------------------------------------- 1 | bmerry 2 | qizichao 3 | winger 4 | Ahyangyi 5 | misof 6 | rem 7 | kia 8 | mystic 9 | marek.cygan 10 | dzhulgakov 11 | Vitaliy 12 | wata 13 | kalinov 14 | ACRush 15 | AdrianKuegel 16 | Myth 17 | pashka 18 | ZhukovDmitry 19 | Khuc.Anh.Tuan 20 | Jeru 21 | PaulJefferys 22 | ploh 23 | emaxx 24 | FloppyCat 25 | nika 26 | alyaxey 27 | halyavin 28 | hos.lyric 29 | Burunduk1 30 | Jiunru 31 | moon5ckq 32 | KOTEHOK 33 | KAP 34 | Imba 35 | linyufly 36 | .Invader 37 | meret 38 | defrager 39 | xlmj531 40 | andrewzta 41 | Lovro 42 | overwise 43 | AS1 44 | marcina 45 | JongMan 46 | ikatanic 47 | ftc 48 | WangDong 49 | ShangJingbo 50 | natalia 51 | logistic 52 | Cheryl 53 | iwi 54 | MikeMirzayanov 55 | ScaleRhyme 56 | Zig 57 | vlad89 58 | blueblimp 59 | Huayang 60 | Lunarmony 61 | neal.wu 62 | Yao 63 | fuwenjie 64 | austrin 65 | diver 66 | LucaB 67 | SpaceFlyer 68 | stone 69 | Helenjyun 70 | MRoizner 71 | yangzhe1990 72 | eatmore 73 | beingryu 74 | tanakh 75 | igorcanadi 76 | RAD 77 | Alexus 78 | EmK 79 | Farmer.John 80 | Michael.Levin 81 | Yarin 82 | Borisp 83 | arti 84 | Gluk 85 | fsouza 86 | gojira 87 | VitalyGoldstein 88 | WSX 89 | Dragoon 90 | gusakov 91 | 1leaf1 92 | PhilipPV 93 | MiminoCoder 94 | CS.Ferng 95 | nya 96 | lewha0 97 | aanastasov 98 | NeT 99 | dgozman 100 | skol 101 | Narg. 102 | kitamasa 103 | dzwiedziu 104 | SavinovAlex 105 | charliez 106 | TheLlama 107 | u1ik 108 | sidky 109 | stjepan 110 | ardiankp 111 | narri 112 | satchipear 113 | lukasP 114 | Doggy 115 | ush 116 | Jonasz 117 | partisan 118 | araste 119 | Gennady.Korotkevich 120 | jaehyunp 121 | gawry 122 | strapahuulius 123 | Progbeat 124 | Vasyl 125 | hhanger 126 | vitar 127 | oberon 128 | jzd 129 | lympanda 130 | cpphamza 131 | anton.akhi 132 | kana.ikeda 133 | maciejk 134 | bwps 135 | Sorokin 136 | Fire 137 | xreborner 138 | Im2Good 139 | Elmiguel409 140 | gislan 141 | HenryNSW 142 | chEEtah 143 | Romka 144 | stan 145 | Nerevar 146 | snguyen.itim 147 | Zhomart 148 | indifferent 149 | felixh 150 | xgy 151 | Sergey.Bankevich 152 | SergeyRogulenko 153 | wjsw 154 | Flex 155 | jthread 156 | ogiekako 157 | SergeyMelnikov 158 | altertain 159 | SergeyFedorov 160 | eMBe 161 | sisu 162 | Will.Wu 163 | palacios.roy 164 | yiuyuho 165 | Eryx 166 | igoro 167 | tanonev 168 | not2knight 169 | deepblue 170 | Smylic 171 | Egor 172 | Maris 173 | boboo 174 | slippy 175 | dolphinigle 176 | fengzlzl 177 | tudejian 178 | indy256 179 | reiten 180 | Fernando 181 | ytj 182 | aditsu 183 | KreysSergey 184 | Jason911 185 | elhipercubo 186 | piyifan 187 | StanY 188 | jbernadas 189 | MaxBuzz 190 | Doeth 191 | Akim 192 | domeng 193 | eagleonhill 194 | latsyrc 195 | momtchil 196 | aussie 197 | chultquist 198 | DeCowboy 199 | cax 200 | Oracle. 201 | domino 202 | macs 203 | Xazker 204 | ssaljalu 205 | SHOIT 206 | chc000 207 | Duc 208 | guilherme 209 | VehicleOfPuzzle 210 | tohagnom 211 | asaveljevs 212 | exod40 213 | leehark 214 | frostnova 215 | gnarlycow 216 | Astein 217 | LYW 218 | izulin 219 | SceneTree 220 | MikleB 221 | humblefool 222 | tund 223 | Wataru 224 | Loner 225 | vigo.ph 226 | tdmorgan 227 | pieguy 228 | Fly 229 | forest 230 | Louty 231 | dexy 232 | Xhark 233 | tracyhenry 234 | vot 235 | Palmtenor 236 | Zero.ztz 237 | eduardische 238 | bachelor 239 | Rahenri 240 | arkar 241 | Medeiros 242 | turuthok 243 | zibada 244 | snizovtsev 245 | TDYa127 246 | keshav57 247 | yariv 248 | onp 249 | boolean 250 | carlos.guia 251 | diwulechao1988 252 | xtof.durr 253 | moonancient 254 | tomekkulczynski 255 | RoBa 256 | roman 257 | Al.Cash 258 | Ryan 259 | Arif 260 | Chmel.Tolstiy 261 | PavelKuznetsov 262 | lschyt 263 | MDA 264 | cpcs 265 | ttim 266 | Cjf 267 | ithlony 268 | Ying 269 | lxhgww 270 | billynyh 271 | paladin8 272 | ToN.AC119 273 | serg 274 | tsukuno 275 | Kicd 276 | LoRdTaPaKaH 277 | Tomato 278 | Landertxu 279 | deepakmanohar 280 | jakubr 281 | lucaspiva 282 | try 283 | bayleef 284 | navi 285 | baihacker 286 | Zr40 287 | aleksey 288 | ania7 289 | LastStand.ziliang 290 | SmartSchizo 291 | aurinegro 292 | Shahriar.Rouf.Nafi 293 | haha 294 | ltaravilse 295 | venco5 296 | A.I.R. 297 | -------------------------------------------------------------------------------- /Corpus/users/2433487.txt: -------------------------------------------------------------------------------- 1 | Gennady.Korotkevich 2 | vepifanov 3 | SnapDragon 4 | theycallhimtom 5 | mystic 6 | mikhailOK 7 | winger 8 | dzhulgakov 9 | qwerty787788 10 | PavelKunyavskiy 11 | nika 12 | hos.lyric 13 | JAPLJ 14 | voover 15 | MiminoCoder 16 | EgorKulikov 17 | staniek 18 | lunae 19 | BNieuwenhuizen 20 | vot 21 | jpaulson 22 | dancho 23 | Psyho 24 | pieguy 25 | Vitaliy 26 | Vasyl 27 | Lovro 28 | mozeda 29 | sdya 30 | NALP 31 | wata 32 | subscriber 33 | niyaznigmatul 34 | stgatilov 35 | GagGuy 36 | ShangJingbo 37 | Ra16bit 38 | JialinOuyang 39 | Jonasz 40 | sevenkplus 41 | bmerry 42 | Tomato 43 | meret 44 | pperm 45 | RAD. 46 | tkociumaka 47 | LayCurse 48 | fagu 49 | peter50216 50 | evima 51 | Mochavic 52 | Myth5 53 | watashi 54 | Romka 55 | rng..58 56 | ush 57 | Thijs. 58 | megaterik 59 | pashka 60 | semiexp. 61 | AppleCplus 62 | fhlasek 63 | Ostap 64 | kevinsogo 65 | levlam 66 | XraY 67 | Merkurev 68 | Soultaker 69 | edly 70 | ShawnDong 71 | Marcin.Smulewicz 72 | dolphinigle 73 | Zlobober 74 | c0cddf 75 | Nerevar 76 | ConanKudo247 77 | Anton.Lunyov 78 | misof 79 | JongMan 80 | cos 81 | shik 82 | linguo 83 | SergeyFedorov 84 | Aksenov239 85 | DKI 86 | maksai 87 | Seyaua 88 | ogiekako 89 | aa2985759 90 | Astein 91 | flowlight0 92 | navi 93 | R.R. 94 | Nikolay.Kalinin 95 | tomerun 96 | Sereja 97 | ashmelev 98 | KhaustovPavel 99 | darnley 100 | LoRdTaPaKaH 101 | pattara.s 102 | yeputons 103 | ZhukovDmitry 104 | TankEngineer 105 | W.Junqiao 106 | Kaizero 107 | Jonick 108 | chokudai 109 | dj3500 110 | n.vilcins 111 | TRYang 112 | MikeMirzayanov 113 | Logic..IU 114 | pawelparys 115 | darknife 116 | coquelicot 117 | enot.1.10 118 | Wataru 119 | DmitryEgorov 120 | Huziwara 121 | kuba97531 122 | kmod 123 | Kirino 124 | JaNo 125 | LeBron 126 | anrieff 127 | andrewzta 128 | technolt 129 | wafrelka 130 | ZILIANG 131 | littlesheep2014 132 | anton.akhi 133 | HaaS 134 | SumuduF 135 | yh.victor 136 | Chmel.Tolstiy 137 | Palmtenor 138 | Aleksander 139 | lxhgww 140 | fuch 141 | mkn 142 | alantian 143 | pablo.t89 144 | wuzhengkai 145 | mfv 146 | kawatea 147 | ruobaole 148 | joey2005 149 | hex539 150 | zyz915 151 | Handojo1 152 | Ljq 153 | Vytis 154 | W4yneb0t 155 | humblefool 156 | tsukuno 157 | Leewings 158 | wh61 159 | gongbaoa 160 | knightL 161 | climpet 162 | imbanoob 163 | razimantv 164 | freak93 165 | lewha0 166 | buko 167 | rinigan 168 | chavit92 169 | cgy4ever 170 | Dembel 171 | robertre 172 | gilesg 173 | forest 174 | zylber 175 | TimeString 176 | Ballon 177 | EmK 178 | Maryann 179 | liangjiaxing 180 | Breakun 181 | xneby 182 | ikatanic 183 | buaaGG 184 | iwi 185 | Lewin 186 | Sabelan 187 | Nicolas16 188 | rowdark 189 | Xhark 190 | DeCowboy 191 | l521530 192 | RalfKistner 193 | mk.al13n 194 | Stigius 195 | Dlougach 196 | komaki 197 | surwdkgo 198 | Ming.Shen 199 | Caoqinxiang 200 | xcwgf666 201 | Yaro 202 | rotsor 203 | shimps 204 | ZhouYuChen 205 | standy 206 | EbTech 207 | Dumbear2 208 | Gassa 209 | isea 210 | ir5 211 | ania7 212 | jalman 213 | EricStansifer 214 | chnlich 215 | darksteel 216 | boleyn.su 217 | DarLam 218 | radeye 219 | aropan 220 | homo.sapiens 221 | simonlindholm 222 | thalassarche 223 | Angor 224 | Giove 225 | Ahmed.Salama 226 | james0zan 227 | amylase 228 | Fumiya 229 | seanwu 230 | Jimbly 231 | irwan.ap 232 | Zerosharp 233 | tongcx 234 | shihanyuan 235 | SurendraM 236 | pjsdream 237 | emaxx 238 | mixsx 239 | kusano 240 | proscriptus 241 | mishun 242 | SkorKNURE 243 | Landertxu 244 | vlpolyansky 245 | Spooky 246 | Nin0 247 | k21 248 | kinodjnz 249 | doudouille 250 | svm11 251 | nwin 252 | polmauri 253 | yarrr 254 | bwps 255 | mrozik 256 | rohanp77 257 | KennyHORROR 258 | masha.and.beer 259 | wikol 260 | Kepnu4 261 | michal27 262 | Lunarmony 263 | logicmachine 264 | CS.Ferng 265 | bayleef 266 | fanhqme 267 | squark 268 | dasko 269 | szefany 270 | Milanin 271 | KrK 272 | zbwmqlw 273 | flashmt 274 | pasin30055 275 | Cruiser 276 | Gleb 277 | slex 278 | davidv 279 | miaout 280 | dspyz 281 | a2stnk 282 | abcsampson 283 | GuyUpLion 284 | Konjac 285 | protos 286 | tpyopyt 287 | rishig 288 | santjuan 289 | IwfWcf 290 | morningsyj 291 | AHdoc 292 | Poldnev 293 | wychen 294 | alexmat21 295 | Fata1ist 296 | CarpathianCoder 297 | kormyshov 298 | 0b00101010 299 | Krig 300 | waitingkuo0527 301 | ptncks0121 302 | paladin8 303 | Lukasz16a 304 | tozangezan 305 | dkirienko 306 | Kriii 307 | natsugiri 308 | yisun 309 | Nekosyndrome 310 | gepa 311 | Dener 312 | -------------------------------------------------------------------------------- /Corpus/users/2437491.txt: -------------------------------------------------------------------------------- 1 | mystic 2 | Vasyl 3 | winger 4 | sdya 5 | pieguy 6 | mikhailOK 7 | jpaulson 8 | EgorKulikov 9 | Lovro 10 | staniek 11 | SnapDragon 12 | Vitaliy 13 | dzhulgakov 14 | PavelKunyavskiy 15 | voover 16 | NALP 17 | vepifanov 18 | nika 19 | GagGuy 20 | BNieuwenhuizen 21 | lunae 22 | MiminoCoder 23 | JAPLJ 24 | theycallhimtom 25 | -------------------------------------------------------------------------------- /Corpus/users/3024486.txt: -------------------------------------------------------------------------------- 1 | EgorKulikov 2 | ivan.popelyshev 3 | Gennady.Korotkevich 4 | vepifanov 5 | sevenkplus 6 | DmitryEgorov 7 | ffao 8 | wuzhengkai 9 | eatmore 10 | mk.al13n 11 | Marcin.Smulewicz 12 | isea 13 | WJMZBMR 14 | fhlasek 15 | Lovro 16 | sdya 17 | exod40 18 | kcm1700 19 | vlad89 20 | hos.lyric 21 | mystic 22 | iwi 23 | Romka 24 | Kepnu4 25 | dzhulgakov 26 | ah1926 27 | Zlobober 28 | shik 29 | kawatea 30 | ZhukovDmitry 31 | Bobik 32 | KennyHORROR 33 | meshanya 34 | VArtem 35 | yeputons 36 | uwi 37 | evima 38 | ThinkCreative 39 | K.A.D.R 40 | niyaznigmatul 41 | nika 42 | Zennon 43 | RiaD 44 | chokudai 45 | semiexp. 46 | Nerevar 47 | eduardische 48 | sky58 49 | netkuba 50 | watashi 51 | peter50216 52 | Gassa 53 | Yarin 54 | Ra16bit 55 | LeoYu 56 | mkirsche 57 | komaki 58 | earl 59 | mikhailOK 60 | Merkurev 61 | Nicolas16 62 | ZbanIlya 63 | fagu 64 | Pasqual45 65 | ValenKof 66 | Eryx 67 | wata 68 | gongbaoa 69 | chnlich 70 | mbradac 71 | notime.sea 72 | kevinsogo 73 | fushar 74 | Fcdkbear 75 | misof 76 | LeBron 77 | spnautilus 78 | qwerty787788 79 | ichyo 80 | twds 81 | TTL.135678942 82 | v.haralampiev 83 | subscriber 84 | PavelKunyavskiy 85 | Riatre 86 | logicmachine 87 | Seyaua 88 | JAPLJ 89 | kraskevich 90 | polmauri 91 | Fefer.Ivan 92 | fetetriste 93 | NALP 94 | linguo 95 | winger 96 | Palmtenor 97 | Murphy 98 | azneye 99 | dongockhanh1997 100 | edward.mj 101 | Vytis 102 | bmerry 103 | berger 104 | Fumiya 105 | tunyash 106 | climpet 107 | Lukasz16a 108 | flowlight0 109 | tkociumaka 110 | pattara.s 111 | qwaker.00 112 | cmd 113 | kostyaby 114 | SnapDragon 115 | pmnox 116 | dnkywin 117 | m.radwan 118 | Tomi 119 | Burunduk1 120 | Fdg 121 | smithinsu 122 | likawind 123 | stqn 124 | stgatilov 125 | oldjunyi 126 | R.R. 127 | NGG 128 | squark 129 | Al.Cash 130 | Tomato 131 | Darsein 132 | YUKI.M 133 | MikeMirzayanov 134 | Sereja 135 | Merlininice.yn 136 | MiminoCoder 137 | pashka 138 | Dembel 139 | Smylic 140 | yaray 141 | S.Yesipenko 142 | Krig 143 | cgy4ever 144 | pieguy 145 | RomaWhite 146 | Mister 147 | c175353 148 | DKI 149 | l521530 150 | caiwaifung 151 | elfness 152 | Dgleich 153 | Zaic 154 | TeaPot 155 | Anta0 156 | W4yneb0t 157 | Stigius 158 | iakudryashov 159 | niquefa.diego 160 | surwdkgo 161 | stevenhao 162 | pandamonium 163 | lzqxh 164 | pperm 165 | Xellos 166 | kohyatoh 167 | andrewzta 168 | tsukuno 169 | MarioYC 170 | sillycross 171 | Mochavic 172 | shimps 173 | Errichto 174 | natalia 175 | zylber 176 | navi 177 | Grzesiu 178 | tckwok 179 | KrK 180 | anrieff 181 | islam.al.aarag 182 | MiSawa 183 | zyz915 184 | Kirino 185 | HiltonLange 186 | Pompon 187 | enarc 188 | k21 189 | ariacas 190 | edgarthewise 191 | pawelparys 192 | koratel 193 | mike.nzk 194 | Smithers 195 | vigo.ph 196 | Sammarize 197 | Robert.Rosolek 198 | elsantodel90 199 | Ming.Shen 200 | Endagorion 201 | carlop 202 | Logic..IU 203 | bwps 204 | HellFalcon 205 | YerzhanU 206 | bit.yangxm 207 | caustique 208 | Lewin 209 | FoodIsGood 210 | Cloud26 211 | liutianren 212 | Prowindy 213 | mR.ilchi 214 | freak93 215 | resty 216 | Ballon 217 | tozangezan 218 | Arterm 219 | marcin.mucha 220 | Sfairat 221 | RiKang 222 | hirosegolf 223 | oml1111 224 | Vasyl 225 | JaNo 226 | Landertxu 227 | applepi 228 | yujinwunz 229 | Hachimori 230 | Carbon.Brother 231 | RuslanGatin 232 | persianpars 233 | xiaodao 234 | Zuza 235 | abcsampson 236 | Shapo 237 | Swistakk 238 | pavelz 239 | Vytenis 240 | yh.victor 241 | W.Junqiao 242 | Ronnoc 243 | Anton.Lunyov 244 | NAFIS 245 | DD.tt 246 | theme 247 | edorundo 248 | przemekkoz 249 | protos 250 | kmod 251 | HowardCheng 252 | Wolfje 253 | MathCrusader 254 | Kyoko 255 | Shangke7788 256 | alantian 257 | gepa 258 | ztxz16 259 | rankalee 260 | DEGwer 261 | thocevar 262 | architkarandikar 263 | takapt 264 | eXtreme 265 | indy256 266 | betaveros 267 | BNieuwenhuizen 268 | nk.karpov 269 | EmK 270 | ybh 271 | ForeverBell 272 | BabaninIvan 273 | Aksenov239 274 | fixme 275 | radeye 276 | boleyn.su 277 | mixsx 278 | Zwergesel 279 | nathanajah 280 | Gerald. 281 | Ripatti 282 | kmjp 283 | amoebius 284 | abyssmaul 285 | SergeyWeiss 286 | Ryan 287 | sokian 288 | IP314 289 | mamekin 290 | dhh1995 291 | madokamylove 292 | Joshik 293 | gawry 294 | LayCurse 295 | Dumbear2 296 | alkjash 297 | cedriclin 298 | flashmt 299 | GagGuy 300 | piob 301 | daizhenyang 302 | darknife 303 | alex.jh 304 | y3eadgbe 305 | AlexanderBolshakov 306 | sroyal 307 | AlexUdalov 308 | kusano 309 | kerker 310 | Mohammad.JRS 311 | yangyue.cn 312 | darkKelvin 313 | Ljq 314 | Elias 315 | Vani0 316 | NateRiverxkh 317 | gpoeta 318 | dwahler 319 | Nin0 320 | stolis 321 | Flydutchman 322 | Fly33 323 | SqrtPi 324 | gnomnain 325 | TakaakiUmedu 326 | xiaowuc1 327 | darnley 328 | fsouza 329 | ordcoder 330 | amashrabov 331 | 2rf 332 | Breakun 333 | net12k44 334 | sankear 335 | argentony 336 | hamadu 337 | RoBa 338 | binhminh410 339 | Astein 340 | SoEnLit 341 | Krwlng 342 | w.yj 343 | jthread 344 | xcwgf666 345 | bardek 346 | dexy 347 | satashun 348 | POMELO 349 | frank12268 350 | Savlik 351 | rowdark 352 | ush 353 | daimi89 354 | Antinomyra 355 | Tkachev 356 | akashin 357 | lnsuyn 358 | 916852 359 | danielf 360 | johnathan79717 361 | sergio3010 362 | DancingSoul 363 | u8765 364 | iSuneast 365 | smapson 366 | ssmike 367 | lasten 368 | Rubanenko 369 | lby 370 | foxwlog 371 | wangchaohui 372 | wafrelka 373 | d.hui 374 | waitingkuo0527 375 | miketcyue 376 | zck921031 377 | dilsonguim 378 | j.e. 379 | pwahs 380 | torus711 381 | jevi 382 | azhai 383 | Brainfvck 384 | humblefool 385 | lkq1992yeah 386 | yukis 387 | mingkaidox 388 | Mare.S.Ephemeral 389 | JohnSmith 390 | -------------------------------------------------------------------------------- /Corpus/users/311101.txt: -------------------------------------------------------------------------------- 1 | ACRush 2 | qizichao 3 | wata 4 | ZhukovDmitry 5 | dzhulgakov 6 | nika 7 | Vitaliy 8 | kalinov 9 | halyavin 10 | bmerry 11 | alyaxey 12 | marek.cygan 13 | Khuc.Anh.Tuan 14 | Jiunru 15 | winger 16 | hos.lyric 17 | pashka 18 | misof 19 | Jeru 20 | FloppyCat 21 | AdrianKuegel 22 | emaxx 23 | ploh 24 | -------------------------------------------------------------------------------- /Corpus/users/32002.txt: -------------------------------------------------------------------------------- 1 | bmerry 2 | yuhch123 3 | halyavin 4 | wata 5 | iwi 6 | Ahyangyi 7 | tourist 8 | gawry 9 | vlad89 10 | neal.wu 11 | austrin 12 | Vasyl 13 | Gluk 14 | mystic 15 | falagar 16 | misof 17 | jakubr 18 | Ying 19 | darnley 20 | dgozman 21 | ftc 22 | Reid 23 | fuwenjie 24 | nya 25 | rem 26 | hmich 27 | Burunduk2 28 | FedorTsarev 29 | Nerevar 30 | ardiankp 31 | JongMan 32 | bogdan2412 33 | elizarov 34 | Alexus 35 | lympanda 36 | Loner 37 | try 38 | dzhulgakov 39 | darthur 40 | PavelKuznetsov 41 | blackmath 42 | Psyho 43 | trebe 44 | kurniady 45 | Jacek 46 | tywok 47 | AS1 48 | pmnox 49 | victorsb 50 | Yarin 51 | ploh 52 | andrewzta 53 | ymatsux 54 | subra 55 | ilyaraz 56 | maciejk 57 | OpenGL 58 | Innovative.Cat 59 | pashka 60 | slex 61 | Jimb 62 | Vytis 63 | LucaB 64 | JanKuipers 65 | blueblimp 66 | klopyrev 67 | kubus 68 | almelv 69 | rspeer 70 | Zhuojie 71 | krijgertje 72 | ecprice 73 | Eryx 74 | xreborner 75 | ivan.popelyshev 76 | arti 77 | beingryu 78 | LinesPrower 79 | SkidanovAlexander 80 | AdrianKuegel 81 | mirosuaf 82 | bramandia 83 | ACRush 84 | yariv 85 | Jonick 86 | KOTEHOK 87 | snguyen 88 | inazz 89 | dolphinigle 90 | reiten 91 | jbernadas 92 | gevak 93 | LayCurse 94 | Soultaker 95 | SBRS 96 | hekacyr 97 | XiaoZiqian 98 | pdallago 99 | narri 100 | beerscout 101 | tos.lunar 102 | jmzero 103 | RodrigoBurgos 104 | bloodmage 105 | sjelkjd 106 | cpphamza 107 | vexorian 108 | karol1 109 | ZloyBastard 110 | Hachimori 111 | exod40 112 | Vytenis 113 | Gigz 114 | Tsubosaka 115 | CataractGoogly 116 | CrazyScratcher 117 | abikbaev 118 | Wolfje 119 | moonancient 120 | MauricioC 121 | charliez 122 | leehark 123 | nika 124 | StanY 125 | Smitty 126 | yuzmukhametov 127 | ShangJingbo 128 | Jonasz 129 | Jan 130 | Lipstick 131 | crazyb0y 132 | VitalyGoldstein 133 | updog 134 | Sanny 135 | sidky 136 | dano 137 | dbh 138 | CM87 139 | nodchip 140 | Xixas 141 | hhb 142 | jdmetz 143 | vepifanov 144 | windy7926778 145 | Mingfei.Li 146 | Astein 147 | wojteks 148 | moon5ckq 149 | ssancho 150 | altertain 151 | DmitryKlenov 152 | bsonrisa 153 | Alexey 154 | elhipercubo 155 | abiczo 156 | RoBa 157 | earl 158 | Bohua 159 | Rostislav 160 | linguo 161 | dzwiedziu 162 | ben.hwang 163 | Lunarmony 164 | SergeyRogulenko 165 | jthread 166 | ituphanov 167 | zhengzhao 168 | tanakh 169 | microsoft 170 | rlblaster 171 | henryy 172 | PaulJefferys 173 | gmark 174 | turuthok 175 | scorzh 176 | bwps 177 | rafaeldaigo 178 | botay 179 | mozeda 180 | zibada 181 | NeT 182 | victorj 183 | eMBe 184 | ulzha 185 | wywcgs 186 | bhamrick 187 | satej 188 | GunnERs 189 | deviatov 190 | Yulo.K 191 | Amber 192 | DStepanenko 193 | u1ik 194 | Klinck 195 | unbing 196 | kp7 197 | Yao 198 | Olexiy 199 | ralekseenkov 200 | Shahriar.Rouf.Nafi 201 | diam 202 | doudouille 203 | MikeMirzayanov 204 | DeCowboy 205 | terranwy 206 | logistic 207 | hrushikesh.tilak 208 | ilyakor 209 | Zakklars 210 | Qingchun 211 | hsyeo 212 | MikeSeibert 213 | ScaleRhyme 214 | g201513 215 | Dragoon 216 | Sempr 217 | gepa 218 | greenoyster 219 | Mimino 220 | obokaman 221 | GeKa 222 | wap 223 | oberon 224 | kivus 225 | lewha0 226 | antimatter 227 | vladut89 228 | strategist333 229 | NefariousZhen 230 | TripleM 231 | Kalq 232 | marius.pungaru 233 | Erik 234 | talchas 235 | xiaobao 236 | Fly 237 | Wataru 238 | TaiTai 239 | xhl.kogitsune 240 | ulyanick 241 | RalfKistner 242 | carlosralv 243 | vpj 244 | RicardoHahn 245 | winger 246 | Die 247 | andersk 248 | AlexanderL 249 | andre.sp 250 | Savior 251 | natalia 252 | Zig 253 | Dlougach 254 | boss 255 | jagjag 256 | Vitaliy 257 | samee 258 | Stigius 259 | maksay 260 | TheHue 261 | mysanal 262 | Aekeri 263 | Hetman 264 | szd 265 | trchen1033 266 | pedroeira 267 | khuebeo 268 | kinaba 269 | Ostap 270 | stone 271 | vitar 272 | pavelz 273 | Kyungryeol 274 | zwdant 275 | Patience 276 | ertesh 277 | Project 278 | Progbeat 279 | MB. 280 | Borisp 281 | LanceHalberd 282 | dotnetcoder 283 | hansonw 284 | frostnova 285 | Helenjyun 286 | HiltonLange 287 | jonathantan86 288 | MRoizner 289 | zzzz 290 | Relja 291 | lrearte 292 | radeye 293 | eXtreme 294 | humblefool 295 | Patriot 296 | araste 297 | mehas 298 | Kee 299 | Jby 300 | vigo.ph 301 | imyoyo 302 | yisun 303 | kk420 304 | tsukuno 305 | Akim 306 | StevieT 307 | Marte 308 | cepheid 309 | Plagapong 310 | irori 311 | diferential 312 | Cupuman 313 | WSX 314 | LBFacci 315 | tienn 316 | adeymo 317 | nicholas 318 | nik239 319 | Jasko 320 | sapal 321 | Rydberg 322 | spencer 323 | gusakov 324 | Huayang 325 | Razvi 326 | burunduk3 327 | lbackstrom 328 | jasonw 329 | JesUltra 330 | kappahouse 331 | Tilps 332 | ssulbbang 333 | vsb 334 | EkTePik 335 | lordmonsoon 336 | JRR 337 | Cheryl 338 | eonx 339 | kangshifu 340 | roypalacios 341 | ltdtl 342 | eldering 343 | EmK 344 | AdamG 345 | Vovka 346 | cmd 347 | anton.akhi 348 | ErickW 349 | licstar 350 | macs 351 | lixs2003 352 | supo 353 | .Invader 354 | Sohel 355 | kik 356 | skatou 357 | FAndy 358 | RomanLipovsky 359 | cvoinescu 360 | eleusive 361 | Baekjoon 362 | Saty 363 | linyufly 364 | agh 365 | IvanRomanov 366 | Fire 367 | mirzman 368 | tckwok 369 | xwbsw 370 | myprasanna 371 | danielp 372 | jfguo 373 | watashi 374 | Doggy 375 | latsyrc 376 | visq 377 | grizzly 378 | legend12 379 | Murphy 380 | ThinkCreative 381 | ilham 382 | impetus 383 | kunigami 384 | loneknight 385 | WillCodeForFood 386 | felipebart 387 | Al.Cash 388 | eagaeoppooaaa 389 | izulin 390 | firepot 391 | yuta.sawa 392 | kit 393 | gaosimeng 394 | RAD 395 | Chmel.Tolstiy 396 | RAVEman 397 | foobarbaz 398 | slippy 399 | ZhouErjin 400 | Vedensky 401 | msn 402 | wcao 403 | YangYi 404 | momtchil 405 | Fluorine 406 | radames9htv 407 | Tony 408 | Davis 409 | Luciano 410 | YUMEN 411 | JosephWen 412 | javau 413 | yuizumi 414 | otinn 415 | abkqz 416 | domeng 417 | pablo.aguilar 418 | Brian 419 | sclo 420 | srikkbhat 421 | Cho 422 | i0exception 423 | HanoiTower 424 | ThomMerillin 425 | TheLlama 426 | p13 427 | andreitheo 428 | sonyckson 429 | janq 430 | Alvin 431 | edauri 432 | ftfish 433 | gojira 434 | Adhit 435 | wkoder 436 | elsantodel90 437 | aknow 438 | gurug 439 | Laurance 440 | Minilek 441 | ToN.AC119 442 | naguib 443 | Elmiguel409 444 | TonyZ 445 | icecream 446 | Wernie 447 | gcdart 448 | griffon 449 | Stone 450 | chouxiaowen 451 | SCat.Wang 452 | DAle 453 | satchipear 454 | qinlu 455 | Nasa 456 | Prostu 457 | LYW 458 | igoro 459 | Stas 460 | schultz 461 | UnknownException 462 | tck 463 | vnikulin 464 | tanaeem 465 | pavel13 466 | kedaizd 467 | chenxiuwei 468 | carp 469 | OldDonkey 470 | rx201 471 | roma 472 | kitamasa 473 | rbtree 474 | yannis 475 | zerg 476 | Mg9H 477 | Duke 478 | Gleb 479 | Arif 480 | mustafij 481 | JackieX 482 | gutalin 483 | Dream 484 | TAG 485 | kprateek88 486 | frankyym 487 | walchl 488 | elmariachi1414 489 | eagleonhill 490 | Leonid 491 | Landertxu 492 | daiwb 493 | kaasis 494 | ged 495 | AnshAryan 496 | rajkon 497 | Kouprin 498 | alien.i 499 | Ragnarok 500 | shimps 501 | winsty 502 | nakajima 503 | cryboy 504 | insotc 505 | PhilipPV 506 | Ravents 507 | Crush 508 | lukasP 509 | Torax 510 | bigheadghost 511 | yiuyuho 512 | those 513 | Terence 514 | Biskup 515 | Rahenri 516 | sao3 517 | WildUtah 518 | RichardPang 519 | Vegetable 520 | voyagerr 521 | lqs 522 | yessit 523 | Vintik 524 | hanney 525 | Felix 526 | Slevin 527 | enjoy1009 528 | Rio 529 | iuaaui 530 | leeang 531 | harpreet.singh 532 | rrpai 533 | kaneko 534 | marting 535 | Jackrabbit 536 | Fernando 537 | Foxtail 538 | Darko 539 | Gnefihz 540 | qwynick 541 | koxvqrvtkp 542 | carl 543 | nebula.lam 544 | nutki 545 | X.Ray 546 | dimozzz 547 | q3dm17 548 | FerroMrkva 549 | cpcs 550 | argentony 551 | zmy 552 | SpaceFlyer 553 | squall1729 554 | kwiatek 555 | Ryan 556 | IgorYevchinets 557 | algostorm 558 | HilbertRaum 559 | stavr 560 | zhendongjia 561 | boba5555 562 | mdoan 563 | connect4 564 | i.bogatyi 565 | SmartSchizo 566 | shell 567 | pperm 568 | Irioth 569 | StepInto 570 | uwi 571 | vyxaryx 572 | NilayVaish 573 | gagik 574 | dkorduban 575 | marspeople 576 | RainingStar 577 | HenryHu 578 | popo 579 | AlphaStream111 580 | rwaliany 581 | Kdub 582 | tollek 583 | bug 584 | gnarlycow 585 | SharpC 586 | mdzfirst 587 | rajeshsr 588 | skol 589 | Lerry 590 | aliquis 591 | aleksandari 592 | martin.at.ksp 593 | Sunny 594 | Davidsu 595 | gladiator 596 | groupbuilder 597 | rasto6sk 598 | aditsu 599 | RalphFurmaniak 600 | Koper 601 | Hammer 602 | Safii 603 | abstractwhiz 604 | ouka 605 | Minny 606 | loyeer 607 | SourSpinach 608 | ipknHama 609 | kmod 610 | pzielinski 611 | moonlight 612 | mdruker 613 | MTWTFFF 614 | panczo 615 | giolekva 616 | matkk 617 | wushuangyue 618 | Demasi 619 | Chimed 620 | t.mac 621 | Jakozaur 622 | augustotorres 623 | zgm 624 | fetetriste. 625 | dmytro.korzhyk 626 | lidaobing 627 | indy256 628 | Spieler 629 | blmarket 630 | aussie 631 | MrZZZ 632 | stan 633 | Aesop 634 | cancho 635 | domob 636 | Faber 637 | ziliang 638 | victor.j8 639 | Yiming 640 | jackfeng 641 | emnmon 642 | diaorui 643 | Carrot 644 | Alphard 645 | Mathwhiz1286 646 | b0b0b0b 647 | wInuX 648 | mth 649 | trulo17 650 | doriath 651 | along 652 | Infinity. 653 | carlop 654 | gizzywump 655 | vlad 656 | Vman 657 | Sergey.Bankevich 658 | AngelClover 659 | marsavic 660 | Dima 661 | SavinovAlex 662 | goober 663 | aleksey 664 | IlyaPonamarev 665 | RenTeng 666 | Ripatti 667 | zxhy2 668 | felixh 669 | Rasifiel 670 | Maryann 671 | vinaysingh 672 | AlMag 673 | cs10520 674 | Rick 675 | tiagomt 676 | microbrain 677 | HenryW 678 | fclaude 679 | mvolke 680 | wuxy 681 | Ramzes2 682 | sanky29288 683 | Chrono 684 | cjoa 685 | MIPTAlex 686 | lh124363042 687 | aanastasov 688 | Abir 689 | Gaizka 690 | deepakmanohar 691 | txandi 692 | DmitriyL 693 | VladS 694 | j.vimal 695 | Ivankovic 696 | ViniciusCabessa 697 | husheyn 698 | joseph 699 | Hackson 700 | morbidel 701 | waterwang 702 | windowmaker 703 | DevilMayCry 704 | soul3434 705 | thecata 706 | Serraa 707 | thunderfyc 708 | FBWolf 709 | Baiger 710 | saltycookie 711 | it3 712 | relic 713 | andrei.info 714 | Hypuk 715 | Keegan 716 | ssaljalu 717 | Askar 718 | MoreFreeze 719 | Valergrad 720 | misko.sz 721 | mhung 722 | ahh 723 | arxor 724 | oaiei 725 | lxhgww 726 | alexkas 727 | lcosvse 728 | TT87 729 | asaveljevs 730 | Alligator 731 | vk91 732 | SergeyAkimov 733 | starforever 734 | acherepanov 735 | Edu 736 | lecoo 737 | rubyeye 738 | DEathkNIghtS 739 | ivo.sluganovic 740 | itoasquall 741 | McFn 742 | the.Fm 743 | upgrade 744 | aleck 745 | ccc5 746 | imrankane2005 747 | brus07 748 | cintana 749 | Rainco 750 | szsz 751 | partychen 752 | Karalabe 753 | sandro 754 | lemonutzf 755 | ThomasDeniau 756 | stpkys 757 | Landrew 758 | Greggypoo 759 | fogwind 760 | star 761 | MonEtoile 762 | kozima 763 | quaji 764 | DragonRidr 765 | Gnurdux 766 | Lvsoft 767 | thobel 768 | kcwu 769 | yappy 770 | pqshq 771 | wistful 772 | mergen 773 | delicato 774 | kittycat 775 | henshiru 776 | SoD 777 | w0nder 778 | Turning 779 | Nikelandjelo 780 | juwon 781 | t3hg0suazn 782 | backluck 783 | zzmike 784 | mihar 785 | stef.lp 786 | spupyrev 787 | mouda 788 | kenyyy 789 | littlej 790 | viclei 791 | levlam 792 | Wam 793 | fanKarpaty 794 | luanlai 795 | Swarun 796 | HenryKitten 797 | hiperx 798 | igorcanadi 799 | Shadrach 800 | ilya 801 | pauldb 802 | liulibo133 803 | srou 804 | three 805 | bee 806 | Bus 807 | bnuer 808 | bl1n 809 | Robinnibor 810 | 5l2 811 | NevoWin777 812 | Ignat 813 | Jyun 814 | LeoC 815 | Younix 816 | runTarm 817 | zhang 818 | Geniuswj 819 | narendhranath 820 | chenxueyu 821 | Muerte 822 | overwise 823 | Sputnik 824 | fredd4 825 | wuyifan 826 | C.A. 827 | Sorokin 828 | sandaru1 829 | marim 830 | oyy 831 | dzetkulict 832 | sunnn 833 | arctanx 834 | Equinox 835 | Vangoz 836 | AlexanderT 837 | foison 838 | pinpin 839 | sproblvem 840 | kmh4500 841 | my.nickname 842 | deepblue 843 | serg 844 | matsuza 845 | yyt 846 | Levy 847 | wisdompoet 848 | wjsw 849 | Sankozi 850 | Lazarev 851 | kuat 852 | Alec 853 | tangyouze 854 | genzmer 855 | Yak 856 | Teferi 857 | bolek 858 | Hanaban 859 | lvyun 860 | trecio 861 | chenhaifeng 862 | Delsius 863 | Calamitas 864 | dexy 865 | Ferlon 866 | 234226 867 | Xofon 868 | ljwan12 869 | Esquimeau 870 | shubham.mittal 871 | Doeke 872 | qu1ck 873 | .maXim. 874 | paisa 875 | LynnKaye 876 | Modulator 877 | AliJ 878 | MaiK 879 | Boping 880 | int9 881 | imos 882 | TheRaven 883 | -------------------------------------------------------------------------------- /Corpus/users/32005.txt: -------------------------------------------------------------------------------- 1 | ACRush 2 | Ahyangyi 3 | Amber 4 | xhl.kogitsune 5 | LayCurse 6 | xreborner 7 | stone 8 | TripleM 9 | wata 10 | tckwok 11 | g201513 12 | iwi 13 | humblefool 14 | kinaba 15 | Fire 16 | ymatsux 17 | Lunarmony 18 | windy7926778 19 | Huayang 20 | FAndy 21 | domeng 22 | ardiankp 23 | OpenGL 24 | Zhuojie 25 | lympanda 26 | tanakh 27 | tos.lunar 28 | yuhch123 29 | YangYi 30 | Savior 31 | kitamasa 32 | JongMan 33 | Innovative.Cat 34 | XiaoZiqian 35 | Loner 36 | wywcgs 37 | eagleonhill 38 | Laurance 39 | yuta.sawa 40 | subra 41 | irori 42 | Astein 43 | ScaleRhyme 44 | kik 45 | Fluorine 46 | Murphy 47 | altertain 48 | nya 49 | hhb 50 | araste 51 | daiwb 52 | Sempr 53 | EmK 54 | moonancient 55 | Terence 56 | watashi 57 | lewha0 58 | those 59 | jfguo 60 | henryy 61 | Yulo.K 62 | kprateek88 63 | zwdant 64 | updog 65 | sidky 66 | terranwy 67 | chenxiuwei 68 | Mingfei.Li 69 | AnshAryan 70 | nakajima 71 | Stone 72 | tanaeem 73 | tsukuno 74 | yuizumi 75 | logistic 76 | GunnERs 77 | lqs 78 | HanoiTower 79 | Sanny 80 | zhengzhao 81 | licstar 82 | kappahouse 83 | LYW 84 | inazz 85 | snguyen 86 | Dragoon 87 | Patience 88 | trchen1033 89 | TheHue 90 | Yao 91 | leehark 92 | chouxiaowen 93 | kurniady 94 | xwbsw 95 | Vegetable 96 | samee 97 | RoBa 98 | OldDonkey 99 | legend12 100 | firepot 101 | vpj 102 | javau 103 | Kee 104 | rbtree 105 | ben.hwang 106 | Shahriar.Rouf.Nafi 107 | qinlu 108 | loneknight 109 | gaosimeng 110 | shimps 111 | YUMEN 112 | mozeda 113 | Alvin 114 | EkTePik 115 | zerg 116 | CrazyScratcher 117 | bigheadghost 118 | microsoft 119 | JosephWen 120 | nodchip 121 | Baekjoon 122 | skatou 123 | Zakklars 124 | charliez 125 | i0exception 126 | Hachimori 127 | khuebeo 128 | Cho 129 | voyagerr 130 | satchipear 131 | Jan 132 | wap 133 | Saty 134 | Kyungryeol 135 | beingryu 136 | Tsubosaka 137 | Dream 138 | SCat.Wang 139 | Tilps 140 | xiaobao 141 | insotc 142 | ftfish 143 | codejam.forget1 144 | nicholas 145 | walchl 146 | rx201 147 | try 148 | Marte 149 | UnknownException 150 | TaiTai 151 | imyoyo 152 | latsyrc 153 | -------------------------------------------------------------------------------- /Corpus/users/32008.txt: -------------------------------------------------------------------------------- 1 | Bohua 2 | SkidanovAlexander 3 | radeye 4 | linguo 5 | andersk 6 | Reid 7 | antimatter 8 | ploh 9 | fuwenjie 10 | pmnox 11 | darthur 12 | macs 13 | yiuyuho 14 | blueblimp 15 | pdallago 16 | igoro 17 | StanY 18 | kp7 19 | Rahenri 20 | edauri 21 | ssancho 22 | Qingchun 23 | ltdtl 24 | jdmetz 25 | ecprice 26 | felipebart 27 | klopyrev 28 | Elmiguel409 29 | rspeer 30 | LanceHalberd 31 | LBFacci 32 | jbernadas 33 | WillCodeForFood 34 | sjelkjd 35 | msn 36 | rafaeldaigo 37 | MikeSeibert 38 | gurug 39 | TonyZ 40 | ged 41 | bsonrisa 42 | JackieX 43 | kit 44 | adeymo 45 | hrushikesh.tilak 46 | Zig 47 | satej 48 | JRR 49 | icecream 50 | NefariousZhen 51 | jmzero 52 | jasonw 53 | talchas 54 | vexorian 55 | karol1 56 | ssulbbang 57 | Mg9H 58 | Nasa 59 | Die 60 | schultz 61 | RicardoHahn 62 | roypalacios 63 | carlosralv 64 | pedroeira 65 | tck 66 | Jimb 67 | lbackstrom 68 | Rostislav 69 | jagjag 70 | CataractGoogly 71 | WildUtah 72 | elsantodel90 73 | sonyckson 74 | lrearte 75 | dbh 76 | spencer 77 | pablo.aguilar 78 | dotnetcoder 79 | wkoder 80 | gcdart 81 | foobarbaz 82 | Aekeri 83 | Luciano 84 | yisun 85 | sclo 86 | Ying 87 | Duke 88 | unbing 89 | eleusive 90 | turuthok 91 | vladut89 92 | myprasanna 93 | RichardPang 94 | beerscout 95 | pavel13 96 | ErickW 97 | narri 98 | kunigami 99 | -------------------------------------------------------------------------------- /Corpus/users/32010.txt: -------------------------------------------------------------------------------- 1 | bmerry 2 | dzhulgakov 3 | gawry 4 | dgozman 5 | halyavin 6 | pashka 7 | mystic 8 | Klinck 9 | .Invader 10 | DmitryKlenov 11 | Gluk 12 | PaulJefferys 13 | Eryx 14 | LucaB 15 | austrin 16 | almelv 17 | krijgertje 18 | ilyaraz 19 | gusakov 20 | JanKuipers 21 | Burunduk2 22 | LinesPrower 23 | Vitaliy 24 | zibada 25 | natalia 26 | Jacek 27 | andrewzta 28 | winger 29 | Prostu 30 | anton.akhi 31 | Borisp 32 | nik239 33 | oberon 34 | Lipstick 35 | Wataru 36 | Yarin 37 | Vedensky 38 | dzwiedziu 39 | Chmel.Tolstiy 40 | MB. 41 | AS1 42 | Vytis 43 | KOTEHOK 44 | ftc 45 | misof 46 | Psyho 47 | RAVEman 48 | falagar 49 | Vasyl 50 | andre.sp 51 | wojteks 52 | Nerevar 53 | hekacyr 54 | gevak 55 | radames9htv 56 | Mimino 57 | Kalq 58 | vigo.ph 59 | Vytenis 60 | Crush 61 | abikbaev 62 | Torax 63 | VitalyGoldstein 64 | scorzh 65 | danielp 66 | ralekseenkov 67 | burunduk3 68 | darnley 69 | elizarov 70 | reiten 71 | Dlougach 72 | IvanRomanov 73 | Xixas 74 | nika 75 | botay 76 | Vintik 77 | PhilipPV 78 | Stigius 79 | MikeMirzayanov 80 | griffon 81 | lordmonsoon 82 | Smitty 83 | jakubr 84 | Alexus 85 | kubus 86 | Olexiy 87 | gepa 88 | ituphanov 89 | Al.Cash 90 | eMBe 91 | Landertxu 92 | Hetman 93 | slex 94 | vlad89 95 | JesUltra 96 | FedorTsarev 97 | elhipercubo 98 | vsb 99 | hmich 100 | Vovka 101 | AlexanderL 102 | trebe 103 | boss 104 | dano 105 | alien.i 106 | cvoinescu 107 | deviatov 108 | eldering 109 | ivan.popelyshev 110 | CM87 111 | Fly 112 | TAG 113 | slippy 114 | Ostap 115 | Soultaker 116 | maciejk 117 | WSX 118 | kedaizd 119 | jthread 120 | Patriot 121 | Rydberg 122 | DAle 123 | rajkon 124 | ulzha 125 | lukasP 126 | Gleb 127 | ilyakor 128 | eXtreme 129 | Progbeat 130 | ulyanick 131 | impetus 132 | MRoizner 133 | diam 134 | abkqz 135 | ilham 136 | gmark 137 | naguib 138 | Jasko 139 | TheLlama 140 | AdrianKuegel 141 | eonx 142 | victorj 143 | roma 144 | agh 145 | mirosuaf 146 | grizzly 147 | abiczo 148 | Leonid 149 | cmd 150 | ertesh 151 | DStepanenko 152 | Jonasz 153 | u1ik 154 | diferential 155 | janq 156 | RomanLipovsky 157 | frostnova 158 | kk420 159 | andreitheo 160 | Ravents 161 | tywok 162 | izulin 163 | blackmath 164 | doudouille 165 | p13 166 | Kouprin 167 | cpphamza 168 | Wernie 169 | Gigz 170 | szd 171 | StevieT 172 | vitar 173 | gojira 174 | ThinkCreative 175 | yariv 176 | supo 177 | PavelKuznetsov 178 | sapal 179 | AdamG 180 | Razvi 181 | mirzman 182 | momtchil 183 | otinn 184 | zzzz 185 | Stas 186 | Slevin 187 | gutalin 188 | Project 189 | pavelz 190 | eagaeoppooaaa 191 | Wolfje 192 | elmariachi1414 193 | visq 194 | vnikulin 195 | kivus 196 | Biskup 197 | rlblaster 198 | bwps 199 | yuzmukhametov 200 | marius.pungaru 201 | Relja 202 | NeT 203 | hanney 204 | -------------------------------------------------------------------------------- /Corpus/users/32011.txt: -------------------------------------------------------------------------------- 1 | ACRush 2 | Innovative.Cat 3 | bmerry 4 | pmnox 5 | yuhch123 6 | gawry 7 | Eryx 8 | mystic 9 | ploh 10 | blueblimp 11 | windy7926778 12 | PaulJefferys 13 | Chmel.Tolstiy 14 | Yarin 15 | andrewzta 16 | radeye 17 | yiuyuho 18 | AS1 19 | KOTEHOK 20 | Fire 21 | g201513 22 | ftc 23 | Burunduk2 24 | pdallago 25 | Gluk 26 | xreborner 27 | DmitryKlenov 28 | halyavin 29 | kinaba 30 | dzwiedziu 31 | humblefool 32 | Jacek 33 | lympanda 34 | domeng 35 | tckwok 36 | krijgertje 37 | ardiankp 38 | darthur 39 | XiaoZiqian 40 | wata 41 | LucaB 42 | JanKuipers 43 | tanakh 44 | dzhulgakov 45 | LinesPrower 46 | fuwenjie 47 | Ahyangyi 48 | Lunarmony 49 | eagleonhill 50 | SkidanovAlexander 51 | tos.lunar 52 | Vitaliy 53 | Laurance 54 | dgozman 55 | Amber 56 | StanY 57 | ltdtl 58 | ymatsux 59 | iwi 60 | Huayang 61 | antimatter 62 | zibada 63 | LayCurse 64 | Vedensky 65 | OpenGL 66 | oberon 67 | igoro 68 | MB. 69 | YangYi 70 | Vytis 71 | Savior 72 | austrin 73 | Qingchun 74 | linguo 75 | JongMan 76 | gusakov 77 | macs 78 | Prostu 79 | kp7 80 | ilyaraz 81 | Reid 82 | Bohua 83 | kitamasa 84 | pashka 85 | ssancho 86 | andersk 87 | Wataru 88 | anton.akhi 89 | nik239 90 | subra 91 | TripleM 92 | natalia 93 | almelv 94 | stone 95 | yuta.sawa 96 | Klinck 97 | Borisp 98 | -------------------------------------------------------------------------------- /Corpus/users/639102.txt: -------------------------------------------------------------------------------- 1 | Burunduk1 2 | winger 3 | Eryx 4 | RAVEman 5 | Gennady.Korotkevich 6 | nika 7 | eatmore 8 | pashka 9 | Vasyl 10 | jakubr 11 | meret 12 | ZhukovDmitry 13 | qizichao 14 | marek.cygan 15 | rng..58 16 | halyavin 17 | krijgertje 18 | linguo 19 | FloppyCat 20 | PaulJefferys 21 | mystic 22 | iwi 23 | wojteks 24 | ACRush 25 | elsantodel90 26 | Khuc.Anh.Tuan 27 | yuhch123 28 | marcina 29 | Gassa 30 | earl 31 | Stigius 32 | SergeyRogulenko 33 | Egor 34 | bmerry 35 | omeometo 36 | andrewzta 37 | Gluk 38 | hos.lyric 39 | Louty 40 | MrBald 41 | wata 42 | moon5ckq 43 | msg555 44 | darnley 45 | Milanin 46 | srh 47 | TripleM 48 | xhl.kogitsune 49 | xreborner 50 | ilyakor 51 | Dlougach 52 | alantian 53 | Orfest 54 | GlennMatthews 55 | onp 56 | Xhark 57 | iddaga 58 | gaoyihan 59 | SuZhan 60 | stgatilov 61 | Anton.Lunyov 62 | eMBe 63 | Lunarmony 64 | dzwiedziu 65 | lxx 66 | tikitikirevenge 67 | LiuKe 68 | Zhuojie 69 | blmarket 70 | ShangJingbo 71 | kangaroo 72 | Fumiya 73 | trebe 74 | Jonick 75 | ryuuga 76 | g201513 77 | zyz915 78 | Kimi.Arthur 79 | exod40 80 | paladin8 81 | kozikow 82 | Chmel.Tolstiy 83 | sdya 84 | misof 85 | Nerevar 86 | MrZZZ 87 | Nyatl 88 | ir5 89 | Seyaua 90 | dagon 91 | Alexus 92 | Reid 93 | Flex 94 | Palmtenor 95 | RAD. 96 | Xazker 97 | pawelparys 98 | levlam 99 | defrager 100 | resty 101 | tomekkulczynski 102 | AS1 103 | ivan.popelyshev 104 | zhengzhao 105 | Maryann 106 | Tomato 107 | maojm 108 | Rydberg 109 | anrieff 110 | oldherl 111 | pperm 112 | adeymo 113 | vitar 114 | StevieT 115 | imabc 116 | joey2005 117 | Imba 118 | dolphinigle 119 | Vegetable 120 | vepifanov 121 | Progbeat 122 | vlad89 123 | dzhulgakov 124 | diver 125 | kia 126 | ralekseenkov 127 | hansonw 128 | dano 129 | RoBa 130 | natalia 131 | gkreitz 132 | mirac 133 | mrc88 134 | rafaeldaigo 135 | AEtheReal 136 | Sunny 137 | Pedro.Bello 138 | DKI 139 | Jed 140 | uwi 141 | ConanKudo247 142 | PavelKunyavskiy 143 | Joshik 144 | pasin30055 145 | forifchen 146 | Shinta 147 | maciejk 148 | zibada 149 | reiten 150 | ArtDitel 151 | Ra16bit 152 | izulin 153 | vigo.ph 154 | Hackson 155 | vot 156 | arti 157 | simonsyd 158 | AdrianKuegel 159 | jaehyunp 160 | guilherme 161 | imazato 162 | ituphanov 163 | GarnetCrow 164 | emaxx 165 | Michael.Levin 166 | JongMan 167 | wRabbits.nevidomy 168 | stone 169 | forest 170 | wRabbits.AlMag 171 | LinesPrower 172 | Ignat 173 | Al.Cash 174 | rotsor 175 | KAP 176 | fixme 177 | jzd 178 | skynet 179 | DAle 180 | kitamasa 181 | XiaoZiqian 182 | blando 183 | ebd 184 | humblefool 185 | .maXim. 186 | slippy 187 | Romka 188 | ftc 189 | lympanda 190 | danielf 191 | CS.Ferng 192 | sisu 193 | snguyen.itim 194 | mohamedafattah 195 | pietrzkiewicz 196 | bwps 197 | gawry 198 | Ryan 199 | fero 200 | Psyho 201 | aleksey 202 | ashmelev 203 | Multifarious 204 | Wataru 205 | wz12 206 | eireksten 207 | kinaba 208 | Astein 209 | qwaker.00 210 | jdmetz 211 | janq 212 | GagGuy 213 | sidky 214 | wushuangyue 215 | those 216 | lewha0 217 | Steps09 218 | hs484 219 | wefgef 220 | TheLlama 221 | impetus 222 | chibby 223 | RalphFurmaniak 224 | dAnton 225 | anton.akhi 226 | txandi 227 | cmd 228 | sky58 229 | KOTEHOK 230 | Vitaliy 231 | jbernadas 232 | AlexLin 233 | beingryu 234 | ika 235 | cax 236 | sonyckson 237 | vvn 238 | imyourgod 239 | thocevar 240 | eXtreme 241 | SuBaRaSi 242 | LoRdTaPaKaH 243 | nik239 244 | cz.vx.bc 245 | yangzhe1990 246 | goober 247 | Lovro 248 | MiminoCoder 249 | damian.k 250 | try 251 | ggm 252 | raincole 253 | lxhgww 254 | syco 255 | tos.lunar 256 | Myth 257 | Gibon 258 | LayCurse 259 | polmauri 260 | supo 261 | ErrGe 262 | Atol 263 | r5insight 264 | reachnomind 265 | Onufry 266 | tanonev 267 | magicdlf 268 | Ostap 269 | Vedensky 270 | domeng 271 | aanastasov 272 | a9108 273 | Fdg 274 | xgy 275 | a70babat 276 | tantian 277 | abeln 278 | Qinz 279 | wudired 280 | cos 281 | K.A.D.R 282 | venco5 283 | biran0079 284 | kmjp 285 | tanakh 286 | daffes 287 | tkcn 288 | Yeomin 289 | Vytis 290 | arxor 291 | maold 292 | lydxlx 293 | Cheryl 294 | KirillB 295 | chokudai 296 | gojira 297 | samee 298 | Patrick.Nguyen 299 | Tarrasch 300 | AnhDT 301 | hopman 302 | pflueger 303 | Farmer.John 304 | Cai0715 305 | Fire 306 | pzielinski 307 | Isis 308 | zxytim 309 | lidaobing 310 | marcin.mucha 311 | MRain 312 | dante.ltw 313 | Plagapong 314 | eduardische 315 | jinlin 316 | kp7 317 | Therion 318 | hhb 319 | it3 320 | tsun 321 | whh 322 | TheRaven 323 | Dovgaluk 324 | Dumbear2 325 | theli 326 | BBuss 327 | foxlit 328 | xyx 329 | otis 330 | Hetman 331 | JosephWen 332 | AleX 333 | Connector 334 | blunar 335 | Smylic 336 | WenX 337 | KuoE0 338 | asaveljevs 339 | ged 340 | JoeyScarr 341 | sfe 342 | pP5438 343 | Sergey.Bankevich 344 | abiczo 345 | mehdib 346 | Tsubosaka 347 | TPReal 348 | Nasa 349 | microsoft 350 | PavelKuznetsov 351 | kubus 352 | vexorian 353 | Sanny 354 | bayleef 355 | StephYDX 356 | Theorem 357 | Aerodonkey 358 | WhiteBear 359 | Pro.hessam 360 | pavelz 361 | Undead 362 | pr0ton 363 | AlexanderL 364 | momtchil 365 | MRoizner 366 | Robert.Rosolek 367 | foison 368 | SergeyMelnikov 369 | johny42 370 | LIBe 371 | -------------------------------------------------------------------------------- /Corpus/users/7214486.txt: -------------------------------------------------------------------------------- 1 | Gennady.Korotkevich 2 | eatmore 3 | sevenkplus 4 | mystic 5 | mk.al13n 6 | EgorKulikov 7 | kcm1700 8 | vepifanov 9 | dzhulgakov 10 | Romka 11 | hos.lyric 12 | Marcin.Smulewicz 13 | vlad89 14 | shik 15 | iwi 16 | DmitryEgorov 17 | kawatea 18 | exod40 19 | ivan.popelyshev 20 | sdya 21 | ffao 22 | fhlasek 23 | wuzhengkai 24 | KennyHORROR 25 | isea 26 | Kepnu4 27 | -------------------------------------------------------------------------------- /Corpus/users/801485.txt: -------------------------------------------------------------------------------- 1 | Egor 2 | krijgertje 3 | Burunduk1 4 | ACRush 5 | marek.cygan 6 | meret 7 | rng..58 8 | pashka 9 | iwi 10 | eatmore 11 | halyavin 12 | Eryx 13 | earl 14 | mystic 15 | RAVEman 16 | jakubr 17 | PaulJefferys 18 | SergeyRogulenko 19 | Vasyl 20 | FloppyCat 21 | bmerry 22 | linguo 23 | Khuc.Anh.Tuan 24 | elsantodel90 25 | -------------------------------------------------------------------------------- /Naive Baseline/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Naive Baseline/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /Naive Baseline/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Naive Baseline 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /Naive Baseline/src/ARFFFactory.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.IOException; 3 | import java.util.HashSet; 4 | import java.util.Iterator; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Set; 9 | import java.util.Stack; 10 | 11 | public class ARFFFactory { 12 | 13 | public static AbstractExtractor getExtractor(File f) throws IOException { 14 | AbstractExtractor x = null; 15 | if (f.getName().matches(".*\\.cpp")) { 16 | x = new ExtractorCPP(f); 17 | } else { 18 | x = new ExtractorC(f); 19 | } 20 | return x; 21 | } 22 | 23 | protected void appendAttributes(FeatureSet f, StringBuffer x) { 24 | x.append(f.numFunctions() + ","); 25 | x.append(f.length() + ","); 26 | x.append(f.numTokens() + ","); 27 | x.append(f.numComments() + ","); 28 | x.append(f.getLiterals().size() + ","); 29 | x.append(f.getReservedWords().size() + ","); 30 | x.append(f.avgLineLength() + ","); 31 | x.append(f.numEmptyLines() + ","); 32 | x.append(f.whiteSpaceRatio() + ","); 33 | x.append(f.avgParamsPerFunction() + ","); 34 | } 35 | 36 | public String getInstanceData(FeatureSet f, Set authors) { 37 | 38 | StringBuffer x = new StringBuffer(); 39 | // 40 | appendAttributes(f, x); 41 | // 42 | x.append(getAuthorName((AbstractExtractor) f) + "\n"); 43 | authors.add(getAuthorName((AbstractExtractor) f)); 44 | return x.toString(); 45 | // Util.writeFile(allLines, targetPath, true); 46 | } 47 | 48 | public static String getAuthorName(AbstractExtractor e) { 49 | File f = e.getFile(); 50 | // String s = f.getName(); 51 | // s = s.replaceFirst("p[\\d]+\\.", ""); 52 | // int i = s.lastIndexOf('.'); 53 | // s = s.replaceAll(",", ""); 54 | // return s.substring(0, i - 1); 55 | String s = f.getParentFile().getName(); 56 | return s.substring(0, s.length()); 57 | } 58 | 59 | public void makeARFF(String rootDirectory, String targetPath) { 60 | // recursively spider thru all c/cpp files and make into a list of files 61 | // call method below 62 | // throw new UnsupportedOperationException(); 63 | Stack files = new Stack(); 64 | List programs = new LinkedList(); 65 | File f = new File(rootDirectory); 66 | files.add(f); 67 | while (files.size() > 0) { 68 | File temp = files.pop(); 69 | for (File myFile : temp.listFiles()) { 70 | if (myFile.isDirectory()) { 71 | files.add(myFile); 72 | } else if (myFile.isFile()) { 73 | if (myFile.getName().matches(".*\\.c") 74 | || myFile.getName().matches(".*\\.cpp")) { 75 | programs.add(myFile); 76 | } 77 | } 78 | } 79 | } 80 | makeARFF(programs, targetPath); 81 | } 82 | 83 | public void makeARFF(List files, String targetPath) { 84 | Set authors = new HashSet<>(); 85 | List allLines = new LinkedList(); 86 | // for each file in the list, get instance data 87 | for (File f : files) { 88 | System.out.println(f.getAbsolutePath()); 89 | try { 90 | allLines.add(getInstanceData((FeatureSet) getExtractor(f), 91 | authors)); 92 | } catch (IOException e) { 93 | e.printStackTrace(); 94 | } 95 | } 96 | // call make arffheader 97 | makeARFFHeader(targetPath, authors); 98 | Util.writeFile(allLines, targetPath, true); 99 | System.out.println(authors.size() + " authors"); 100 | System.out.println(files.size() + " files"); 101 | } 102 | 103 | protected void arffAttributes(List allLines) { 104 | allLines.add("@attribute numFunctions numeric\n"); 105 | allLines.add("@attribute length numeric\n"); 106 | allLines.add("@attribute numTokens numeric\n"); 107 | allLines.add("@attribute numComments numeric\n"); 108 | allLines.add("@attribute numLiterals numeric\n"); 109 | allLines.add("@attribute numReservedWords numeric\n"); 110 | allLines.add("@attribute avgLineLength numeric\n"); 111 | allLines.add("@attribute numEmptyLines numeric\n"); 112 | allLines.add("@attribute whiteSpaceRatio numeric\n"); 113 | allLines.add("@attribute avgParams numeric\n"); 114 | } 115 | 116 | public void makeARFFHeader(String targetPath, Set authors) { 117 | // put @relation at top 118 | // put all the @attribute lines 119 | List allLines = new LinkedList(); 120 | allLines.add("@relation code_style\n\n"); 121 | // add all the @attributes 122 | arffAttributes(allLines); 123 | // 124 | allLines.add("@attribute author {"); 125 | Iterator author = authors.iterator(); 126 | while (author.hasNext()) { 127 | allLines.add(author.next()); 128 | if (author.hasNext()) { 129 | allLines.add(","); 130 | } 131 | } 132 | allLines.add("}\n\n@data\n"); 133 | Util.writeFile(allLines, targetPath, false); 134 | } 135 | 136 | public static double stdDev(Map mappy) { 137 | List list = new LinkedList(); 138 | for (Integer i : mappy.keySet()) { 139 | for (int j = 0; j < mappy.get(i); j++) { 140 | list.add(i); 141 | } 142 | } 143 | return stdDev(list); 144 | } 145 | 146 | public static double variance(List list) { 147 | int sum1 = 0; // E(x^2) 148 | int sum2 = 0; // E(x) 149 | double size = list.size(); 150 | for (Integer i : list) { 151 | sum1 += i * i; 152 | sum2 += i; 153 | } 154 | return (sum1 / size) - (sum2 / size) * (sum2 / size); 155 | } 156 | 157 | public static double stdDev(List list) { 158 | return Math.sqrt(variance(list)); 159 | } 160 | 161 | } -------------------------------------------------------------------------------- /Naive Baseline/src/ARFFFactory2.java: -------------------------------------------------------------------------------- 1 | import java.util.HashSet; 2 | import java.util.Iterator; 3 | import java.util.List; 4 | import java.util.Set; 5 | 6 | 7 | public class ARFFFactory2 extends ARFFFactory { 8 | 9 | protected Set instanceIDs = new HashSet<>(); 10 | 11 | @Override 12 | protected void appendAttributes(FeatureSet f, StringBuffer x) { 13 | x.append(((AbstractExtractor) f).getFile().getName() + ","); 14 | instanceIDs.add(((AbstractExtractor) f).getFile().getName()); 15 | 16 | x.append(f.numFunctions() + ","); 17 | x.append(f.length() + ","); 18 | x.append(f.numTokens() + ","); 19 | x.append(f.numComments() + ","); 20 | x.append(f.getLiterals().size() + ","); 21 | x.append(f.getReservedWords().size() + ","); 22 | x.append(f.avgLineLength() + ","); 23 | x.append(f.numEmptyLines() + ","); 24 | x.append(f.whiteSpaceRatio() + ","); 25 | x.append(f.avgParamsPerFunction() + ","); 26 | 27 | x.append(stdDev(f.lineLengths()) + ","); 28 | x.append(f.numMacros() + ","); 29 | x.append(("" + f.tabsLeadLines()).toUpperCase() + ","); // double check 30 | x.append(f.getWhiteSpace().get(WhiteSpace.tab) + ","); 31 | x.append(f.getWhiteSpace().get(WhiteSpace.space) + ","); 32 | x.append(stdDev(f.numFunctionParams()) + ","); 33 | x.append(f.getControlStructures().get(ControlStatement.ifStatement) + ","); 34 | x.append(f.getControlStructures().get(ControlStatement.elifStatement) + ","); 35 | x.append(f.getControlStructures().get(ControlStatement.elseStatement) + ","); 36 | x.append(f.getControlStructures().get(ControlStatement.switchStatement) + ","); 37 | x.append(f.getControlStructures().get(ControlStatement.ternaryOperator) + ","); 38 | x.append(f.getLoops().get(Loops.forLoop) + ","); 39 | x.append(f.getLoops().get(Loops.whileLoop) + ","); 40 | x.append(f.getLoops().get(Loops.doWhileLoop) + ","); 41 | x.append(("" + f.newLineBrace()).toUpperCase() + ","); 42 | } 43 | 44 | @Override 45 | protected void arffAttributes(List allLines) { 46 | allLines.add("@attribute instanceID {"); 47 | Iterator id = instanceIDs.iterator(); 48 | while (id.hasNext()) { 49 | allLines.add(id.next()); 50 | if (id.hasNext()) { 51 | allLines.add(","); 52 | } 53 | } 54 | allLines.add("}\n"); 55 | 56 | allLines.add("@attribute numFunctions numeric\n"); 57 | allLines.add("@attribute length numeric\n"); 58 | allLines.add("@attribute numTokens numeric\n"); 59 | allLines.add("@attribute numComments numeric\n"); 60 | allLines.add("@attribute numLiterals numeric\n"); 61 | allLines.add("@attribute numReservedWords numeric\n"); 62 | allLines.add("@attribute avgLineLength numeric\n"); 63 | allLines.add("@attribute numEmptyLines numeric\n"); 64 | allLines.add("@attribute whiteSpaceRatio numeric\n"); 65 | allLines.add("@attribute avgParams numeric\n"); 66 | 67 | allLines.add("@attribute stdDevLineLength numeric\n"); 68 | allLines.add("@attribute numMacros numeric\n"); 69 | allLines.add("@attribute tabsLeadLines {TRUE, FALSE}\n"); 70 | allLines.add("@attribute numTabs numeric\n"); 71 | allLines.add("@attribute numSpaces numeric\n"); 72 | allLines.add("@attribute stdDevNumParams numeric\n"); 73 | allLines.add("@attribute numIf numeric\n"); 74 | allLines.add("@attribute numElif numeric\n"); 75 | allLines.add("@attribute numElse numeric\n"); 76 | allLines.add("@attribute numSwitch numeric\n"); 77 | allLines.add("@attribute numTernary numeric\n"); 78 | allLines.add("@attribute numFor numeric\n"); 79 | allLines.add("@attribute numWhile numeric\n"); 80 | allLines.add("@attribute numDo numeric\n"); 81 | allLines.add("@attribute newLineBeforeOpeningBrace {TRUE, FALSE}\n"); 82 | } 83 | } 84 | 85 | //number of functions 86 | //program length 87 | //number of tokens 88 | //number of comments 89 | //number of String/character/numeric literals 90 | //number of unique reserved words used 91 | //average length of lines 92 | //number of empty lines 93 | //the ratio of whitespace to text 94 | //average number of parameters per function 95 | 96 | //standard deviation of length of lines 97 | //number of macros 98 | //whether tabs precede lines (versus spaces) 99 | //number of tabs 100 | //number of spaces 101 | //standard deviation of number of parameters 102 | //number of "if" statements 103 | //number of "else if" statements 104 | //number of "else" statements 105 | //number of "switch" statements 106 | //number of ternary operators 107 | //number of "for" loops 108 | //number of "while" loops 109 | //number of "do-while" loops -------------------------------------------------------------------------------- /Naive Baseline/src/ARFFFactory3.java: -------------------------------------------------------------------------------- 1 | import java.util.Iterator; 2 | import java.util.List; 3 | 4 | 5 | public class ARFFFactory3 extends ARFFFactory2 { 6 | 7 | @Override 8 | protected void appendAttributes(FeatureSet f, StringBuffer x) { 9 | double len = f.length(); 10 | 11 | x.append(((AbstractExtractor) f).getFile().getName() + ","); 12 | instanceIDs.add(((AbstractExtractor) f).getFile().getName()); 13 | 14 | x.append(Math.log(f.numFunctions() / len) + ","); 15 | // x.append(f.length() + ","); 16 | x.append(Math.log(f.numTokens() / len) + ","); 17 | x.append(Math.log(f.numComments() / len) + ","); 18 | x.append(Math.log(f.getLiterals().size() / len) + ","); 19 | x.append(Math.log(f.getReservedWords().size() / len) + ","); 20 | x.append(f.avgLineLength() + ","); 21 | x.append(Math.log(f.numEmptyLines() / len) + ","); 22 | x.append(f.whiteSpaceRatio() + ","); 23 | x.append(f.avgParamsPerFunction() + ","); 24 | 25 | x.append(stdDev(f.lineLengths()) + ","); 26 | x.append(Math.log(f.numMacros() / len) + ","); 27 | x.append(("" + f.tabsLeadLines()).toUpperCase() + ","); // double check 28 | x.append(Math.log(f.getWhiteSpace().get(WhiteSpace.tab) / len) + ","); 29 | x.append(Math.log(f.getWhiteSpace().get(WhiteSpace.space) / len) + ","); 30 | x.append(stdDev(f.numFunctionParams()) + ","); 31 | x.append(Math.log(f.getControlStructures().get(ControlStatement.ifStatement) / len) + ","); 32 | x.append(Math.log(f.getControlStructures().get(ControlStatement.elifStatement) / len) + ","); 33 | x.append(Math.log(f.getControlStructures().get(ControlStatement.elseStatement) / len) + ","); 34 | x.append(Math.log(f.getControlStructures().get(ControlStatement.switchStatement) / len) + ","); 35 | x.append(Math.log(f.getControlStructures().get(ControlStatement.ternaryOperator) / len) + ","); 36 | x.append(Math.log(f.getLoops().get(Loops.forLoop) / len) + ","); 37 | x.append(Math.log(f.getLoops().get(Loops.whileLoop) / len) + ","); 38 | x.append(Math.log(f.getLoops().get(Loops.doWhileLoop) / len) + ","); 39 | x.append(("" + f.newLineBrace()).toUpperCase() + ","); 40 | } 41 | 42 | @Override 43 | protected void arffAttributes(List allLines) { 44 | allLines.add("@attribute instanceID {"); 45 | Iterator id = instanceIDs.iterator(); 46 | while (id.hasNext()) { 47 | allLines.add(id.next()); 48 | if (id.hasNext()) { 49 | allLines.add(","); 50 | } 51 | } 52 | allLines.add("}\n"); 53 | 54 | allLines.add("@attribute log(numFunctions/length) numeric\n"); 55 | // allLines.add("@attribute length numeric\n"); 56 | allLines.add("@attribute log(numTokens/length) numeric\n"); 57 | allLines.add("@attribute log(numComments/length) numeric\n"); 58 | allLines.add("@attribute log(numLiterals/length) numeric\n"); 59 | allLines.add("@attribute log(numReservedWords/length) numeric\n"); 60 | allLines.add("@attribute avgLineLength numeric\n"); 61 | allLines.add("@attribute log(numEmptyLines/length) numeric\n"); 62 | allLines.add("@attribute whiteSpaceRatio numeric\n"); 63 | allLines.add("@attribute avgParams numeric\n"); 64 | 65 | allLines.add("@attribute stdDevLineLength numeric\n"); 66 | allLines.add("@attribute log(numMacros/length) numeric\n"); 67 | allLines.add("@attribute tabsLeadLines {TRUE, FALSE}\n"); 68 | allLines.add("@attribute log(numTabs/length) numeric\n"); 69 | allLines.add("@attribute log(numSpaces/length) numeric\n"); 70 | allLines.add("@attribute stdDevNumParams numeric\n"); 71 | allLines.add("@attribute log(numIf/length) numeric\n"); 72 | allLines.add("@attribute log(numElif/length) numeric\n"); 73 | allLines.add("@attribute log(numElse/length) numeric\n"); 74 | allLines.add("@attribute log(numSwitch/length) numeric\n"); 75 | allLines.add("@attribute log(numTernary/length) numeric\n"); 76 | allLines.add("@attribute log(numFor/length) numeric\n"); 77 | allLines.add("@attribute log(numWhile/length) numeric\n"); 78 | allLines.add("@attribute log(numDo/length) numeric\n"); 79 | allLines.add("@attribute newLineBeforeOpeningBrace {TRUE, FALSE}\n"); 80 | } 81 | } -------------------------------------------------------------------------------- /Naive Baseline/src/ARFFFactory4.java: -------------------------------------------------------------------------------- 1 | import java.util.List; 2 | 3 | 4 | public class ARFFFactory4 extends ARFFFactory3 { 5 | 6 | @Override 7 | protected void appendAttributes(FeatureSet f, StringBuffer x) { 8 | super.appendAttributes(f, x); 9 | 10 | x.append(f.nestingDepth() + ","); 11 | x.append(f.branchingFactor() + ","); 12 | } 13 | 14 | @Override 15 | protected void arffAttributes(List allLines) { 16 | super.arffAttributes(allLines); 17 | 18 | allLines.add("@attribute nestingDepth numeric\n"); 19 | allLines.add("@attribute branchingFactor numeric\n"); 20 | } 21 | } -------------------------------------------------------------------------------- /Naive Baseline/src/AbstractExtractor.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.File; 3 | import java.io.FileReader; 4 | import java.io.IOException; 5 | import java.util.HashMap; 6 | import java.util.Iterator; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Scanner; 11 | import java.util.Stack; 12 | 13 | /** 14 | * Two big assumptions: the code is valid and no silly macros 15 | * 16 | * @author Andrew Liu 17 | * 18 | */ 19 | public abstract class AbstractExtractor implements FeatureSet { 20 | 21 | private File file; 22 | static String tokenDelimiter = "[*;\\{\\}\\[\\]()+=\\-&/|%!?:,<>~`\\s\"]"; 23 | MultiSet literals; 24 | List commentList; 25 | CodeBlock blocks; 26 | final String code; // source strippped of literals and comments 27 | int length = 0; 28 | int numWhiteSpaceChars = 0; 29 | List lines; 30 | 31 | public AbstractExtractor(File program) throws IOException { 32 | setTokenDelimiter(); // set what separates a token 33 | 34 | /* reading in the program contents into a StringBuffer */ 35 | this.file = program; 36 | BufferedReader reader = new BufferedReader(new FileReader(program)); 37 | StringBuffer source = new StringBuffer(); 38 | char nextChar; 39 | while (reader.ready()) { // TODO can extract features here 40 | this.length++; 41 | nextChar = (char) reader.read(); 42 | String charStr = "" + nextChar; 43 | if (charStr.matches("\\s")) { 44 | numWhiteSpaceChars++; 45 | } 46 | source.append(nextChar); 47 | } 48 | reader.close(); 49 | 50 | /* 51 | * stripping out the String, character, integer, and floating point 52 | * literals 53 | */ 54 | /* filtering out the comments as well */ 55 | this.literals = new MultiSet(); 56 | this.commentList = new LinkedList<>(); 57 | StringBuffer sink = new StringBuffer(); 58 | 59 | while (source.length() > 0) { 60 | if (matchesLiteral(source)) { 61 | // read in the literal 62 | this.literals.add(readNextLiteral(source)); 63 | } else if (matchesComment(source)) { 64 | // read in the comment 65 | this.commentList.add(readNextComment(source)); 66 | } else { 67 | // read in the code until after the next delimiter 68 | readUntilNextToken(source, sink); 69 | } 70 | } 71 | 72 | /* putting the leftover code back into the source */ 73 | source = sink; 74 | sink = new StringBuffer(); 75 | this.code = source.toString(); // setting the code without literals or 76 | // comments 77 | 78 | /* separating the code by blocks */ 79 | this.blocks = new CodeBlock(this.file.getName()); 80 | CodeBlock currentBlock = blocks; 81 | while (source.length() > 0) { 82 | if (isPrototype(source)) { 83 | // adding all statements into the previous block 84 | currentBlock.addStatements(breakIntoStmts(sink)); 85 | sink = new StringBuffer(); 86 | // creating a child block to use 87 | CodeBlock temp = new CodeBlock( 88 | extractPrototype(source)); 89 | currentBlock.addChild(temp); 90 | currentBlock = temp; 91 | } else if (isBlockEnd(source, sink)) { 92 | // adding all statements into the previous block 93 | currentBlock.addStatements(breakIntoStmts(sink)); 94 | sink = new StringBuffer(); 95 | // using the parent block 96 | currentBlock = currentBlock.getParent(); 97 | } else { 98 | readUntilNextToken(source, sink); 99 | } 100 | } 101 | Scanner sc = new Scanner(this.file); 102 | this.lines = new LinkedList(); 103 | while (sc.hasNextLine()) { 104 | this.lines.add(sc.nextLine()); 105 | } 106 | sc.close(); 107 | } 108 | 109 | /** 110 | * Implement this now or make a getter for the token delimiter. Also 111 | * remember to read in the delimiter itself! 112 | * 113 | * @param source 114 | * @param sink 115 | */ 116 | abstract void readUntilNextToken(StringBuffer source, StringBuffer sink); 117 | 118 | abstract boolean matchesLiteral(StringBuffer source); 119 | 120 | abstract String readNextLiteral(StringBuffer source); 121 | 122 | abstract boolean matchesComment(StringBuffer source); 123 | 124 | abstract String readNextComment(StringBuffer source); 125 | 126 | abstract boolean isPrototype(StringBuffer source); 127 | 128 | /** 129 | * Don't forget to remove the opening delimiter of the next block 130 | * 131 | * @param source 132 | * @return 133 | */ 134 | abstract String extractPrototype(StringBuffer source); 135 | 136 | /** 137 | * Will put the "while" part into sink if it detects a do-while 138 | * 139 | * @param source 140 | * @param sink 141 | * @return 142 | */ 143 | abstract boolean isBlockEnd(StringBuffer source, StringBuffer sink); 144 | 145 | /** 146 | * Does NOT empty buffer when done. 147 | * 148 | * @param source 149 | * @return 150 | */ 151 | abstract List breakIntoStmts(StringBuffer source); 152 | 153 | static void setTokenDelimiter() { 154 | // override if you want 155 | } 156 | 157 | final void extractMultipleChars(StringBuffer source, StringBuffer sink, 158 | int num) { 159 | for (int i = 0; i < num; i++) { 160 | extractChar(source, sink); 161 | } 162 | } 163 | 164 | final void extractChar(StringBuffer source, StringBuffer sink) { 165 | sink.append(source.charAt(0)); 166 | source.deleteCharAt(0); 167 | } 168 | 169 | final char peek(StringBuffer source) { 170 | return source.charAt(0); 171 | } 172 | 173 | /** 174 | * Remember this eats up the regex char! 175 | * 176 | * @param source 177 | * @param sink 178 | * @param regex 179 | */ 180 | final void readUntil(StringBuffer source, StringBuffer sink, String regex) { 181 | this.readBefore(source, sink, regex); 182 | if (this.peek(source) != '"') { 183 | this.extractChar(source, sink); 184 | } 185 | } 186 | 187 | /** 188 | * Same as readUntil except it doesn't eat the regex. 189 | * 190 | * @param source 191 | * @param sink 192 | * @param regex 193 | */ 194 | final void readBefore(StringBuffer source, StringBuffer sink, String regex) { 195 | while (source.length() > 1 && !source.substring(0, 1).matches(regex)) { 196 | this.extractChar(source, sink); 197 | } 198 | } 199 | 200 | static String getTokenDelimiter() { 201 | return tokenDelimiter; 202 | } 203 | 204 | public File getFile() { 205 | return this.file; 206 | } 207 | 208 | @Override 209 | public int nestingDepth() { 210 | return this.blocks.getHeight(); 211 | } 212 | 213 | @Override 214 | public double branchingFactor() { 215 | List numChildren = new LinkedList<>(); 216 | Stack> stack = new Stack<>(); 217 | stack.add(this.blocks); 218 | while (!stack.empty()) { 219 | CodeBlock myBlock = stack.pop(); 220 | if (myBlock.children.size() > 0) { 221 | numChildren.add(myBlock.children.size()); 222 | for (CodeBlock c : myBlock.children) { 223 | stack.add(c); 224 | } 225 | } 226 | } 227 | int sum = 0; 228 | double size = numChildren.size(); 229 | for (Integer i : numChildren) { 230 | sum += i; 231 | } 232 | return sum / size; 233 | } 234 | 235 | @Override 236 | public int numComments() { 237 | return this.commentList.size(); 238 | } 239 | 240 | @Override 241 | public List getComments() { 242 | return new LinkedList(this.commentList); 243 | } 244 | 245 | @Override 246 | public Map getLiterals() { 247 | return new HashMap(this.literals); 248 | } 249 | 250 | @Override 251 | public int length() { 252 | return this.length; 253 | } 254 | 255 | @Override 256 | public int numEmptyLines() { 257 | int count = 0; 258 | int bufferCount = 0; 259 | boolean leadingFlag = false; 260 | for (String line : this.lines) { 261 | if (line.matches("[\\s]*")) { 262 | if (leadingFlag) { 263 | bufferCount++; 264 | } 265 | } else { 266 | count += bufferCount; 267 | bufferCount = 0; 268 | leadingFlag = true; 269 | } 270 | } 271 | return count; 272 | } 273 | 274 | @Override 275 | public List lineLengths() { 276 | List lengths = new LinkedList(); 277 | for (String line : this.lines) { 278 | lengths.add(line.length()); 279 | } 280 | return lengths; 281 | } 282 | 283 | @Override 284 | public double avgLineLength() { 285 | int sum = 0; 286 | int count = 0; 287 | Iterator iter = this.lineLengths().iterator(); 288 | while (iter.hasNext()) { 289 | sum += iter.next(); 290 | count++; 291 | } 292 | return sum / (double) count; 293 | } 294 | 295 | @Override 296 | public double whiteSpaceRatio() { 297 | return this.length / (double) this.numWhiteSpaceChars; 298 | } 299 | 300 | @Override 301 | public boolean tabsLeadLines() { 302 | int tabs = 0; 303 | int spaces = 0; 304 | for (String s : this.code.split("\\n")) { 305 | if (s.matches("\\t.*")) { 306 | tabs++; 307 | } else if (s.matches(" .*")) { 308 | spaces++; 309 | } 310 | } 311 | return tabs >= spaces; 312 | } 313 | 314 | @Override 315 | public String instanceID() { 316 | return this.file.getName(); 317 | } 318 | 319 | @Override 320 | public Map getWhiteSpace() { 321 | MultiSet whitespace = new MultiSet(); 322 | whitespace.put(WhiteSpace.newLine, 0); 323 | whitespace.put(WhiteSpace.tab, 0); 324 | whitespace.put(WhiteSpace.space, 0); 325 | for (int i = 0; i < this.code.length(); i++) { 326 | if (this.code.charAt(i) == '\n') { 327 | whitespace.add(WhiteSpace.newLine); 328 | } else if (this.code.charAt(i) == '\t') { 329 | whitespace.add(WhiteSpace.tab); 330 | } else if (this.code.charAt(i) == ' ') { 331 | whitespace.add(WhiteSpace.space); 332 | } 333 | } 334 | return whitespace; 335 | } 336 | 337 | } -------------------------------------------------------------------------------- /Naive Baseline/src/CodeBlock.java: -------------------------------------------------------------------------------- 1 | import java.util.LinkedList; 2 | import java.util.List; 3 | 4 | /* 5 | * TODO 6 | * 7 | * This class is purposefully mutable. Reconsider the shallowness/deepness of some of the getters... 8 | * 9 | * Consider making a node inner class rather than recursively using this class. 10 | * 11 | * EDIT: This class probably shouldn't be purposefully mutable... 12 | */ 13 | 14 | /** 15 | * An data structure that resembles an n-ary tree. It represents blocks of code 16 | * and its nested blocks (and statements). 17 | * 18 | * @author Andrew Liu 19 | * 20 | * @param 21 | * The type of statement each block holds. Usually a String. 22 | */ 23 | public class CodeBlock { 24 | 25 | private String prototype; 26 | private List statements; 27 | List> children; 28 | private CodeBlock parent; 29 | 30 | /** 31 | * Default constructor. 32 | */ 33 | private CodeBlock() { 34 | this.parent = null; 35 | this.statements = new LinkedList(); 36 | this.children = new LinkedList>(); 37 | } 38 | 39 | /** 40 | * Constructor. 41 | * 42 | * @param prototype 43 | * The "prototype" for the block. It can be a function prototype, 44 | * or a class declaration, loop header, etc... 45 | */ 46 | public CodeBlock(String prototype) { 47 | this(); 48 | this.prototype = prototype; 49 | } 50 | 51 | /** 52 | * Copy constructor. 53 | * 54 | * @param copy 55 | * CodeBlock to copy. 56 | */ 57 | public CodeBlock(CodeBlock copy) { 58 | this(); 59 | this.parent = copy.parent; 60 | this.prototype = copy.prototype; 61 | this.addStatements(copy.statements); 62 | for (CodeBlock child : copy.getChildren()) { 63 | this.addChild(new CodeBlock(child)); 64 | } 65 | } 66 | 67 | /** 68 | * Gets the block's parent block. 69 | * 70 | * @return The parent block. 71 | */ 72 | public CodeBlock getParent() { 73 | return this.parent; 74 | } 75 | 76 | /** 77 | * Changes the block's parent block. 78 | * 79 | * @param parent 80 | * The new parent block. 81 | */ 82 | public void setParent(CodeBlock parent) { 83 | this.parent = parent; 84 | } 85 | 86 | /** 87 | * Gets the prototype for this block. 88 | * 89 | * @return The block's prototype. 90 | */ 91 | public String getPrototype() { 92 | return this.prototype; 93 | } 94 | 95 | /** 96 | * Does a depth-first search to get the prototypes of this code block and 97 | * all child code blocks. 98 | * 99 | * @return All prototypes. 100 | */ 101 | public List getPrototypesRecursively() { 102 | List prototypes = new LinkedList(); 103 | prototypes.add(this.prototype); 104 | for (CodeBlock child : this.children) { 105 | prototypes.addAll(child.getPrototypesRecursively()); 106 | } 107 | return prototypes; 108 | } 109 | 110 | /** 111 | * Changes the block's prototype. 112 | * 113 | * @param prototype 114 | * The new prototype. 115 | */ 116 | public void setPrototype(String prototype) { 117 | this.prototype = prototype.trim(); 118 | } 119 | 120 | /** 121 | * Adds the statement to the list of statements for the block. 122 | * 123 | * @param statement 124 | * The statement to add. 125 | */ 126 | public void addStatement(T statement) { 127 | this.statements.add(statement); 128 | } 129 | 130 | /** 131 | * Adds multiple statements to the list of statements. 132 | * 133 | * @param statements 134 | * The list of statements to add. 135 | */ 136 | public void addStatements(List statements) { 137 | for (T statement : statements) { 138 | this.statements.add(statement); 139 | } 140 | } 141 | 142 | /** 143 | * Gets the list of statements for the current block. 144 | * 145 | * @return The list of statements for the current block. 146 | */ 147 | public List getStatements() { 148 | return new LinkedList(this.statements); 149 | } 150 | 151 | /** 152 | * Gets a list of all statements for the block and its children depth-first. 153 | * 154 | * @return List of all statements. 155 | */ 156 | public List getStatementsRecursively() { 157 | List allStatements = this.getStatements(); 158 | for (CodeBlock child : this.children) { 159 | allStatements.addAll(child.getStatementsRecursively()); 160 | } 161 | return allStatements; 162 | } 163 | 164 | /** 165 | * Adds a new child to the code block. 166 | * 167 | * @param child 168 | * The new child. 169 | */ 170 | public void addChild(CodeBlock child) { 171 | this.children.add(child); 172 | child.parent = this; 173 | } 174 | 175 | /** 176 | * Gets the list of children for the code block. 177 | * 178 | * @return The list of code block children. 179 | */ 180 | public List> getChildren() { 181 | List> children = new LinkedList>(); 182 | for (CodeBlock block : this.children) { 183 | children.add(new CodeBlock(block)); 184 | } 185 | return children; 186 | } 187 | 188 | /** 189 | * Calculates the height of this tree structure. 190 | * 191 | * @return The height of the tree. 192 | */ 193 | public int getHeight() { 194 | int height = 1; 195 | int max = 0; 196 | for (CodeBlock child : this.children) { 197 | int subHeight = child.getHeight(); 198 | if (subHeight > max) { 199 | max = subHeight; 200 | } 201 | } 202 | return height + max; 203 | } 204 | 205 | /** 206 | * Gets the total number of code block children including the current block. 207 | * 208 | * @return The total number of nodes in the tree. 209 | */ 210 | public int getTotalNumBlocks() { 211 | int total = 1; 212 | for (CodeBlock child : this.children) { 213 | total += child.getTotalNumBlocks(); 214 | } 215 | return total; 216 | } 217 | 218 | /* 219 | * (non-Javadoc) 220 | * 221 | * @see java.lang.Object#toString() 222 | */ 223 | public String toString() { 224 | return this.toStringAux().toString(); 225 | } 226 | 227 | private StringBuffer toStringAux() { 228 | StringBuffer ret = new StringBuffer("{"); 229 | ret.append(this.prototype); 230 | for (CodeBlock child : this.children) { 231 | ret.append(child.toStringAux()); 232 | } 233 | return ret.append("}"); 234 | } 235 | } -------------------------------------------------------------------------------- /Naive Baseline/src/ControlStatement.java: -------------------------------------------------------------------------------- 1 | public enum ControlStatement { 2 | 3 | ifStatement("if"), elifStatement("elif"), elseStatement("else"), switchStatement( 4 | "switch"), ternaryOperator("ternary"); 5 | 6 | private final String name; 7 | 8 | private ControlStatement(String name) { 9 | this.name = name; 10 | } 11 | 12 | @Override 13 | public String toString() { 14 | return this.name; 15 | } 16 | } -------------------------------------------------------------------------------- /Naive Baseline/src/Driver.java: -------------------------------------------------------------------------------- 1 | public class Driver { 2 | 3 | /** 4 | * Dependencies: Apache Commons IO, Util.java, ARFFFactory*.java, everything 5 | * else in the Naive-Baseline package 6 | */ 7 | public static void main(String args[]) { 8 | if (args.length != 2) { 9 | System.err 10 | .println("Usage: "); 11 | System.exit(1); 12 | } 13 | (new ARFFFactory4()).makeARFF(args[0], args[1]); 14 | 15 | for(int datasetNo=101; datasetNo<102; datasetNo++){ 16 | args[0] ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAA_Datasets/" 17 | + "forMallory/mallory_new_SFS/malloryDataset_"+datasetNo+"/"; 18 | args[1] ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/mallory_150/malloryDataset_andrew_"+datasetNo+".arff"; 19 | (new ARFFFactory4()).makeARFF(args[0], args[1]); 20 | } 21 | } 22 | } -------------------------------------------------------------------------------- /Naive Baseline/src/ExtractorC.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.IOException; 3 | import java.util.Arrays; 4 | import java.util.HashSet; 5 | import java.util.Iterator; 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Set; 10 | 11 | public class ExtractorC extends AbstractExtractor { 12 | 13 | protected Set reservedWords; 14 | 15 | public ExtractorC(File program) throws IOException { 16 | super(program); 17 | this.prepareReservedWords(); 18 | } 19 | 20 | protected void prepareReservedWords() { 21 | this.reservedWords = new HashSet(); 22 | for (String s : ReservedC.reservedWords) { 23 | this.reservedWords.add(s); 24 | } 25 | } 26 | 27 | @Override 28 | void readUntilNextToken(StringBuffer source, StringBuffer sink) { 29 | this.readUntil(source, sink, tokenDelimiter); 30 | } 31 | 32 | @Override 33 | boolean matchesLiteral(StringBuffer source) { 34 | return source.charAt(0) == '"' || source.charAt(0) == '\'' 35 | || source.toString().matches("[\\d]+[\\w\\W]*") 36 | || source.toString().matches("[.][\\d]+[\\w\\W]*"); 37 | } 38 | 39 | @Override 40 | String readNextLiteral(StringBuffer source) { 41 | StringBuffer sink = new StringBuffer(); 42 | if (source.charAt(0) == '"') { 43 | // strings 44 | this.extractChar(source, sink); // get opening quote 45 | char prev = '\0'; 46 | char next; 47 | while (source.length() > 0) { 48 | next = source.charAt(0); 49 | this.extractChar(source, sink); 50 | if (prev != '\\' && next == '"') { 51 | break; 52 | } 53 | prev = next; 54 | } 55 | } else if (source.charAt(0) == '\'') { 56 | // characters 57 | if (source.charAt(1) == '\\') { 58 | this.extractMultipleChars(source, sink, 4); 59 | } else { 60 | this.extractMultipleChars(source, sink, 3); 61 | } 62 | } else { 63 | // numbers 64 | this.readBefore(source, sink, "\\D"); 65 | if (source.charAt(0) == 'l' || source.charAt(0) == 'L') { 66 | this.extractChar(source, sink); 67 | } else if (source.charAt(0) == '.') { 68 | // is a floating point number 69 | this.extractChar(source, sink); 70 | this.readBefore(source, sink, "\\D"); 71 | if (source.charAt(0) == 'f' || source.charAt(0) == 'F') { 72 | this.extractChar(source, sink); 73 | } 74 | } 75 | } 76 | return sink.toString(); 77 | } 78 | 79 | @Override 80 | boolean matchesComment(StringBuffer source) { 81 | return source.length() >= 2 82 | && (source.substring(0, 2).equals("//") || source.substring(0, 83 | 2).equals("/*")); 84 | } 85 | 86 | @Override 87 | String readNextComment(StringBuffer source) { 88 | StringBuffer sink = new StringBuffer(); 89 | if (source.substring(0, 2).equals("//")) { 90 | this.readUntil(source, sink, "\n"); 91 | } else { 92 | int endIndex = source.toString().indexOf("*/") + 2; 93 | this.extractMultipleChars(source, sink, endIndex); 94 | } 95 | return sink.toString(); 96 | } 97 | 98 | @Override 99 | boolean isPrototype(StringBuffer source) { 100 | String s = source.toString(); 101 | if (s.matches(".*\\{[\\w\\W]*") || s.matches(".*\\n\\{[\\w\\W]*")) { 102 | return true; 103 | } 104 | if (s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*") 105 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*") 106 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*") 107 | || s.matches("switch[\\w\\W]*")) { 108 | return true; // notice the space after the "do" regex (avoids 109 | // matching "double" 110 | } 111 | if (s.matches("static[\\w\\W]*") || s.matches("extern[\\w\\W]*") 112 | || s.matches("unsigned[\\w\\W]*") 113 | || s.matches("signed[\\w\\W]*") || s.matches("char[\\w\\W]*") 114 | || s.matches("short[\\w\\W]*") || s.matches("int[\\w\\W]*") 115 | || s.matches("long[\\w\\W]*") || s.matches("float[\\w\\W]*") 116 | || s.matches("double[\\w\\W]*") || s.matches("enum[\\w\\W]*") 117 | || s.matches("typedef[\\w\\W]*") 118 | || s.matches("register[\\w\\W]*") 119 | || s.matches("union[\\w\\W]*") || s.matches("void[\\w\\W]*")) { 120 | int braceIndex = s.indexOf('{'); 121 | int semicolonIndex = s.indexOf(';'); 122 | if (braceIndex == -1) { 123 | return false; 124 | } 125 | if (semicolonIndex == -1) { 126 | return true; 127 | } 128 | return braceIndex < semicolonIndex; 129 | } 130 | return false; 131 | } 132 | 133 | @Override 134 | String extractPrototype(StringBuffer source) { 135 | StringBuffer sink = new StringBuffer(); 136 | 137 | String s = source.toString(); 138 | if (s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*") 139 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*") 140 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*") 141 | || s.matches("switch[\\w\\W]*")) { 142 | int lineIndex = s.indexOf("\n"); 143 | int braceIndex = s.indexOf("{"); 144 | if (braceIndex == -1 || braceIndex < lineIndex 145 | || s.substring(lineIndex, braceIndex).matches("[\\s]*")) { 146 | this.readBefore(source, sink, "\\n"); 147 | return sink.toString(); 148 | } 149 | } 150 | 151 | this.readUntil(source, sink, "\\{"); 152 | return sink.substring(0, sink.length() - 1); // we don't want to 153 | // include 154 | // the '{' 155 | } 156 | 157 | @Override 158 | boolean isBlockEnd(StringBuffer source, StringBuffer sink) { 159 | if (source.charAt(0) == '}') { 160 | source.deleteCharAt(0); // get rid of the '}' 161 | if (source.length() > 0 && source.charAt(0) == ';') { 162 | source.deleteCharAt(0); // get rid of the ';' after the '}' 163 | } else if (source.length() > 0 && source.toString().matches("[\\s]*while")) { 164 | // in case of a do-while 165 | int semicolonIndex = source.indexOf(";"); 166 | this.extractMultipleChars(source, sink, semicolonIndex + 1); 167 | } 168 | return true; 169 | } 170 | return false; 171 | } 172 | 173 | @Override 174 | List breakIntoStmts(StringBuffer source) { 175 | List stmts = new LinkedList(); 176 | List fragments = Arrays.asList(source.toString() 177 | .split("[\\n;]")); 178 | Iterator iter = fragments.iterator(); 179 | while (iter.hasNext()) { 180 | String s = iter.next(); 181 | if (s.matches("[\\s]*")) { 182 | continue; 183 | } 184 | stmts.add(s.trim()); 185 | } 186 | return stmts; 187 | } 188 | 189 | @Override 190 | public boolean newLineBrace() { 191 | int onLineBrace = 0; 192 | int newLineBrace = 0; 193 | for (String s : this.code.split("\\{")) { 194 | if (s.length() == 0) { 195 | continue; 196 | } 197 | if (s.charAt(s.length() - 1) == '\n') { 198 | newLineBrace++; 199 | } else { 200 | onLineBrace++; 201 | } 202 | } 203 | return newLineBrace >= onLineBrace; 204 | } 205 | 206 | @Override 207 | public int numFunctions() { 208 | int count = 0; 209 | for (String s : this.blocks.getPrototypesRecursively()) { 210 | if (isFunction(s)) { // need to double check 211 | count++; 212 | } 213 | } 214 | return count; 215 | } 216 | 217 | protected static boolean isFunction(String s) { 218 | return !(s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*") 219 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*") 220 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*") 221 | || s.matches("switch[\\w\\W]*") || s.matches("enum[\\w\\W]*") 222 | || s.matches("typedef[\\w\\W]*") 223 | || s.matches("register[\\w\\W]*") || s 224 | .matches("union[\\w\\W]*")); 225 | } 226 | 227 | @Override 228 | public int numTokens() { 229 | return this.code.split(tokenDelimiter).length; 230 | } // need to double check 231 | 232 | @Override 233 | public Map getReservedWords() { 234 | MultiSet reservedWords = new MultiSet<>(); 235 | String[] tokens = this.code.split(tokenDelimiter); 236 | for (String token : tokens) { 237 | if (this.reservedWords.contains(token)) { 238 | reservedWords.add(token); 239 | } 240 | } 241 | return reservedWords; 242 | } 243 | 244 | @Override 245 | public Map getUserDefinedWords() { 246 | MultiSet reservedWords = new MultiSet<>(); 247 | String[] tokens = this.code.split(tokenDelimiter); 248 | for (String token : tokens) { 249 | if (!this.reservedWords.contains(token)) { 250 | reservedWords.add(token); 251 | } 252 | } 253 | return reservedWords; 254 | } 255 | 256 | @Override 257 | public Map getLoops() { 258 | MultiSet myLoops = new MultiSet<>(); 259 | myLoops.put(Loops.doWhileLoop, 0); 260 | myLoops.put(Loops.forLoop, 0); 261 | myLoops.put(Loops.whileLoop, 0); 262 | for (String s : this.blocks.getPrototypesRecursively()) { 263 | if (s.matches("do [\\w\\W]*")) { 264 | myLoops.add(Loops.doWhileLoop); 265 | } else if (s.matches("for [\\w\\W]*")) { 266 | myLoops.add(Loops.forLoop); 267 | } else if (s.matches("while [\\w\\W]*")) { 268 | myLoops.add(Loops.whileLoop); 269 | } 270 | } 271 | return myLoops; 272 | } 273 | 274 | @Override 275 | public Map getControlStructures() { 276 | MultiSet myControls = new MultiSet<>(); 277 | myControls.put(ControlStatement.elifStatement, 0); 278 | myControls.put(ControlStatement.elseStatement, 0); 279 | myControls.put(ControlStatement.ifStatement, 0); 280 | myControls.put(ControlStatement.switchStatement, 0); 281 | myControls.put(ControlStatement.ternaryOperator, 0); 282 | for (String s : this.blocks.getPrototypesRecursively()) { 283 | if (s.matches("else if[\\w\\W]*")) { 284 | myControls.add(ControlStatement.elifStatement); 285 | } else if (s.matches("else [\\w\\W]*")) { 286 | myControls.add(ControlStatement.elseStatement); 287 | } else if (s.matches("if [\\w\\W]*")) { 288 | myControls.add(ControlStatement.ifStatement); 289 | } else if (s.matches("switch [\\w\\W]*")) { 290 | myControls.add(ControlStatement.switchStatement); 291 | } 292 | } 293 | // get ternaries by splitting via "?" 294 | myControls.put(ControlStatement.ternaryOperator, this.code.split("\\?").length - 1); 295 | return myControls; 296 | } 297 | 298 | @Override 299 | public Map numFunctionParams() { 300 | MultiSet params = new MultiSet<>(); 301 | for (String s : this.blocks.getPrototypesRecursively()) { 302 | if (!isFunction(s)) { 303 | continue; 304 | } 305 | String[] s2 = s.split(","); 306 | params.add(s2.length - 1); 307 | } 308 | return params; 309 | } 310 | 311 | @Override 312 | public double avgParamsPerFunction() { 313 | Map params = this.numFunctionParams(); 314 | Set keys = params.keySet(); 315 | int totalParams = 0; 316 | for (Integer key : keys) { 317 | totalParams += key * params.get(key); 318 | } 319 | return totalParams / (double) this.numFunctions(); 320 | } 321 | 322 | @Override 323 | public Map getVariableLocality() { 324 | // check var in nary tree with its tree depth 325 | // TODO Auto-generated method stub 326 | throw new UnsupportedOperationException(); 327 | } 328 | 329 | @Override 330 | public int numMacros() { 331 | int count = 0; 332 | for (String s : this.code.split("\\n")) { 333 | if (s.matches("#.*")) { 334 | count++; 335 | } 336 | } 337 | return count; 338 | } 339 | 340 | } -------------------------------------------------------------------------------- /Naive Baseline/src/ExtractorCPP.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.IOException; 3 | import java.util.HashSet; 4 | 5 | public class ExtractorCPP extends ExtractorC { 6 | 7 | public ExtractorCPP(File program) throws IOException { 8 | super(program); 9 | } 10 | 11 | @Override 12 | protected void prepareReservedWords() { 13 | this.reservedWords = new HashSet(); 14 | for (String s : ReservedCPP.reservedWords) { 15 | this.reservedWords.add(s); 16 | } 17 | } 18 | 19 | @Override 20 | boolean isPrototype(StringBuffer source) { 21 | String s = source.toString(); 22 | 23 | if (s.matches(".*\\{[\\w\\W]*") || s.matches(".*\\n\\{[\\w\\W]*")) { 24 | return true; 25 | } 26 | 27 | if (s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*") 28 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*") 29 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*") 30 | || s.matches("switch[\\w\\W]*")) { 31 | return true; // notice the space after the "do" regex (avoids 32 | // matching "double" 33 | } 34 | if (s.matches("static[\\w\\W]*") || s.matches("extern[\\w\\W]*") 35 | || s.matches("unsigned[\\w\\W]*") 36 | || s.matches("signed[\\w\\W]*") || s.matches("char[\\w\\W]*") 37 | || s.matches("short[\\w\\W]*") || s.matches("int[\\w\\W]*") 38 | || s.matches("long[\\w\\W]*") || s.matches("float[\\w\\W]*") 39 | || s.matches("double[\\w\\W]*") || s.matches("enum[\\w\\W]*") 40 | || s.matches("typedef[\\w\\W]*") 41 | || s.matches("register[\\w\\W]*") 42 | || s.matches("union[\\w\\W]*") || s.matches("void[\\w\\W]*") 43 | || s.matches("char16_t[\\w\\W]*") 44 | || s.matches("char32_t[\\w\\W]*") 45 | || s.matches("wchar_t[\\w\\W]*") || s.matches("bool[\\w\\W]*")) { 46 | int braceIndex = s.indexOf('{'); 47 | int semicolonIndex = s.indexOf(';'); 48 | if (braceIndex == -1) { 49 | return false; 50 | } 51 | if (semicolonIndex == -1) { 52 | return true; 53 | } 54 | return braceIndex < semicolonIndex; 55 | } 56 | return false; 57 | } 58 | 59 | } -------------------------------------------------------------------------------- /Naive Baseline/src/FeatureSet.java: -------------------------------------------------------------------------------- 1 | import java.util.List; 2 | import java.util.Map; 3 | 4 | public interface FeatureSet { 5 | 6 | /** 7 | * Shows if the code is of the style "[stmt] {\n" or "[stmt]\n{\n". 8 | * 9 | * @return 10 | */ 11 | public boolean newLineBrace(); 12 | 13 | public int numFunctions(); 14 | 15 | public int nestingDepth(); 16 | 17 | public double branchingFactor(); 18 | 19 | public int length(); 20 | 21 | public int numTokens(); 22 | 23 | public int numComments(); 24 | 25 | public List getComments(); 26 | 27 | public Map getLiterals(); 28 | 29 | public Map getReservedWords(); 30 | 31 | public Map getUserDefinedWords(); 32 | 33 | public Map getLoops(); 34 | 35 | public List lineLengths(); 36 | 37 | public double avgLineLength(); 38 | 39 | /** 40 | * Map each control structure to the number of times it occurs. 41 | * 42 | * @return 43 | */ 44 | public Map getControlStructures(); 45 | 46 | /** 47 | * Does not count leading and trailing empty lines 48 | * 49 | * @return 50 | */ 51 | public int numEmptyLines(); 52 | 53 | public double whiteSpaceRatio(); 54 | 55 | public Map numFunctionParams(); 56 | 57 | public double avgParamsPerFunction(); 58 | 59 | public Map getVariableLocality(); 60 | 61 | public Map getWhiteSpace(); 62 | 63 | public String instanceID(); 64 | 65 | // public Map caseDistr();////////////// 66 | 67 | public int numMacros(); 68 | 69 | public boolean tabsLeadLines(); 70 | 71 | // ++ vs += 1 72 | // spaces vs tabs 73 | // x=1 vs x = 1 74 | // variable names 75 | } -------------------------------------------------------------------------------- /Naive Baseline/src/Loops.java: -------------------------------------------------------------------------------- 1 | public enum Loops { 2 | 3 | forLoop("for"), doWhileLoop("do"), whileLoop("while"); 4 | 5 | private final String name; 6 | 7 | private Loops(String name) { 8 | this.name = name; 9 | } 10 | 11 | @Override 12 | public String toString() { 13 | return this.name; 14 | } 15 | } -------------------------------------------------------------------------------- /Naive Baseline/src/MultiSet.java: -------------------------------------------------------------------------------- 1 | 2 | import java.util.HashMap; 3 | import java.util.Set; 4 | 5 | /** 6 | * A data structure emulating a set that counts the number of repeated elements. 7 | * 8 | * @author Andrew Liu 9 | * 10 | *@param The type of element the MultiSet holds. 11 | */ 12 | public class MultiSet extends HashMap { 13 | 14 | /** 15 | * 16 | */ 17 | private static final long serialVersionUID = 1L; 18 | 19 | /** 20 | * Default constructor. 21 | */ 22 | public MultiSet() { 23 | } 24 | 25 | /** 26 | * Copy constructor. 27 | * 28 | * @param copy MultiSet to copy. 29 | */ 30 | public MultiSet(MultiSet copy) { 31 | Set keys = copy.keySet(); 32 | for (T s : keys) { 33 | this.put(s, (Integer) copy.get(s)); 34 | } 35 | } 36 | 37 | /** 38 | * Adds an element to the MultiSet, or increments its count by one if the element already exists. 39 | * 40 | * @param key The element to add. 41 | */ 42 | public void add(T key) { 43 | if (!this.containsKey(key)) { 44 | this.put(key, 1); 45 | } else { 46 | this.put(key, this.get(key) + 1); 47 | } 48 | } 49 | 50 | /* 51 | * (non-Javadoc) 52 | * 53 | * @see java.util.AbstractMap#toString() 54 | */ 55 | public String toString() { 56 | StringBuffer s = new StringBuffer(); 57 | for (java.util.Map.Entry e : this.entrySet()) { 58 | s.append(e.toString() + '\n'); 59 | } 60 | return s.toString(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Naive Baseline/src/ReservedC.java: -------------------------------------------------------------------------------- 1 | public class ReservedC { 2 | public static final String[] reservedWords = { "auto", "break", "case", 3 | "char", "continue", "default", "do", "double", "else", "entry", 4 | "extern", "float", "for", "goto", "if", "int", "long", "register", 5 | "return", "short", "sizeof", "static", "struct", "switch", 6 | "typedef", "union", "unsigned", "while", "enum", "void", "const", 7 | "signed", "volatile" }; 8 | } -------------------------------------------------------------------------------- /Naive Baseline/src/ReservedCPP.java: -------------------------------------------------------------------------------- 1 | public class ReservedCPP { 2 | 3 | public static final String[] reservedWords = { "alignas", "alignof", "and", 4 | "and_eq", "asm", "auto", "bitand", "bitor", "bool", "break", 5 | "case", "catch", "char", "char16_t", "char32_t", "class", "compl", 6 | "const", "constexpr", "const_cast", "continue", "decltype", 7 | "default", "delete", "do", "double", "dynamic_cast", "else", 8 | "enum", "explicit", "export", "extern", "false", "float", "for", 9 | "friend", "goto", "if", "inline", "int", "long", "mutable", 10 | "namespace", "new", "noexcept", "not", "not_eq", "nullptr", 11 | "operator", "or", "or_eq", "private", "protected", "public", 12 | "register", "reinterpret_cast", "return", "short", "signed", 13 | "sizeof", "static", "static_assert", "static_cast", "struct", 14 | "switch", "template", "this", "thread_local", "throw", "true", 15 | "try", "typedef", "typeid", "typename", "union", "unsigned", 16 | "using", "virtual", "void", "volatile", "wchar_t", "while", "xor", 17 | "xor_eq" }; 18 | } -------------------------------------------------------------------------------- /Naive Baseline/src/WhiteSpace.java: -------------------------------------------------------------------------------- 1 | 2 | public enum WhiteSpace { 3 | space("' '"), tab("'\\t'"), newLine("'\\n'"); 4 | 5 | private final String name; 6 | 7 | private WhiteSpace(String name) { 8 | this.name = name; 9 | } 10 | 11 | @Override 12 | public String toString() { 13 | return this.name; 14 | } 15 | } -------------------------------------------------------------------------------- /Naive Baseline/src/c_reserved_words.txt: -------------------------------------------------------------------------------- 1 | auto 2 | break 3 | case 4 | char 5 | continue 6 | default 7 | do 8 | double 9 | else 10 | entry 11 | extern 12 | float 13 | for 14 | goto 15 | if 16 | int 17 | long 18 | register 19 | return 20 | short 21 | sizeof 22 | static 23 | struct 24 | switch 25 | typedef 26 | union 27 | unsigned 28 | while 29 | enum 30 | void 31 | const 32 | signed 33 | volatile -------------------------------------------------------------------------------- /Naive Baseline/src/cpp_reserved_words.txt: -------------------------------------------------------------------------------- 1 | alignas 2 | alignof 3 | and 4 | and_eq 5 | asm 6 | auto 7 | bitand 8 | bitor 9 | bool 10 | break 11 | case 12 | catch 13 | char 14 | char16_t 15 | char32_t 16 | class 17 | compl 18 | const 19 | constexpr 20 | const_cast 21 | continue 22 | decltype 23 | default 24 | delete 25 | do 26 | double 27 | dynamic_cast 28 | else 29 | enum 30 | explicit 31 | export 32 | extern 33 | false 34 | float 35 | for 36 | friend 37 | goto 38 | if 39 | inline 40 | int 41 | long 42 | mutable 43 | namespace 44 | new 45 | noexcept 46 | not 47 | not_eq 48 | nullptr 49 | operator 50 | or 51 | or_eq 52 | private 53 | protected 54 | public 55 | register 56 | reinterpret_cast 57 | return 58 | short 59 | signed 60 | sizeof 61 | static 62 | static_assert 63 | static_cast 64 | struct 65 | switch 66 | template 67 | this 68 | thread_local 69 | throw 70 | true 71 | try 72 | typedef 73 | typeid 74 | typename 75 | union 76 | unsigned 77 | using 78 | virtual 79 | void 80 | volatile 81 | wchar_t 82 | while 83 | xor 84 | xor_eq -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SCAA 2 | ==== 3 | Runs joern on testCode, writes joern-tools script output to text files for each testCode file, 4 | extracts features from the text files to create an arff file that can be used in WEKA for machine learning. 5 | 6 | (This project requires the development branches of joern and python-joern, and also joern-tools to be set up. These three git repositories have dependencies and come with thorough documentation.) 7 | 8 | 1) Do preprocessing for all files in the directory structure, year-> author name -> all_cpp_files_ofauthor 9 | run preprocessDataToTXTdepAST(filePath) in FeatureCalculators.java test_cpp_dir has all the cpp files of an author. Check if all dep, txt, and ast files are created correctly. (eg, if the cpp file has only comments and no code, the dep, txt and ast files will be empty. Exclude such cases from authorship attribution.) If you only want syntactic features, make sure that in joern-tools, change astLabel.py's lines: 10 | 11 | if len(children) == 0: 12 | node.attr['label'] = attrDict['node'] 13 | else: 14 | node.attr['label'] = attrDict['type'] 15 | 16 | to 17 | 18 | 19 | if len(children) == 0: 20 | node.attr['label'] = attrDict['type'] 21 | else: 22 | node.attr['label'] = attrDict['type'] 23 | 24 | 2) Start writing the attribute declaration to arff (writes relation, selected attributes and at last @attribute 'authorName' {cyg4ever,darkKelvin, ....} after getting all the author names. The last attribute defines your test classes. 25 | After preprocessing, run the main method in FeatureExtractor.java 26 | test_dir has all the .txt files written from joern, can be the same as test_cpp_dir 27 | output_filename is your arff file path 28 | If you want only syntactic features from the syntactic dataset that has only node types, make sure to select the correct ASTTypes in FeatureExtractor.java 29 | 30 | 3) Extract features: from all text files in the directory structure, year-> author name -> all_txt_files_ofauthor (output from joern) extract the desired features to be written to feature vectors. In order to extract some layout and other lexical features, run Driver.java in Naive Baseline. If you want to merge the arffs from feature extractor and driver, run MergeArffFiles.java. (the instance order is important, modify code accordingly.) 31 | 32 | 4)Once the arff file is written, open it in WEKA or call WEKA from java and use the necessary classifiers, and attribute selection methods to do authorship attribution. AuthorClassification.java can also be used with a random forest and relaxed attribution. 33 | 34 | SCAA 35 | -------------------------------------------------------------------------------- /SCAA/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /SCAA/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /SCAA/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | SCAA 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /SCAA/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /SCAA/commons-exec-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/commons-exec-1.2.jar -------------------------------------------------------------------------------- /SCAA/commons-lang3-3.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/commons-lang3-3.3.2.jar -------------------------------------------------------------------------------- /SCAA/commons.io_2.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/commons.io_2.0.1.jar -------------------------------------------------------------------------------- /SCAA/javacsv.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/javacsv.jar -------------------------------------------------------------------------------- /SCAA/src/AuthorClassification.java: -------------------------------------------------------------------------------- 1 | import weka.attributeSelection.InfoGainAttributeEval; 2 | import weka.attributeSelection.Ranker; 3 | import weka.classifiers.*; 4 | import weka.classifiers.evaluation.ThresholdCurve; 5 | import weka.classifiers.meta.FilteredClassifier; 6 | import weka.classifiers.trees.RandomForest; 7 | import weka.core.Attribute; 8 | import weka.core.Instances; 9 | import weka.core.Range; 10 | import weka.core.Utils; 11 | import weka.filters.Filter; 12 | import weka.filters.supervised.attribute.AttributeSelection; 13 | import weka.filters.unsupervised.attribute.Remove; 14 | import weka.filters.unsupervised.instance.RemoveRange; 15 | import weka.filters.unsupervised.instance.RemoveWithValues; 16 | 17 | import java.io.BufferedWriter; 18 | import java.io.FileReader; 19 | import java.io.FileWriter; 20 | import java.util.*; 21 | 22 | public class AuthorClassification { 23 | 24 | 25 | public static void main(String[] args) throws Exception 26 | { 27 | double accuracy=0; 28 | int endRelax = 1; 29 | int numberFiles; 30 | int numFeatures=0; //0 is the default logM+1 31 | int seedNumber; 32 | double total =0; 33 | double average =0; 34 | 35 | String fileName ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/Results/AutomatedResults/" 36 | + "mallory/"+"mallory_CSFS_new.txt"; 37 | for(int authorNo=6; authorNo<=54; authorNo+=1){ 38 | for(numberFiles=9; numberFiles<10; numberFiles++){ 39 | String arffFile = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/" 40 | + "mallory_150/CSFS/" +"mallory_CSFS_"+authorNo+".arff" ; 41 | 42 | Util.writeFile(numberFiles+"FilesPerAuthor: \n",fileName, true); 43 | for(int relaxPar = 1; relaxPar<=endRelax; relaxPar++){ 44 | total=0; 45 | average=0; 46 | 47 | for(seedNumber=1; seedNumber<2; seedNumber++){ 48 | int foldNumber=numberFiles; 49 | 50 | 51 | 52 | RandomForest cls = new RandomForest(); 53 | Instances data = new Instances(new FileReader(arffFile)); 54 | data.setClassIndex(data.numAttributes() - 1); 55 | //do not stratify if you are going to remove instances for training and testing 56 | // data.stratify(foldNumber); 57 | 58 | 59 | /* //Start information gain that selects up to 200 features that have nonzero infogain 60 | int n = 200; // number of features to select 61 | AttributeSelection attributeSelection = new AttributeSelection(); 62 | Ranker ranker = new Ranker(); 63 | ranker.setNumToSelect(n); 64 | ranker.setThreshold(0); 65 | InfoGainAttributeEval infoGainAttributeEval = new InfoGainAttributeEval(); 66 | attributeSelection.setEvaluator(infoGainAttributeEval); 67 | attributeSelection.setSearch(ranker); 68 | attributeSelection.setInputFormat(data); 69 | data = Filter.useFilter(data, attributeSelection); 70 | //end of infogain 71 | */ 72 | 73 | 74 | 75 | 76 | RemoveRange filter1 = new RemoveRange(); 77 | filter1.setInputFormat(data); 78 | filter1.setInstancesIndices("13-last"); 79 | filter1.setInvertSelection(true); 80 | 81 | Instances trainData = Filter.useFilter(data, filter1); 82 | System.out.println("trainData size " + trainData.numInstances()); 83 | BufferedWriter writer = new BufferedWriter(new FileWriter("/Users/Aylin/Desktop/Drexel/" 84 | + "2014/ARLInternship/SCAAarffs/mallory_150/traintest/trainData_"+authorNo+".arff")); 85 | writer.write(trainData.toString()); 86 | writer.flush(); 87 | writer.close(); 88 | /* for(int inst=0; inst<16; inst++) 89 | System.out.println("trainData " + trainData.classAttribute().value((int) trainData.instance(inst).classValue())); 90 | */ 91 | 92 | RemoveRange filter2 = new RemoveRange(); 93 | filter2.setInputFormat(data); 94 | filter2.setInstancesIndices("13-last"); 95 | Instances testData = Filter.useFilter(data, filter2); 96 | System.out.println("testData size " + testData.numInstances()); 97 | writer = new BufferedWriter(new FileWriter("/Users/Aylin/Desktop/Drexel/" 98 | + "2014/ARLInternship/SCAAarffs/mallory_150/traintest/testData_"+authorNo+".arff")); 99 | // writer.write(testData.toString()); 100 | writer.flush(); 101 | writer.close(); 102 | /* for(int inst=0; inst<29; inst++) 103 | System.out.println("testData " + testData.classAttribute().value((int) testData.instance(inst).classValue())); 104 | */ 105 | 106 | 107 | 108 | Remove rm = new Remove(); 109 | int authorName = (data.numAttributes() - 28); 110 | // rm.setAttributeIndices("1," +authorName); // remove 1st and the autor attribute 111 | rm.setAttributeIndices("1"); // remove 1st attribute*/ 112 | 113 | FilteredClassifier fc = new FilteredClassifier(); 114 | fc.setClassifier(new RandomForest()); 115 | fc.setFilter(rm); 116 | 117 | String[] options = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber); 118 | fc.setOptions(options); 119 | fc.buildClassifier(trainData); 120 | // Evaluation evalMallory=null; 121 | // evalMallory = new Evaluation(testData); 122 | 123 | 124 | Evaluation eval_mal = new Evaluation(testData); 125 | eval_mal.evaluateModel(fc, testData); 126 | 127 | 128 | /* for (int i = 0; i < testData.numInstances(); i++) { 129 | double classVal = fc.classifyInstance(testData 130 | .instance(i)); 131 | System.out 132 | .println("===== Classified instance =====" + classVal); 133 | double[] pred = fc.distributionForInstance(testData 134 | .instance(i)); 135 | System.out.println("===== Classified instance =====" + pred);*/ 136 | // System.out.println("Class predicted: " + testData.instance(i).classAttribute().value((int) pred)); 137 | 138 | // train on trainData and make predictions on testData 139 | fc.buildClassifier(trainData); 140 | for (int i = 0; i < testData.numInstances(); i++) { 141 | double pred = fc.classifyInstance(testData.instance(i)); 142 | System.out.print(fc.getOptions()); 143 | System.out.print("ID: " + testData.instance(i).value(0)); 144 | System.out.print(", actual: " + testData.classAttribute().value((int) testData.instance(i).classValue())); 145 | System.out.println(", predicted: " + testData.classAttribute().value((int) pred)+"\n"); 146 | 147 | 148 | Util.writeFile("ID: " + testData.instance(i).value(0), 149 | fileName, true); 150 | Util.writeFile(", actual: " + testData.classAttribute().value((int) testData.instance(i).classValue()), 151 | fileName, true); 152 | Util.writeFile(", predicted: " + testData.classAttribute().value((int) pred)+"\n", 153 | fileName, true); 154 | 155 | } 156 | 157 | ThresholdCurve tc_mal = new ThresholdCurve(); 158 | int classIndex = 0; 159 | Instances result1 = tc_mal.getCurve(eval_mal.predictions(), classIndex); 160 | tc_mal.getROCArea(result1); 161 | 162 | Instances result2 = tc_mal.getCurve(eval_mal.predictions(), 1); 163 | tc_mal.getROCArea(result2); 164 | System.out.println("AUC class1: "+ tc_mal.getROCArea(result1) + " AUC class2: "+ tc_mal.getROCArea(result2)); 165 | // +"\n"+"Number of trees used, "+fc.getNumTrees()+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+fc.measureOutOfBagError()); 166 | 167 | Util.writeFile("AUC class1: "+ tc_mal.getROCArea(result1) + " AUC class2: "+ tc_mal.getROCArea(result2) +"\n"+"Number of features used, default is 0 (logM+1) "+cls.getNumFeatures()+ ", Correctly classified instances, "+eval_mal.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n" 168 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"\n" , 169 | fileName, true); 170 | 171 | 172 | 173 | 174 | 175 | 176 | System.out.println("Number of instances: " + data.numInstances()+" and number of authors: " + data.numClasses()); 177 | 178 | 179 | String[] options1 = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber); 180 | cls.setOptions(options); 181 | 182 | cls.buildClassifier(data); 183 | 184 | 185 | 186 | Evaluation eval=null; 187 | 188 | 189 | if(endRelax==1) 190 | eval = new Evaluation(data); 191 | else 192 | eval= new RelaxedEvaluation(data, relaxPar); 193 | 194 | 195 | 196 | 197 | eval.crossValidateModel(cls, data,foldNumber , new Random(seedNumber)); 198 | 199 | /* System.out.println("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses()); 200 | Util.writeFile("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses(), 201 | fileName, true);*/ 202 | ThresholdCurve tc = new ThresholdCurve(); 203 | classIndex = 0; 204 | result1 = tc.getCurve(eval.predictions(), classIndex); 205 | tc.getROCArea(result1); 206 | 207 | result2 = tc.getCurve(eval.predictions(), 1); 208 | tc.getROCArea(result2); 209 | /* System.out.println("AUC class1: "+ tc.getROCArea(result1) + " AUC class2: "+ tc.getROCArea(result2)); 210 | //"\n"+"Number of features used, "+cls.getNumFeatures()+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()); 211 | 212 | Util.writeFile("AUC class1: "+ tc.getROCArea(result1) + " AUC class2: "+ tc.getROCArea(result2) +"\n"+"Number of features used, default is 0 (logM+1) "+cls.getNumFeatures()+ ", Correctly classified instances, "+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n" 213 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"\n" , 214 | fileName, true);*/ 215 | 216 | 217 | if(numFeatures==0){ 218 | int defaultNumFeatures=(int)Utils.log2(data.numAttributes()) + 1; 219 | /* Util.writeFile("Number of features used, "+defaultNumFeatures+ ", Correctly classified instances, "+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n" 220 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"max depth of trees"+cls.getMaxDepth()+"\n" , 221 | fileName, true); 222 | System.out.println("Number of features used, "+defaultNumFeatures+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()); 223 | */ 224 | } 225 | 226 | else{ 227 | /* System.out.println("Number of features used, "+cls.getNumFeatures()+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()); 228 | 229 | Util.writeFile("Number of features used, default is 0 (logM+1) "+cls.getNumFeatures()+ ", Correctly classified instances, "+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n" 230 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"\n" , 231 | fileName, true); */ 232 | } 233 | accuracy=eval.pctCorrect(); 234 | total =total+accuracy; 235 | average = total/seedNumber; 236 | } 237 | 238 | System.out.println("total is "+total); 239 | System.out.println("avg is "+average); 240 | System.out.println("accuracy is "+accuracy); 241 | 242 | System.out.println("\nThe average accuracy with "+numberFiles+"files is "+average+"\n"); 243 | Util.writeFile("\nThe average accuracy with "+numberFiles+"files is "+average+", relaxed by, "+relaxPar+", \n", 244 | fileName, true); 245 | 246 | }} 247 | } 248 | } 249 | 250 | } -------------------------------------------------------------------------------- /SCAA/src/AuthorClassificationRelaxed.java: -------------------------------------------------------------------------------- 1 | import weka.attributeSelection.InfoGainAttributeEval; 2 | import weka.attributeSelection.Ranker; 3 | import weka.classifiers.*; 4 | import weka.classifiers.evaluation.ThresholdCurve; 5 | import weka.classifiers.meta.FilteredClassifier; 6 | import weka.classifiers.trees.RandomForest; 7 | import weka.core.Attribute; 8 | import weka.core.AttributeStats; 9 | import weka.core.Instances; 10 | import weka.core.Range; 11 | import weka.core.Utils; 12 | import weka.filters.Filter; 13 | import weka.filters.supervised.attribute.AttributeSelection; 14 | import weka.filters.unsupervised.attribute.Remove; 15 | import weka.filters.unsupervised.instance.RemoveRange; 16 | import weka.filters.unsupervised.instance.RemoveWithValues; 17 | 18 | import java.io.BufferedWriter; 19 | import java.io.FileReader; 20 | import java.io.FileWriter; 21 | import java.util.*; 22 | 23 | public class AuthorClassificationRelaxed { 24 | 25 | 26 | public static void main(String[] args) throws Exception 27 | { 28 | double accuracy=0; 29 | int endRelax = 5; 30 | int numberFiles; 31 | int numFeatures=0; //0 is the default logM+1 32 | int seedNumber; 33 | double total =0; 34 | double average =0; 35 | 36 | String fileName ="textFile"; 37 | 38 | 39 | 40 | for(int authorNo=9; authorNo<10; authorNo++){ 41 | for(numberFiles=9; numberFiles<10; numberFiles++){ 42 | for (int x=28; x<=(18*31); x=x+9){ 43 | String arffFile = "path"; 44 | 45 | Util.writeFile(numberFiles+"FilesPerAuthor: \n",fileName, true); 46 | for(int relaxPar = 5; relaxPar<=endRelax; relaxPar++){ 47 | total=0; 48 | average=0; 49 | 50 | for(seedNumber=1; seedNumber<2; seedNumber++){ 51 | int foldNumber=numberFiles; 52 | 53 | 54 | 55 | RandomForest cls = new RandomForest(); 56 | Instances data = new Instances(new FileReader(arffFile)); 57 | data.setClassIndex(data.numAttributes() - 1); 58 | // data.setClassIndex(0); 59 | 60 | //do not stratify if you are going to remove instances for training and testing 61 | // data.stratify(foldNumber); 62 | 63 | 64 | //write classes that have 9 samples to a new arff 65 | System.out.println(data.attributeStats(0)); 66 | 67 | // System.out.println(data.instance(2).stringValue(0)); 68 | // System.out.println(data.instance(2).value(0)); 69 | 70 | /* for(int i=0; i<=data.numInstances();i++){ 71 | int count = data.attributeStats(0).nominalCounts[(int) data.instance(i).value(0)]; 72 | if(count==9){ 73 | Util.writeFile(data.instance(i).toString() + "\n", "/Users/Aylin/Desktop/" 74 | + "python9files.arff", true); 75 | 76 | } 77 | 78 | }*/ 79 | 80 | //Start information gain that selects up to 200 features that have nonzero infogain 81 | int n = 500; // number of features to select 82 | AttributeSelection attributeSelection = new AttributeSelection(); 83 | Ranker ranker = new Ranker(); 84 | ranker.setNumToSelect(n); 85 | ranker.setThreshold(0.001); 86 | InfoGainAttributeEval infoGainAttributeEval = new InfoGainAttributeEval(); 87 | attributeSelection.setEvaluator(infoGainAttributeEval); 88 | attributeSelection.setSearch(ranker); 89 | attributeSelection.setInputFormat(data); 90 | data = Filter.useFilter(data, attributeSelection); 91 | //end of infogain 92 | 93 | 94 | 95 | RemoveRange rm = new RemoveRange(); 96 | rm.setInputFormat(data); 97 | // rm.setInstancesIndices("first-"+(x-19)+","+x+"-last"); 98 | Instances testData = Filter.useFilter(data, rm); 99 | System.out.println("testData size " + testData.numInstances()); 100 | 101 | 102 | FilteredClassifier fc = new FilteredClassifier(); 103 | fc.setClassifier(new RandomForest()); 104 | fc.setFilter(rm); 105 | 106 | String[] options = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber); 107 | fc.setOptions(options); 108 | // fc.buildClassifier(data); 109 | Evaluation eval_mal = new Evaluation(data); 110 | 111 | System.out.println("Number of instances: " + data.numInstances()+" and number of authors: " + data.numClasses()); 112 | 113 | 114 | String[] options1 = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber); 115 | cls.setOptions(options); 116 | // cls.buildClassifier(data); 117 | 118 | 119 | 120 | Evaluation eval=null; 121 | 122 | 123 | if(endRelax==1) 124 | eval = new Evaluation(data); 125 | else 126 | eval= new RelaxedEvaluation(data, relaxPar); 127 | 128 | 129 | eval.crossValidateModel(cls, data,foldNumber , new Random(seedNumber)); 130 | 131 | System.out.println("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses()); 132 | Util.writeFile("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses(), 133 | fileName, true); 134 | 135 | accuracy=eval.pctCorrect(); 136 | total =total+accuracy; 137 | average = total/seedNumber; 138 | } 139 | 140 | System.out.println("total is "+total); 141 | System.out.println("avg is "+average); 142 | System.out.println("accuracy is "+accuracy); 143 | 144 | System.out.println("\nThe average accuracy with "+numberFiles+"files is "+average+"\n"); 145 | Util.writeFile("\nThe average accuracy with "+numberFiles+"files is "+average+", relaxed by, "+relaxPar+", \n", 146 | fileName, true); 147 | 148 | } 149 | 150 | 151 | }} 152 | } 153 | } 154 | 155 | } -------------------------------------------------------------------------------- /SCAA/src/BigramExtractor.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.IOException; 3 | import java.text.SimpleDateFormat; 4 | import java.util.ArrayList; 5 | import java.util.Calendar; 6 | import java.util.HashSet; 7 | import java.util.LinkedHashSet; 8 | import java.util.List; 9 | import java.util.Set; 10 | import java.util.regex.Matcher; 11 | import java.util.regex.Pattern; 12 | 13 | import org.apache.commons.lang3.StringUtils; 14 | 15 | 16 | public class BigramExtractor { 17 | 18 | 19 | public static void main(String[] args) throws IOException 20 | { 21 | Calendar cal = Calendar.getInstance(); 22 | cal.getTime(); 23 | SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); 24 | int month = cal.get(Calendar.MONTH); 25 | int dayOfMonth = cal.get(Calendar.DAY_OF_MONTH); 26 | String time = sdf.format(cal.getTime()); 27 | String output_filename = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigramArffs/" 28 | +(month+1) + "." + dayOfMonth + "_" + 29 | "9FilesExactlyPerAuthor_2012_validation_exact_bigrams.arff" ; 30 | 31 | String dirPath="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAA_Datasets/" 32 | +"bigExperiments/250authors/9FilesExactlyPerAuthor_2012_validation_exact_allfeatures/"; List test_file_paths = Util.listTextFiles(dirPath); 33 | 34 | 35 | String text = ""; 36 | //Writing the test arff 37 | //first specify relation 38 | Util.writeFile("@relation 9FilesExactlyPerAuthor_2012_validation_bigrams"+"\n"+"\n", output_filename, true); 39 | Util.writeFile("@attribute instanceID {", output_filename, true); 40 | List test_cpp_paths = Util.listCPPFiles(dirPath); 41 | for(int j=0; j < test_cpp_paths.size();j++ ) 42 | { 43 | File fileCPP = new File(test_cpp_paths.get(j).toString()); 44 | String fileName = fileCPP.getName(); 45 | Util.writeFile(fileName+",", output_filename, true); 46 | if ((j+1)==test_cpp_paths.size()) 47 | Util.writeFile("}"+"\n", output_filename, true); 48 | } 49 | String[] ASTNodeBigrams = null; 50 | ASTNodeBigrams= getASTNodeBigrams(dirPath); 51 | 52 | for (int i=0; i uniqueWords = new HashSet(); 65 | 66 | for (String word : words) { 67 | uniqueWords.add(word); 68 | } 69 | words = uniqueWords.toArray(new String[0]); 70 | int authorCount = words.length; 71 | if (i+1==test_file_paths.size()){ 72 | for (int j=0; j< authorCount; j++){ 73 | {System.out.println(words[j]); 74 | if(j+1 == authorCount) 75 | { 76 | Util.writeFile(words[j]+"}"+"\n\n",output_filename, true); 77 | } 78 | else 79 | { 80 | Util.writeFile(words[j]+","+"",output_filename, true); 81 | 82 | } 83 | } 84 | } 85 | 86 | } 87 | 88 | } 89 | Util.writeFile("@data"+"\n", output_filename, true); 90 | //Finished defining the attributes 91 | 92 | //EXTRACT LABELED FEATURES 93 | for(int i=0; i< test_file_paths.size(); i++){ 94 | int testIDlength = test_file_paths.get(i).toString().length(); 95 | File authorFileName= new File(test_file_paths.get(i).toString()); 96 | String authorName= authorFileName.getParentFile().getName(); 97 | 98 | System.out.println(test_file_paths.get(i)); 99 | System.out.println(authorName); 100 | 101 | File fileCPPID = new File(test_cpp_paths.get(i).toString()); 102 | String fileNameID = fileCPPID.getName(); 103 | Util.writeFile(fileNameID+",", output_filename, true); 104 | 105 | String DepASTText = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"dep"); 106 | float[] typeCount = getASTNodeBigramsTF(DepASTText, ASTNodeBigrams ); 107 | for (int j=0; j uniqueWords = new LinkedHashSet(); 127 | List unigrams = new ArrayList(); 128 | Set bigrams = new LinkedHashSet(); 129 | String[] uniquebigrams = null; 130 | 131 | for(int i=0; i< test_file_paths.size(); i++){ 132 | String filePath = test_file_paths.get(i).toString(); 133 | // System.out.println(filePath); 134 | 135 | String inputText =Util.readFile(filePath); 136 | int [] lines = DepthASTNode.getASTDepLines(inputText); 137 | String textAST=null; 138 | for (int j=0; j occurrencesHere = finder.findIndexesForKeyword(str); 48 | occurrences[j] = occurrences[j] + occurrencesHere.size(); 49 | 50 | 51 | for(int k=0; k occurrencesHere = finder.findIndexesForKeyword(str); 98 | occurrences[j] = occurrences[j] + occurrencesHere.size(); 99 | 100 | 101 | for(int k=0; k maxDepth[j]) 116 | maxDepth[j]= rightParanthesis-leftParanthesis; 117 | } 118 | 119 | if(occurrences[j]==0) 120 | maxDepth[j]=0; 121 | } 122 | } 123 | List maxDepthall = Arrays.asList(ArrayUtils.toObject(maxDepth)); 124 | return Collections.max(maxDepthall); 125 | 126 | } 127 | 128 | //line number starts from 0 129 | public static int[] getASTDepLines(String featureText) 130 | { 131 | HashSet functionIDs = new HashSet(); 132 | HashSet functionIDs2 = new HashSet(); 133 | 134 | //take the function id in the beginning of the line. 135 | String[] lines = featureText.split("\n"); 136 | for(int i=0; i< lines.length; i++) 137 | { 138 | String firstWord = lines[i].substring(0, featureText.indexOf('\t')); 139 | if(!functionIDs.contains(firstWord)) 140 | functionIDs.add(firstWord); 141 | } 142 | int [] ASTDepLines=new int[functionIDs.size()]; 143 | for(int i=0; i< lines.length; i++) 144 | { 145 | String firstWord = lines[i].substring(0, featureText.indexOf('\t')); 146 | if(i==0) 147 | { 148 | functionIDs2.add(firstWord); 149 | } 150 | else 151 | { 152 | if(!functionIDs2.contains(firstWord)) 153 | { 154 | int lineNumber = i-1; 155 | ASTDepLines[functionIDs2.size()-1] = lineNumber; 156 | } 157 | if(i==lines.length-1) 158 | { 159 | int lineNumber = i; 160 | ASTDepLines[functionIDs2.size()-1] = lineNumber; 161 | } 162 | functionIDs2.add(firstWord); 163 | } 164 | } 165 | return ASTDepLines; 166 | } 167 | 168 | 169 | //starts from 0 170 | public static String readLineNumber (String featureText, int lineNumber) throws IOException 171 | { 172 | List lines = IOUtils.readLines(new StringReader(featureText)); 173 | return lines.get(lineNumber); 174 | } 175 | 176 | public static float[] InfoGainsgetAvgDepthASTNode(String featureText, String[] ASTtypesAvgDepth) throws IOException 177 | { 178 | 179 | int [] lines = getASTDepLines(featureText); 180 | float [] occurrences=new float[ASTtypesAvgDepth.length]; 181 | float [] totalDepth=new float[ASTtypesAvgDepth.length]; 182 | float [] avgDepth=new float[ASTtypesAvgDepth.length]; 183 | 184 | String textAST=null; 185 | for (int i=0; i occurrencesHere = finder.findIndexesForKeyword(str); 193 | occurrences[j] = occurrences[j] + occurrencesHere.size(); 194 | 195 | 196 | for(int k=0; k 0) { 54 | int newValue = costs[j - 1]; 55 | if (s1.charAt(i - 1) != s2.charAt(j - 1)) 56 | newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1; 57 | costs[j - 1] = lastValue; 58 | lastValue = newValue; 59 | } 60 | } 61 | } 62 | if (i > 0) 63 | costs[s2.length()] = lastValue; 64 | } 65 | return costs[s2.length()]; 66 | } 67 | 68 | public static void printDistance(String s1, String s2) { 69 | // System.out.println(s1 + "-->" + s2 + ": " + computeDistance(s1, s2)); 70 | System.out.println(levenshteinDistance(s1, s2)); 71 | 72 | } 73 | 74 | @SuppressWarnings("resource") 75 | public static void main(String[] args) throws IOException { 76 | 77 | String parentDir = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/" 78 | + "SCAA_Datasets/bigExperiments/250authors/9FilesExactlyPer250Author_2014/"; 79 | String output_file= "/Users/Aylin/Desktop/similarityTestAcrossAuthors.txt"; 80 | 81 | File file = new File(parentDir); 82 | String[] directories = file.list(new FilenameFilter() 83 | { 84 | @Override 85 | public boolean accept(File current, String name) 86 | { 87 | return new File(current, name).isDirectory(); 88 | } 89 | }); 90 | System.out.println(Arrays.toString(directories)); 91 | //within author similarity 92 | /* for (int i =0; i< directories.length; i++) 93 | { 94 | double author_ratio=0; 95 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname 96 | String authorName = directories[i].toString().substring(0, directories[i].toString().length()); 97 | String authorDir = parentDir + directories[i] + "/"; 98 | Util.writeFile(authorName +"\n", output_file, true); 99 | System.out.println(authorName); 100 | System.out.println(authorDir); 101 | 102 | BufferedReader br = null; 103 | String line = ""; 104 | List test_cpp_paths = Util.listCPPFiles(authorDir); 105 | 106 | for(int j=0; j < test_cpp_paths.size();j++ ) 107 | { 108 | double avg_ratio=0; 109 | double ratio =0; 110 | 111 | String file1 = Util.readFile(test_cpp_paths.get(j).toString()); 112 | Util.writeFile(test_cpp_paths.get(j).toString() +"\n", output_file, true); 113 | for(int k=0; k < test_cpp_paths.size();k++ ) 114 | { 115 | if(j!=k){ 116 | String file2 = Util.readFile(test_cpp_paths.get(k).toString()); 117 | int distance =computeDistance(file1, file2); 118 | if(file1.length() <= file2.length()){ 119 | ratio = distance/(double)((Integer)file2.length()); 120 | } 121 | if(file2.length() < file1.length()){ 122 | ratio = distance/(double)((Integer)file1.length()); 123 | } 124 | 125 | Util.writeFile("File1 length: "+file1.length() +" " + 126 | "File2 length:"+file2.length()+" ", output_file, true); 127 | Util.writeFile("distance: "+Integer.toString(distance)+ 128 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true); 129 | avg_ratio =avg_ratio+ratio; 130 | }} 131 | avg_ratio = avg_ratio/(double)((Integer)(test_cpp_paths.size()-1)); 132 | Util.writeFile("average ratio of file: "+avg_ratio +"\n", output_file, true); 133 | System.out.println("average ratio of file: "+avg_ratio ); 134 | 135 | author_ratio=author_ratio+avg_ratio; 136 | } 137 | author_ratio=author_ratio/(double)((Integer)(test_cpp_paths.size())); 138 | System.out.println("average ratio of author: "+author_ratio); 139 | 140 | Util.writeFile("average ratio of author: "+author_ratio +"\n", output_file, true); 141 | 142 | }*/ 143 | 144 | List all_cpp_paths = Util.listCPPFiles(parentDir); 145 | String authorName2; 146 | double avg_ratio=0; 147 | 148 | for (int i =0; i< all_cpp_paths.size(); i++) 149 | { 150 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname 151 | File newFile = new File(all_cpp_paths.get(i).toString()); 152 | authorName2 = newFile.getParentFile().getName().toString(); 153 | Util.writeFile(authorName2+":"+newFile.getName().toString() +"\n", output_file, true); 154 | System.out.println(authorName2); 155 | 156 | BufferedReader br = null; 157 | 158 | for(int j=0; j < all_cpp_paths.size();j++ ) 159 | { 160 | File newFile1 = new File(all_cpp_paths.get(j).toString()); 161 | String authorName3 = newFile1.getParentFile().getName().toString(); 162 | 163 | double ratio =0; 164 | if(!authorName2.equals(authorName3)){ 165 | String file1 = Util.readFile(all_cpp_paths.get(i).toString()); 166 | 167 | String file2 = Util.readFile(all_cpp_paths.get(j).toString()); 168 | int distance =levenshteinDistance(file1, file2); 169 | if(file1.length() <= file2.length()){ 170 | ratio = distance/(double)((Integer)file2.length()); 171 | } 172 | if(file2.length() < file1.length()){ 173 | ratio = distance/(double)((Integer)file1.length()); 174 | } 175 | 176 | /* Util.writeFile("File1 length: "+file1.length() +" " + 177 | "File2 length:"+file2.length()+" ", output_file, true); 178 | Util.writeFile("distance: "+Integer.toString(distance)+ 179 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true);*/ 180 | // Util.writeFile( Double.toString(ratio) + ", " ,output_file, true); 181 | avg_ratio =avg_ratio+ratio; 182 | }} 183 | avg_ratio = avg_ratio/(double)((Integer)(all_cpp_paths.size()-9)); 184 | Util.writeFile("\n Average distance to all other files: "+avg_ratio +"\n", output_file, true); 185 | System.out.println("Average distance to all other files: "+avg_ratio ); 186 | } 187 | 188 | } 189 | } -------------------------------------------------------------------------------- /SCAA/src/FeatureExtractorInfoGain.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.FileNotFoundException; 3 | import java.io.IOException; 4 | import java.util.Calendar; 5 | import java.util.HashSet; 6 | import java.util.List; 7 | import java.util.Set; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | import java.text.SimpleDateFormat; 11 | 12 | /** 13 | * FeatureExtractor writes extracted features to arff file to be used with WEKA 14 | * @author Aylin Caliskan-Islam (ac993@drexel.edu) 15 | */ 16 | 17 | public class FeatureExtractorInfoGain { 18 | public static void main(String[] args) throws FileNotFoundException, IOException, ClassNotFoundException { 19 | 20 | //list the cppKeywords that appear in infogain 21 | String [] cppKeywords = {"auto","case", "class", "compl", "const","inline","namespace","operator", 22 | "signed", "static", "template", "typedef","typename","unsigned", "using"}; 23 | 24 | 25 | String output_filename = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigExperiments/InfoGain/" +"InfoGain_9FilesPer250Author2012_bigExperiments.arff" ; 26 | 27 | String test_dir = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAA_Datasets/bigExperiments/250authors/9FilesExactlyPerAuthor_2012_validation_exact_allfeatures/"; 28 | List test_file_paths = Util.listTextFiles(test_dir); 29 | 30 | String text = ""; 31 | //Writing the test arff 32 | //first specify relation 33 | Util.writeFile("@relation InfoGain "+"\n"+"\n", output_filename, true); 34 | Util.writeFile("@attribute instanceID {", output_filename, true); 35 | 36 | List test_cpp_paths = Util.listCPPFiles(test_dir); 37 | for(int j=0; j < test_cpp_paths.size();j++ ) 38 | { 39 | File fileCPP = new File(test_cpp_paths.get(j).toString()); 40 | String fileName = fileCPP.getName(); 41 | Util.writeFile(fileName+",", output_filename, true); 42 | if ((j+1)==test_cpp_paths.size()) 43 | Util.writeFile("}"+"\n", output_filename, true); 44 | } 45 | 46 | // Util.writeFile("@attribute 'functionIDCount' numeric"+"\n", output_filename, true); 47 | // Util.writeFile("@attribute 'CFGNodeCount' numeric"+"\n", output_filename, true); 48 | // Util.writeFile("@attribute 'ASTFunctionIDCount' numeric"+"\n", output_filename, true); 49 | Util.writeFile("@attribute 'getMaxDepthASTLeaf' numeric"+"\n", output_filename, true); 50 | 51 | 52 | //List the info gain nodes 53 | String[] ASTtypesTF = {"T", "t", "FOR", "cout", "stdout", "freopen", "in", "tt", "tc", 54 | "test", "open", "ForStatement", "UnaryExpression", "IncDecOp", "scanf", "close", 55 | "argc", "argv", "fin", "stdin", "ofstream", "ForInit", "cin", "solve", "fopen", 56 | "ifstream", "fprintf", "cas", "printf", "ShiftExpression", "REP", "fout", 57 | "forn", "endl", "size_t", "out", "cases", "cerr"}; 58 | String[] ASTtypesTFIDF = {"FOR", "cout", "stdout", "freopen", "tc", "test", "open", 59 | "close", "argc", "argv", "fin", "stdin", "ofstream", "cin", "solve", "fopen", 60 | "ifstream", "fprintf", "cas", "REP", "fout", "forn", "endl", "size_t", "out", "cases", "cerr"}; 61 | String[] ASTtypesAvgDep = {"T", "d", "w", "t", "r", "FOR", "cout", "stdout", "freopen", "small", 62 | "in", "tt", "tc", "input", "test", "open", "ForStatement", "UnaryExpression", "inline", 63 | "IncDecOp", "scanf", "close", "argc", "argv", "const", "fin", "stdin", "ofstream", 64 | "ForInit", "cin", "solve", "txt", "sync_with_stdio", "fopen", "ifstream", "std", "cas", 65 | "printf", "ShiftExpression", "REP", "fout", "forn", "Case", "size_t", "out", "cases", 66 | "output", "cerr"}; 67 | 68 | for (int i=0; i uniqueWords = new HashSet(); 94 | 95 | for (String word : words) { 96 | uniqueWords.add(word); 97 | } 98 | words = uniqueWords.toArray(new String[0]); 99 | int authorCount = words.length; 100 | if (i+1==test_file_paths.size()){ 101 | for (int j=0; j< authorCount; j++){ 102 | {System.out.println(words[j]); 103 | if(j+1 == authorCount) 104 | { 105 | Util.writeFile(words[j]+"}"+"\n\n",output_filename, true); 106 | } 107 | else 108 | { 109 | Util.writeFile(words[j]+","+"",output_filename, true); 110 | 111 | } 112 | } 113 | } 114 | 115 | } 116 | 117 | } 118 | 119 | 120 | Util.writeFile("@data"+"\n", output_filename, true); 121 | //Finished defining the attributes 122 | 123 | 124 | //EXTRACT LABELED FEATURES 125 | for(int i=0; i< test_file_paths.size(); i++){ 126 | String featureText = Util.readFile(test_file_paths.get(i).toString()); 127 | int testIDlength = test_file_paths.get(i).toString().length(); 128 | authorFileName= new File(test_file_paths.get(i).toString()); 129 | String authorName= authorFileName.getParentFile().getName(); 130 | 131 | System.out.println(test_file_paths.get(i)); 132 | System.out.println(authorName); 133 | File fileCPPID = new File(test_cpp_paths.get(i).toString()); 134 | String fileNameID = fileCPPID.getName(); 135 | Util.writeFile(fileNameID+",", output_filename, true); 136 | // Util.writeFile(FeatureCalculators.functionIDCount(featureText)+",", output_filename, true); 137 | String ASTText = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"ast"); 138 | String DepASTText = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"dep"); 139 | String sourceCode = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"cpp"); 140 | 141 | // Util.writeFile(FeatureCalculators.CFGNodeCount(ASTText)+",", output_filename, true); 142 | // Util.writeFile(FeatureCalculators.ASTFunctionIDCount(ASTText)+",", output_filename, true); 143 | Util.writeFile(DepthASTNode.getMaxDepthASTLeaf(DepASTText, ASTtypesTF)+",", output_filename, true); 144 | 145 | 146 | 147 | //get count of each ASTtype not-DepAST type present 148 | float[] typeCount = FeatureCalculators.DepASTTypeTF(DepASTText, ASTtypesTF ); 149 | for (int j=0; j uniqueWords = new HashSet(); 197 | 198 | for (String word : words) { 199 | uniqueWords.add(word); 200 | } 201 | words = uniqueWords.toArray(new String[0]); 202 | return words; 203 | } 204 | 205 | 206 | } 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /SCAA/src/IndexWrapper.java: -------------------------------------------------------------------------------- 1 | // Taken from http://whyjava.wordpress.com/2010/05/04/finding-all-the-indexes-of-a-whole-word-in-a-given-string-using-java/ 2 | 3 | public class IndexWrapper { 4 | 5 | private int start; 6 | private int end; 7 | 8 | public IndexWrapper(int start, int end) { 9 | this.start = start; 10 | this.end = end; 11 | } 12 | 13 | public int getEnd() { 14 | return end; 15 | } 16 | 17 | public int getStart() { 18 | return start; 19 | } 20 | 21 | @Override 22 | public int hashCode() { 23 | final int prime = 31; 24 | int result = 1; 25 | result = prime * result + end; 26 | result = prime * result + start; 27 | return result; 28 | } 29 | 30 | @Override 31 | public boolean equals(Object obj) { 32 | if (this == obj) 33 | return true; 34 | if (obj == null) 35 | return false; 36 | if (getClass() != obj.getClass()) 37 | return false; 38 | IndexWrapper other = (IndexWrapper) obj; 39 | if (end != other.end) 40 | return false; 41 | if (start != other.start) 42 | return false; 43 | return true; 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /SCAA/src/LevenshteinDistance.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | import java.util.*; 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FilenameFilter; 7 | import java.io.IOException; 8 | 9 | import com.csvreader.CsvWriter; 10 | 11 | 12 | public class LevenshteinDistance { 13 | 14 | public static int computeDistance(String s1, String s2) { 15 | // s1 = s1.toLowerCase(); 16 | // s2 = s2.toLowerCase(); 17 | 18 | System.out.println("Length of first string: "+s1.length()); 19 | System.out.println("Length of second string: "+s2.length()); 20 | 21 | int[] costs = new int[s2.length() + 1]; 22 | for (int i = 0; i <= s1.length(); i++) { 23 | int lastValue = i; 24 | for (int j = 0; j <= s2.length(); j++) { 25 | if (i == 0) 26 | costs[j] = j; 27 | else { 28 | if (j > 0) { 29 | int newValue = costs[j - 1]; 30 | if (s1.charAt(i - 1) != s2.charAt(j - 1)) 31 | newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1; 32 | costs[j - 1] = lastValue; 33 | lastValue = newValue; 34 | } 35 | } 36 | } 37 | if (i > 0) 38 | costs[s2.length()] = lastValue; 39 | } 40 | return costs[s2.length()]; 41 | } 42 | 43 | public static void printDistance(String s1, String s2) { 44 | // System.out.println(s1 + "-->" + s2 + ": " + computeDistance(s1, s2)); 45 | System.out.println(computeDistance(s1, s2)); 46 | 47 | } 48 | 49 | @SuppressWarnings("resource") 50 | public static void main(String[] args) throws IOException { 51 | 52 | String parentDir = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/" 53 | + "SCAA_Datasets/bigExperiments/250authors/9FilesExactlyPer250Author_2014/"; 54 | String output_file= "/Users/Aylin/Desktop/similarityTestAcrossAuthors.txt"; 55 | 56 | File file = new File(parentDir); 57 | String[] directories = file.list(new FilenameFilter() 58 | { 59 | @Override 60 | public boolean accept(File current, String name) 61 | { 62 | return new File(current, name).isDirectory(); 63 | } 64 | }); 65 | System.out.println(Arrays.toString(directories)); 66 | //within author similarity 67 | /* for (int i =0; i< directories.length; i++) 68 | { 69 | double author_ratio=0; 70 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname 71 | String authorName = directories[i].toString().substring(0, directories[i].toString().length()); 72 | String authorDir = parentDir + directories[i] + "/"; 73 | Util.writeFile(authorName +"\n", output_file, true); 74 | System.out.println(authorName); 75 | System.out.println(authorDir); 76 | 77 | BufferedReader br = null; 78 | String line = ""; 79 | List test_cpp_paths = Util.listCPPFiles(authorDir); 80 | 81 | for(int j=0; j < test_cpp_paths.size();j++ ) 82 | { 83 | double avg_ratio=0; 84 | double ratio =0; 85 | 86 | String file1 = Util.readFile(test_cpp_paths.get(j).toString()); 87 | Util.writeFile(test_cpp_paths.get(j).toString() +"\n", output_file, true); 88 | for(int k=0; k < test_cpp_paths.size();k++ ) 89 | { 90 | if(j!=k){ 91 | String file2 = Util.readFile(test_cpp_paths.get(k).toString()); 92 | int distance =computeDistance(file1, file2); 93 | if(file1.length() <= file2.length()){ 94 | ratio = distance/(double)((Integer)file2.length()); 95 | } 96 | if(file2.length() < file1.length()){ 97 | ratio = distance/(double)((Integer)file1.length()); 98 | } 99 | 100 | Util.writeFile("File1 length: "+file1.length() +" " + 101 | "File2 length:"+file2.length()+" ", output_file, true); 102 | Util.writeFile("distance: "+Integer.toString(distance)+ 103 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true); 104 | avg_ratio =avg_ratio+ratio; 105 | }} 106 | avg_ratio = avg_ratio/(double)((Integer)(test_cpp_paths.size()-1)); 107 | Util.writeFile("average ratio of file: "+avg_ratio +"\n", output_file, true); 108 | System.out.println("average ratio of file: "+avg_ratio ); 109 | 110 | author_ratio=author_ratio+avg_ratio; 111 | } 112 | author_ratio=author_ratio/(double)((Integer)(test_cpp_paths.size())); 113 | System.out.println("average ratio of author: "+author_ratio); 114 | 115 | Util.writeFile("average ratio of author: "+author_ratio +"\n", output_file, true); 116 | 117 | }*/ 118 | 119 | List all_cpp_paths = Util.listCPPFiles(parentDir); 120 | String authorName2; 121 | double avg_ratio=0; 122 | 123 | for (int i =0; i< all_cpp_paths.size(); i++) 124 | { 125 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname 126 | File newFile = new File(all_cpp_paths.get(i).toString()); 127 | authorName2 = newFile.getParentFile().getName().toString(); 128 | Util.writeFile(authorName2+":"+newFile.getName().toString() +"\n", output_file, true); 129 | System.out.println(authorName2); 130 | 131 | BufferedReader br = null; 132 | 133 | for(int j=0; j < all_cpp_paths.size();j++ ) 134 | { 135 | File newFile1 = new File(all_cpp_paths.get(j).toString()); 136 | String authorName3 = newFile1.getParentFile().getName().toString(); 137 | 138 | double ratio =0; 139 | if(!authorName2.equals(authorName3)){ 140 | String file1 = Util.readFile(all_cpp_paths.get(i).toString()); 141 | 142 | String file2 = Util.readFile(all_cpp_paths.get(j).toString()); 143 | int distance =computeDistance(file1, file2); 144 | if(file1.length() <= file2.length()){ 145 | ratio = distance/(double)((Integer)file2.length()); 146 | } 147 | if(file2.length() < file1.length()){ 148 | ratio = distance/(double)((Integer)file1.length()); 149 | } 150 | 151 | /* Util.writeFile("File1 length: "+file1.length() +" " + 152 | "File2 length:"+file2.length()+" ", output_file, true); 153 | Util.writeFile("distance: "+Integer.toString(distance)+ 154 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true);*/ 155 | // Util.writeFile( Double.toString(ratio) + ", " ,output_file, true); 156 | avg_ratio =avg_ratio+ratio; 157 | }} 158 | avg_ratio = avg_ratio/(double)((Integer)(all_cpp_paths.size()-9)); 159 | Util.writeFile("\n Average distance to all other files: "+avg_ratio +"\n", output_file, true); 160 | System.out.println("Average distance to all other files: "+avg_ratio ); 161 | } 162 | 163 | } 164 | } -------------------------------------------------------------------------------- /SCAA/src/MergeArffFiles.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.DataInputStream; 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.nio.charset.Charset; 10 | import java.util.Scanner; 11 | 12 | import org.apache.commons.io.FileUtils; 13 | 14 | /** 15 | * Appends two arff files where each feature vector contains the same ID. 16 | * This can be used to combine extracted features with features extracted from JSylo 17 | * (eg Writeprints limited, save to arff files in the Analysis tab) 18 | * 19 | * 20 | * @author Aylin Caliskan-Islam (ac993@drexel.edu) 21 | */ 22 | public class MergeArffFiles { 23 | 24 | //after @data, if the first csv element is the same as file2's first csv element, 25 | //append file2's that line to file1 and move 26 | public static void main(String[] args) throws Exception{ 27 | 28 | 29 | for(int numberFiles = 1; numberFiles <2; numberFiles++){ 30 | 31 | String word = "@data"; 32 | 33 | String file1 ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/UsenixArffs/62Authors/" 34 | 35 | + "62authors14FilesOnlyUsenixFeatures.arff"; 36 | 37 | String file2 ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/UsenixArffs/62Authors/" 38 | 39 | + "62authors14FilesAndrewFeatures.arff"; 40 | 41 | String outputArffName ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/UsenixArffs/62Authors/" 42 | 43 | + "62authors14FilesUsenixAndrewFeatures.arff"; 44 | 45 | /* String file1_tosort ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigramArffs/2014/" 46 | + "9BigExperiment250_2014FS9Andrew.arff"; 47 | String file1_sorted ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigramArffs/2014/" 48 | + "9BigExperiment250_2014FS9Andrew_sorted.arff "; 49 | Util.AlphabeticallySortLinesOfTextInFile(file1_tosort, file1_sorted); 50 | }*/ 51 | 52 | int atDataLineNumberFile1 = MergeArffFiles.grepLineNumber(file1, word); 53 | int atDataLineNumberFile2 = MergeArffFiles.grepLineNumber(file2, word); 54 | 55 | //fast copy attributes 56 | /* File file = new File(file2); 57 | FileReader fileReader = new FileReader(file); 58 | BufferedReader bufferedReader = new BufferedReader(fileReader); 59 | StringBuffer stringBuffer = new StringBuffer(); 60 | String line; 61 | while ((line = bufferedReader.readLine()) != "@data") { 62 | stringBuffer.append(line); 63 | stringBuffer.append("\n"); 64 | Util.writeFile(line + "\n", outputArffName, true); 65 | } 66 | fileReader.close();*/ 67 | 68 | //write the feature names in order from both files 69 | /* for(int firstFileAttributes=1; firstFileAttributes =file2LineNumberStart; j--) 90 | 91 | //for normal case in ascending order 92 | // for(int j=file2LineNumberStart; j <= atDataLineNumberFile2+numberOfInstances; j++) 93 | { 94 | System.out.println(j); 95 | if (instID.equals(MergeArffFiles.getInstanceID(file2, j))) 96 | { 97 | 98 | String firstPart = getInstance(file1, i); 99 | String secondPart = getInstanceVector(file2, j); 100 | System.out.println(firstPart); 101 | 102 | final Scanner scanner = new Scanner(outputArffName); 103 | while (scanner.hasNextLine()) { 104 | final String lineFromFile = scanner.nextLine(); 105 | if(lineFromFile.equals(firstPart)==false) { 106 | Util.writeFile( firstPart+ "," +secondPart + "\n", outputArffName, true); 107 | System.out.println(j); 108 | } 109 | } 110 | 111 | 112 | //Use this if the second file is in descending order 113 | // if(j= atDataLineNumberFile2 +9){ 131 | 132 | file2LineNumberStart= j+1; 133 | 134 | j = atDataLineNumberFile2+numberOfInstances; 135 | } 136 | }*/ 137 | } 138 | } 139 | } } 140 | 141 | } 142 | public static String getInstanceID(String file, int lineNumber) throws IOException 143 | { 144 | //will give an error if there is onl 145 | String line = MergeArffFiles.readSpecificLineNumber(file, lineNumber); 146 | String arr[] = line.split(",", 2); 147 | String firstWord = arr[0]; 148 | return firstWord; 149 | } 150 | 151 | public static String getInstance(String file, int lineNumber) throws IOException 152 | { 153 | //will give an error if there is onl 154 | String line = MergeArffFiles.readSpecificLineNumber(file, lineNumber); 155 | String arr[] = line.split(" ", 1); 156 | String firstWord = arr[0]; 157 | return firstWord; 158 | } 159 | 160 | 161 | public static String getInstanceVector(String file, int lineNumber) throws IOException 162 | { 163 | 164 | String line = MergeArffFiles.readSpecificLineNumber(file, lineNumber); 165 | String arr[] = line.split(",", 2); 166 | 167 | // String firstWord = arr[0]; 168 | String theRest = arr[1]; 169 | return theRest; 170 | } 171 | 172 | 173 | public static String readSpecificLineNumber (String file, int lineNumber) throws IOException 174 | { 175 | String lineString = (String)FileUtils.readLines(new File(file)).get(lineNumber-1); 176 | 177 | return lineString; 178 | } 179 | 180 | 181 | public static int grepLineNumber(String file, String word) throws Exception { 182 | BufferedReader buf = new BufferedReader(new InputStreamReader(new DataInputStream(new FileInputStream(file)))); 183 | 184 | String line; 185 | int lineNumber = 0; 186 | while ((line = buf.readLine()) != null) { 187 | lineNumber++; 188 | if (word.equals(line)) { 189 | return lineNumber; 190 | } 191 | } 192 | return -1; 193 | } 194 | 195 | } 196 | -------------------------------------------------------------------------------- /SCAA/src/MergeArffFilesNew.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.DataInputStream; 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.io.Reader; 10 | import java.nio.charset.Charset; 11 | import java.util.Enumeration; 12 | import java.util.HashSet; 13 | import java.util.Scanner; 14 | import java.util.Set; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | import org.apache.commons.io.FileUtils; 19 | 20 | import weka.core.Instances; 21 | 22 | /** 23 | * Appends two arff files where each feature vector contains the same ID. 24 | * This can be used to combine features at different times. 25 | * The two files can contain the same features. 26 | * It does a right join based on the first file. The resulting file would only have features 27 | * in the first file that also existed in the second file. 28 | * 29 | * 30 | * @author Aylin Caliskan-Islam (aylinc@princeton.edu) 31 | */ 32 | public class MergeArffFilesNew { 33 | 34 | //Find the intersecting userIDs and merge all the features and instances to a new file 35 | //file2's instances and features are appended to file1 36 | public static void main(String[] args) throws Exception{ 37 | 38 | 39 | String file1 ="/Users/Aylin/Desktop/Princeton/BAA/arffs/" 40 | 41 | + "C_62Authors14files_decompiledNEW.arff"; 42 | 43 | String file2 ="/Users/Aylin/Desktop/Princeton/BAA/arffs/" 44 | 45 | // + "merged/C_62Authors14files_original_C++.arff"; 46 | + "62authors14FilesUsenixAndrewFeatures.arff"; 47 | 48 | 49 | String outputArffName ="/Users/Aylin/Desktop/Princeton/BAA/arffs/merged/" 50 | 51 | + "C_62Authors14files_decompiledPlusOriginal.arff"; 52 | 53 | 54 | 55 | 56 | 57 | Util.writeFile("@relation " + file1+file2+"\n" +"\n" , outputArffName, true); 58 | 59 | // Read all the instances in the files 60 | Instances instances = new Instances(new FileReader(file1)); 61 | Instances instances2 = new Instances(new FileReader(file2)); 62 | 63 | for (int att=0; att < instances.numAttributes(); att++) 64 | // for (int att=0; att < 50; att++) 65 | { // System.out.println("instance no:"+att+" "+instances.attribute(att).name()); 66 | String type=""; 67 | String attValues=""; 68 | String name=instances.attribute(att).name(); 69 | name.replace("$", "dollarsign"); 70 | String arr[] = name.split("\n", 10); 71 | if(arr.length>1){ 72 | name=""; 73 | int splits = arr.length; 74 | for(int i =0; i1){ 128 | name=""; 129 | int splits = arr.length; 130 | for(int i =0; i"+"\n", problemSetFilename, true); 31 | Util.writeFile("\t" + ""+"\n", problemSetFilename, true); 32 | for(int i=0; i< authorName.length; i++) 33 | { 34 | 35 | Util.writeFile("\t"+"\t"+ ""+"\n", problemSetFilename, true); 36 | List test_cpp_paths = Util.listCPPFiles(test_dir + authorName[i] + "/"); 37 | // System.out.println(test_cpp_paths); 38 | for(int j=0; j < test_cpp_paths.size();j++ ) 39 | { 40 | File fileCPP = new File(test_cpp_paths.get(j).toString()); 41 | String fileName = fileCPP.getName(); 42 | Util.writeFile("\t"+"\t"+"\t"+"" 43 | + test_cpp_paths.get(j).toString() + "" 44 | + "\n", problemSetFilename, true); 45 | } 46 | Util.writeFile("\t"+"\t"+ ""+ "\n", problemSetFilename, true); 47 | 48 | } 49 | Util.writeFile("\t"+ ""+ "\n", problemSetFilename, true); 50 | Util.writeFile("\t"+ ""+ "\n", problemSetFilename, true); 51 | Util.writeFile("\t"+ ""+ "\n", problemSetFilename, true); 52 | Util.writeFile(""+ "\n", problemSetFilename, true); 53 | 54 | 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /SCAA/src/RelaxedEvaluation.java: -------------------------------------------------------------------------------- 1 | import weka.classifiers.CostMatrix; 2 | import weka.classifiers.Evaluation; 3 | import weka.core.Instance; 4 | import weka.core.Instances; 5 | import weka.core.Utils; 6 | import java.util.ArrayList; 7 | import java.util.Comparator; 8 | import java.util.List; 9 | import java.util.SortedMap; 10 | import java.util.TreeMap; 11 | //Author, Ariel Stolerman, code taken from the doppelganger finder project 12 | 13 | public class RelaxedEvaluation extends Evaluation { 14 | protected int relaxParam; 15 | 16 | /** 17 | * Initializes all the counters for the evaluation. 18 | * Use useNoPriors() if the dataset is the test set and you 19 | * can't initialize with the priors from the training set via 20 | * setPriors(Instances). 21 | * 22 | * @param data set of training instances, to get some header 23 | * information and prior class distribution information 24 | * @throws Exception if the class is not defined 25 | * @see #useNoPriors() 26 | * @see #setPriors(Instances) 27 | */ 28 | public RelaxedEvaluation(Instances data, int relaxParam) throws Exception { 29 | super(data); 30 | this.relaxParam = relaxParam; 31 | } 32 | 33 | /** 34 | * Initializes all the counters for the evaluation and also takes a 35 | * cost matrix as parameter. 36 | * Use useNoPriors() if the dataset is the test set and you 37 | * can't initialize with the priors from the training set via 38 | * setPriors(Instances). 39 | * 40 | * @param data set of training instances, to get some header 41 | * information and prior class distribution information 42 | * @param costMatrix the cost matrix---if null, default costs will be used 43 | * @throws Exception if cost matrix is not compatible with 44 | * data, the class is not defined or the class is numeric 45 | * @see #useNoPriors() 46 | * @see #setPriors(Instances) 47 | */ 48 | public RelaxedEvaluation(Instances data, CostMatrix costMatrix, int relaxParam) 49 | throws Exception { 50 | super(data, costMatrix); 51 | this.relaxParam = relaxParam; 52 | } 53 | 54 | /** 55 | * Compares Doubles by ascending order 56 | */ 57 | static Comparator descendingDouble = new Comparator() { 58 | @Override 59 | public int compare(Double arg0, Double arg1) { 60 | return -1 * arg0.compareTo(arg1); 61 | } 62 | }; 63 | 64 | static Comparator descendingInteger = new Comparator() { 65 | @Override 66 | public int compare(Integer arg0, Integer arg1) { 67 | return -1 * arg0.compareTo(arg1); 68 | } 69 | }; 70 | 71 | /** 72 | * Updates all the statistics about a classifiers performance for 73 | * the current test instance. 74 | * 75 | * @param predictedDistribution the probabilities assigned to 76 | * each class 77 | * @param instance the instance to be classified 78 | * @throws Exception if the class of the instance is not 79 | * set 80 | */ 81 | protected void updateStatsForClassifier(double [] predictedDistribution, 82 | Instance instance) 83 | throws Exception { 84 | 85 | int actualClass = (int)instance.classValue(); 86 | 87 | if (!instance.classIsMissing()) { 88 | updateMargins(predictedDistribution, actualClass, instance.weight()); 89 | 90 | // collect all predictions and their corresponding classes 91 | SortedMap predToClass = 92 | new TreeMap(descendingDouble); 93 | for(int i = 0; i < m_NumClasses; i++) { 94 | predToClass.put(predictedDistribution[i], i); 95 | } 96 | List candidateClasses = new ArrayList(relaxParam); 97 | int count = 0; 98 | for (Double pred: predToClass.keySet()) 99 | { 100 | candidateClasses.add(predToClass.get(pred)); 101 | count++; 102 | if (count == relaxParam) 103 | break; 104 | } 105 | // check if relaxed set of candidates contains actual, if so - 106 | // attribute that prediction 107 | // otherwise - take the to pprediction 108 | int predictedClass = -1; 109 | if (candidateClasses.contains(actualClass)) 110 | predictedClass = actualClass; 111 | else 112 | predictedClass = candidateClasses.get(0); 113 | 114 | /* 115 | // Determine the predicted class (doesn't detect multiple 116 | // classifications) 117 | int predictedClass = -1; 118 | double bestProb = 0.0; 119 | for(int i = 0; i < m_NumClasses; i++) { 120 | if (predictedDistribution[i] > bestProb) { 121 | predictedClass = i; 122 | bestProb = predictedDistribution[i]; 123 | } 124 | } 125 | */ 126 | 127 | m_WithClass += instance.weight(); 128 | 129 | // Determine misclassification cost 130 | if (m_CostMatrix != null) { 131 | if (predictedClass < 0) { 132 | // For missing predictions, we assume the worst possible cost. 133 | // This is pretty harsh. 134 | // Perhaps we could take the negative of the cost of a correct 135 | // prediction (-m_CostMatrix.getElement(actualClass,actualClass)), 136 | // although often this will be zero 137 | m_TotalCost += instance.weight() 138 | * m_CostMatrix.getMaxCost(actualClass, instance); 139 | } else { 140 | m_TotalCost += instance.weight() 141 | * m_CostMatrix.getElement(actualClass, predictedClass, 142 | instance); 143 | } 144 | } 145 | 146 | // Update counts when no class was predicted 147 | if (predictedClass < 0) { 148 | m_Unclassified += instance.weight(); 149 | return; 150 | } 151 | 152 | double predictedProb = Math.max(MIN_SF_PROB, 153 | predictedDistribution[actualClass]); 154 | double priorProb = Math.max(MIN_SF_PROB, 155 | m_ClassPriors[actualClass] 156 | / m_ClassPriorsSum); 157 | if (predictedProb >= priorProb) { 158 | m_SumKBInfo += (Utils.log2(predictedProb) - 159 | Utils.log2(priorProb)) 160 | * instance.weight(); 161 | } else { 162 | m_SumKBInfo -= (Utils.log2(1.0-predictedProb) - 163 | Utils.log2(1.0-priorProb)) 164 | * instance.weight(); 165 | } 166 | 167 | m_SumSchemeEntropy -= Utils.log2(predictedProb) * instance.weight(); 168 | m_SumPriorEntropy -= Utils.log2(priorProb) * instance.weight(); 169 | 170 | updateNumericScores(predictedDistribution, 171 | makeDistribution(instance.classValue()), 172 | instance.weight()); 173 | 174 | // Update other stats 175 | m_ConfusionMatrix[actualClass][predictedClass] += instance.weight(); 176 | if (predictedClass != actualClass) { 177 | m_Incorrect += instance.weight(); 178 | } else { 179 | m_Correct += instance.weight(); 180 | } 181 | } else { 182 | m_MissingClass += instance.weight(); 183 | } 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /SCAA/src/RemoveComments.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.List; 3 | 4 | 5 | public class RemoveComments { 6 | 7 | public static void main(String[] args) throws IOException 8 | { 9 | String test = "githubManySmallSnippets/"; 10 | List test_file_paths = Util.listCPPFiles(test); //use this for preprocessing 11 | for(int i=0; i< test_file_paths.size(); i++) 12 | { 13 | String fileName = test_file_paths.get(i).toString(); 14 | System.out.println(fileName); 15 | String sourceCode = Util.readFile(fileName); 16 | // System.out.println(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)","")); 17 | //does not catch Gleb.kalachev's comments, removed them manually. Has a lot of commented code. 18 | Util.writeFile(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)",""), fileName, false); 19 | }} 20 | public static void removeComments(String test) throws IOException 21 | { 22 | List test_file_paths = Util.listCPPFiles(test); //use this for preprocessing 23 | for(int i=0; i< test_file_paths.size(); i++) 24 | { 25 | String fileName = test_file_paths.get(i).toString(); 26 | System.out.println(fileName); 27 | String sourceCode = Util.readFile(fileName); 28 | // System.out.println(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)","")); 29 | //does not catch Gleb.kalachev's comments, removed them manually. Has a lot of commented code. 30 | Util.writeFile(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)",""), fileName, false); 31 | } 32 | 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /SCAA/src/WholeWordIndexFinder.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.List; 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | // Taken from http://whyjava.wordpress.com/2010/05/04/finding-all-the-indexes-of-a-whole-word-in-a-given-string-using-java/ 7 | public class WholeWordIndexFinder { 8 | 9 | private String searchString; 10 | 11 | public WholeWordIndexFinder(String searchString) { 12 | this.searchString = searchString; 13 | } 14 | 15 | public List findIndexesForKeyword(String keyword) { 16 | String regex = "\\b"+keyword+"\\b"; 17 | Pattern pattern = Pattern.compile(regex); 18 | Matcher matcher = pattern.matcher(searchString); 19 | 20 | List wrappers = new ArrayList(); 21 | 22 | while(matcher.find() == true){ 23 | int end = matcher.end(); 24 | int start = matcher.start(); 25 | IndexWrapper wrapper = new IndexWrapper(start, end); 26 | wrappers.add(wrapper); 27 | } 28 | return wrappers; 29 | } 30 | 31 | public static void main(String[] args) { 32 | WholeWordIndexFinder finder = new WholeWordIndexFinder( 33 | "2 (FunctionDef(((CompoundStatement((ForStatement((ForInit((IdentifierDeclStatement(IdentifierDecl)))))((Condition((RelationalExpression(x)(10)))))((IncDecOp((x))((++))))"); 34 | List indexes = finder.findIndexesForKeyword("Condition"); 35 | System.out.println("Indexes found "+indexes.size() +" keyword found at index : " +indexes.get(0).getStart()); 36 | 37 | //input should be the dep file, do this for each line 38 | //take the last line that a function id appears in that has the whole depth structure 39 | String input = "1111 t (flag)"; 40 | //take the function id in the beginning of the line. 41 | String firstWord = input.substring(0, input.indexOf('\t')); 42 | System.out.println(firstWord); 43 | 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /SCAA/weka.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/weka.jar --------------------------------------------------------------------------------