├── Corpus
├── .classpath
├── .project
├── CodeJamMetadata.json
├── allusers.py
├── allusers2.py
├── cjcompile.py
├── cjscrape.py
├── cjstats.py
└── users
│ ├── 1128486.txt
│ ├── 1145485.txt
│ ├── 1150485.txt
│ ├── 1150486.txt
│ ├── 1158485.txt
│ ├── 1327485.txt
│ ├── 1460488.txt
│ ├── 1645485.txt
│ ├── 1781488.txt
│ ├── 1835486.txt
│ ├── 1836486.txt
│ ├── 1842485.txt
│ ├── 186264.txt
│ ├── 188266.txt
│ ├── 189252.txt
│ ├── 204113.txt
│ ├── 2075486.txt
│ ├── 2270488.txt
│ ├── 2418487.txt
│ ├── 243103.txt
│ ├── 2433487.txt
│ ├── 2434486.txt
│ ├── 2437488.txt
│ ├── 2437491.txt
│ ├── 2442487.txt
│ ├── 2974486.txt
│ ├── 2984486.txt
│ ├── 2994486.txt
│ ├── 3004486.txt
│ ├── 3014486.txt
│ ├── 3024486.txt
│ ├── 311101.txt
│ ├── 32001.txt
│ ├── 32002.txt
│ ├── 32005.txt
│ ├── 32008.txt
│ ├── 32010.txt
│ ├── 32011.txt
│ ├── 32013.txt
│ ├── 32015.txt
│ ├── 32016.txt
│ ├── 32017.txt
│ ├── 433101.txt
│ ├── 544101.txt
│ ├── 619102.txt
│ ├── 635101.txt
│ ├── 635102.txt
│ ├── 639102.txt
│ ├── 7214486.txt
│ ├── 801485.txt
│ ├── 90101.txt
│ ├── 975485.txt
│ └── users.txt
├── Naive Baseline
├── .classpath
├── .gitignore
├── .project
└── src
│ ├── ARFFFactory.java
│ ├── ARFFFactory2.java
│ ├── ARFFFactory3.java
│ ├── ARFFFactory4.java
│ ├── AbstractExtractor.java
│ ├── CodeBlock.java
│ ├── ControlStatement.java
│ ├── Driver.java
│ ├── ExtractorC.java
│ ├── ExtractorCPP.java
│ ├── FeatureSet.java
│ ├── Loops.java
│ ├── MultiSet.java
│ ├── ReservedC.java
│ ├── ReservedCPP.java
│ ├── WhiteSpace.java
│ ├── c_reserved_words.txt
│ └── cpp_reserved_words.txt
├── README.md
└── SCAA
├── .classpath
├── .gitignore
├── .project
├── .settings
└── org.eclipse.jdt.core.prefs
├── commons-exec-1.2.jar
├── commons-lang3-3.3.2.jar
├── commons.io_2.0.1.jar
├── javacsv.jar
├── src
├── AuthorClassification.java
├── AuthorClassificationRelaxed.java
├── BigramExtractor.java
├── CheckFiles.java
├── DatasetCreator.java
├── DepthASTNode.java
├── DistanceCalculations.java
├── FeatureCalculators.java
├── FeatureExtractor.java
├── FeatureExtractorConcurrent.java
├── FeatureExtractorInfoGain.java
├── IndexWrapper.java
├── LevenshteinDistance.java
├── MergeArffFiles.java
├── MergeArffFilesNew.java
├── ProblemSetWriter.java
├── RelaxedEvaluation.java
├── RemoveComments.java
├── Util.java
└── WholeWordIndexFinder.java
└── weka.jar
/Corpus/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Corpus/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | Corpus
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Corpus/allusers.py:
--------------------------------------------------------------------------------
1 | from urllib import urlopen
2 | import json
3 |
4 | #
5 | # Gets all users who participated in the Google Code Jam competition.
6 | # Posts results in a single text file.
7 | #
8 |
9 | users = {} # dictionary of all discovered users
10 |
11 | # adds all users who participated in the given round to the dictionary
12 | def get_all_users(round_id, num_players):
13 | global users
14 | for pos in range(1, int(num_players), 30):
15 | meta_url = "http://code.google.com/codejam/contest/scoreboard/" \
16 | + "do?cmd=GetScoreboard&contest_id=" \
17 | + round_id \
18 | + "&show_type=all&start_pos=" \
19 | + str(pos) \
20 | + "&views_time=1&views_file=0&csrfmiddlewaretoken="
21 | meta_url_data = urlopen(meta_url).read()
22 | meta_json = json.loads(meta_url_data)
23 | for row in meta_json['rows']:
24 | username = row['n']
25 | users[username] = True
26 |
27 | user_file = open('users/users.txt', 'w')
28 | metadatafile = open(os.path.dirname(os.path.realpath(__file__)) + "/CodeJamMetadata.json").read()
29 | metadata = json.loads(metadatafile)
30 |
31 | # loop through all years
32 | for year_json in metadata['competitions']:
33 | qual_round = year_json['round'][0] # get only the qualification round
34 | num_players = qual_round['numPlayers']
35 | round_id = qual_round['contest']
36 | get_all_users(round_id, num_players) # get users for the qualification round of the given year
37 |
38 | # write out all users
39 | for user in users.keys():
40 | user_file.write(user)
41 | user_file.write('\n')
42 | user_file.close()
43 |
--------------------------------------------------------------------------------
/Corpus/allusers2.py:
--------------------------------------------------------------------------------
1 | from urllib import urlopen
2 | import json
3 | import os
4 |
5 | #
6 | # Gets all users who participated in the Google Code Jam competition.
7 | # Posts results according to round number.
8 | #
9 |
10 | # writes a list of all users who participated in the round
11 | def get_all_users(round_id, num_players):
12 | round_file = open('users/' + round_id + '.txt', 'w')
13 |
14 | # loop through each page of users
15 | for pos in range(1, int(num_players), 30):
16 | meta_url = "http://code.google.com/codejam/contest/scoreboard/" \
17 | + "do?cmd=GetScoreboard&contest_id=" \
18 | + round_id \
19 | + "&show_type=all&start_pos=" \
20 | + str(pos) \
21 | + "&views_time=1&views_file=0&csrfmiddlewaretoken="
22 | print meta_url
23 | meta_url_data = urlopen(meta_url).read()
24 | meta_json = json.loads(meta_url_data)
25 |
26 | # find and print usernames
27 | for row in meta_json['rows']:
28 | username = row['n']
29 | round_file.write(username)
30 | round_file.write('\n')
31 | print username
32 | round_file.close()
33 |
34 | # load JSON
35 | metadatafile = open(os.path.dirname(os.path.realpath(__file__)) + "/CodeJamMetadata.json").read()
36 | metadata = json.loads(metadatafile)
37 |
38 | # loop through all years
39 | for year_json in metadata['competitions']:
40 | for round_json in year_json['round']:
41 | num_players = round_json['numPlayers']
42 | round_id = round_json['contest']
43 | get_all_users(round_id, num_players)
44 |
--------------------------------------------------------------------------------
/Corpus/cjcompile.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | #
5 | # Usage: python /directory/path/to/cjcompile.py [compiler flags]
6 | #
7 | # Compiles all C/C++ source files in the current working directory.
8 | # Also recursively compiles all C/C++ source files in all subdirectories.
9 | #
10 |
11 | flags = sys.argv
12 | flags.pop(0)
13 |
14 | # go through all files under the root directory
15 | for (path, dirs, files) in os.walk('.'):
16 | for f in files:
17 | # check if file is a C or C++ file
18 | if f.endswith(('.c', '.cpp')):
19 | if f.endswith('.c'): # C file
20 | command = "gcc "
21 | else: # C++ file
22 | command = "g++ "
23 |
24 | # add full directory path of the source file
25 | command += path
26 | command += '/'
27 | command += f
28 |
29 | # add full directory path of the executable
30 | command += " -o "
31 | command += path
32 | command += '/'
33 | command += os.path.splitext(f)[0] # remove file extension
34 |
35 | # adding in compiler flags (specified in the arguments)
36 | for flag in flags:
37 | command += " "
38 | command += flag
39 |
40 | # compile
41 | os.system(command)
42 | print command
43 |
--------------------------------------------------------------------------------
/Corpus/cjscrape.py:
--------------------------------------------------------------------------------
1 | from urllib import urlopen
2 | from urllib import urlretrieve
3 | import json
4 | import sys
5 | import os
6 | import zipfile
7 | import shutil
8 | import multiprocessing
9 |
10 | #
11 | # Scrapes Google Code Jam data, and extracts the C/C++/Python source files.
12 | #
13 | # The directory structure and naming convention of the data is as follows:
14 | #
15 | # ./codejamdata/ |--> c/ | --> username0 | --> p[problem_number].[user_name]0.c
16 | # | | | --> p2453486.Bob0.c
17 | # | | | --> etc...
18 | # | |
19 | # | | --> name0 | --> etc...
20 | # | | | --> etc...
21 | # | | | --> etc...
22 | # | |
23 | # | | --> another0 | --> etc...
24 | # | | --> etc...
25 | # |
26 | # |--> cpp/ | --> etc... | --> etc...
27 | # | | | --> etc...
28 | # | |
29 | # | | --> etc... | --> etc...
30 | # |
31 | # |--> py/ | --> etc... | --> etc...
32 | #
33 |
34 | # returns the URL to download the user submission
35 | def get_download_url(round_id, problem_id, username):
36 | return "http://code.google.com/codejam/contest/scoreboard/do?cmd=GetSourceCode&contest=" \
37 | + round_id \
38 | + "&problem=" \
39 | + problem_id \
40 | + "&io_set_id=0&username=" \
41 | + username
42 |
43 | # scrapes the C/C++/Python files of the given round
44 | def scrape(round_id, problems, script_path):
45 |
46 | # load list of users
47 | user_file = open(script_path + '/users/' + round_id + '.txt', 'r')
48 | users = user_file.read().splitlines()
49 |
50 | # loop through problems in the round
51 | for problem_json in problems:
52 | problem_id = problem_json['id']
53 |
54 | # loop through users who participated in the round
55 | for username in users:
56 | download_url = get_download_url(round_id, problem_id, username)
57 |
58 | # print and flush URL
59 | print download_url
60 | sys.stdout.flush()
61 |
62 | # make temp directory for storing zips
63 | tempdir = round_id + 'temp'
64 | if not os.path.exists(tempdir):
65 | os.makedirs(tempdir)
66 |
67 | # download and read zip
68 | target_zip = tempdir + '/' + problem_id + '.' + username + '0.zip'
69 | urlretrieve(download_url,target_zip)
70 | zip_header = open(target_zip, 'rb')
71 |
72 | # try-except in case of a bad header
73 | try:
74 | my_zip = zipfile.ZipFile(zip_header)
75 |
76 | # loop through each file in the zip file
77 | for my_file in my_zip.namelist():
78 |
79 | # check for C/C++/Python source
80 | if my_file.endswith(('.c', '.cpp', '.py')):
81 | target_source = username + '0' # destination of source files
82 | file_newname = 'p' + problem_id + '.' + username + '0.' # appropriate name for file
83 | if my_file.endswith('.c'):
84 | file_newname += 'c'
85 | target_source = 'c/' + target_source
86 | elif my_file.endswith('.cpp'):
87 | file_newname += 'cpp'
88 | target_source = 'cpp/' + target_source
89 | else:
90 | file_newname += 'py'
91 | target_source = 'py/' + target_source
92 | target_source = 'codejamfolder/' + target_source
93 |
94 | # make directory for language and author
95 | if not os.path.exists(target_source):
96 | os.makedirs(target_source)
97 |
98 | # extract and rename source file
99 | my_zip.extract(my_file, target_source)
100 | os.rename((target_source + '/' + my_file), (target_source + '/' + file_newname))
101 |
102 | # print location of extracted source file
103 | print target_source + '/' + file_newname
104 | sys.stdout.flush()
105 | except:
106 | print "error:", sys.exc_info()[0] # can happen if the user didn't do a problem
107 | sys.stdout.flush()
108 |
109 | # delete temp directory
110 | if os.path.exists(tempdir):
111 | shutil.rmtree(tempdir)
112 | return
113 |
114 | # main section of script
115 | if __name__ == '__main__':
116 | script_path = os.path.dirname(os.path.realpath(__file__))
117 | metadatafile = open(script_path + "/CodeJamMetadata.json").read()
118 | metadata = json.loads(metadatafile)
119 |
120 | # loop through years
121 | for year_json in metadata['competitions']:
122 | year = year_json['year']
123 |
124 | # loop through rounds
125 | for round_json in year_json['round']:
126 | round_id = round_json['contest']
127 | problems = round_json['problems']
128 |
129 | # run scraper on current round
130 | scraper = multiprocessing.Process(target=scrape, args=(round_id, problems, script_path))
131 | scraper.start()
132 |
--------------------------------------------------------------------------------
/Corpus/cjstats.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import re
4 | import json
5 |
6 | #
7 | # Counts number of files per round based off filename.
8 | # Run in root directory of files you want to search.
9 | # CodeJamMetadata.json must be in the same directory as this script.
10 | #
11 |
12 | def get_problem_id(filename):
13 | regex = re.search('[0-9]+', filename, flags=0)
14 | return regex.group()
15 |
16 | def get_username(filename):
17 | filename += os.path.splitext(filename)[0]
18 | return re.sub('p[0-9]+\.', '', filename)
19 |
20 | metadatafile = open(os.path.dirname(os.path.realpath(__file__)) + "/CodeJamMetadata.json").read()
21 | metadata = json.loads(metadatafile)
22 |
23 |
24 | # hash: p# -> r#
25 | # another hash: r# -> {user -> true}
26 | prob_to_round = {}
27 | #round_users = {}
28 | round_users = []
29 | round_to_desc = []
30 |
31 | round_count = 0
32 |
33 | for year_json in metadata['competitions']:
34 | year = year_json['year']
35 | for round_json in year_json['round']:
36 | description = round_json['desc']
37 | round_id = round_json['contest']
38 | #round_users[round_id] = {} #
39 | round_users.append({})
40 | round_to_desc.append(description)
41 | num_players = round_json['numPlayers']
42 | for problem_json in round_json['problems']:
43 | problem_name = problem_json['name']
44 | problem_id = problem_json['id']
45 | #prob_to_round[problem_id] = round_id #
46 | prob_to_round[problem_id] = round_count
47 | round_count += 1
48 |
49 | # go through all files under the root directory
50 | for (path, dirs, files) in os.walk('.'):
51 | for f in files:
52 | p_id = get_problem_id(f)
53 | u_name = get_username(f)
54 | round_users[prob_to_round[p_id]][u_name] = True
55 |
56 | for i in range(len(round_users)):
57 | print round_to_desc[i]
58 | print len(round_users[i])
59 |
--------------------------------------------------------------------------------
/Corpus/users/1158485.txt:
--------------------------------------------------------------------------------
1 | linguo
2 | nika
3 | winger
4 | zyz915
5 | misof
6 | andrewzta
7 | rng..58
8 | mystic
9 | acrush
10 | natalia
11 | hanshuai
12 | meret
13 | darnley
14 | eatmore
15 | ilyakor
16 | g201513
17 | ashmelev
18 | Egor
19 | dolphinigle
20 | omeometo
21 | ilyaraz
22 | Bin.Jin
23 | vepifanov
24 | neal.wu
25 | ir5
26 | RAD.
27 | Palmtenor
28 | cgy4ever
29 | pashka
30 | iddaga
31 | voover
32 | ogiekako
33 | Al.Cash
34 | Maja
35 | pdallago
36 | PaulJefferys
37 | surwdkgo
38 | fagu
39 | chokudai
40 | Ra16bit
41 | stgatilov
42 | Eryx
43 | hos.lyric
44 | mikhailOK
45 | SergeyFedorov
46 | vot
47 | iwi
48 | tikitikirevenge
49 | SergeyRogulenko
50 | tos.lunar
51 | anari
52 | dasko
53 | Jonick
54 | seanwu
55 | Vasyl
56 | open
57 | KAP
58 | izulin
59 | Yaro
60 | Khuc.Anh.Tuan
61 | KennyHORROR
62 | uwi
63 | jh1
64 | Landertxu
65 | Tomato
66 | a70babat
67 | levlam
68 | tom612pl
69 | Stigius
70 | jinlin
71 | qizichao
72 | Klinck
73 | kitamasa
74 | yczhang
75 | Ahyangyi
76 | Gennady.Korotkevich
77 | dAnton
78 | theycallhimtom
79 | earl
80 | slippy
81 | peter50216
82 | JAPLJ
83 | HiltonLange
84 | dan19
85 | WSX
86 | Wataru
87 | Vedensky
88 | Vytis
89 | pawelparys
90 | Milanin
91 | bayleef
92 | wata
93 | Fumiya
94 | emaxx
95 | niyaznigmatul
96 | tckwok
97 | Fdg
98 | masha.and.beer
99 | vsb
100 | watashi
101 | dzhulgakov
102 | Akim
103 | yeputons
104 | Louty
105 | RAVEman
106 | ConanKudo247
107 | qwaker.00
108 | Burunduk1
109 | lqp1831
110 | iwiskimo
111 | Farmer.John
112 | sevenkplus
113 | eduardische
114 | navi
115 | X.Ray
116 | monsoon
117 | LinesPrower
118 | GarnetCrow
119 | sdya
120 | Seyaua
121 | lidaobing
122 | bmerry
123 | blmarket
124 | flashmt
125 | R.R.
126 | venco5
127 | kubus
128 | aleksey
129 | sisu
130 | alantian
131 | LoRdTaPaKaH
132 | MaxBuzz
133 | wychen
134 | SmileIJP
135 | temper
136 | gaoxin
137 | szsz
138 | Fcdkbear
139 | PhilipPV
140 | ONP
141 | mk.al13n
142 | ACube
143 | exod40
144 | W.Junqiao
145 | Sammarize
146 | MRoizner
147 | jimison
148 | Gassa
149 | RoBa
150 | polesp
151 | kappahouse
152 | Pedro.Bello
153 | stolis
154 | Sempr
155 | Koyaah
156 | eaglet
157 | Connector
158 | EvgeniusASPX
159 | TangKy
160 | emppu
161 | enot.1.10
162 | Bicheng.Cao
163 | DKI
164 | austrin
165 | paladin8
166 | KOTEHOK
167 | Burunduk2
168 | Jonasz
169 | ll931110
170 | charliez
171 | kcm1700
172 | shik
173 | ivan.popelyshev
174 | YiningWang
175 | xreborner
176 | Progbeat
177 | burdakovd
178 | alexkas
179 | pieguy
180 | gawry
181 | Anton.Lunyov
182 | WhiteBear
183 | SoCalledName
184 | XiaoZiqian
185 | akira.nekoneko
186 | kcd
187 | msg555
188 | kia
189 | yayamao
190 | chenwang0517
191 | 2rf
192 | foison
193 | w10d
194 | DjinnKahn
195 | butterfly21
196 | flowlight
197 | huameng
198 | zaq1xsw2tktk
199 | DD.tt
200 | gilesg
201 | nk.karpov
202 | qin
203 | DamianS
204 | janq
205 | thinfaifai
206 | Zhiwei.Li
207 | simp1eton
208 | dano
209 | xlmj531
210 | This
211 | fidels
212 | Alexander86
213 | gustav
214 | Aiz
215 | pasin30055
216 | fero
217 | Pompon
218 | vpj
219 | boris4
220 | zhendongjia
221 | nihao
222 | domeng
223 | DmitryEgorov
224 | Pasqual45
225 | rafaeldaigo
226 | bcloud7
227 | han6
228 | Kirino
229 | xiaowuc1
230 | Gluk
231 | Joshik
232 | Lovro
233 | Dragoon
234 | int9
235 | tsukuno
236 | Ljq
237 | megaterik
238 | tanakh
239 | zibada
240 | VulpesX
241 | SmartSchizo
242 | drazil
243 | Ignat
244 | fuch
245 | VArtem
246 | SkidanovAlexander
247 | Zlobober
248 | VITAKS
249 | guilherme
250 | defrager
251 | imbanoob
252 | humblefool
253 | sprea
254 | yiuyuho
255 | Yakumo
256 | latsyrc
257 | chEEtah
258 | beingryu
259 | Romka
260 | LayCurse
261 | Sergey.Bankevich
262 | gdiver
263 | mrozik
264 | ronalchn
265 | berger
266 | Ostap
267 | hpmv
268 | KevinErikLee
269 | ploh
270 | blando
271 | psir
272 | darkKelvin
273 | dooglius
274 | tantian
275 | alyaxey
276 | Vitaliy
277 | madking
278 | Slimper
279 | cedriclin
280 | dennis.lissov
281 | kuba97531
282 | resty
283 | eagleonhill
284 | pmnox
285 | paisa
286 | AVictor
287 | Yao
288 | EpicWu
289 | cocls
290 | .dP.
291 | goodwind
292 | ush
293 | QuJun
294 | C.A.
295 | ikatanic
296 | oioi98
297 | zouxun
298 | technolt
299 | dalex
300 | hillboy
301 | Sanny
302 | GVS
303 | dj3500
304 | AlexLin
305 | stan
306 | balakrishnan.v
307 | alexmat21
308 | jellies
309 | jackchen92
310 | TripleManiac
311 | moonlight
312 | Dembel
313 | love.wenxuan
314 | SceneTree
315 | wuzhengkai
316 | iscsi
317 | AquaSnail
318 | shadowind
319 | MauricioC
320 | dzetkulict
321 | ramlaf
322 | Tony
323 | zhouxiaobo
324 | radeye
325 | sjtu.pigoneand
326 | eMBe
327 | k.kojima
328 | siavosh
329 | mlwong
330 | Ax.h
331 | shreevatsa
332 | zero.lin
333 | AlexanderL
334 | springegg
335 | LordKlotski
336 | dozingcat
337 | dyukha
338 | narri
339 | naagi
340 | EricStansifer
341 | goober
342 | Michael.Levin
343 | Ahbong
344 | 357
345 | BM954
346 | SeMeKh
347 | Kinan.Sarmini
348 | pablo.aguilar
349 | angwuy
350 | dahlukeh
351 | aditsu
352 | aurinegro
353 | pP5438
354 | liymouse
355 | Myth
356 | Lightmoon
357 | strangecow
358 | Navid
359 | xneby
360 | Nikkolloz
361 | azhai
362 | Peteris
363 | Factorial
364 | random.johnnyh
365 | hmao5
366 | wangchaohui
367 | jaehyunp
368 | sbzlyessit
369 | HerrKanzler
370 | almaz
371 | Akai
372 | ttim
373 | coolzzz
374 | dan.banica
375 | berestinsky
376 | forifchen
377 | Weiqi
378 | solidsnake1905
379 | falcon112358
380 | Jed
381 | vlad
382 | Jrdevil.1984
383 | Hitrez
384 | Tomi
385 | seuamigohenry
386 | Kalmakka
387 | piyifan
388 | cypress
389 | WeiLiu
390 | Davidmg
391 | akaki
392 | ghostgold
393 | Handojo1
394 | jzj
395 | TRYang
396 | AekdyCoin
397 | Naonao
398 | ErickW
399 | shouhm
400 | FloppyCat
401 | TDteach
402 | kmjp
403 | gepa
404 | stupidbear
405 | Nooodles
406 | whh
407 | ArtHoly
408 |
--------------------------------------------------------------------------------
/Corpus/users/1327485.txt:
--------------------------------------------------------------------------------
1 | rng..58
2 | mystic
3 | meret
4 | RAD.
5 | misof
6 | g201513
7 | pashka
8 | vepifanov
9 | eatmore
10 | winger
11 | natalia
12 | acrush
13 | ilyakor
14 | ir5
15 | Bin.Jin
16 | voover
17 | hanshuai
18 | Palmtenor
19 | cgy4ever
20 | ashmelev
21 | Egor
22 | linguo
23 | neal.wu
24 | darnley
25 | zyz915
26 |
--------------------------------------------------------------------------------
/Corpus/users/1835486.txt:
--------------------------------------------------------------------------------
1 | EgorKulikov
2 | Eryx
3 | SnapDragon
4 | eatmore
5 | bmerry
6 | Ahyangyi
7 | squark
8 | andrewzta
9 | Vasyl
10 | misof
11 | Gennady.Korotkevich
12 | meret
13 | neal.wu
14 | watashi
15 | sdya
16 | Burunduk1
17 | vepifanov
18 | hos.lyric
19 | fhlasek
20 | chokudai
21 | paladin8
22 | Farmer.John
23 | gawry
24 | Dlougach
25 | rng..58
26 | g201513
27 | dzhulgakov
28 | omeometo
29 | sevenkplus
30 | Gassa
31 | exod40
32 | peter50216
33 | mikhailOK
34 | emaxx
35 | Krazul
36 | cos
37 | Romka
38 | kcm1700
39 | Zhuojie
40 | zithral
41 | mk.al13n
42 | shik
43 | kohyatoh
44 | Murphy
45 | latsyrc
46 | Plagapong
47 | ivan.popelyshev
48 | withleave
49 | Myth5
50 | Tigvarts
51 | Tomato
52 | chEEtah
53 | RAD.
54 | LayCurse
55 | yeputons
56 | azhai
57 | DmitryEgorov
58 | Chmel.Tolstiy
59 | sisu
60 | ConanKudo247
61 | MiminoCoder
62 | gaoyihan
63 | Vitaliy
64 | Peti
65 | Alexander86
66 | DKI
67 | AdrianKuegel
68 | tikitikirevenge
69 | tkociumaka
70 | ploh
71 | ir5
72 | SergeyFedorov
73 | vlad89
74 | Xhark
75 | cedriclin
76 | gnomnain
77 | wap
78 | stgatilov
79 | iwi
80 | tomerun
81 | Pasqual45
82 | Saeed
83 | WhiteBear
84 | Anton.Lunyov
85 | thocevar
86 | PavelKunyavskiy
87 | MikeMirzayanov
88 | KAP
89 | Maryann
90 | kmod
91 | anton.akhi
92 | Zlobober
93 | LGM
94 | razimantv
95 | mystic
96 | DamianS
97 | Ra16bit
98 | LoRdTaPaKaH
99 | AHdoc
100 | atetubou
101 | gepa
102 | T.Insane
103 | darnley
104 | logicmachine
105 | nik239
106 | CNRICville
107 | chavit92
108 | voover
109 | sky58
110 | Jonasz
111 | LXYXYNT
112 | IvanRomanov
113 | Palmtenor
114 | dzetkulict
115 | donvel
116 | bwps
117 | GeKa
118 | acrush
119 | CaseyRoberts711
120 | Cifko
121 | homo.sapiens
122 | vot
123 | AlexUdalov
124 | lightholy
125 | daidailanlan
126 | fuch
127 | uwi
128 | goober
129 | fuseidenamida
130 | fixme
131 | ashmelev
132 | Astein
133 | vexorian
134 | natalia
135 | Borisp
136 | Vytis
137 | cgy4ever
138 | jakubr
139 | AS1
140 | popwax
141 | Smylic
142 | argentony
143 | aropan
144 | Silence
145 | linguo
146 | izulin
147 | Lipstick
148 | natsugiri
149 | hadi
150 | DAle
151 | pasin30055
152 | alexmat21
153 | pjsdream
154 | zw7840
155 | MrRoach
156 | dj3500
157 | llx
158 | Yao
159 | levlam
160 | lidaobing
161 | kelvinlau
162 | anrieff
163 | elfness
164 | angwuy
165 | ZhukovDmitry
166 | xujie
167 | yvasyliv
168 | WSX
169 | unbing
170 | Dovgaluk
171 | fuwenjie
172 | Kirino
173 | pieguy
174 | Imo
175 | FloppyCat
176 | rohanp77
177 | VulpesX
178 | dg.
179 | Cruiser
180 | tfliao
181 | geeky.elk
182 | exKAZUu
183 | maksay
184 | eldering
185 | .dP.
186 | RedApe
187 | stqn
188 | Nicolas16
189 | boleyn.su
190 | zhujiaye
191 | DoublePointer
192 | 2rf
193 | RiaD
194 | Kristofer
195 | Wataru
196 | WXYZ
197 | JaapB
198 | wata
199 | s.y
200 | shevchen
201 | marcoskwkm
202 | JAPLJ
203 | nika
204 | h4tguy
205 | pawelparys
206 | lzw75
207 | joy32812
208 | darkKelvin
209 | GlebsHP
210 | Aiz
211 | xreborner
212 | MonEtoile
213 | ania7
214 | Sempr
215 | alberist
216 | olalia
217 | wojteks
218 | sbzlyessit
219 | maciejk
220 | ZaN
221 | kitamasa
222 | NP...np
223 | AekdyCoin
224 | rowdark
225 | eduardische
226 | guilherme
227 | kappahouse
228 | pflueger
229 | technolt
230 | dai1741
231 | AHA
232 | Nabb
233 | eddyferreira
234 | KOTEHOK
235 | Michael.Levin
236 | a3nm
237 | beingryu
238 | jzj
239 | Erop
240 | porker2008
241 | immoonancient
242 | Tojot
243 | Milanin
244 | ZbanIlya
245 | jdmetz
246 | W4yneb0t
247 | wuzhengkai
248 | palacios.roy
249 | ush
250 | Ljq
251 | tos.lunar
252 | chnlich
253 | chaemon
254 | jpaulson
255 | alantian
256 | desertfox
257 | kobra
258 | ytj
259 | hillboy
260 | lcch
261 | Ostap
262 | kawatea
263 | spnautilus
264 | iakudryashov
265 | frank12268
266 | Matej
267 | Fatest
268 | EvgeniSergeev
269 | byte
270 | zbwmqlw
271 | simp1eton
272 | Felix
273 | jki14
274 | Ixanezis
275 | MilesEdgeworth
276 | Nyatl
277 | Mister
278 | AndreySiunov
279 | yaray
280 | Caoqinxiang
281 | pashkal
282 | Garyzx
283 | eXtreme
284 | gaoxin
285 | polmauri
286 | k21
287 | SergeyLazarev
288 | tsukuno
289 | imbanoob
290 | MRoizner
291 | xiaowuc1
292 | ulzha
293 | aussie
294 | Arios
295 | Taehyun
296 | jthread
297 | kmjp
298 | i314
299 | ACube
300 | arihayes
301 | jackchen92
302 | evima
303 | IwfWcf
304 | a180285
305 | Amtrix
306 | lintaor1
307 | superjoel
308 | Psyho
309 | Logic..IU
310 | Cronos
311 | goffrie
312 | kAc
313 | xiaodao
314 | Aleks
315 | kdalex
316 | abrackadabra
317 | ChingYunH
318 | K.A.D.R
319 | yangzhe1991
320 | DarLam
321 | geka666
322 | lrgar
323 | mr146
324 | GreenPeace
325 | Abscp
326 | delta2323
327 | kormyshov
328 | iPeter
329 | userresu
330 | Imsbuno
331 | williamljb
332 | Fancy
333 | Lovro
334 | daviduarte
335 | coldcutter
336 | Paf
337 | paramaciej
338 | LinesPrower
339 | EmK
340 | bruce3557
341 | jackfeng
342 | Gigz
343 | dogwalker
344 | zouxun
345 | Dener
346 | impetus
347 | issue9
348 | elizarov
349 | strapahuulius
350 | swgr
351 | LYW
352 | Kriii
353 | marim
354 | svick
355 | HexTree
356 | b0b0b0b
357 | domob
358 | SergGr
359 | real
360 | Robert.Newey
361 | tur.turczyn
362 | hansonw
363 | framalex
364 | Yoshiap
365 | jaigupta
366 | Smitty
367 | jinlin
368 | DaniJVaz
369 | YUKI.M
370 | justever86
371 | JC.C.
372 | LiuKe
373 | UESTC.Fish
374 | SumuduF
375 | yairchu
376 | A.Grishchenko
377 | C0pymaster
378 | nullmineral
379 | gasho
380 | DryukAlex
381 | bnulzm
382 | narri
383 | blue.boy
384 | alpc104
385 | fidels
386 | ytau
387 | Jan.D.Huang
388 | Boping
389 | srh
390 | francoisvn
391 | Jens
392 | RalfKistner
393 | Nikolay.Kalinin
394 | pashka
395 | GunnERs
396 | Gordderp
397 | LazyLie
398 | nip
399 | fswenton
400 | kevinsogo
401 | ssssss
402 |
--------------------------------------------------------------------------------
/Corpus/users/2075486.txt:
--------------------------------------------------------------------------------
1 | meret
2 | neal.wu
3 | misof
4 | vepifanov
5 | hos.lyric
6 | bmerry
7 | watashi
8 | SnapDragon
9 | dzhulgakov
10 | eatmore
11 | g201513
12 | Farmer.John
13 | Ahyangyi
14 | exod40
15 | EgorKulikov
16 | Burunduk1
17 | gawry
18 | sdya
19 | rng..58
20 | chokudai
21 | Dlougach
22 | paladin8
23 | Eryx
24 | Vasyl
25 | fhlasek
26 |
--------------------------------------------------------------------------------
/Corpus/users/243103.txt:
--------------------------------------------------------------------------------
1 | bmerry
2 | qizichao
3 | winger
4 | Ahyangyi
5 | misof
6 | rem
7 | kia
8 | mystic
9 | marek.cygan
10 | dzhulgakov
11 | Vitaliy
12 | wata
13 | kalinov
14 | ACRush
15 | AdrianKuegel
16 | Myth
17 | pashka
18 | ZhukovDmitry
19 | Khuc.Anh.Tuan
20 | Jeru
21 | PaulJefferys
22 | ploh
23 | emaxx
24 | FloppyCat
25 | nika
26 | alyaxey
27 | halyavin
28 | hos.lyric
29 | Burunduk1
30 | Jiunru
31 | moon5ckq
32 | KOTEHOK
33 | KAP
34 | Imba
35 | linyufly
36 | .Invader
37 | meret
38 | defrager
39 | xlmj531
40 | andrewzta
41 | Lovro
42 | overwise
43 | AS1
44 | marcina
45 | JongMan
46 | ikatanic
47 | ftc
48 | WangDong
49 | ShangJingbo
50 | natalia
51 | logistic
52 | Cheryl
53 | iwi
54 | MikeMirzayanov
55 | ScaleRhyme
56 | Zig
57 | vlad89
58 | blueblimp
59 | Huayang
60 | Lunarmony
61 | neal.wu
62 | Yao
63 | fuwenjie
64 | austrin
65 | diver
66 | LucaB
67 | SpaceFlyer
68 | stone
69 | Helenjyun
70 | MRoizner
71 | yangzhe1990
72 | eatmore
73 | beingryu
74 | tanakh
75 | igorcanadi
76 | RAD
77 | Alexus
78 | EmK
79 | Farmer.John
80 | Michael.Levin
81 | Yarin
82 | Borisp
83 | arti
84 | Gluk
85 | fsouza
86 | gojira
87 | VitalyGoldstein
88 | WSX
89 | Dragoon
90 | gusakov
91 | 1leaf1
92 | PhilipPV
93 | MiminoCoder
94 | CS.Ferng
95 | nya
96 | lewha0
97 | aanastasov
98 | NeT
99 | dgozman
100 | skol
101 | Narg.
102 | kitamasa
103 | dzwiedziu
104 | SavinovAlex
105 | charliez
106 | TheLlama
107 | u1ik
108 | sidky
109 | stjepan
110 | ardiankp
111 | narri
112 | satchipear
113 | lukasP
114 | Doggy
115 | ush
116 | Jonasz
117 | partisan
118 | araste
119 | Gennady.Korotkevich
120 | jaehyunp
121 | gawry
122 | strapahuulius
123 | Progbeat
124 | Vasyl
125 | hhanger
126 | vitar
127 | oberon
128 | jzd
129 | lympanda
130 | cpphamza
131 | anton.akhi
132 | kana.ikeda
133 | maciejk
134 | bwps
135 | Sorokin
136 | Fire
137 | xreborner
138 | Im2Good
139 | Elmiguel409
140 | gislan
141 | HenryNSW
142 | chEEtah
143 | Romka
144 | stan
145 | Nerevar
146 | snguyen.itim
147 | Zhomart
148 | indifferent
149 | felixh
150 | xgy
151 | Sergey.Bankevich
152 | SergeyRogulenko
153 | wjsw
154 | Flex
155 | jthread
156 | ogiekako
157 | SergeyMelnikov
158 | altertain
159 | SergeyFedorov
160 | eMBe
161 | sisu
162 | Will.Wu
163 | palacios.roy
164 | yiuyuho
165 | Eryx
166 | igoro
167 | tanonev
168 | not2knight
169 | deepblue
170 | Smylic
171 | Egor
172 | Maris
173 | boboo
174 | slippy
175 | dolphinigle
176 | fengzlzl
177 | tudejian
178 | indy256
179 | reiten
180 | Fernando
181 | ytj
182 | aditsu
183 | KreysSergey
184 | Jason911
185 | elhipercubo
186 | piyifan
187 | StanY
188 | jbernadas
189 | MaxBuzz
190 | Doeth
191 | Akim
192 | domeng
193 | eagleonhill
194 | latsyrc
195 | momtchil
196 | aussie
197 | chultquist
198 | DeCowboy
199 | cax
200 | Oracle.
201 | domino
202 | macs
203 | Xazker
204 | ssaljalu
205 | SHOIT
206 | chc000
207 | Duc
208 | guilherme
209 | VehicleOfPuzzle
210 | tohagnom
211 | asaveljevs
212 | exod40
213 | leehark
214 | frostnova
215 | gnarlycow
216 | Astein
217 | LYW
218 | izulin
219 | SceneTree
220 | MikleB
221 | humblefool
222 | tund
223 | Wataru
224 | Loner
225 | vigo.ph
226 | tdmorgan
227 | pieguy
228 | Fly
229 | forest
230 | Louty
231 | dexy
232 | Xhark
233 | tracyhenry
234 | vot
235 | Palmtenor
236 | Zero.ztz
237 | eduardische
238 | bachelor
239 | Rahenri
240 | arkar
241 | Medeiros
242 | turuthok
243 | zibada
244 | snizovtsev
245 | TDYa127
246 | keshav57
247 | yariv
248 | onp
249 | boolean
250 | carlos.guia
251 | diwulechao1988
252 | xtof.durr
253 | moonancient
254 | tomekkulczynski
255 | RoBa
256 | roman
257 | Al.Cash
258 | Ryan
259 | Arif
260 | Chmel.Tolstiy
261 | PavelKuznetsov
262 | lschyt
263 | MDA
264 | cpcs
265 | ttim
266 | Cjf
267 | ithlony
268 | Ying
269 | lxhgww
270 | billynyh
271 | paladin8
272 | ToN.AC119
273 | serg
274 | tsukuno
275 | Kicd
276 | LoRdTaPaKaH
277 | Tomato
278 | Landertxu
279 | deepakmanohar
280 | jakubr
281 | lucaspiva
282 | try
283 | bayleef
284 | navi
285 | baihacker
286 | Zr40
287 | aleksey
288 | ania7
289 | LastStand.ziliang
290 | SmartSchizo
291 | aurinegro
292 | Shahriar.Rouf.Nafi
293 | haha
294 | ltaravilse
295 | venco5
296 | A.I.R.
297 |
--------------------------------------------------------------------------------
/Corpus/users/2433487.txt:
--------------------------------------------------------------------------------
1 | Gennady.Korotkevich
2 | vepifanov
3 | SnapDragon
4 | theycallhimtom
5 | mystic
6 | mikhailOK
7 | winger
8 | dzhulgakov
9 | qwerty787788
10 | PavelKunyavskiy
11 | nika
12 | hos.lyric
13 | JAPLJ
14 | voover
15 | MiminoCoder
16 | EgorKulikov
17 | staniek
18 | lunae
19 | BNieuwenhuizen
20 | vot
21 | jpaulson
22 | dancho
23 | Psyho
24 | pieguy
25 | Vitaliy
26 | Vasyl
27 | Lovro
28 | mozeda
29 | sdya
30 | NALP
31 | wata
32 | subscriber
33 | niyaznigmatul
34 | stgatilov
35 | GagGuy
36 | ShangJingbo
37 | Ra16bit
38 | JialinOuyang
39 | Jonasz
40 | sevenkplus
41 | bmerry
42 | Tomato
43 | meret
44 | pperm
45 | RAD.
46 | tkociumaka
47 | LayCurse
48 | fagu
49 | peter50216
50 | evima
51 | Mochavic
52 | Myth5
53 | watashi
54 | Romka
55 | rng..58
56 | ush
57 | Thijs.
58 | megaterik
59 | pashka
60 | semiexp.
61 | AppleCplus
62 | fhlasek
63 | Ostap
64 | kevinsogo
65 | levlam
66 | XraY
67 | Merkurev
68 | Soultaker
69 | edly
70 | ShawnDong
71 | Marcin.Smulewicz
72 | dolphinigle
73 | Zlobober
74 | c0cddf
75 | Nerevar
76 | ConanKudo247
77 | Anton.Lunyov
78 | misof
79 | JongMan
80 | cos
81 | shik
82 | linguo
83 | SergeyFedorov
84 | Aksenov239
85 | DKI
86 | maksai
87 | Seyaua
88 | ogiekako
89 | aa2985759
90 | Astein
91 | flowlight0
92 | navi
93 | R.R.
94 | Nikolay.Kalinin
95 | tomerun
96 | Sereja
97 | ashmelev
98 | KhaustovPavel
99 | darnley
100 | LoRdTaPaKaH
101 | pattara.s
102 | yeputons
103 | ZhukovDmitry
104 | TankEngineer
105 | W.Junqiao
106 | Kaizero
107 | Jonick
108 | chokudai
109 | dj3500
110 | n.vilcins
111 | TRYang
112 | MikeMirzayanov
113 | Logic..IU
114 | pawelparys
115 | darknife
116 | coquelicot
117 | enot.1.10
118 | Wataru
119 | DmitryEgorov
120 | Huziwara
121 | kuba97531
122 | kmod
123 | Kirino
124 | JaNo
125 | LeBron
126 | anrieff
127 | andrewzta
128 | technolt
129 | wafrelka
130 | ZILIANG
131 | littlesheep2014
132 | anton.akhi
133 | HaaS
134 | SumuduF
135 | yh.victor
136 | Chmel.Tolstiy
137 | Palmtenor
138 | Aleksander
139 | lxhgww
140 | fuch
141 | mkn
142 | alantian
143 | pablo.t89
144 | wuzhengkai
145 | mfv
146 | kawatea
147 | ruobaole
148 | joey2005
149 | hex539
150 | zyz915
151 | Handojo1
152 | Ljq
153 | Vytis
154 | W4yneb0t
155 | humblefool
156 | tsukuno
157 | Leewings
158 | wh61
159 | gongbaoa
160 | knightL
161 | climpet
162 | imbanoob
163 | razimantv
164 | freak93
165 | lewha0
166 | buko
167 | rinigan
168 | chavit92
169 | cgy4ever
170 | Dembel
171 | robertre
172 | gilesg
173 | forest
174 | zylber
175 | TimeString
176 | Ballon
177 | EmK
178 | Maryann
179 | liangjiaxing
180 | Breakun
181 | xneby
182 | ikatanic
183 | buaaGG
184 | iwi
185 | Lewin
186 | Sabelan
187 | Nicolas16
188 | rowdark
189 | Xhark
190 | DeCowboy
191 | l521530
192 | RalfKistner
193 | mk.al13n
194 | Stigius
195 | Dlougach
196 | komaki
197 | surwdkgo
198 | Ming.Shen
199 | Caoqinxiang
200 | xcwgf666
201 | Yaro
202 | rotsor
203 | shimps
204 | ZhouYuChen
205 | standy
206 | EbTech
207 | Dumbear2
208 | Gassa
209 | isea
210 | ir5
211 | ania7
212 | jalman
213 | EricStansifer
214 | chnlich
215 | darksteel
216 | boleyn.su
217 | DarLam
218 | radeye
219 | aropan
220 | homo.sapiens
221 | simonlindholm
222 | thalassarche
223 | Angor
224 | Giove
225 | Ahmed.Salama
226 | james0zan
227 | amylase
228 | Fumiya
229 | seanwu
230 | Jimbly
231 | irwan.ap
232 | Zerosharp
233 | tongcx
234 | shihanyuan
235 | SurendraM
236 | pjsdream
237 | emaxx
238 | mixsx
239 | kusano
240 | proscriptus
241 | mishun
242 | SkorKNURE
243 | Landertxu
244 | vlpolyansky
245 | Spooky
246 | Nin0
247 | k21
248 | kinodjnz
249 | doudouille
250 | svm11
251 | nwin
252 | polmauri
253 | yarrr
254 | bwps
255 | mrozik
256 | rohanp77
257 | KennyHORROR
258 | masha.and.beer
259 | wikol
260 | Kepnu4
261 | michal27
262 | Lunarmony
263 | logicmachine
264 | CS.Ferng
265 | bayleef
266 | fanhqme
267 | squark
268 | dasko
269 | szefany
270 | Milanin
271 | KrK
272 | zbwmqlw
273 | flashmt
274 | pasin30055
275 | Cruiser
276 | Gleb
277 | slex
278 | davidv
279 | miaout
280 | dspyz
281 | a2stnk
282 | abcsampson
283 | GuyUpLion
284 | Konjac
285 | protos
286 | tpyopyt
287 | rishig
288 | santjuan
289 | IwfWcf
290 | morningsyj
291 | AHdoc
292 | Poldnev
293 | wychen
294 | alexmat21
295 | Fata1ist
296 | CarpathianCoder
297 | kormyshov
298 | 0b00101010
299 | Krig
300 | waitingkuo0527
301 | ptncks0121
302 | paladin8
303 | Lukasz16a
304 | tozangezan
305 | dkirienko
306 | Kriii
307 | natsugiri
308 | yisun
309 | Nekosyndrome
310 | gepa
311 | Dener
312 |
--------------------------------------------------------------------------------
/Corpus/users/2437491.txt:
--------------------------------------------------------------------------------
1 | mystic
2 | Vasyl
3 | winger
4 | sdya
5 | pieguy
6 | mikhailOK
7 | jpaulson
8 | EgorKulikov
9 | Lovro
10 | staniek
11 | SnapDragon
12 | Vitaliy
13 | dzhulgakov
14 | PavelKunyavskiy
15 | voover
16 | NALP
17 | vepifanov
18 | nika
19 | GagGuy
20 | BNieuwenhuizen
21 | lunae
22 | MiminoCoder
23 | JAPLJ
24 | theycallhimtom
25 |
--------------------------------------------------------------------------------
/Corpus/users/3024486.txt:
--------------------------------------------------------------------------------
1 | EgorKulikov
2 | ivan.popelyshev
3 | Gennady.Korotkevich
4 | vepifanov
5 | sevenkplus
6 | DmitryEgorov
7 | ffao
8 | wuzhengkai
9 | eatmore
10 | mk.al13n
11 | Marcin.Smulewicz
12 | isea
13 | WJMZBMR
14 | fhlasek
15 | Lovro
16 | sdya
17 | exod40
18 | kcm1700
19 | vlad89
20 | hos.lyric
21 | mystic
22 | iwi
23 | Romka
24 | Kepnu4
25 | dzhulgakov
26 | ah1926
27 | Zlobober
28 | shik
29 | kawatea
30 | ZhukovDmitry
31 | Bobik
32 | KennyHORROR
33 | meshanya
34 | VArtem
35 | yeputons
36 | uwi
37 | evima
38 | ThinkCreative
39 | K.A.D.R
40 | niyaznigmatul
41 | nika
42 | Zennon
43 | RiaD
44 | chokudai
45 | semiexp.
46 | Nerevar
47 | eduardische
48 | sky58
49 | netkuba
50 | watashi
51 | peter50216
52 | Gassa
53 | Yarin
54 | Ra16bit
55 | LeoYu
56 | mkirsche
57 | komaki
58 | earl
59 | mikhailOK
60 | Merkurev
61 | Nicolas16
62 | ZbanIlya
63 | fagu
64 | Pasqual45
65 | ValenKof
66 | Eryx
67 | wata
68 | gongbaoa
69 | chnlich
70 | mbradac
71 | notime.sea
72 | kevinsogo
73 | fushar
74 | Fcdkbear
75 | misof
76 | LeBron
77 | spnautilus
78 | qwerty787788
79 | ichyo
80 | twds
81 | TTL.135678942
82 | v.haralampiev
83 | subscriber
84 | PavelKunyavskiy
85 | Riatre
86 | logicmachine
87 | Seyaua
88 | JAPLJ
89 | kraskevich
90 | polmauri
91 | Fefer.Ivan
92 | fetetriste
93 | NALP
94 | linguo
95 | winger
96 | Palmtenor
97 | Murphy
98 | azneye
99 | dongockhanh1997
100 | edward.mj
101 | Vytis
102 | bmerry
103 | berger
104 | Fumiya
105 | tunyash
106 | climpet
107 | Lukasz16a
108 | flowlight0
109 | tkociumaka
110 | pattara.s
111 | qwaker.00
112 | cmd
113 | kostyaby
114 | SnapDragon
115 | pmnox
116 | dnkywin
117 | m.radwan
118 | Tomi
119 | Burunduk1
120 | Fdg
121 | smithinsu
122 | likawind
123 | stqn
124 | stgatilov
125 | oldjunyi
126 | R.R.
127 | NGG
128 | squark
129 | Al.Cash
130 | Tomato
131 | Darsein
132 | YUKI.M
133 | MikeMirzayanov
134 | Sereja
135 | Merlininice.yn
136 | MiminoCoder
137 | pashka
138 | Dembel
139 | Smylic
140 | yaray
141 | S.Yesipenko
142 | Krig
143 | cgy4ever
144 | pieguy
145 | RomaWhite
146 | Mister
147 | c175353
148 | DKI
149 | l521530
150 | caiwaifung
151 | elfness
152 | Dgleich
153 | Zaic
154 | TeaPot
155 | Anta0
156 | W4yneb0t
157 | Stigius
158 | iakudryashov
159 | niquefa.diego
160 | surwdkgo
161 | stevenhao
162 | pandamonium
163 | lzqxh
164 | pperm
165 | Xellos
166 | kohyatoh
167 | andrewzta
168 | tsukuno
169 | MarioYC
170 | sillycross
171 | Mochavic
172 | shimps
173 | Errichto
174 | natalia
175 | zylber
176 | navi
177 | Grzesiu
178 | tckwok
179 | KrK
180 | anrieff
181 | islam.al.aarag
182 | MiSawa
183 | zyz915
184 | Kirino
185 | HiltonLange
186 | Pompon
187 | enarc
188 | k21
189 | ariacas
190 | edgarthewise
191 | pawelparys
192 | koratel
193 | mike.nzk
194 | Smithers
195 | vigo.ph
196 | Sammarize
197 | Robert.Rosolek
198 | elsantodel90
199 | Ming.Shen
200 | Endagorion
201 | carlop
202 | Logic..IU
203 | bwps
204 | HellFalcon
205 | YerzhanU
206 | bit.yangxm
207 | caustique
208 | Lewin
209 | FoodIsGood
210 | Cloud26
211 | liutianren
212 | Prowindy
213 | mR.ilchi
214 | freak93
215 | resty
216 | Ballon
217 | tozangezan
218 | Arterm
219 | marcin.mucha
220 | Sfairat
221 | RiKang
222 | hirosegolf
223 | oml1111
224 | Vasyl
225 | JaNo
226 | Landertxu
227 | applepi
228 | yujinwunz
229 | Hachimori
230 | Carbon.Brother
231 | RuslanGatin
232 | persianpars
233 | xiaodao
234 | Zuza
235 | abcsampson
236 | Shapo
237 | Swistakk
238 | pavelz
239 | Vytenis
240 | yh.victor
241 | W.Junqiao
242 | Ronnoc
243 | Anton.Lunyov
244 | NAFIS
245 | DD.tt
246 | theme
247 | edorundo
248 | przemekkoz
249 | protos
250 | kmod
251 | HowardCheng
252 | Wolfje
253 | MathCrusader
254 | Kyoko
255 | Shangke7788
256 | alantian
257 | gepa
258 | ztxz16
259 | rankalee
260 | DEGwer
261 | thocevar
262 | architkarandikar
263 | takapt
264 | eXtreme
265 | indy256
266 | betaveros
267 | BNieuwenhuizen
268 | nk.karpov
269 | EmK
270 | ybh
271 | ForeverBell
272 | BabaninIvan
273 | Aksenov239
274 | fixme
275 | radeye
276 | boleyn.su
277 | mixsx
278 | Zwergesel
279 | nathanajah
280 | Gerald.
281 | Ripatti
282 | kmjp
283 | amoebius
284 | abyssmaul
285 | SergeyWeiss
286 | Ryan
287 | sokian
288 | IP314
289 | mamekin
290 | dhh1995
291 | madokamylove
292 | Joshik
293 | gawry
294 | LayCurse
295 | Dumbear2
296 | alkjash
297 | cedriclin
298 | flashmt
299 | GagGuy
300 | piob
301 | daizhenyang
302 | darknife
303 | alex.jh
304 | y3eadgbe
305 | AlexanderBolshakov
306 | sroyal
307 | AlexUdalov
308 | kusano
309 | kerker
310 | Mohammad.JRS
311 | yangyue.cn
312 | darkKelvin
313 | Ljq
314 | Elias
315 | Vani0
316 | NateRiverxkh
317 | gpoeta
318 | dwahler
319 | Nin0
320 | stolis
321 | Flydutchman
322 | Fly33
323 | SqrtPi
324 | gnomnain
325 | TakaakiUmedu
326 | xiaowuc1
327 | darnley
328 | fsouza
329 | ordcoder
330 | amashrabov
331 | 2rf
332 | Breakun
333 | net12k44
334 | sankear
335 | argentony
336 | hamadu
337 | RoBa
338 | binhminh410
339 | Astein
340 | SoEnLit
341 | Krwlng
342 | w.yj
343 | jthread
344 | xcwgf666
345 | bardek
346 | dexy
347 | satashun
348 | POMELO
349 | frank12268
350 | Savlik
351 | rowdark
352 | ush
353 | daimi89
354 | Antinomyra
355 | Tkachev
356 | akashin
357 | lnsuyn
358 | 916852
359 | danielf
360 | johnathan79717
361 | sergio3010
362 | DancingSoul
363 | u8765
364 | iSuneast
365 | smapson
366 | ssmike
367 | lasten
368 | Rubanenko
369 | lby
370 | foxwlog
371 | wangchaohui
372 | wafrelka
373 | d.hui
374 | waitingkuo0527
375 | miketcyue
376 | zck921031
377 | dilsonguim
378 | j.e.
379 | pwahs
380 | torus711
381 | jevi
382 | azhai
383 | Brainfvck
384 | humblefool
385 | lkq1992yeah
386 | yukis
387 | mingkaidox
388 | Mare.S.Ephemeral
389 | JohnSmith
390 |
--------------------------------------------------------------------------------
/Corpus/users/311101.txt:
--------------------------------------------------------------------------------
1 | ACRush
2 | qizichao
3 | wata
4 | ZhukovDmitry
5 | dzhulgakov
6 | nika
7 | Vitaliy
8 | kalinov
9 | halyavin
10 | bmerry
11 | alyaxey
12 | marek.cygan
13 | Khuc.Anh.Tuan
14 | Jiunru
15 | winger
16 | hos.lyric
17 | pashka
18 | misof
19 | Jeru
20 | FloppyCat
21 | AdrianKuegel
22 | emaxx
23 | ploh
24 |
--------------------------------------------------------------------------------
/Corpus/users/32002.txt:
--------------------------------------------------------------------------------
1 | bmerry
2 | yuhch123
3 | halyavin
4 | wata
5 | iwi
6 | Ahyangyi
7 | tourist
8 | gawry
9 | vlad89
10 | neal.wu
11 | austrin
12 | Vasyl
13 | Gluk
14 | mystic
15 | falagar
16 | misof
17 | jakubr
18 | Ying
19 | darnley
20 | dgozman
21 | ftc
22 | Reid
23 | fuwenjie
24 | nya
25 | rem
26 | hmich
27 | Burunduk2
28 | FedorTsarev
29 | Nerevar
30 | ardiankp
31 | JongMan
32 | bogdan2412
33 | elizarov
34 | Alexus
35 | lympanda
36 | Loner
37 | try
38 | dzhulgakov
39 | darthur
40 | PavelKuznetsov
41 | blackmath
42 | Psyho
43 | trebe
44 | kurniady
45 | Jacek
46 | tywok
47 | AS1
48 | pmnox
49 | victorsb
50 | Yarin
51 | ploh
52 | andrewzta
53 | ymatsux
54 | subra
55 | ilyaraz
56 | maciejk
57 | OpenGL
58 | Innovative.Cat
59 | pashka
60 | slex
61 | Jimb
62 | Vytis
63 | LucaB
64 | JanKuipers
65 | blueblimp
66 | klopyrev
67 | kubus
68 | almelv
69 | rspeer
70 | Zhuojie
71 | krijgertje
72 | ecprice
73 | Eryx
74 | xreborner
75 | ivan.popelyshev
76 | arti
77 | beingryu
78 | LinesPrower
79 | SkidanovAlexander
80 | AdrianKuegel
81 | mirosuaf
82 | bramandia
83 | ACRush
84 | yariv
85 | Jonick
86 | KOTEHOK
87 | snguyen
88 | inazz
89 | dolphinigle
90 | reiten
91 | jbernadas
92 | gevak
93 | LayCurse
94 | Soultaker
95 | SBRS
96 | hekacyr
97 | XiaoZiqian
98 | pdallago
99 | narri
100 | beerscout
101 | tos.lunar
102 | jmzero
103 | RodrigoBurgos
104 | bloodmage
105 | sjelkjd
106 | cpphamza
107 | vexorian
108 | karol1
109 | ZloyBastard
110 | Hachimori
111 | exod40
112 | Vytenis
113 | Gigz
114 | Tsubosaka
115 | CataractGoogly
116 | CrazyScratcher
117 | abikbaev
118 | Wolfje
119 | moonancient
120 | MauricioC
121 | charliez
122 | leehark
123 | nika
124 | StanY
125 | Smitty
126 | yuzmukhametov
127 | ShangJingbo
128 | Jonasz
129 | Jan
130 | Lipstick
131 | crazyb0y
132 | VitalyGoldstein
133 | updog
134 | Sanny
135 | sidky
136 | dano
137 | dbh
138 | CM87
139 | nodchip
140 | Xixas
141 | hhb
142 | jdmetz
143 | vepifanov
144 | windy7926778
145 | Mingfei.Li
146 | Astein
147 | wojteks
148 | moon5ckq
149 | ssancho
150 | altertain
151 | DmitryKlenov
152 | bsonrisa
153 | Alexey
154 | elhipercubo
155 | abiczo
156 | RoBa
157 | earl
158 | Bohua
159 | Rostislav
160 | linguo
161 | dzwiedziu
162 | ben.hwang
163 | Lunarmony
164 | SergeyRogulenko
165 | jthread
166 | ituphanov
167 | zhengzhao
168 | tanakh
169 | microsoft
170 | rlblaster
171 | henryy
172 | PaulJefferys
173 | gmark
174 | turuthok
175 | scorzh
176 | bwps
177 | rafaeldaigo
178 | botay
179 | mozeda
180 | zibada
181 | NeT
182 | victorj
183 | eMBe
184 | ulzha
185 | wywcgs
186 | bhamrick
187 | satej
188 | GunnERs
189 | deviatov
190 | Yulo.K
191 | Amber
192 | DStepanenko
193 | u1ik
194 | Klinck
195 | unbing
196 | kp7
197 | Yao
198 | Olexiy
199 | ralekseenkov
200 | Shahriar.Rouf.Nafi
201 | diam
202 | doudouille
203 | MikeMirzayanov
204 | DeCowboy
205 | terranwy
206 | logistic
207 | hrushikesh.tilak
208 | ilyakor
209 | Zakklars
210 | Qingchun
211 | hsyeo
212 | MikeSeibert
213 | ScaleRhyme
214 | g201513
215 | Dragoon
216 | Sempr
217 | gepa
218 | greenoyster
219 | Mimino
220 | obokaman
221 | GeKa
222 | wap
223 | oberon
224 | kivus
225 | lewha0
226 | antimatter
227 | vladut89
228 | strategist333
229 | NefariousZhen
230 | TripleM
231 | Kalq
232 | marius.pungaru
233 | Erik
234 | talchas
235 | xiaobao
236 | Fly
237 | Wataru
238 | TaiTai
239 | xhl.kogitsune
240 | ulyanick
241 | RalfKistner
242 | carlosralv
243 | vpj
244 | RicardoHahn
245 | winger
246 | Die
247 | andersk
248 | AlexanderL
249 | andre.sp
250 | Savior
251 | natalia
252 | Zig
253 | Dlougach
254 | boss
255 | jagjag
256 | Vitaliy
257 | samee
258 | Stigius
259 | maksay
260 | TheHue
261 | mysanal
262 | Aekeri
263 | Hetman
264 | szd
265 | trchen1033
266 | pedroeira
267 | khuebeo
268 | kinaba
269 | Ostap
270 | stone
271 | vitar
272 | pavelz
273 | Kyungryeol
274 | zwdant
275 | Patience
276 | ertesh
277 | Project
278 | Progbeat
279 | MB.
280 | Borisp
281 | LanceHalberd
282 | dotnetcoder
283 | hansonw
284 | frostnova
285 | Helenjyun
286 | HiltonLange
287 | jonathantan86
288 | MRoizner
289 | zzzz
290 | Relja
291 | lrearte
292 | radeye
293 | eXtreme
294 | humblefool
295 | Patriot
296 | araste
297 | mehas
298 | Kee
299 | Jby
300 | vigo.ph
301 | imyoyo
302 | yisun
303 | kk420
304 | tsukuno
305 | Akim
306 | StevieT
307 | Marte
308 | cepheid
309 | Plagapong
310 | irori
311 | diferential
312 | Cupuman
313 | WSX
314 | LBFacci
315 | tienn
316 | adeymo
317 | nicholas
318 | nik239
319 | Jasko
320 | sapal
321 | Rydberg
322 | spencer
323 | gusakov
324 | Huayang
325 | Razvi
326 | burunduk3
327 | lbackstrom
328 | jasonw
329 | JesUltra
330 | kappahouse
331 | Tilps
332 | ssulbbang
333 | vsb
334 | EkTePik
335 | lordmonsoon
336 | JRR
337 | Cheryl
338 | eonx
339 | kangshifu
340 | roypalacios
341 | ltdtl
342 | eldering
343 | EmK
344 | AdamG
345 | Vovka
346 | cmd
347 | anton.akhi
348 | ErickW
349 | licstar
350 | macs
351 | lixs2003
352 | supo
353 | .Invader
354 | Sohel
355 | kik
356 | skatou
357 | FAndy
358 | RomanLipovsky
359 | cvoinescu
360 | eleusive
361 | Baekjoon
362 | Saty
363 | linyufly
364 | agh
365 | IvanRomanov
366 | Fire
367 | mirzman
368 | tckwok
369 | xwbsw
370 | myprasanna
371 | danielp
372 | jfguo
373 | watashi
374 | Doggy
375 | latsyrc
376 | visq
377 | grizzly
378 | legend12
379 | Murphy
380 | ThinkCreative
381 | ilham
382 | impetus
383 | kunigami
384 | loneknight
385 | WillCodeForFood
386 | felipebart
387 | Al.Cash
388 | eagaeoppooaaa
389 | izulin
390 | firepot
391 | yuta.sawa
392 | kit
393 | gaosimeng
394 | RAD
395 | Chmel.Tolstiy
396 | RAVEman
397 | foobarbaz
398 | slippy
399 | ZhouErjin
400 | Vedensky
401 | msn
402 | wcao
403 | YangYi
404 | momtchil
405 | Fluorine
406 | radames9htv
407 | Tony
408 | Davis
409 | Luciano
410 | YUMEN
411 | JosephWen
412 | javau
413 | yuizumi
414 | otinn
415 | abkqz
416 | domeng
417 | pablo.aguilar
418 | Brian
419 | sclo
420 | srikkbhat
421 | Cho
422 | i0exception
423 | HanoiTower
424 | ThomMerillin
425 | TheLlama
426 | p13
427 | andreitheo
428 | sonyckson
429 | janq
430 | Alvin
431 | edauri
432 | ftfish
433 | gojira
434 | Adhit
435 | wkoder
436 | elsantodel90
437 | aknow
438 | gurug
439 | Laurance
440 | Minilek
441 | ToN.AC119
442 | naguib
443 | Elmiguel409
444 | TonyZ
445 | icecream
446 | Wernie
447 | gcdart
448 | griffon
449 | Stone
450 | chouxiaowen
451 | SCat.Wang
452 | DAle
453 | satchipear
454 | qinlu
455 | Nasa
456 | Prostu
457 | LYW
458 | igoro
459 | Stas
460 | schultz
461 | UnknownException
462 | tck
463 | vnikulin
464 | tanaeem
465 | pavel13
466 | kedaizd
467 | chenxiuwei
468 | carp
469 | OldDonkey
470 | rx201
471 | roma
472 | kitamasa
473 | rbtree
474 | yannis
475 | zerg
476 | Mg9H
477 | Duke
478 | Gleb
479 | Arif
480 | mustafij
481 | JackieX
482 | gutalin
483 | Dream
484 | TAG
485 | kprateek88
486 | frankyym
487 | walchl
488 | elmariachi1414
489 | eagleonhill
490 | Leonid
491 | Landertxu
492 | daiwb
493 | kaasis
494 | ged
495 | AnshAryan
496 | rajkon
497 | Kouprin
498 | alien.i
499 | Ragnarok
500 | shimps
501 | winsty
502 | nakajima
503 | cryboy
504 | insotc
505 | PhilipPV
506 | Ravents
507 | Crush
508 | lukasP
509 | Torax
510 | bigheadghost
511 | yiuyuho
512 | those
513 | Terence
514 | Biskup
515 | Rahenri
516 | sao3
517 | WildUtah
518 | RichardPang
519 | Vegetable
520 | voyagerr
521 | lqs
522 | yessit
523 | Vintik
524 | hanney
525 | Felix
526 | Slevin
527 | enjoy1009
528 | Rio
529 | iuaaui
530 | leeang
531 | harpreet.singh
532 | rrpai
533 | kaneko
534 | marting
535 | Jackrabbit
536 | Fernando
537 | Foxtail
538 | Darko
539 | Gnefihz
540 | qwynick
541 | koxvqrvtkp
542 | carl
543 | nebula.lam
544 | nutki
545 | X.Ray
546 | dimozzz
547 | q3dm17
548 | FerroMrkva
549 | cpcs
550 | argentony
551 | zmy
552 | SpaceFlyer
553 | squall1729
554 | kwiatek
555 | Ryan
556 | IgorYevchinets
557 | algostorm
558 | HilbertRaum
559 | stavr
560 | zhendongjia
561 | boba5555
562 | mdoan
563 | connect4
564 | i.bogatyi
565 | SmartSchizo
566 | shell
567 | pperm
568 | Irioth
569 | StepInto
570 | uwi
571 | vyxaryx
572 | NilayVaish
573 | gagik
574 | dkorduban
575 | marspeople
576 | RainingStar
577 | HenryHu
578 | popo
579 | AlphaStream111
580 | rwaliany
581 | Kdub
582 | tollek
583 | bug
584 | gnarlycow
585 | SharpC
586 | mdzfirst
587 | rajeshsr
588 | skol
589 | Lerry
590 | aliquis
591 | aleksandari
592 | martin.at.ksp
593 | Sunny
594 | Davidsu
595 | gladiator
596 | groupbuilder
597 | rasto6sk
598 | aditsu
599 | RalphFurmaniak
600 | Koper
601 | Hammer
602 | Safii
603 | abstractwhiz
604 | ouka
605 | Minny
606 | loyeer
607 | SourSpinach
608 | ipknHama
609 | kmod
610 | pzielinski
611 | moonlight
612 | mdruker
613 | MTWTFFF
614 | panczo
615 | giolekva
616 | matkk
617 | wushuangyue
618 | Demasi
619 | Chimed
620 | t.mac
621 | Jakozaur
622 | augustotorres
623 | zgm
624 | fetetriste.
625 | dmytro.korzhyk
626 | lidaobing
627 | indy256
628 | Spieler
629 | blmarket
630 | aussie
631 | MrZZZ
632 | stan
633 | Aesop
634 | cancho
635 | domob
636 | Faber
637 | ziliang
638 | victor.j8
639 | Yiming
640 | jackfeng
641 | emnmon
642 | diaorui
643 | Carrot
644 | Alphard
645 | Mathwhiz1286
646 | b0b0b0b
647 | wInuX
648 | mth
649 | trulo17
650 | doriath
651 | along
652 | Infinity.
653 | carlop
654 | gizzywump
655 | vlad
656 | Vman
657 | Sergey.Bankevich
658 | AngelClover
659 | marsavic
660 | Dima
661 | SavinovAlex
662 | goober
663 | aleksey
664 | IlyaPonamarev
665 | RenTeng
666 | Ripatti
667 | zxhy2
668 | felixh
669 | Rasifiel
670 | Maryann
671 | vinaysingh
672 | AlMag
673 | cs10520
674 | Rick
675 | tiagomt
676 | microbrain
677 | HenryW
678 | fclaude
679 | mvolke
680 | wuxy
681 | Ramzes2
682 | sanky29288
683 | Chrono
684 | cjoa
685 | MIPTAlex
686 | lh124363042
687 | aanastasov
688 | Abir
689 | Gaizka
690 | deepakmanohar
691 | txandi
692 | DmitriyL
693 | VladS
694 | j.vimal
695 | Ivankovic
696 | ViniciusCabessa
697 | husheyn
698 | joseph
699 | Hackson
700 | morbidel
701 | waterwang
702 | windowmaker
703 | DevilMayCry
704 | soul3434
705 | thecata
706 | Serraa
707 | thunderfyc
708 | FBWolf
709 | Baiger
710 | saltycookie
711 | it3
712 | relic
713 | andrei.info
714 | Hypuk
715 | Keegan
716 | ssaljalu
717 | Askar
718 | MoreFreeze
719 | Valergrad
720 | misko.sz
721 | mhung
722 | ahh
723 | arxor
724 | oaiei
725 | lxhgww
726 | alexkas
727 | lcosvse
728 | TT87
729 | asaveljevs
730 | Alligator
731 | vk91
732 | SergeyAkimov
733 | starforever
734 | acherepanov
735 | Edu
736 | lecoo
737 | rubyeye
738 | DEathkNIghtS
739 | ivo.sluganovic
740 | itoasquall
741 | McFn
742 | the.Fm
743 | upgrade
744 | aleck
745 | ccc5
746 | imrankane2005
747 | brus07
748 | cintana
749 | Rainco
750 | szsz
751 | partychen
752 | Karalabe
753 | sandro
754 | lemonutzf
755 | ThomasDeniau
756 | stpkys
757 | Landrew
758 | Greggypoo
759 | fogwind
760 | star
761 | MonEtoile
762 | kozima
763 | quaji
764 | DragonRidr
765 | Gnurdux
766 | Lvsoft
767 | thobel
768 | kcwu
769 | yappy
770 | pqshq
771 | wistful
772 | mergen
773 | delicato
774 | kittycat
775 | henshiru
776 | SoD
777 | w0nder
778 | Turning
779 | Nikelandjelo
780 | juwon
781 | t3hg0suazn
782 | backluck
783 | zzmike
784 | mihar
785 | stef.lp
786 | spupyrev
787 | mouda
788 | kenyyy
789 | littlej
790 | viclei
791 | levlam
792 | Wam
793 | fanKarpaty
794 | luanlai
795 | Swarun
796 | HenryKitten
797 | hiperx
798 | igorcanadi
799 | Shadrach
800 | ilya
801 | pauldb
802 | liulibo133
803 | srou
804 | three
805 | bee
806 | Bus
807 | bnuer
808 | bl1n
809 | Robinnibor
810 | 5l2
811 | NevoWin777
812 | Ignat
813 | Jyun
814 | LeoC
815 | Younix
816 | runTarm
817 | zhang
818 | Geniuswj
819 | narendhranath
820 | chenxueyu
821 | Muerte
822 | overwise
823 | Sputnik
824 | fredd4
825 | wuyifan
826 | C.A.
827 | Sorokin
828 | sandaru1
829 | marim
830 | oyy
831 | dzetkulict
832 | sunnn
833 | arctanx
834 | Equinox
835 | Vangoz
836 | AlexanderT
837 | foison
838 | pinpin
839 | sproblvem
840 | kmh4500
841 | my.nickname
842 | deepblue
843 | serg
844 | matsuza
845 | yyt
846 | Levy
847 | wisdompoet
848 | wjsw
849 | Sankozi
850 | Lazarev
851 | kuat
852 | Alec
853 | tangyouze
854 | genzmer
855 | Yak
856 | Teferi
857 | bolek
858 | Hanaban
859 | lvyun
860 | trecio
861 | chenhaifeng
862 | Delsius
863 | Calamitas
864 | dexy
865 | Ferlon
866 | 234226
867 | Xofon
868 | ljwan12
869 | Esquimeau
870 | shubham.mittal
871 | Doeke
872 | qu1ck
873 | .maXim.
874 | paisa
875 | LynnKaye
876 | Modulator
877 | AliJ
878 | MaiK
879 | Boping
880 | int9
881 | imos
882 | TheRaven
883 |
--------------------------------------------------------------------------------
/Corpus/users/32005.txt:
--------------------------------------------------------------------------------
1 | ACRush
2 | Ahyangyi
3 | Amber
4 | xhl.kogitsune
5 | LayCurse
6 | xreborner
7 | stone
8 | TripleM
9 | wata
10 | tckwok
11 | g201513
12 | iwi
13 | humblefool
14 | kinaba
15 | Fire
16 | ymatsux
17 | Lunarmony
18 | windy7926778
19 | Huayang
20 | FAndy
21 | domeng
22 | ardiankp
23 | OpenGL
24 | Zhuojie
25 | lympanda
26 | tanakh
27 | tos.lunar
28 | yuhch123
29 | YangYi
30 | Savior
31 | kitamasa
32 | JongMan
33 | Innovative.Cat
34 | XiaoZiqian
35 | Loner
36 | wywcgs
37 | eagleonhill
38 | Laurance
39 | yuta.sawa
40 | subra
41 | irori
42 | Astein
43 | ScaleRhyme
44 | kik
45 | Fluorine
46 | Murphy
47 | altertain
48 | nya
49 | hhb
50 | araste
51 | daiwb
52 | Sempr
53 | EmK
54 | moonancient
55 | Terence
56 | watashi
57 | lewha0
58 | those
59 | jfguo
60 | henryy
61 | Yulo.K
62 | kprateek88
63 | zwdant
64 | updog
65 | sidky
66 | terranwy
67 | chenxiuwei
68 | Mingfei.Li
69 | AnshAryan
70 | nakajima
71 | Stone
72 | tanaeem
73 | tsukuno
74 | yuizumi
75 | logistic
76 | GunnERs
77 | lqs
78 | HanoiTower
79 | Sanny
80 | zhengzhao
81 | licstar
82 | kappahouse
83 | LYW
84 | inazz
85 | snguyen
86 | Dragoon
87 | Patience
88 | trchen1033
89 | TheHue
90 | Yao
91 | leehark
92 | chouxiaowen
93 | kurniady
94 | xwbsw
95 | Vegetable
96 | samee
97 | RoBa
98 | OldDonkey
99 | legend12
100 | firepot
101 | vpj
102 | javau
103 | Kee
104 | rbtree
105 | ben.hwang
106 | Shahriar.Rouf.Nafi
107 | qinlu
108 | loneknight
109 | gaosimeng
110 | shimps
111 | YUMEN
112 | mozeda
113 | Alvin
114 | EkTePik
115 | zerg
116 | CrazyScratcher
117 | bigheadghost
118 | microsoft
119 | JosephWen
120 | nodchip
121 | Baekjoon
122 | skatou
123 | Zakklars
124 | charliez
125 | i0exception
126 | Hachimori
127 | khuebeo
128 | Cho
129 | voyagerr
130 | satchipear
131 | Jan
132 | wap
133 | Saty
134 | Kyungryeol
135 | beingryu
136 | Tsubosaka
137 | Dream
138 | SCat.Wang
139 | Tilps
140 | xiaobao
141 | insotc
142 | ftfish
143 | codejam.forget1
144 | nicholas
145 | walchl
146 | rx201
147 | try
148 | Marte
149 | UnknownException
150 | TaiTai
151 | imyoyo
152 | latsyrc
153 |
--------------------------------------------------------------------------------
/Corpus/users/32008.txt:
--------------------------------------------------------------------------------
1 | Bohua
2 | SkidanovAlexander
3 | radeye
4 | linguo
5 | andersk
6 | Reid
7 | antimatter
8 | ploh
9 | fuwenjie
10 | pmnox
11 | darthur
12 | macs
13 | yiuyuho
14 | blueblimp
15 | pdallago
16 | igoro
17 | StanY
18 | kp7
19 | Rahenri
20 | edauri
21 | ssancho
22 | Qingchun
23 | ltdtl
24 | jdmetz
25 | ecprice
26 | felipebart
27 | klopyrev
28 | Elmiguel409
29 | rspeer
30 | LanceHalberd
31 | LBFacci
32 | jbernadas
33 | WillCodeForFood
34 | sjelkjd
35 | msn
36 | rafaeldaigo
37 | MikeSeibert
38 | gurug
39 | TonyZ
40 | ged
41 | bsonrisa
42 | JackieX
43 | kit
44 | adeymo
45 | hrushikesh.tilak
46 | Zig
47 | satej
48 | JRR
49 | icecream
50 | NefariousZhen
51 | jmzero
52 | jasonw
53 | talchas
54 | vexorian
55 | karol1
56 | ssulbbang
57 | Mg9H
58 | Nasa
59 | Die
60 | schultz
61 | RicardoHahn
62 | roypalacios
63 | carlosralv
64 | pedroeira
65 | tck
66 | Jimb
67 | lbackstrom
68 | Rostislav
69 | jagjag
70 | CataractGoogly
71 | WildUtah
72 | elsantodel90
73 | sonyckson
74 | lrearte
75 | dbh
76 | spencer
77 | pablo.aguilar
78 | dotnetcoder
79 | wkoder
80 | gcdart
81 | foobarbaz
82 | Aekeri
83 | Luciano
84 | yisun
85 | sclo
86 | Ying
87 | Duke
88 | unbing
89 | eleusive
90 | turuthok
91 | vladut89
92 | myprasanna
93 | RichardPang
94 | beerscout
95 | pavel13
96 | ErickW
97 | narri
98 | kunigami
99 |
--------------------------------------------------------------------------------
/Corpus/users/32010.txt:
--------------------------------------------------------------------------------
1 | bmerry
2 | dzhulgakov
3 | gawry
4 | dgozman
5 | halyavin
6 | pashka
7 | mystic
8 | Klinck
9 | .Invader
10 | DmitryKlenov
11 | Gluk
12 | PaulJefferys
13 | Eryx
14 | LucaB
15 | austrin
16 | almelv
17 | krijgertje
18 | ilyaraz
19 | gusakov
20 | JanKuipers
21 | Burunduk2
22 | LinesPrower
23 | Vitaliy
24 | zibada
25 | natalia
26 | Jacek
27 | andrewzta
28 | winger
29 | Prostu
30 | anton.akhi
31 | Borisp
32 | nik239
33 | oberon
34 | Lipstick
35 | Wataru
36 | Yarin
37 | Vedensky
38 | dzwiedziu
39 | Chmel.Tolstiy
40 | MB.
41 | AS1
42 | Vytis
43 | KOTEHOK
44 | ftc
45 | misof
46 | Psyho
47 | RAVEman
48 | falagar
49 | Vasyl
50 | andre.sp
51 | wojteks
52 | Nerevar
53 | hekacyr
54 | gevak
55 | radames9htv
56 | Mimino
57 | Kalq
58 | vigo.ph
59 | Vytenis
60 | Crush
61 | abikbaev
62 | Torax
63 | VitalyGoldstein
64 | scorzh
65 | danielp
66 | ralekseenkov
67 | burunduk3
68 | darnley
69 | elizarov
70 | reiten
71 | Dlougach
72 | IvanRomanov
73 | Xixas
74 | nika
75 | botay
76 | Vintik
77 | PhilipPV
78 | Stigius
79 | MikeMirzayanov
80 | griffon
81 | lordmonsoon
82 | Smitty
83 | jakubr
84 | Alexus
85 | kubus
86 | Olexiy
87 | gepa
88 | ituphanov
89 | Al.Cash
90 | eMBe
91 | Landertxu
92 | Hetman
93 | slex
94 | vlad89
95 | JesUltra
96 | FedorTsarev
97 | elhipercubo
98 | vsb
99 | hmich
100 | Vovka
101 | AlexanderL
102 | trebe
103 | boss
104 | dano
105 | alien.i
106 | cvoinescu
107 | deviatov
108 | eldering
109 | ivan.popelyshev
110 | CM87
111 | Fly
112 | TAG
113 | slippy
114 | Ostap
115 | Soultaker
116 | maciejk
117 | WSX
118 | kedaizd
119 | jthread
120 | Patriot
121 | Rydberg
122 | DAle
123 | rajkon
124 | ulzha
125 | lukasP
126 | Gleb
127 | ilyakor
128 | eXtreme
129 | Progbeat
130 | ulyanick
131 | impetus
132 | MRoizner
133 | diam
134 | abkqz
135 | ilham
136 | gmark
137 | naguib
138 | Jasko
139 | TheLlama
140 | AdrianKuegel
141 | eonx
142 | victorj
143 | roma
144 | agh
145 | mirosuaf
146 | grizzly
147 | abiczo
148 | Leonid
149 | cmd
150 | ertesh
151 | DStepanenko
152 | Jonasz
153 | u1ik
154 | diferential
155 | janq
156 | RomanLipovsky
157 | frostnova
158 | kk420
159 | andreitheo
160 | Ravents
161 | tywok
162 | izulin
163 | blackmath
164 | doudouille
165 | p13
166 | Kouprin
167 | cpphamza
168 | Wernie
169 | Gigz
170 | szd
171 | StevieT
172 | vitar
173 | gojira
174 | ThinkCreative
175 | yariv
176 | supo
177 | PavelKuznetsov
178 | sapal
179 | AdamG
180 | Razvi
181 | mirzman
182 | momtchil
183 | otinn
184 | zzzz
185 | Stas
186 | Slevin
187 | gutalin
188 | Project
189 | pavelz
190 | eagaeoppooaaa
191 | Wolfje
192 | elmariachi1414
193 | visq
194 | vnikulin
195 | kivus
196 | Biskup
197 | rlblaster
198 | bwps
199 | yuzmukhametov
200 | marius.pungaru
201 | Relja
202 | NeT
203 | hanney
204 |
--------------------------------------------------------------------------------
/Corpus/users/32011.txt:
--------------------------------------------------------------------------------
1 | ACRush
2 | Innovative.Cat
3 | bmerry
4 | pmnox
5 | yuhch123
6 | gawry
7 | Eryx
8 | mystic
9 | ploh
10 | blueblimp
11 | windy7926778
12 | PaulJefferys
13 | Chmel.Tolstiy
14 | Yarin
15 | andrewzta
16 | radeye
17 | yiuyuho
18 | AS1
19 | KOTEHOK
20 | Fire
21 | g201513
22 | ftc
23 | Burunduk2
24 | pdallago
25 | Gluk
26 | xreborner
27 | DmitryKlenov
28 | halyavin
29 | kinaba
30 | dzwiedziu
31 | humblefool
32 | Jacek
33 | lympanda
34 | domeng
35 | tckwok
36 | krijgertje
37 | ardiankp
38 | darthur
39 | XiaoZiqian
40 | wata
41 | LucaB
42 | JanKuipers
43 | tanakh
44 | dzhulgakov
45 | LinesPrower
46 | fuwenjie
47 | Ahyangyi
48 | Lunarmony
49 | eagleonhill
50 | SkidanovAlexander
51 | tos.lunar
52 | Vitaliy
53 | Laurance
54 | dgozman
55 | Amber
56 | StanY
57 | ltdtl
58 | ymatsux
59 | iwi
60 | Huayang
61 | antimatter
62 | zibada
63 | LayCurse
64 | Vedensky
65 | OpenGL
66 | oberon
67 | igoro
68 | MB.
69 | YangYi
70 | Vytis
71 | Savior
72 | austrin
73 | Qingchun
74 | linguo
75 | JongMan
76 | gusakov
77 | macs
78 | Prostu
79 | kp7
80 | ilyaraz
81 | Reid
82 | Bohua
83 | kitamasa
84 | pashka
85 | ssancho
86 | andersk
87 | Wataru
88 | anton.akhi
89 | nik239
90 | subra
91 | TripleM
92 | natalia
93 | almelv
94 | stone
95 | yuta.sawa
96 | Klinck
97 | Borisp
98 |
--------------------------------------------------------------------------------
/Corpus/users/639102.txt:
--------------------------------------------------------------------------------
1 | Burunduk1
2 | winger
3 | Eryx
4 | RAVEman
5 | Gennady.Korotkevich
6 | nika
7 | eatmore
8 | pashka
9 | Vasyl
10 | jakubr
11 | meret
12 | ZhukovDmitry
13 | qizichao
14 | marek.cygan
15 | rng..58
16 | halyavin
17 | krijgertje
18 | linguo
19 | FloppyCat
20 | PaulJefferys
21 | mystic
22 | iwi
23 | wojteks
24 | ACRush
25 | elsantodel90
26 | Khuc.Anh.Tuan
27 | yuhch123
28 | marcina
29 | Gassa
30 | earl
31 | Stigius
32 | SergeyRogulenko
33 | Egor
34 | bmerry
35 | omeometo
36 | andrewzta
37 | Gluk
38 | hos.lyric
39 | Louty
40 | MrBald
41 | wata
42 | moon5ckq
43 | msg555
44 | darnley
45 | Milanin
46 | srh
47 | TripleM
48 | xhl.kogitsune
49 | xreborner
50 | ilyakor
51 | Dlougach
52 | alantian
53 | Orfest
54 | GlennMatthews
55 | onp
56 | Xhark
57 | iddaga
58 | gaoyihan
59 | SuZhan
60 | stgatilov
61 | Anton.Lunyov
62 | eMBe
63 | Lunarmony
64 | dzwiedziu
65 | lxx
66 | tikitikirevenge
67 | LiuKe
68 | Zhuojie
69 | blmarket
70 | ShangJingbo
71 | kangaroo
72 | Fumiya
73 | trebe
74 | Jonick
75 | ryuuga
76 | g201513
77 | zyz915
78 | Kimi.Arthur
79 | exod40
80 | paladin8
81 | kozikow
82 | Chmel.Tolstiy
83 | sdya
84 | misof
85 | Nerevar
86 | MrZZZ
87 | Nyatl
88 | ir5
89 | Seyaua
90 | dagon
91 | Alexus
92 | Reid
93 | Flex
94 | Palmtenor
95 | RAD.
96 | Xazker
97 | pawelparys
98 | levlam
99 | defrager
100 | resty
101 | tomekkulczynski
102 | AS1
103 | ivan.popelyshev
104 | zhengzhao
105 | Maryann
106 | Tomato
107 | maojm
108 | Rydberg
109 | anrieff
110 | oldherl
111 | pperm
112 | adeymo
113 | vitar
114 | StevieT
115 | imabc
116 | joey2005
117 | Imba
118 | dolphinigle
119 | Vegetable
120 | vepifanov
121 | Progbeat
122 | vlad89
123 | dzhulgakov
124 | diver
125 | kia
126 | ralekseenkov
127 | hansonw
128 | dano
129 | RoBa
130 | natalia
131 | gkreitz
132 | mirac
133 | mrc88
134 | rafaeldaigo
135 | AEtheReal
136 | Sunny
137 | Pedro.Bello
138 | DKI
139 | Jed
140 | uwi
141 | ConanKudo247
142 | PavelKunyavskiy
143 | Joshik
144 | pasin30055
145 | forifchen
146 | Shinta
147 | maciejk
148 | zibada
149 | reiten
150 | ArtDitel
151 | Ra16bit
152 | izulin
153 | vigo.ph
154 | Hackson
155 | vot
156 | arti
157 | simonsyd
158 | AdrianKuegel
159 | jaehyunp
160 | guilherme
161 | imazato
162 | ituphanov
163 | GarnetCrow
164 | emaxx
165 | Michael.Levin
166 | JongMan
167 | wRabbits.nevidomy
168 | stone
169 | forest
170 | wRabbits.AlMag
171 | LinesPrower
172 | Ignat
173 | Al.Cash
174 | rotsor
175 | KAP
176 | fixme
177 | jzd
178 | skynet
179 | DAle
180 | kitamasa
181 | XiaoZiqian
182 | blando
183 | ebd
184 | humblefool
185 | .maXim.
186 | slippy
187 | Romka
188 | ftc
189 | lympanda
190 | danielf
191 | CS.Ferng
192 | sisu
193 | snguyen.itim
194 | mohamedafattah
195 | pietrzkiewicz
196 | bwps
197 | gawry
198 | Ryan
199 | fero
200 | Psyho
201 | aleksey
202 | ashmelev
203 | Multifarious
204 | Wataru
205 | wz12
206 | eireksten
207 | kinaba
208 | Astein
209 | qwaker.00
210 | jdmetz
211 | janq
212 | GagGuy
213 | sidky
214 | wushuangyue
215 | those
216 | lewha0
217 | Steps09
218 | hs484
219 | wefgef
220 | TheLlama
221 | impetus
222 | chibby
223 | RalphFurmaniak
224 | dAnton
225 | anton.akhi
226 | txandi
227 | cmd
228 | sky58
229 | KOTEHOK
230 | Vitaliy
231 | jbernadas
232 | AlexLin
233 | beingryu
234 | ika
235 | cax
236 | sonyckson
237 | vvn
238 | imyourgod
239 | thocevar
240 | eXtreme
241 | SuBaRaSi
242 | LoRdTaPaKaH
243 | nik239
244 | cz.vx.bc
245 | yangzhe1990
246 | goober
247 | Lovro
248 | MiminoCoder
249 | damian.k
250 | try
251 | ggm
252 | raincole
253 | lxhgww
254 | syco
255 | tos.lunar
256 | Myth
257 | Gibon
258 | LayCurse
259 | polmauri
260 | supo
261 | ErrGe
262 | Atol
263 | r5insight
264 | reachnomind
265 | Onufry
266 | tanonev
267 | magicdlf
268 | Ostap
269 | Vedensky
270 | domeng
271 | aanastasov
272 | a9108
273 | Fdg
274 | xgy
275 | a70babat
276 | tantian
277 | abeln
278 | Qinz
279 | wudired
280 | cos
281 | K.A.D.R
282 | venco5
283 | biran0079
284 | kmjp
285 | tanakh
286 | daffes
287 | tkcn
288 | Yeomin
289 | Vytis
290 | arxor
291 | maold
292 | lydxlx
293 | Cheryl
294 | KirillB
295 | chokudai
296 | gojira
297 | samee
298 | Patrick.Nguyen
299 | Tarrasch
300 | AnhDT
301 | hopman
302 | pflueger
303 | Farmer.John
304 | Cai0715
305 | Fire
306 | pzielinski
307 | Isis
308 | zxytim
309 | lidaobing
310 | marcin.mucha
311 | MRain
312 | dante.ltw
313 | Plagapong
314 | eduardische
315 | jinlin
316 | kp7
317 | Therion
318 | hhb
319 | it3
320 | tsun
321 | whh
322 | TheRaven
323 | Dovgaluk
324 | Dumbear2
325 | theli
326 | BBuss
327 | foxlit
328 | xyx
329 | otis
330 | Hetman
331 | JosephWen
332 | AleX
333 | Connector
334 | blunar
335 | Smylic
336 | WenX
337 | KuoE0
338 | asaveljevs
339 | ged
340 | JoeyScarr
341 | sfe
342 | pP5438
343 | Sergey.Bankevich
344 | abiczo
345 | mehdib
346 | Tsubosaka
347 | TPReal
348 | Nasa
349 | microsoft
350 | PavelKuznetsov
351 | kubus
352 | vexorian
353 | Sanny
354 | bayleef
355 | StephYDX
356 | Theorem
357 | Aerodonkey
358 | WhiteBear
359 | Pro.hessam
360 | pavelz
361 | Undead
362 | pr0ton
363 | AlexanderL
364 | momtchil
365 | MRoizner
366 | Robert.Rosolek
367 | foison
368 | SergeyMelnikov
369 | johny42
370 | LIBe
371 |
--------------------------------------------------------------------------------
/Corpus/users/7214486.txt:
--------------------------------------------------------------------------------
1 | Gennady.Korotkevich
2 | eatmore
3 | sevenkplus
4 | mystic
5 | mk.al13n
6 | EgorKulikov
7 | kcm1700
8 | vepifanov
9 | dzhulgakov
10 | Romka
11 | hos.lyric
12 | Marcin.Smulewicz
13 | vlad89
14 | shik
15 | iwi
16 | DmitryEgorov
17 | kawatea
18 | exod40
19 | ivan.popelyshev
20 | sdya
21 | ffao
22 | fhlasek
23 | wuzhengkai
24 | KennyHORROR
25 | isea
26 | Kepnu4
27 |
--------------------------------------------------------------------------------
/Corpus/users/801485.txt:
--------------------------------------------------------------------------------
1 | Egor
2 | krijgertje
3 | Burunduk1
4 | ACRush
5 | marek.cygan
6 | meret
7 | rng..58
8 | pashka
9 | iwi
10 | eatmore
11 | halyavin
12 | Eryx
13 | earl
14 | mystic
15 | RAVEman
16 | jakubr
17 | PaulJefferys
18 | SergeyRogulenko
19 | Vasyl
20 | FloppyCat
21 | bmerry
22 | linguo
23 | Khuc.Anh.Tuan
24 | elsantodel90
25 |
--------------------------------------------------------------------------------
/Naive Baseline/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Naive Baseline/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 |
--------------------------------------------------------------------------------
/Naive Baseline/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | Naive Baseline
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Naive Baseline/src/ARFFFactory.java:
--------------------------------------------------------------------------------
1 | import java.io.File;
2 | import java.io.IOException;
3 | import java.util.HashSet;
4 | import java.util.Iterator;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 | import java.util.Map;
8 | import java.util.Set;
9 | import java.util.Stack;
10 |
11 | public class ARFFFactory {
12 |
13 | public static AbstractExtractor getExtractor(File f) throws IOException {
14 | AbstractExtractor x = null;
15 | if (f.getName().matches(".*\\.cpp")) {
16 | x = new ExtractorCPP(f);
17 | } else {
18 | x = new ExtractorC(f);
19 | }
20 | return x;
21 | }
22 |
23 | protected void appendAttributes(FeatureSet f, StringBuffer x) {
24 | x.append(f.numFunctions() + ",");
25 | x.append(f.length() + ",");
26 | x.append(f.numTokens() + ",");
27 | x.append(f.numComments() + ",");
28 | x.append(f.getLiterals().size() + ",");
29 | x.append(f.getReservedWords().size() + ",");
30 | x.append(f.avgLineLength() + ",");
31 | x.append(f.numEmptyLines() + ",");
32 | x.append(f.whiteSpaceRatio() + ",");
33 | x.append(f.avgParamsPerFunction() + ",");
34 | }
35 |
36 | public String getInstanceData(FeatureSet f, Set authors) {
37 |
38 | StringBuffer x = new StringBuffer();
39 | //
40 | appendAttributes(f, x);
41 | //
42 | x.append(getAuthorName((AbstractExtractor) f) + "\n");
43 | authors.add(getAuthorName((AbstractExtractor) f));
44 | return x.toString();
45 | // Util.writeFile(allLines, targetPath, true);
46 | }
47 |
48 | public static String getAuthorName(AbstractExtractor e) {
49 | File f = e.getFile();
50 | // String s = f.getName();
51 | // s = s.replaceFirst("p[\\d]+\\.", "");
52 | // int i = s.lastIndexOf('.');
53 | // s = s.replaceAll(",", "");
54 | // return s.substring(0, i - 1);
55 | String s = f.getParentFile().getName();
56 | return s.substring(0, s.length());
57 | }
58 |
59 | public void makeARFF(String rootDirectory, String targetPath) {
60 | // recursively spider thru all c/cpp files and make into a list of files
61 | // call method below
62 | // throw new UnsupportedOperationException();
63 | Stack files = new Stack();
64 | List programs = new LinkedList();
65 | File f = new File(rootDirectory);
66 | files.add(f);
67 | while (files.size() > 0) {
68 | File temp = files.pop();
69 | for (File myFile : temp.listFiles()) {
70 | if (myFile.isDirectory()) {
71 | files.add(myFile);
72 | } else if (myFile.isFile()) {
73 | if (myFile.getName().matches(".*\\.c")
74 | || myFile.getName().matches(".*\\.cpp")) {
75 | programs.add(myFile);
76 | }
77 | }
78 | }
79 | }
80 | makeARFF(programs, targetPath);
81 | }
82 |
83 | public void makeARFF(List files, String targetPath) {
84 | Set authors = new HashSet<>();
85 | List allLines = new LinkedList();
86 | // for each file in the list, get instance data
87 | for (File f : files) {
88 | System.out.println(f.getAbsolutePath());
89 | try {
90 | allLines.add(getInstanceData((FeatureSet) getExtractor(f),
91 | authors));
92 | } catch (IOException e) {
93 | e.printStackTrace();
94 | }
95 | }
96 | // call make arffheader
97 | makeARFFHeader(targetPath, authors);
98 | Util.writeFile(allLines, targetPath, true);
99 | System.out.println(authors.size() + " authors");
100 | System.out.println(files.size() + " files");
101 | }
102 |
103 | protected void arffAttributes(List allLines) {
104 | allLines.add("@attribute numFunctions numeric\n");
105 | allLines.add("@attribute length numeric\n");
106 | allLines.add("@attribute numTokens numeric\n");
107 | allLines.add("@attribute numComments numeric\n");
108 | allLines.add("@attribute numLiterals numeric\n");
109 | allLines.add("@attribute numReservedWords numeric\n");
110 | allLines.add("@attribute avgLineLength numeric\n");
111 | allLines.add("@attribute numEmptyLines numeric\n");
112 | allLines.add("@attribute whiteSpaceRatio numeric\n");
113 | allLines.add("@attribute avgParams numeric\n");
114 | }
115 |
116 | public void makeARFFHeader(String targetPath, Set authors) {
117 | // put @relation at top
118 | // put all the @attribute lines
119 | List allLines = new LinkedList();
120 | allLines.add("@relation code_style\n\n");
121 | // add all the @attributes
122 | arffAttributes(allLines);
123 | //
124 | allLines.add("@attribute author {");
125 | Iterator author = authors.iterator();
126 | while (author.hasNext()) {
127 | allLines.add(author.next());
128 | if (author.hasNext()) {
129 | allLines.add(",");
130 | }
131 | }
132 | allLines.add("}\n\n@data\n");
133 | Util.writeFile(allLines, targetPath, false);
134 | }
135 |
136 | public static double stdDev(Map mappy) {
137 | List list = new LinkedList();
138 | for (Integer i : mappy.keySet()) {
139 | for (int j = 0; j < mappy.get(i); j++) {
140 | list.add(i);
141 | }
142 | }
143 | return stdDev(list);
144 | }
145 |
146 | public static double variance(List list) {
147 | int sum1 = 0; // E(x^2)
148 | int sum2 = 0; // E(x)
149 | double size = list.size();
150 | for (Integer i : list) {
151 | sum1 += i * i;
152 | sum2 += i;
153 | }
154 | return (sum1 / size) - (sum2 / size) * (sum2 / size);
155 | }
156 |
157 | public static double stdDev(List list) {
158 | return Math.sqrt(variance(list));
159 | }
160 |
161 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/ARFFFactory2.java:
--------------------------------------------------------------------------------
1 | import java.util.HashSet;
2 | import java.util.Iterator;
3 | import java.util.List;
4 | import java.util.Set;
5 |
6 |
7 | public class ARFFFactory2 extends ARFFFactory {
8 |
9 | protected Set instanceIDs = new HashSet<>();
10 |
11 | @Override
12 | protected void appendAttributes(FeatureSet f, StringBuffer x) {
13 | x.append(((AbstractExtractor) f).getFile().getName() + ",");
14 | instanceIDs.add(((AbstractExtractor) f).getFile().getName());
15 |
16 | x.append(f.numFunctions() + ",");
17 | x.append(f.length() + ",");
18 | x.append(f.numTokens() + ",");
19 | x.append(f.numComments() + ",");
20 | x.append(f.getLiterals().size() + ",");
21 | x.append(f.getReservedWords().size() + ",");
22 | x.append(f.avgLineLength() + ",");
23 | x.append(f.numEmptyLines() + ",");
24 | x.append(f.whiteSpaceRatio() + ",");
25 | x.append(f.avgParamsPerFunction() + ",");
26 |
27 | x.append(stdDev(f.lineLengths()) + ",");
28 | x.append(f.numMacros() + ",");
29 | x.append(("" + f.tabsLeadLines()).toUpperCase() + ","); // double check
30 | x.append(f.getWhiteSpace().get(WhiteSpace.tab) + ",");
31 | x.append(f.getWhiteSpace().get(WhiteSpace.space) + ",");
32 | x.append(stdDev(f.numFunctionParams()) + ",");
33 | x.append(f.getControlStructures().get(ControlStatement.ifStatement) + ",");
34 | x.append(f.getControlStructures().get(ControlStatement.elifStatement) + ",");
35 | x.append(f.getControlStructures().get(ControlStatement.elseStatement) + ",");
36 | x.append(f.getControlStructures().get(ControlStatement.switchStatement) + ",");
37 | x.append(f.getControlStructures().get(ControlStatement.ternaryOperator) + ",");
38 | x.append(f.getLoops().get(Loops.forLoop) + ",");
39 | x.append(f.getLoops().get(Loops.whileLoop) + ",");
40 | x.append(f.getLoops().get(Loops.doWhileLoop) + ",");
41 | x.append(("" + f.newLineBrace()).toUpperCase() + ",");
42 | }
43 |
44 | @Override
45 | protected void arffAttributes(List allLines) {
46 | allLines.add("@attribute instanceID {");
47 | Iterator id = instanceIDs.iterator();
48 | while (id.hasNext()) {
49 | allLines.add(id.next());
50 | if (id.hasNext()) {
51 | allLines.add(",");
52 | }
53 | }
54 | allLines.add("}\n");
55 |
56 | allLines.add("@attribute numFunctions numeric\n");
57 | allLines.add("@attribute length numeric\n");
58 | allLines.add("@attribute numTokens numeric\n");
59 | allLines.add("@attribute numComments numeric\n");
60 | allLines.add("@attribute numLiterals numeric\n");
61 | allLines.add("@attribute numReservedWords numeric\n");
62 | allLines.add("@attribute avgLineLength numeric\n");
63 | allLines.add("@attribute numEmptyLines numeric\n");
64 | allLines.add("@attribute whiteSpaceRatio numeric\n");
65 | allLines.add("@attribute avgParams numeric\n");
66 |
67 | allLines.add("@attribute stdDevLineLength numeric\n");
68 | allLines.add("@attribute numMacros numeric\n");
69 | allLines.add("@attribute tabsLeadLines {TRUE, FALSE}\n");
70 | allLines.add("@attribute numTabs numeric\n");
71 | allLines.add("@attribute numSpaces numeric\n");
72 | allLines.add("@attribute stdDevNumParams numeric\n");
73 | allLines.add("@attribute numIf numeric\n");
74 | allLines.add("@attribute numElif numeric\n");
75 | allLines.add("@attribute numElse numeric\n");
76 | allLines.add("@attribute numSwitch numeric\n");
77 | allLines.add("@attribute numTernary numeric\n");
78 | allLines.add("@attribute numFor numeric\n");
79 | allLines.add("@attribute numWhile numeric\n");
80 | allLines.add("@attribute numDo numeric\n");
81 | allLines.add("@attribute newLineBeforeOpeningBrace {TRUE, FALSE}\n");
82 | }
83 | }
84 |
85 | //number of functions
86 | //program length
87 | //number of tokens
88 | //number of comments
89 | //number of String/character/numeric literals
90 | //number of unique reserved words used
91 | //average length of lines
92 | //number of empty lines
93 | //the ratio of whitespace to text
94 | //average number of parameters per function
95 |
96 | //standard deviation of length of lines
97 | //number of macros
98 | //whether tabs precede lines (versus spaces)
99 | //number of tabs
100 | //number of spaces
101 | //standard deviation of number of parameters
102 | //number of "if" statements
103 | //number of "else if" statements
104 | //number of "else" statements
105 | //number of "switch" statements
106 | //number of ternary operators
107 | //number of "for" loops
108 | //number of "while" loops
109 | //number of "do-while" loops
--------------------------------------------------------------------------------
/Naive Baseline/src/ARFFFactory3.java:
--------------------------------------------------------------------------------
1 | import java.util.Iterator;
2 | import java.util.List;
3 |
4 |
5 | public class ARFFFactory3 extends ARFFFactory2 {
6 |
7 | @Override
8 | protected void appendAttributes(FeatureSet f, StringBuffer x) {
9 | double len = f.length();
10 |
11 | x.append(((AbstractExtractor) f).getFile().getName() + ",");
12 | instanceIDs.add(((AbstractExtractor) f).getFile().getName());
13 |
14 | x.append(Math.log(f.numFunctions() / len) + ",");
15 | // x.append(f.length() + ",");
16 | x.append(Math.log(f.numTokens() / len) + ",");
17 | x.append(Math.log(f.numComments() / len) + ",");
18 | x.append(Math.log(f.getLiterals().size() / len) + ",");
19 | x.append(Math.log(f.getReservedWords().size() / len) + ",");
20 | x.append(f.avgLineLength() + ",");
21 | x.append(Math.log(f.numEmptyLines() / len) + ",");
22 | x.append(f.whiteSpaceRatio() + ",");
23 | x.append(f.avgParamsPerFunction() + ",");
24 |
25 | x.append(stdDev(f.lineLengths()) + ",");
26 | x.append(Math.log(f.numMacros() / len) + ",");
27 | x.append(("" + f.tabsLeadLines()).toUpperCase() + ","); // double check
28 | x.append(Math.log(f.getWhiteSpace().get(WhiteSpace.tab) / len) + ",");
29 | x.append(Math.log(f.getWhiteSpace().get(WhiteSpace.space) / len) + ",");
30 | x.append(stdDev(f.numFunctionParams()) + ",");
31 | x.append(Math.log(f.getControlStructures().get(ControlStatement.ifStatement) / len) + ",");
32 | x.append(Math.log(f.getControlStructures().get(ControlStatement.elifStatement) / len) + ",");
33 | x.append(Math.log(f.getControlStructures().get(ControlStatement.elseStatement) / len) + ",");
34 | x.append(Math.log(f.getControlStructures().get(ControlStatement.switchStatement) / len) + ",");
35 | x.append(Math.log(f.getControlStructures().get(ControlStatement.ternaryOperator) / len) + ",");
36 | x.append(Math.log(f.getLoops().get(Loops.forLoop) / len) + ",");
37 | x.append(Math.log(f.getLoops().get(Loops.whileLoop) / len) + ",");
38 | x.append(Math.log(f.getLoops().get(Loops.doWhileLoop) / len) + ",");
39 | x.append(("" + f.newLineBrace()).toUpperCase() + ",");
40 | }
41 |
42 | @Override
43 | protected void arffAttributes(List allLines) {
44 | allLines.add("@attribute instanceID {");
45 | Iterator id = instanceIDs.iterator();
46 | while (id.hasNext()) {
47 | allLines.add(id.next());
48 | if (id.hasNext()) {
49 | allLines.add(",");
50 | }
51 | }
52 | allLines.add("}\n");
53 |
54 | allLines.add("@attribute log(numFunctions/length) numeric\n");
55 | // allLines.add("@attribute length numeric\n");
56 | allLines.add("@attribute log(numTokens/length) numeric\n");
57 | allLines.add("@attribute log(numComments/length) numeric\n");
58 | allLines.add("@attribute log(numLiterals/length) numeric\n");
59 | allLines.add("@attribute log(numReservedWords/length) numeric\n");
60 | allLines.add("@attribute avgLineLength numeric\n");
61 | allLines.add("@attribute log(numEmptyLines/length) numeric\n");
62 | allLines.add("@attribute whiteSpaceRatio numeric\n");
63 | allLines.add("@attribute avgParams numeric\n");
64 |
65 | allLines.add("@attribute stdDevLineLength numeric\n");
66 | allLines.add("@attribute log(numMacros/length) numeric\n");
67 | allLines.add("@attribute tabsLeadLines {TRUE, FALSE}\n");
68 | allLines.add("@attribute log(numTabs/length) numeric\n");
69 | allLines.add("@attribute log(numSpaces/length) numeric\n");
70 | allLines.add("@attribute stdDevNumParams numeric\n");
71 | allLines.add("@attribute log(numIf/length) numeric\n");
72 | allLines.add("@attribute log(numElif/length) numeric\n");
73 | allLines.add("@attribute log(numElse/length) numeric\n");
74 | allLines.add("@attribute log(numSwitch/length) numeric\n");
75 | allLines.add("@attribute log(numTernary/length) numeric\n");
76 | allLines.add("@attribute log(numFor/length) numeric\n");
77 | allLines.add("@attribute log(numWhile/length) numeric\n");
78 | allLines.add("@attribute log(numDo/length) numeric\n");
79 | allLines.add("@attribute newLineBeforeOpeningBrace {TRUE, FALSE}\n");
80 | }
81 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/ARFFFactory4.java:
--------------------------------------------------------------------------------
1 | import java.util.List;
2 |
3 |
4 | public class ARFFFactory4 extends ARFFFactory3 {
5 |
6 | @Override
7 | protected void appendAttributes(FeatureSet f, StringBuffer x) {
8 | super.appendAttributes(f, x);
9 |
10 | x.append(f.nestingDepth() + ",");
11 | x.append(f.branchingFactor() + ",");
12 | }
13 |
14 | @Override
15 | protected void arffAttributes(List allLines) {
16 | super.arffAttributes(allLines);
17 |
18 | allLines.add("@attribute nestingDepth numeric\n");
19 | allLines.add("@attribute branchingFactor numeric\n");
20 | }
21 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/AbstractExtractor.java:
--------------------------------------------------------------------------------
1 | import java.io.BufferedReader;
2 | import java.io.File;
3 | import java.io.FileReader;
4 | import java.io.IOException;
5 | import java.util.HashMap;
6 | import java.util.Iterator;
7 | import java.util.LinkedList;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Scanner;
11 | import java.util.Stack;
12 |
13 | /**
14 | * Two big assumptions: the code is valid and no silly macros
15 | *
16 | * @author Andrew Liu
17 | *
18 | */
19 | public abstract class AbstractExtractor implements FeatureSet {
20 |
21 | private File file;
22 | static String tokenDelimiter = "[*;\\{\\}\\[\\]()+=\\-&/|%!?:,<>~`\\s\"]";
23 | MultiSet literals;
24 | List commentList;
25 | CodeBlock blocks;
26 | final String code; // source strippped of literals and comments
27 | int length = 0;
28 | int numWhiteSpaceChars = 0;
29 | List lines;
30 |
31 | public AbstractExtractor(File program) throws IOException {
32 | setTokenDelimiter(); // set what separates a token
33 |
34 | /* reading in the program contents into a StringBuffer */
35 | this.file = program;
36 | BufferedReader reader = new BufferedReader(new FileReader(program));
37 | StringBuffer source = new StringBuffer();
38 | char nextChar;
39 | while (reader.ready()) { // TODO can extract features here
40 | this.length++;
41 | nextChar = (char) reader.read();
42 | String charStr = "" + nextChar;
43 | if (charStr.matches("\\s")) {
44 | numWhiteSpaceChars++;
45 | }
46 | source.append(nextChar);
47 | }
48 | reader.close();
49 |
50 | /*
51 | * stripping out the String, character, integer, and floating point
52 | * literals
53 | */
54 | /* filtering out the comments as well */
55 | this.literals = new MultiSet();
56 | this.commentList = new LinkedList<>();
57 | StringBuffer sink = new StringBuffer();
58 |
59 | while (source.length() > 0) {
60 | if (matchesLiteral(source)) {
61 | // read in the literal
62 | this.literals.add(readNextLiteral(source));
63 | } else if (matchesComment(source)) {
64 | // read in the comment
65 | this.commentList.add(readNextComment(source));
66 | } else {
67 | // read in the code until after the next delimiter
68 | readUntilNextToken(source, sink);
69 | }
70 | }
71 |
72 | /* putting the leftover code back into the source */
73 | source = sink;
74 | sink = new StringBuffer();
75 | this.code = source.toString(); // setting the code without literals or
76 | // comments
77 |
78 | /* separating the code by blocks */
79 | this.blocks = new CodeBlock(this.file.getName());
80 | CodeBlock currentBlock = blocks;
81 | while (source.length() > 0) {
82 | if (isPrototype(source)) {
83 | // adding all statements into the previous block
84 | currentBlock.addStatements(breakIntoStmts(sink));
85 | sink = new StringBuffer();
86 | // creating a child block to use
87 | CodeBlock temp = new CodeBlock(
88 | extractPrototype(source));
89 | currentBlock.addChild(temp);
90 | currentBlock = temp;
91 | } else if (isBlockEnd(source, sink)) {
92 | // adding all statements into the previous block
93 | currentBlock.addStatements(breakIntoStmts(sink));
94 | sink = new StringBuffer();
95 | // using the parent block
96 | currentBlock = currentBlock.getParent();
97 | } else {
98 | readUntilNextToken(source, sink);
99 | }
100 | }
101 | Scanner sc = new Scanner(this.file);
102 | this.lines = new LinkedList();
103 | while (sc.hasNextLine()) {
104 | this.lines.add(sc.nextLine());
105 | }
106 | sc.close();
107 | }
108 |
109 | /**
110 | * Implement this now or make a getter for the token delimiter. Also
111 | * remember to read in the delimiter itself!
112 | *
113 | * @param source
114 | * @param sink
115 | */
116 | abstract void readUntilNextToken(StringBuffer source, StringBuffer sink);
117 |
118 | abstract boolean matchesLiteral(StringBuffer source);
119 |
120 | abstract String readNextLiteral(StringBuffer source);
121 |
122 | abstract boolean matchesComment(StringBuffer source);
123 |
124 | abstract String readNextComment(StringBuffer source);
125 |
126 | abstract boolean isPrototype(StringBuffer source);
127 |
128 | /**
129 | * Don't forget to remove the opening delimiter of the next block
130 | *
131 | * @param source
132 | * @return
133 | */
134 | abstract String extractPrototype(StringBuffer source);
135 |
136 | /**
137 | * Will put the "while" part into sink if it detects a do-while
138 | *
139 | * @param source
140 | * @param sink
141 | * @return
142 | */
143 | abstract boolean isBlockEnd(StringBuffer source, StringBuffer sink);
144 |
145 | /**
146 | * Does NOT empty buffer when done.
147 | *
148 | * @param source
149 | * @return
150 | */
151 | abstract List breakIntoStmts(StringBuffer source);
152 |
153 | static void setTokenDelimiter() {
154 | // override if you want
155 | }
156 |
157 | final void extractMultipleChars(StringBuffer source, StringBuffer sink,
158 | int num) {
159 | for (int i = 0; i < num; i++) {
160 | extractChar(source, sink);
161 | }
162 | }
163 |
164 | final void extractChar(StringBuffer source, StringBuffer sink) {
165 | sink.append(source.charAt(0));
166 | source.deleteCharAt(0);
167 | }
168 |
169 | final char peek(StringBuffer source) {
170 | return source.charAt(0);
171 | }
172 |
173 | /**
174 | * Remember this eats up the regex char!
175 | *
176 | * @param source
177 | * @param sink
178 | * @param regex
179 | */
180 | final void readUntil(StringBuffer source, StringBuffer sink, String regex) {
181 | this.readBefore(source, sink, regex);
182 | if (this.peek(source) != '"') {
183 | this.extractChar(source, sink);
184 | }
185 | }
186 |
187 | /**
188 | * Same as readUntil except it doesn't eat the regex.
189 | *
190 | * @param source
191 | * @param sink
192 | * @param regex
193 | */
194 | final void readBefore(StringBuffer source, StringBuffer sink, String regex) {
195 | while (source.length() > 1 && !source.substring(0, 1).matches(regex)) {
196 | this.extractChar(source, sink);
197 | }
198 | }
199 |
200 | static String getTokenDelimiter() {
201 | return tokenDelimiter;
202 | }
203 |
204 | public File getFile() {
205 | return this.file;
206 | }
207 |
208 | @Override
209 | public int nestingDepth() {
210 | return this.blocks.getHeight();
211 | }
212 |
213 | @Override
214 | public double branchingFactor() {
215 | List numChildren = new LinkedList<>();
216 | Stack> stack = new Stack<>();
217 | stack.add(this.blocks);
218 | while (!stack.empty()) {
219 | CodeBlock myBlock = stack.pop();
220 | if (myBlock.children.size() > 0) {
221 | numChildren.add(myBlock.children.size());
222 | for (CodeBlock c : myBlock.children) {
223 | stack.add(c);
224 | }
225 | }
226 | }
227 | int sum = 0;
228 | double size = numChildren.size();
229 | for (Integer i : numChildren) {
230 | sum += i;
231 | }
232 | return sum / size;
233 | }
234 |
235 | @Override
236 | public int numComments() {
237 | return this.commentList.size();
238 | }
239 |
240 | @Override
241 | public List getComments() {
242 | return new LinkedList(this.commentList);
243 | }
244 |
245 | @Override
246 | public Map getLiterals() {
247 | return new HashMap(this.literals);
248 | }
249 |
250 | @Override
251 | public int length() {
252 | return this.length;
253 | }
254 |
255 | @Override
256 | public int numEmptyLines() {
257 | int count = 0;
258 | int bufferCount = 0;
259 | boolean leadingFlag = false;
260 | for (String line : this.lines) {
261 | if (line.matches("[\\s]*")) {
262 | if (leadingFlag) {
263 | bufferCount++;
264 | }
265 | } else {
266 | count += bufferCount;
267 | bufferCount = 0;
268 | leadingFlag = true;
269 | }
270 | }
271 | return count;
272 | }
273 |
274 | @Override
275 | public List lineLengths() {
276 | List lengths = new LinkedList();
277 | for (String line : this.lines) {
278 | lengths.add(line.length());
279 | }
280 | return lengths;
281 | }
282 |
283 | @Override
284 | public double avgLineLength() {
285 | int sum = 0;
286 | int count = 0;
287 | Iterator iter = this.lineLengths().iterator();
288 | while (iter.hasNext()) {
289 | sum += iter.next();
290 | count++;
291 | }
292 | return sum / (double) count;
293 | }
294 |
295 | @Override
296 | public double whiteSpaceRatio() {
297 | return this.length / (double) this.numWhiteSpaceChars;
298 | }
299 |
300 | @Override
301 | public boolean tabsLeadLines() {
302 | int tabs = 0;
303 | int spaces = 0;
304 | for (String s : this.code.split("\\n")) {
305 | if (s.matches("\\t.*")) {
306 | tabs++;
307 | } else if (s.matches(" .*")) {
308 | spaces++;
309 | }
310 | }
311 | return tabs >= spaces;
312 | }
313 |
314 | @Override
315 | public String instanceID() {
316 | return this.file.getName();
317 | }
318 |
319 | @Override
320 | public Map getWhiteSpace() {
321 | MultiSet whitespace = new MultiSet();
322 | whitespace.put(WhiteSpace.newLine, 0);
323 | whitespace.put(WhiteSpace.tab, 0);
324 | whitespace.put(WhiteSpace.space, 0);
325 | for (int i = 0; i < this.code.length(); i++) {
326 | if (this.code.charAt(i) == '\n') {
327 | whitespace.add(WhiteSpace.newLine);
328 | } else if (this.code.charAt(i) == '\t') {
329 | whitespace.add(WhiteSpace.tab);
330 | } else if (this.code.charAt(i) == ' ') {
331 | whitespace.add(WhiteSpace.space);
332 | }
333 | }
334 | return whitespace;
335 | }
336 |
337 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/CodeBlock.java:
--------------------------------------------------------------------------------
1 | import java.util.LinkedList;
2 | import java.util.List;
3 |
4 | /*
5 | * TODO
6 | *
7 | * This class is purposefully mutable. Reconsider the shallowness/deepness of some of the getters...
8 | *
9 | * Consider making a node inner class rather than recursively using this class.
10 | *
11 | * EDIT: This class probably shouldn't be purposefully mutable...
12 | */
13 |
14 | /**
15 | * An data structure that resembles an n-ary tree. It represents blocks of code
16 | * and its nested blocks (and statements).
17 | *
18 | * @author Andrew Liu
19 | *
20 | * @param
21 | * The type of statement each block holds. Usually a String.
22 | */
23 | public class CodeBlock {
24 |
25 | private String prototype;
26 | private List statements;
27 | List> children;
28 | private CodeBlock parent;
29 |
30 | /**
31 | * Default constructor.
32 | */
33 | private CodeBlock() {
34 | this.parent = null;
35 | this.statements = new LinkedList();
36 | this.children = new LinkedList>();
37 | }
38 |
39 | /**
40 | * Constructor.
41 | *
42 | * @param prototype
43 | * The "prototype" for the block. It can be a function prototype,
44 | * or a class declaration, loop header, etc...
45 | */
46 | public CodeBlock(String prototype) {
47 | this();
48 | this.prototype = prototype;
49 | }
50 |
51 | /**
52 | * Copy constructor.
53 | *
54 | * @param copy
55 | * CodeBlock to copy.
56 | */
57 | public CodeBlock(CodeBlock copy) {
58 | this();
59 | this.parent = copy.parent;
60 | this.prototype = copy.prototype;
61 | this.addStatements(copy.statements);
62 | for (CodeBlock child : copy.getChildren()) {
63 | this.addChild(new CodeBlock(child));
64 | }
65 | }
66 |
67 | /**
68 | * Gets the block's parent block.
69 | *
70 | * @return The parent block.
71 | */
72 | public CodeBlock getParent() {
73 | return this.parent;
74 | }
75 |
76 | /**
77 | * Changes the block's parent block.
78 | *
79 | * @param parent
80 | * The new parent block.
81 | */
82 | public void setParent(CodeBlock parent) {
83 | this.parent = parent;
84 | }
85 |
86 | /**
87 | * Gets the prototype for this block.
88 | *
89 | * @return The block's prototype.
90 | */
91 | public String getPrototype() {
92 | return this.prototype;
93 | }
94 |
95 | /**
96 | * Does a depth-first search to get the prototypes of this code block and
97 | * all child code blocks.
98 | *
99 | * @return All prototypes.
100 | */
101 | public List getPrototypesRecursively() {
102 | List prototypes = new LinkedList();
103 | prototypes.add(this.prototype);
104 | for (CodeBlock child : this.children) {
105 | prototypes.addAll(child.getPrototypesRecursively());
106 | }
107 | return prototypes;
108 | }
109 |
110 | /**
111 | * Changes the block's prototype.
112 | *
113 | * @param prototype
114 | * The new prototype.
115 | */
116 | public void setPrototype(String prototype) {
117 | this.prototype = prototype.trim();
118 | }
119 |
120 | /**
121 | * Adds the statement to the list of statements for the block.
122 | *
123 | * @param statement
124 | * The statement to add.
125 | */
126 | public void addStatement(T statement) {
127 | this.statements.add(statement);
128 | }
129 |
130 | /**
131 | * Adds multiple statements to the list of statements.
132 | *
133 | * @param statements
134 | * The list of statements to add.
135 | */
136 | public void addStatements(List statements) {
137 | for (T statement : statements) {
138 | this.statements.add(statement);
139 | }
140 | }
141 |
142 | /**
143 | * Gets the list of statements for the current block.
144 | *
145 | * @return The list of statements for the current block.
146 | */
147 | public List getStatements() {
148 | return new LinkedList(this.statements);
149 | }
150 |
151 | /**
152 | * Gets a list of all statements for the block and its children depth-first.
153 | *
154 | * @return List of all statements.
155 | */
156 | public List getStatementsRecursively() {
157 | List allStatements = this.getStatements();
158 | for (CodeBlock child : this.children) {
159 | allStatements.addAll(child.getStatementsRecursively());
160 | }
161 | return allStatements;
162 | }
163 |
164 | /**
165 | * Adds a new child to the code block.
166 | *
167 | * @param child
168 | * The new child.
169 | */
170 | public void addChild(CodeBlock child) {
171 | this.children.add(child);
172 | child.parent = this;
173 | }
174 |
175 | /**
176 | * Gets the list of children for the code block.
177 | *
178 | * @return The list of code block children.
179 | */
180 | public List> getChildren() {
181 | List> children = new LinkedList>();
182 | for (CodeBlock block : this.children) {
183 | children.add(new CodeBlock(block));
184 | }
185 | return children;
186 | }
187 |
188 | /**
189 | * Calculates the height of this tree structure.
190 | *
191 | * @return The height of the tree.
192 | */
193 | public int getHeight() {
194 | int height = 1;
195 | int max = 0;
196 | for (CodeBlock child : this.children) {
197 | int subHeight = child.getHeight();
198 | if (subHeight > max) {
199 | max = subHeight;
200 | }
201 | }
202 | return height + max;
203 | }
204 |
205 | /**
206 | * Gets the total number of code block children including the current block.
207 | *
208 | * @return The total number of nodes in the tree.
209 | */
210 | public int getTotalNumBlocks() {
211 | int total = 1;
212 | for (CodeBlock child : this.children) {
213 | total += child.getTotalNumBlocks();
214 | }
215 | return total;
216 | }
217 |
218 | /*
219 | * (non-Javadoc)
220 | *
221 | * @see java.lang.Object#toString()
222 | */
223 | public String toString() {
224 | return this.toStringAux().toString();
225 | }
226 |
227 | private StringBuffer toStringAux() {
228 | StringBuffer ret = new StringBuffer("{");
229 | ret.append(this.prototype);
230 | for (CodeBlock child : this.children) {
231 | ret.append(child.toStringAux());
232 | }
233 | return ret.append("}");
234 | }
235 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/ControlStatement.java:
--------------------------------------------------------------------------------
1 | public enum ControlStatement {
2 |
3 | ifStatement("if"), elifStatement("elif"), elseStatement("else"), switchStatement(
4 | "switch"), ternaryOperator("ternary");
5 |
6 | private final String name;
7 |
8 | private ControlStatement(String name) {
9 | this.name = name;
10 | }
11 |
12 | @Override
13 | public String toString() {
14 | return this.name;
15 | }
16 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/Driver.java:
--------------------------------------------------------------------------------
1 | public class Driver {
2 |
3 | /**
4 | * Dependencies: Apache Commons IO, Util.java, ARFFFactory*.java, everything
5 | * else in the Naive-Baseline package
6 | */
7 | public static void main(String args[]) {
8 | if (args.length != 2) {
9 | System.err
10 | .println("Usage: ");
11 | System.exit(1);
12 | }
13 | (new ARFFFactory4()).makeARFF(args[0], args[1]);
14 |
15 | for(int datasetNo=101; datasetNo<102; datasetNo++){
16 | args[0] ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAA_Datasets/"
17 | + "forMallory/mallory_new_SFS/malloryDataset_"+datasetNo+"/";
18 | args[1] ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/mallory_150/malloryDataset_andrew_"+datasetNo+".arff";
19 | (new ARFFFactory4()).makeARFF(args[0], args[1]);
20 | }
21 | }
22 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/ExtractorC.java:
--------------------------------------------------------------------------------
1 | import java.io.File;
2 | import java.io.IOException;
3 | import java.util.Arrays;
4 | import java.util.HashSet;
5 | import java.util.Iterator;
6 | import java.util.LinkedList;
7 | import java.util.List;
8 | import java.util.Map;
9 | import java.util.Set;
10 |
11 | public class ExtractorC extends AbstractExtractor {
12 |
13 | protected Set reservedWords;
14 |
15 | public ExtractorC(File program) throws IOException {
16 | super(program);
17 | this.prepareReservedWords();
18 | }
19 |
20 | protected void prepareReservedWords() {
21 | this.reservedWords = new HashSet();
22 | for (String s : ReservedC.reservedWords) {
23 | this.reservedWords.add(s);
24 | }
25 | }
26 |
27 | @Override
28 | void readUntilNextToken(StringBuffer source, StringBuffer sink) {
29 | this.readUntil(source, sink, tokenDelimiter);
30 | }
31 |
32 | @Override
33 | boolean matchesLiteral(StringBuffer source) {
34 | return source.charAt(0) == '"' || source.charAt(0) == '\''
35 | || source.toString().matches("[\\d]+[\\w\\W]*")
36 | || source.toString().matches("[.][\\d]+[\\w\\W]*");
37 | }
38 |
39 | @Override
40 | String readNextLiteral(StringBuffer source) {
41 | StringBuffer sink = new StringBuffer();
42 | if (source.charAt(0) == '"') {
43 | // strings
44 | this.extractChar(source, sink); // get opening quote
45 | char prev = '\0';
46 | char next;
47 | while (source.length() > 0) {
48 | next = source.charAt(0);
49 | this.extractChar(source, sink);
50 | if (prev != '\\' && next == '"') {
51 | break;
52 | }
53 | prev = next;
54 | }
55 | } else if (source.charAt(0) == '\'') {
56 | // characters
57 | if (source.charAt(1) == '\\') {
58 | this.extractMultipleChars(source, sink, 4);
59 | } else {
60 | this.extractMultipleChars(source, sink, 3);
61 | }
62 | } else {
63 | // numbers
64 | this.readBefore(source, sink, "\\D");
65 | if (source.charAt(0) == 'l' || source.charAt(0) == 'L') {
66 | this.extractChar(source, sink);
67 | } else if (source.charAt(0) == '.') {
68 | // is a floating point number
69 | this.extractChar(source, sink);
70 | this.readBefore(source, sink, "\\D");
71 | if (source.charAt(0) == 'f' || source.charAt(0) == 'F') {
72 | this.extractChar(source, sink);
73 | }
74 | }
75 | }
76 | return sink.toString();
77 | }
78 |
79 | @Override
80 | boolean matchesComment(StringBuffer source) {
81 | return source.length() >= 2
82 | && (source.substring(0, 2).equals("//") || source.substring(0,
83 | 2).equals("/*"));
84 | }
85 |
86 | @Override
87 | String readNextComment(StringBuffer source) {
88 | StringBuffer sink = new StringBuffer();
89 | if (source.substring(0, 2).equals("//")) {
90 | this.readUntil(source, sink, "\n");
91 | } else {
92 | int endIndex = source.toString().indexOf("*/") + 2;
93 | this.extractMultipleChars(source, sink, endIndex);
94 | }
95 | return sink.toString();
96 | }
97 |
98 | @Override
99 | boolean isPrototype(StringBuffer source) {
100 | String s = source.toString();
101 | if (s.matches(".*\\{[\\w\\W]*") || s.matches(".*\\n\\{[\\w\\W]*")) {
102 | return true;
103 | }
104 | if (s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*")
105 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*")
106 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*")
107 | || s.matches("switch[\\w\\W]*")) {
108 | return true; // notice the space after the "do" regex (avoids
109 | // matching "double"
110 | }
111 | if (s.matches("static[\\w\\W]*") || s.matches("extern[\\w\\W]*")
112 | || s.matches("unsigned[\\w\\W]*")
113 | || s.matches("signed[\\w\\W]*") || s.matches("char[\\w\\W]*")
114 | || s.matches("short[\\w\\W]*") || s.matches("int[\\w\\W]*")
115 | || s.matches("long[\\w\\W]*") || s.matches("float[\\w\\W]*")
116 | || s.matches("double[\\w\\W]*") || s.matches("enum[\\w\\W]*")
117 | || s.matches("typedef[\\w\\W]*")
118 | || s.matches("register[\\w\\W]*")
119 | || s.matches("union[\\w\\W]*") || s.matches("void[\\w\\W]*")) {
120 | int braceIndex = s.indexOf('{');
121 | int semicolonIndex = s.indexOf(';');
122 | if (braceIndex == -1) {
123 | return false;
124 | }
125 | if (semicolonIndex == -1) {
126 | return true;
127 | }
128 | return braceIndex < semicolonIndex;
129 | }
130 | return false;
131 | }
132 |
133 | @Override
134 | String extractPrototype(StringBuffer source) {
135 | StringBuffer sink = new StringBuffer();
136 |
137 | String s = source.toString();
138 | if (s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*")
139 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*")
140 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*")
141 | || s.matches("switch[\\w\\W]*")) {
142 | int lineIndex = s.indexOf("\n");
143 | int braceIndex = s.indexOf("{");
144 | if (braceIndex == -1 || braceIndex < lineIndex
145 | || s.substring(lineIndex, braceIndex).matches("[\\s]*")) {
146 | this.readBefore(source, sink, "\\n");
147 | return sink.toString();
148 | }
149 | }
150 |
151 | this.readUntil(source, sink, "\\{");
152 | return sink.substring(0, sink.length() - 1); // we don't want to
153 | // include
154 | // the '{'
155 | }
156 |
157 | @Override
158 | boolean isBlockEnd(StringBuffer source, StringBuffer sink) {
159 | if (source.charAt(0) == '}') {
160 | source.deleteCharAt(0); // get rid of the '}'
161 | if (source.length() > 0 && source.charAt(0) == ';') {
162 | source.deleteCharAt(0); // get rid of the ';' after the '}'
163 | } else if (source.length() > 0 && source.toString().matches("[\\s]*while")) {
164 | // in case of a do-while
165 | int semicolonIndex = source.indexOf(";");
166 | this.extractMultipleChars(source, sink, semicolonIndex + 1);
167 | }
168 | return true;
169 | }
170 | return false;
171 | }
172 |
173 | @Override
174 | List breakIntoStmts(StringBuffer source) {
175 | List stmts = new LinkedList();
176 | List fragments = Arrays.asList(source.toString()
177 | .split("[\\n;]"));
178 | Iterator iter = fragments.iterator();
179 | while (iter.hasNext()) {
180 | String s = iter.next();
181 | if (s.matches("[\\s]*")) {
182 | continue;
183 | }
184 | stmts.add(s.trim());
185 | }
186 | return stmts;
187 | }
188 |
189 | @Override
190 | public boolean newLineBrace() {
191 | int onLineBrace = 0;
192 | int newLineBrace = 0;
193 | for (String s : this.code.split("\\{")) {
194 | if (s.length() == 0) {
195 | continue;
196 | }
197 | if (s.charAt(s.length() - 1) == '\n') {
198 | newLineBrace++;
199 | } else {
200 | onLineBrace++;
201 | }
202 | }
203 | return newLineBrace >= onLineBrace;
204 | }
205 |
206 | @Override
207 | public int numFunctions() {
208 | int count = 0;
209 | for (String s : this.blocks.getPrototypesRecursively()) {
210 | if (isFunction(s)) { // need to double check
211 | count++;
212 | }
213 | }
214 | return count;
215 | }
216 |
217 | protected static boolean isFunction(String s) {
218 | return !(s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*")
219 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*")
220 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*")
221 | || s.matches("switch[\\w\\W]*") || s.matches("enum[\\w\\W]*")
222 | || s.matches("typedef[\\w\\W]*")
223 | || s.matches("register[\\w\\W]*") || s
224 | .matches("union[\\w\\W]*"));
225 | }
226 |
227 | @Override
228 | public int numTokens() {
229 | return this.code.split(tokenDelimiter).length;
230 | } // need to double check
231 |
232 | @Override
233 | public Map getReservedWords() {
234 | MultiSet reservedWords = new MultiSet<>();
235 | String[] tokens = this.code.split(tokenDelimiter);
236 | for (String token : tokens) {
237 | if (this.reservedWords.contains(token)) {
238 | reservedWords.add(token);
239 | }
240 | }
241 | return reservedWords;
242 | }
243 |
244 | @Override
245 | public Map getUserDefinedWords() {
246 | MultiSet reservedWords = new MultiSet<>();
247 | String[] tokens = this.code.split(tokenDelimiter);
248 | for (String token : tokens) {
249 | if (!this.reservedWords.contains(token)) {
250 | reservedWords.add(token);
251 | }
252 | }
253 | return reservedWords;
254 | }
255 |
256 | @Override
257 | public Map getLoops() {
258 | MultiSet myLoops = new MultiSet<>();
259 | myLoops.put(Loops.doWhileLoop, 0);
260 | myLoops.put(Loops.forLoop, 0);
261 | myLoops.put(Loops.whileLoop, 0);
262 | for (String s : this.blocks.getPrototypesRecursively()) {
263 | if (s.matches("do [\\w\\W]*")) {
264 | myLoops.add(Loops.doWhileLoop);
265 | } else if (s.matches("for [\\w\\W]*")) {
266 | myLoops.add(Loops.forLoop);
267 | } else if (s.matches("while [\\w\\W]*")) {
268 | myLoops.add(Loops.whileLoop);
269 | }
270 | }
271 | return myLoops;
272 | }
273 |
274 | @Override
275 | public Map getControlStructures() {
276 | MultiSet myControls = new MultiSet<>();
277 | myControls.put(ControlStatement.elifStatement, 0);
278 | myControls.put(ControlStatement.elseStatement, 0);
279 | myControls.put(ControlStatement.ifStatement, 0);
280 | myControls.put(ControlStatement.switchStatement, 0);
281 | myControls.put(ControlStatement.ternaryOperator, 0);
282 | for (String s : this.blocks.getPrototypesRecursively()) {
283 | if (s.matches("else if[\\w\\W]*")) {
284 | myControls.add(ControlStatement.elifStatement);
285 | } else if (s.matches("else [\\w\\W]*")) {
286 | myControls.add(ControlStatement.elseStatement);
287 | } else if (s.matches("if [\\w\\W]*")) {
288 | myControls.add(ControlStatement.ifStatement);
289 | } else if (s.matches("switch [\\w\\W]*")) {
290 | myControls.add(ControlStatement.switchStatement);
291 | }
292 | }
293 | // get ternaries by splitting via "?"
294 | myControls.put(ControlStatement.ternaryOperator, this.code.split("\\?").length - 1);
295 | return myControls;
296 | }
297 |
298 | @Override
299 | public Map numFunctionParams() {
300 | MultiSet params = new MultiSet<>();
301 | for (String s : this.blocks.getPrototypesRecursively()) {
302 | if (!isFunction(s)) {
303 | continue;
304 | }
305 | String[] s2 = s.split(",");
306 | params.add(s2.length - 1);
307 | }
308 | return params;
309 | }
310 |
311 | @Override
312 | public double avgParamsPerFunction() {
313 | Map params = this.numFunctionParams();
314 | Set keys = params.keySet();
315 | int totalParams = 0;
316 | for (Integer key : keys) {
317 | totalParams += key * params.get(key);
318 | }
319 | return totalParams / (double) this.numFunctions();
320 | }
321 |
322 | @Override
323 | public Map getVariableLocality() {
324 | // check var in nary tree with its tree depth
325 | // TODO Auto-generated method stub
326 | throw new UnsupportedOperationException();
327 | }
328 |
329 | @Override
330 | public int numMacros() {
331 | int count = 0;
332 | for (String s : this.code.split("\\n")) {
333 | if (s.matches("#.*")) {
334 | count++;
335 | }
336 | }
337 | return count;
338 | }
339 |
340 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/ExtractorCPP.java:
--------------------------------------------------------------------------------
1 | import java.io.File;
2 | import java.io.IOException;
3 | import java.util.HashSet;
4 |
5 | public class ExtractorCPP extends ExtractorC {
6 |
7 | public ExtractorCPP(File program) throws IOException {
8 | super(program);
9 | }
10 |
11 | @Override
12 | protected void prepareReservedWords() {
13 | this.reservedWords = new HashSet();
14 | for (String s : ReservedCPP.reservedWords) {
15 | this.reservedWords.add(s);
16 | }
17 | }
18 |
19 | @Override
20 | boolean isPrototype(StringBuffer source) {
21 | String s = source.toString();
22 |
23 | if (s.matches(".*\\{[\\w\\W]*") || s.matches(".*\\n\\{[\\w\\W]*")) {
24 | return true;
25 | }
26 |
27 | if (s.matches("for[\\w\\W]*") || s.matches("while[\\w\\W]*")
28 | || s.matches("do [\\w\\W]*") || s.matches("struct[\\w\\W]*")
29 | || s.matches("if[\\w\\W]*") || s.matches("else[\\w\\W]*")
30 | || s.matches("switch[\\w\\W]*")) {
31 | return true; // notice the space after the "do" regex (avoids
32 | // matching "double"
33 | }
34 | if (s.matches("static[\\w\\W]*") || s.matches("extern[\\w\\W]*")
35 | || s.matches("unsigned[\\w\\W]*")
36 | || s.matches("signed[\\w\\W]*") || s.matches("char[\\w\\W]*")
37 | || s.matches("short[\\w\\W]*") || s.matches("int[\\w\\W]*")
38 | || s.matches("long[\\w\\W]*") || s.matches("float[\\w\\W]*")
39 | || s.matches("double[\\w\\W]*") || s.matches("enum[\\w\\W]*")
40 | || s.matches("typedef[\\w\\W]*")
41 | || s.matches("register[\\w\\W]*")
42 | || s.matches("union[\\w\\W]*") || s.matches("void[\\w\\W]*")
43 | || s.matches("char16_t[\\w\\W]*")
44 | || s.matches("char32_t[\\w\\W]*")
45 | || s.matches("wchar_t[\\w\\W]*") || s.matches("bool[\\w\\W]*")) {
46 | int braceIndex = s.indexOf('{');
47 | int semicolonIndex = s.indexOf(';');
48 | if (braceIndex == -1) {
49 | return false;
50 | }
51 | if (semicolonIndex == -1) {
52 | return true;
53 | }
54 | return braceIndex < semicolonIndex;
55 | }
56 | return false;
57 | }
58 |
59 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/FeatureSet.java:
--------------------------------------------------------------------------------
1 | import java.util.List;
2 | import java.util.Map;
3 |
4 | public interface FeatureSet {
5 |
6 | /**
7 | * Shows if the code is of the style "[stmt] {\n" or "[stmt]\n{\n".
8 | *
9 | * @return
10 | */
11 | public boolean newLineBrace();
12 |
13 | public int numFunctions();
14 |
15 | public int nestingDepth();
16 |
17 | public double branchingFactor();
18 |
19 | public int length();
20 |
21 | public int numTokens();
22 |
23 | public int numComments();
24 |
25 | public List getComments();
26 |
27 | public Map getLiterals();
28 |
29 | public Map getReservedWords();
30 |
31 | public Map getUserDefinedWords();
32 |
33 | public Map getLoops();
34 |
35 | public List lineLengths();
36 |
37 | public double avgLineLength();
38 |
39 | /**
40 | * Map each control structure to the number of times it occurs.
41 | *
42 | * @return
43 | */
44 | public Map getControlStructures();
45 |
46 | /**
47 | * Does not count leading and trailing empty lines
48 | *
49 | * @return
50 | */
51 | public int numEmptyLines();
52 |
53 | public double whiteSpaceRatio();
54 |
55 | public Map numFunctionParams();
56 |
57 | public double avgParamsPerFunction();
58 |
59 | public Map getVariableLocality();
60 |
61 | public Map getWhiteSpace();
62 |
63 | public String instanceID();
64 |
65 | // public Map caseDistr();//////////////
66 |
67 | public int numMacros();
68 |
69 | public boolean tabsLeadLines();
70 |
71 | // ++ vs += 1
72 | // spaces vs tabs
73 | // x=1 vs x = 1
74 | // variable names
75 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/Loops.java:
--------------------------------------------------------------------------------
1 | public enum Loops {
2 |
3 | forLoop("for"), doWhileLoop("do"), whileLoop("while");
4 |
5 | private final String name;
6 |
7 | private Loops(String name) {
8 | this.name = name;
9 | }
10 |
11 | @Override
12 | public String toString() {
13 | return this.name;
14 | }
15 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/MultiSet.java:
--------------------------------------------------------------------------------
1 |
2 | import java.util.HashMap;
3 | import java.util.Set;
4 |
5 | /**
6 | * A data structure emulating a set that counts the number of repeated elements.
7 | *
8 | * @author Andrew Liu
9 | *
10 | *@param The type of element the MultiSet holds.
11 | */
12 | public class MultiSet extends HashMap {
13 |
14 | /**
15 | *
16 | */
17 | private static final long serialVersionUID = 1L;
18 |
19 | /**
20 | * Default constructor.
21 | */
22 | public MultiSet() {
23 | }
24 |
25 | /**
26 | * Copy constructor.
27 | *
28 | * @param copy MultiSet to copy.
29 | */
30 | public MultiSet(MultiSet copy) {
31 | Set keys = copy.keySet();
32 | for (T s : keys) {
33 | this.put(s, (Integer) copy.get(s));
34 | }
35 | }
36 |
37 | /**
38 | * Adds an element to the MultiSet, or increments its count by one if the element already exists.
39 | *
40 | * @param key The element to add.
41 | */
42 | public void add(T key) {
43 | if (!this.containsKey(key)) {
44 | this.put(key, 1);
45 | } else {
46 | this.put(key, this.get(key) + 1);
47 | }
48 | }
49 |
50 | /*
51 | * (non-Javadoc)
52 | *
53 | * @see java.util.AbstractMap#toString()
54 | */
55 | public String toString() {
56 | StringBuffer s = new StringBuffer();
57 | for (java.util.Map.Entry e : this.entrySet()) {
58 | s.append(e.toString() + '\n');
59 | }
60 | return s.toString();
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/Naive Baseline/src/ReservedC.java:
--------------------------------------------------------------------------------
1 | public class ReservedC {
2 | public static final String[] reservedWords = { "auto", "break", "case",
3 | "char", "continue", "default", "do", "double", "else", "entry",
4 | "extern", "float", "for", "goto", "if", "int", "long", "register",
5 | "return", "short", "sizeof", "static", "struct", "switch",
6 | "typedef", "union", "unsigned", "while", "enum", "void", "const",
7 | "signed", "volatile" };
8 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/ReservedCPP.java:
--------------------------------------------------------------------------------
1 | public class ReservedCPP {
2 |
3 | public static final String[] reservedWords = { "alignas", "alignof", "and",
4 | "and_eq", "asm", "auto", "bitand", "bitor", "bool", "break",
5 | "case", "catch", "char", "char16_t", "char32_t", "class", "compl",
6 | "const", "constexpr", "const_cast", "continue", "decltype",
7 | "default", "delete", "do", "double", "dynamic_cast", "else",
8 | "enum", "explicit", "export", "extern", "false", "float", "for",
9 | "friend", "goto", "if", "inline", "int", "long", "mutable",
10 | "namespace", "new", "noexcept", "not", "not_eq", "nullptr",
11 | "operator", "or", "or_eq", "private", "protected", "public",
12 | "register", "reinterpret_cast", "return", "short", "signed",
13 | "sizeof", "static", "static_assert", "static_cast", "struct",
14 | "switch", "template", "this", "thread_local", "throw", "true",
15 | "try", "typedef", "typeid", "typename", "union", "unsigned",
16 | "using", "virtual", "void", "volatile", "wchar_t", "while", "xor",
17 | "xor_eq" };
18 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/WhiteSpace.java:
--------------------------------------------------------------------------------
1 |
2 | public enum WhiteSpace {
3 | space("' '"), tab("'\\t'"), newLine("'\\n'");
4 |
5 | private final String name;
6 |
7 | private WhiteSpace(String name) {
8 | this.name = name;
9 | }
10 |
11 | @Override
12 | public String toString() {
13 | return this.name;
14 | }
15 | }
--------------------------------------------------------------------------------
/Naive Baseline/src/c_reserved_words.txt:
--------------------------------------------------------------------------------
1 | auto
2 | break
3 | case
4 | char
5 | continue
6 | default
7 | do
8 | double
9 | else
10 | entry
11 | extern
12 | float
13 | for
14 | goto
15 | if
16 | int
17 | long
18 | register
19 | return
20 | short
21 | sizeof
22 | static
23 | struct
24 | switch
25 | typedef
26 | union
27 | unsigned
28 | while
29 | enum
30 | void
31 | const
32 | signed
33 | volatile
--------------------------------------------------------------------------------
/Naive Baseline/src/cpp_reserved_words.txt:
--------------------------------------------------------------------------------
1 | alignas
2 | alignof
3 | and
4 | and_eq
5 | asm
6 | auto
7 | bitand
8 | bitor
9 | bool
10 | break
11 | case
12 | catch
13 | char
14 | char16_t
15 | char32_t
16 | class
17 | compl
18 | const
19 | constexpr
20 | const_cast
21 | continue
22 | decltype
23 | default
24 | delete
25 | do
26 | double
27 | dynamic_cast
28 | else
29 | enum
30 | explicit
31 | export
32 | extern
33 | false
34 | float
35 | for
36 | friend
37 | goto
38 | if
39 | inline
40 | int
41 | long
42 | mutable
43 | namespace
44 | new
45 | noexcept
46 | not
47 | not_eq
48 | nullptr
49 | operator
50 | or
51 | or_eq
52 | private
53 | protected
54 | public
55 | register
56 | reinterpret_cast
57 | return
58 | short
59 | signed
60 | sizeof
61 | static
62 | static_assert
63 | static_cast
64 | struct
65 | switch
66 | template
67 | this
68 | thread_local
69 | throw
70 | true
71 | try
72 | typedef
73 | typeid
74 | typename
75 | union
76 | unsigned
77 | using
78 | virtual
79 | void
80 | volatile
81 | wchar_t
82 | while
83 | xor
84 | xor_eq
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | SCAA
2 | ====
3 | Runs joern on testCode, writes joern-tools script output to text files for each testCode file,
4 | extracts features from the text files to create an arff file that can be used in WEKA for machine learning.
5 |
6 | (This project requires the development branches of joern and python-joern, and also joern-tools to be set up. These three git repositories have dependencies and come with thorough documentation.)
7 |
8 | 1) Do preprocessing for all files in the directory structure, year-> author name -> all_cpp_files_ofauthor
9 | run preprocessDataToTXTdepAST(filePath) in FeatureCalculators.java test_cpp_dir has all the cpp files of an author. Check if all dep, txt, and ast files are created correctly. (eg, if the cpp file has only comments and no code, the dep, txt and ast files will be empty. Exclude such cases from authorship attribution.) If you only want syntactic features, make sure that in joern-tools, change astLabel.py's lines:
10 |
11 | if len(children) == 0:
12 | node.attr['label'] = attrDict['node']
13 | else:
14 | node.attr['label'] = attrDict['type']
15 |
16 | to
17 |
18 |
19 | if len(children) == 0:
20 | node.attr['label'] = attrDict['type']
21 | else:
22 | node.attr['label'] = attrDict['type']
23 |
24 | 2) Start writing the attribute declaration to arff (writes relation, selected attributes and at last @attribute 'authorName' {cyg4ever,darkKelvin, ....} after getting all the author names. The last attribute defines your test classes.
25 | After preprocessing, run the main method in FeatureExtractor.java
26 | test_dir has all the .txt files written from joern, can be the same as test_cpp_dir
27 | output_filename is your arff file path
28 | If you want only syntactic features from the syntactic dataset that has only node types, make sure to select the correct ASTTypes in FeatureExtractor.java
29 |
30 | 3) Extract features: from all text files in the directory structure, year-> author name -> all_txt_files_ofauthor (output from joern) extract the desired features to be written to feature vectors. In order to extract some layout and other lexical features, run Driver.java in Naive Baseline. If you want to merge the arffs from feature extractor and driver, run MergeArffFiles.java. (the instance order is important, modify code accordingly.)
31 |
32 | 4)Once the arff file is written, open it in WEKA or call WEKA from java and use the necessary classifiers, and attribute selection methods to do authorship attribution. AuthorClassification.java can also be used with a random forest and relaxed attribution.
33 |
34 | SCAA
35 |
--------------------------------------------------------------------------------
/SCAA/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/SCAA/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 |
--------------------------------------------------------------------------------
/SCAA/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | SCAA
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/SCAA/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.7
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.7
12 |
--------------------------------------------------------------------------------
/SCAA/commons-exec-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/commons-exec-1.2.jar
--------------------------------------------------------------------------------
/SCAA/commons-lang3-3.3.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/commons-lang3-3.3.2.jar
--------------------------------------------------------------------------------
/SCAA/commons.io_2.0.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/commons.io_2.0.1.jar
--------------------------------------------------------------------------------
/SCAA/javacsv.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/javacsv.jar
--------------------------------------------------------------------------------
/SCAA/src/AuthorClassification.java:
--------------------------------------------------------------------------------
1 | import weka.attributeSelection.InfoGainAttributeEval;
2 | import weka.attributeSelection.Ranker;
3 | import weka.classifiers.*;
4 | import weka.classifiers.evaluation.ThresholdCurve;
5 | import weka.classifiers.meta.FilteredClassifier;
6 | import weka.classifiers.trees.RandomForest;
7 | import weka.core.Attribute;
8 | import weka.core.Instances;
9 | import weka.core.Range;
10 | import weka.core.Utils;
11 | import weka.filters.Filter;
12 | import weka.filters.supervised.attribute.AttributeSelection;
13 | import weka.filters.unsupervised.attribute.Remove;
14 | import weka.filters.unsupervised.instance.RemoveRange;
15 | import weka.filters.unsupervised.instance.RemoveWithValues;
16 |
17 | import java.io.BufferedWriter;
18 | import java.io.FileReader;
19 | import java.io.FileWriter;
20 | import java.util.*;
21 |
22 | public class AuthorClassification {
23 |
24 |
25 | public static void main(String[] args) throws Exception
26 | {
27 | double accuracy=0;
28 | int endRelax = 1;
29 | int numberFiles;
30 | int numFeatures=0; //0 is the default logM+1
31 | int seedNumber;
32 | double total =0;
33 | double average =0;
34 |
35 | String fileName ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/Results/AutomatedResults/"
36 | + "mallory/"+"mallory_CSFS_new.txt";
37 | for(int authorNo=6; authorNo<=54; authorNo+=1){
38 | for(numberFiles=9; numberFiles<10; numberFiles++){
39 | String arffFile = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/"
40 | + "mallory_150/CSFS/" +"mallory_CSFS_"+authorNo+".arff" ;
41 |
42 | Util.writeFile(numberFiles+"FilesPerAuthor: \n",fileName, true);
43 | for(int relaxPar = 1; relaxPar<=endRelax; relaxPar++){
44 | total=0;
45 | average=0;
46 |
47 | for(seedNumber=1; seedNumber<2; seedNumber++){
48 | int foldNumber=numberFiles;
49 |
50 |
51 |
52 | RandomForest cls = new RandomForest();
53 | Instances data = new Instances(new FileReader(arffFile));
54 | data.setClassIndex(data.numAttributes() - 1);
55 | //do not stratify if you are going to remove instances for training and testing
56 | // data.stratify(foldNumber);
57 |
58 |
59 | /* //Start information gain that selects up to 200 features that have nonzero infogain
60 | int n = 200; // number of features to select
61 | AttributeSelection attributeSelection = new AttributeSelection();
62 | Ranker ranker = new Ranker();
63 | ranker.setNumToSelect(n);
64 | ranker.setThreshold(0);
65 | InfoGainAttributeEval infoGainAttributeEval = new InfoGainAttributeEval();
66 | attributeSelection.setEvaluator(infoGainAttributeEval);
67 | attributeSelection.setSearch(ranker);
68 | attributeSelection.setInputFormat(data);
69 | data = Filter.useFilter(data, attributeSelection);
70 | //end of infogain
71 | */
72 |
73 |
74 |
75 |
76 | RemoveRange filter1 = new RemoveRange();
77 | filter1.setInputFormat(data);
78 | filter1.setInstancesIndices("13-last");
79 | filter1.setInvertSelection(true);
80 |
81 | Instances trainData = Filter.useFilter(data, filter1);
82 | System.out.println("trainData size " + trainData.numInstances());
83 | BufferedWriter writer = new BufferedWriter(new FileWriter("/Users/Aylin/Desktop/Drexel/"
84 | + "2014/ARLInternship/SCAAarffs/mallory_150/traintest/trainData_"+authorNo+".arff"));
85 | writer.write(trainData.toString());
86 | writer.flush();
87 | writer.close();
88 | /* for(int inst=0; inst<16; inst++)
89 | System.out.println("trainData " + trainData.classAttribute().value((int) trainData.instance(inst).classValue()));
90 | */
91 |
92 | RemoveRange filter2 = new RemoveRange();
93 | filter2.setInputFormat(data);
94 | filter2.setInstancesIndices("13-last");
95 | Instances testData = Filter.useFilter(data, filter2);
96 | System.out.println("testData size " + testData.numInstances());
97 | writer = new BufferedWriter(new FileWriter("/Users/Aylin/Desktop/Drexel/"
98 | + "2014/ARLInternship/SCAAarffs/mallory_150/traintest/testData_"+authorNo+".arff"));
99 | // writer.write(testData.toString());
100 | writer.flush();
101 | writer.close();
102 | /* for(int inst=0; inst<29; inst++)
103 | System.out.println("testData " + testData.classAttribute().value((int) testData.instance(inst).classValue()));
104 | */
105 |
106 |
107 |
108 | Remove rm = new Remove();
109 | int authorName = (data.numAttributes() - 28);
110 | // rm.setAttributeIndices("1," +authorName); // remove 1st and the autor attribute
111 | rm.setAttributeIndices("1"); // remove 1st attribute*/
112 |
113 | FilteredClassifier fc = new FilteredClassifier();
114 | fc.setClassifier(new RandomForest());
115 | fc.setFilter(rm);
116 |
117 | String[] options = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber);
118 | fc.setOptions(options);
119 | fc.buildClassifier(trainData);
120 | // Evaluation evalMallory=null;
121 | // evalMallory = new Evaluation(testData);
122 |
123 |
124 | Evaluation eval_mal = new Evaluation(testData);
125 | eval_mal.evaluateModel(fc, testData);
126 |
127 |
128 | /* for (int i = 0; i < testData.numInstances(); i++) {
129 | double classVal = fc.classifyInstance(testData
130 | .instance(i));
131 | System.out
132 | .println("===== Classified instance =====" + classVal);
133 | double[] pred = fc.distributionForInstance(testData
134 | .instance(i));
135 | System.out.println("===== Classified instance =====" + pred);*/
136 | // System.out.println("Class predicted: " + testData.instance(i).classAttribute().value((int) pred));
137 |
138 | // train on trainData and make predictions on testData
139 | fc.buildClassifier(trainData);
140 | for (int i = 0; i < testData.numInstances(); i++) {
141 | double pred = fc.classifyInstance(testData.instance(i));
142 | System.out.print(fc.getOptions());
143 | System.out.print("ID: " + testData.instance(i).value(0));
144 | System.out.print(", actual: " + testData.classAttribute().value((int) testData.instance(i).classValue()));
145 | System.out.println(", predicted: " + testData.classAttribute().value((int) pred)+"\n");
146 |
147 |
148 | Util.writeFile("ID: " + testData.instance(i).value(0),
149 | fileName, true);
150 | Util.writeFile(", actual: " + testData.classAttribute().value((int) testData.instance(i).classValue()),
151 | fileName, true);
152 | Util.writeFile(", predicted: " + testData.classAttribute().value((int) pred)+"\n",
153 | fileName, true);
154 |
155 | }
156 |
157 | ThresholdCurve tc_mal = new ThresholdCurve();
158 | int classIndex = 0;
159 | Instances result1 = tc_mal.getCurve(eval_mal.predictions(), classIndex);
160 | tc_mal.getROCArea(result1);
161 |
162 | Instances result2 = tc_mal.getCurve(eval_mal.predictions(), 1);
163 | tc_mal.getROCArea(result2);
164 | System.out.println("AUC class1: "+ tc_mal.getROCArea(result1) + " AUC class2: "+ tc_mal.getROCArea(result2));
165 | // +"\n"+"Number of trees used, "+fc.getNumTrees()+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+fc.measureOutOfBagError());
166 |
167 | Util.writeFile("AUC class1: "+ tc_mal.getROCArea(result1) + " AUC class2: "+ tc_mal.getROCArea(result2) +"\n"+"Number of features used, default is 0 (logM+1) "+cls.getNumFeatures()+ ", Correctly classified instances, "+eval_mal.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n"
168 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"\n" ,
169 | fileName, true);
170 |
171 |
172 |
173 |
174 |
175 |
176 | System.out.println("Number of instances: " + data.numInstances()+" and number of authors: " + data.numClasses());
177 |
178 |
179 | String[] options1 = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber);
180 | cls.setOptions(options);
181 |
182 | cls.buildClassifier(data);
183 |
184 |
185 |
186 | Evaluation eval=null;
187 |
188 |
189 | if(endRelax==1)
190 | eval = new Evaluation(data);
191 | else
192 | eval= new RelaxedEvaluation(data, relaxPar);
193 |
194 |
195 |
196 |
197 | eval.crossValidateModel(cls, data,foldNumber , new Random(seedNumber));
198 |
199 | /* System.out.println("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses());
200 | Util.writeFile("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses(),
201 | fileName, true);*/
202 | ThresholdCurve tc = new ThresholdCurve();
203 | classIndex = 0;
204 | result1 = tc.getCurve(eval.predictions(), classIndex);
205 | tc.getROCArea(result1);
206 |
207 | result2 = tc.getCurve(eval.predictions(), 1);
208 | tc.getROCArea(result2);
209 | /* System.out.println("AUC class1: "+ tc.getROCArea(result1) + " AUC class2: "+ tc.getROCArea(result2));
210 | //"\n"+"Number of features used, "+cls.getNumFeatures()+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError());
211 |
212 | Util.writeFile("AUC class1: "+ tc.getROCArea(result1) + " AUC class2: "+ tc.getROCArea(result2) +"\n"+"Number of features used, default is 0 (logM+1) "+cls.getNumFeatures()+ ", Correctly classified instances, "+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n"
213 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"\n" ,
214 | fileName, true);*/
215 |
216 |
217 | if(numFeatures==0){
218 | int defaultNumFeatures=(int)Utils.log2(data.numAttributes()) + 1;
219 | /* Util.writeFile("Number of features used, "+defaultNumFeatures+ ", Correctly classified instances, "+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n"
220 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"max depth of trees"+cls.getMaxDepth()+"\n" ,
221 | fileName, true);
222 | System.out.println("Number of features used, "+defaultNumFeatures+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError());
223 | */
224 | }
225 |
226 | else{
227 | /* System.out.println("Number of features used, "+cls.getNumFeatures()+ ", Relaxed by, "+relaxPar+", Correctly classified instances,"+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError());
228 |
229 | Util.writeFile("Number of features used, default is 0 (logM+1) "+cls.getNumFeatures()+ ", Correctly classified instances, "+eval.pctCorrect()+", OOB error,"+cls.measureOutOfBagError()+"\n"
230 | +"Filename is, "+arffFile.toString()+" Number of features used, "+cls.getNumFeatures()+"\n" ,
231 | fileName, true); */
232 | }
233 | accuracy=eval.pctCorrect();
234 | total =total+accuracy;
235 | average = total/seedNumber;
236 | }
237 |
238 | System.out.println("total is "+total);
239 | System.out.println("avg is "+average);
240 | System.out.println("accuracy is "+accuracy);
241 |
242 | System.out.println("\nThe average accuracy with "+numberFiles+"files is "+average+"\n");
243 | Util.writeFile("\nThe average accuracy with "+numberFiles+"files is "+average+", relaxed by, "+relaxPar+", \n",
244 | fileName, true);
245 |
246 | }}
247 | }
248 | }
249 |
250 | }
--------------------------------------------------------------------------------
/SCAA/src/AuthorClassificationRelaxed.java:
--------------------------------------------------------------------------------
1 | import weka.attributeSelection.InfoGainAttributeEval;
2 | import weka.attributeSelection.Ranker;
3 | import weka.classifiers.*;
4 | import weka.classifiers.evaluation.ThresholdCurve;
5 | import weka.classifiers.meta.FilteredClassifier;
6 | import weka.classifiers.trees.RandomForest;
7 | import weka.core.Attribute;
8 | import weka.core.AttributeStats;
9 | import weka.core.Instances;
10 | import weka.core.Range;
11 | import weka.core.Utils;
12 | import weka.filters.Filter;
13 | import weka.filters.supervised.attribute.AttributeSelection;
14 | import weka.filters.unsupervised.attribute.Remove;
15 | import weka.filters.unsupervised.instance.RemoveRange;
16 | import weka.filters.unsupervised.instance.RemoveWithValues;
17 |
18 | import java.io.BufferedWriter;
19 | import java.io.FileReader;
20 | import java.io.FileWriter;
21 | import java.util.*;
22 |
23 | public class AuthorClassificationRelaxed {
24 |
25 |
26 | public static void main(String[] args) throws Exception
27 | {
28 | double accuracy=0;
29 | int endRelax = 5;
30 | int numberFiles;
31 | int numFeatures=0; //0 is the default logM+1
32 | int seedNumber;
33 | double total =0;
34 | double average =0;
35 |
36 | String fileName ="textFile";
37 |
38 |
39 |
40 | for(int authorNo=9; authorNo<10; authorNo++){
41 | for(numberFiles=9; numberFiles<10; numberFiles++){
42 | for (int x=28; x<=(18*31); x=x+9){
43 | String arffFile = "path";
44 |
45 | Util.writeFile(numberFiles+"FilesPerAuthor: \n",fileName, true);
46 | for(int relaxPar = 5; relaxPar<=endRelax; relaxPar++){
47 | total=0;
48 | average=0;
49 |
50 | for(seedNumber=1; seedNumber<2; seedNumber++){
51 | int foldNumber=numberFiles;
52 |
53 |
54 |
55 | RandomForest cls = new RandomForest();
56 | Instances data = new Instances(new FileReader(arffFile));
57 | data.setClassIndex(data.numAttributes() - 1);
58 | // data.setClassIndex(0);
59 |
60 | //do not stratify if you are going to remove instances for training and testing
61 | // data.stratify(foldNumber);
62 |
63 |
64 | //write classes that have 9 samples to a new arff
65 | System.out.println(data.attributeStats(0));
66 |
67 | // System.out.println(data.instance(2).stringValue(0));
68 | // System.out.println(data.instance(2).value(0));
69 |
70 | /* for(int i=0; i<=data.numInstances();i++){
71 | int count = data.attributeStats(0).nominalCounts[(int) data.instance(i).value(0)];
72 | if(count==9){
73 | Util.writeFile(data.instance(i).toString() + "\n", "/Users/Aylin/Desktop/"
74 | + "python9files.arff", true);
75 |
76 | }
77 |
78 | }*/
79 |
80 | //Start information gain that selects up to 200 features that have nonzero infogain
81 | int n = 500; // number of features to select
82 | AttributeSelection attributeSelection = new AttributeSelection();
83 | Ranker ranker = new Ranker();
84 | ranker.setNumToSelect(n);
85 | ranker.setThreshold(0.001);
86 | InfoGainAttributeEval infoGainAttributeEval = new InfoGainAttributeEval();
87 | attributeSelection.setEvaluator(infoGainAttributeEval);
88 | attributeSelection.setSearch(ranker);
89 | attributeSelection.setInputFormat(data);
90 | data = Filter.useFilter(data, attributeSelection);
91 | //end of infogain
92 |
93 |
94 |
95 | RemoveRange rm = new RemoveRange();
96 | rm.setInputFormat(data);
97 | // rm.setInstancesIndices("first-"+(x-19)+","+x+"-last");
98 | Instances testData = Filter.useFilter(data, rm);
99 | System.out.println("testData size " + testData.numInstances());
100 |
101 |
102 | FilteredClassifier fc = new FilteredClassifier();
103 | fc.setClassifier(new RandomForest());
104 | fc.setFilter(rm);
105 |
106 | String[] options = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber);
107 | fc.setOptions(options);
108 | // fc.buildClassifier(data);
109 | Evaluation eval_mal = new Evaluation(data);
110 |
111 | System.out.println("Number of instances: " + data.numInstances()+" and number of authors: " + data.numClasses());
112 |
113 |
114 | String[] options1 = weka.core.Utils.splitOptions("-I 300 -K "+numFeatures+" -S "+seedNumber);
115 | cls.setOptions(options);
116 | // cls.buildClassifier(data);
117 |
118 |
119 |
120 | Evaluation eval=null;
121 |
122 |
123 | if(endRelax==1)
124 | eval = new Evaluation(data);
125 | else
126 | eval= new RelaxedEvaluation(data, relaxPar);
127 |
128 |
129 | eval.crossValidateModel(cls, data,foldNumber , new Random(seedNumber));
130 |
131 | System.out.println("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses());
132 | Util.writeFile("Relaxed by, "+relaxPar+", seedNo,"+seedNumber+", files,"+numberFiles+", authors,"+data.numClasses(),
133 | fileName, true);
134 |
135 | accuracy=eval.pctCorrect();
136 | total =total+accuracy;
137 | average = total/seedNumber;
138 | }
139 |
140 | System.out.println("total is "+total);
141 | System.out.println("avg is "+average);
142 | System.out.println("accuracy is "+accuracy);
143 |
144 | System.out.println("\nThe average accuracy with "+numberFiles+"files is "+average+"\n");
145 | Util.writeFile("\nThe average accuracy with "+numberFiles+"files is "+average+", relaxed by, "+relaxPar+", \n",
146 | fileName, true);
147 |
148 | }
149 |
150 |
151 | }}
152 | }
153 | }
154 |
155 | }
--------------------------------------------------------------------------------
/SCAA/src/BigramExtractor.java:
--------------------------------------------------------------------------------
1 | import java.io.File;
2 | import java.io.IOException;
3 | import java.text.SimpleDateFormat;
4 | import java.util.ArrayList;
5 | import java.util.Calendar;
6 | import java.util.HashSet;
7 | import java.util.LinkedHashSet;
8 | import java.util.List;
9 | import java.util.Set;
10 | import java.util.regex.Matcher;
11 | import java.util.regex.Pattern;
12 |
13 | import org.apache.commons.lang3.StringUtils;
14 |
15 |
16 | public class BigramExtractor {
17 |
18 |
19 | public static void main(String[] args) throws IOException
20 | {
21 | Calendar cal = Calendar.getInstance();
22 | cal.getTime();
23 | SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
24 | int month = cal.get(Calendar.MONTH);
25 | int dayOfMonth = cal.get(Calendar.DAY_OF_MONTH);
26 | String time = sdf.format(cal.getTime());
27 | String output_filename = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigramArffs/"
28 | +(month+1) + "." + dayOfMonth + "_" +
29 | "9FilesExactlyPerAuthor_2012_validation_exact_bigrams.arff" ;
30 |
31 | String dirPath="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAA_Datasets/"
32 | +"bigExperiments/250authors/9FilesExactlyPerAuthor_2012_validation_exact_allfeatures/"; List test_file_paths = Util.listTextFiles(dirPath);
33 |
34 |
35 | String text = "";
36 | //Writing the test arff
37 | //first specify relation
38 | Util.writeFile("@relation 9FilesExactlyPerAuthor_2012_validation_bigrams"+"\n"+"\n", output_filename, true);
39 | Util.writeFile("@attribute instanceID {", output_filename, true);
40 | List test_cpp_paths = Util.listCPPFiles(dirPath);
41 | for(int j=0; j < test_cpp_paths.size();j++ )
42 | {
43 | File fileCPP = new File(test_cpp_paths.get(j).toString());
44 | String fileName = fileCPP.getName();
45 | Util.writeFile(fileName+",", output_filename, true);
46 | if ((j+1)==test_cpp_paths.size())
47 | Util.writeFile("}"+"\n", output_filename, true);
48 | }
49 | String[] ASTNodeBigrams = null;
50 | ASTNodeBigrams= getASTNodeBigrams(dirPath);
51 |
52 | for (int i=0; i uniqueWords = new HashSet();
65 |
66 | for (String word : words) {
67 | uniqueWords.add(word);
68 | }
69 | words = uniqueWords.toArray(new String[0]);
70 | int authorCount = words.length;
71 | if (i+1==test_file_paths.size()){
72 | for (int j=0; j< authorCount; j++){
73 | {System.out.println(words[j]);
74 | if(j+1 == authorCount)
75 | {
76 | Util.writeFile(words[j]+"}"+"\n\n",output_filename, true);
77 | }
78 | else
79 | {
80 | Util.writeFile(words[j]+","+"",output_filename, true);
81 |
82 | }
83 | }
84 | }
85 |
86 | }
87 |
88 | }
89 | Util.writeFile("@data"+"\n", output_filename, true);
90 | //Finished defining the attributes
91 |
92 | //EXTRACT LABELED FEATURES
93 | for(int i=0; i< test_file_paths.size(); i++){
94 | int testIDlength = test_file_paths.get(i).toString().length();
95 | File authorFileName= new File(test_file_paths.get(i).toString());
96 | String authorName= authorFileName.getParentFile().getName();
97 |
98 | System.out.println(test_file_paths.get(i));
99 | System.out.println(authorName);
100 |
101 | File fileCPPID = new File(test_cpp_paths.get(i).toString());
102 | String fileNameID = fileCPPID.getName();
103 | Util.writeFile(fileNameID+",", output_filename, true);
104 |
105 | String DepASTText = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"dep");
106 | float[] typeCount = getASTNodeBigramsTF(DepASTText, ASTNodeBigrams );
107 | for (int j=0; j uniqueWords = new LinkedHashSet();
127 | List unigrams = new ArrayList();
128 | Set bigrams = new LinkedHashSet();
129 | String[] uniquebigrams = null;
130 |
131 | for(int i=0; i< test_file_paths.size(); i++){
132 | String filePath = test_file_paths.get(i).toString();
133 | // System.out.println(filePath);
134 |
135 | String inputText =Util.readFile(filePath);
136 | int [] lines = DepthASTNode.getASTDepLines(inputText);
137 | String textAST=null;
138 | for (int j=0; j occurrencesHere = finder.findIndexesForKeyword(str);
48 | occurrences[j] = occurrences[j] + occurrencesHere.size();
49 |
50 |
51 | for(int k=0; k occurrencesHere = finder.findIndexesForKeyword(str);
98 | occurrences[j] = occurrences[j] + occurrencesHere.size();
99 |
100 |
101 | for(int k=0; k maxDepth[j])
116 | maxDepth[j]= rightParanthesis-leftParanthesis;
117 | }
118 |
119 | if(occurrences[j]==0)
120 | maxDepth[j]=0;
121 | }
122 | }
123 | List maxDepthall = Arrays.asList(ArrayUtils.toObject(maxDepth));
124 | return Collections.max(maxDepthall);
125 |
126 | }
127 |
128 | //line number starts from 0
129 | public static int[] getASTDepLines(String featureText)
130 | {
131 | HashSet functionIDs = new HashSet();
132 | HashSet functionIDs2 = new HashSet();
133 |
134 | //take the function id in the beginning of the line.
135 | String[] lines = featureText.split("\n");
136 | for(int i=0; i< lines.length; i++)
137 | {
138 | String firstWord = lines[i].substring(0, featureText.indexOf('\t'));
139 | if(!functionIDs.contains(firstWord))
140 | functionIDs.add(firstWord);
141 | }
142 | int [] ASTDepLines=new int[functionIDs.size()];
143 | for(int i=0; i< lines.length; i++)
144 | {
145 | String firstWord = lines[i].substring(0, featureText.indexOf('\t'));
146 | if(i==0)
147 | {
148 | functionIDs2.add(firstWord);
149 | }
150 | else
151 | {
152 | if(!functionIDs2.contains(firstWord))
153 | {
154 | int lineNumber = i-1;
155 | ASTDepLines[functionIDs2.size()-1] = lineNumber;
156 | }
157 | if(i==lines.length-1)
158 | {
159 | int lineNumber = i;
160 | ASTDepLines[functionIDs2.size()-1] = lineNumber;
161 | }
162 | functionIDs2.add(firstWord);
163 | }
164 | }
165 | return ASTDepLines;
166 | }
167 |
168 |
169 | //starts from 0
170 | public static String readLineNumber (String featureText, int lineNumber) throws IOException
171 | {
172 | List lines = IOUtils.readLines(new StringReader(featureText));
173 | return lines.get(lineNumber);
174 | }
175 |
176 | public static float[] InfoGainsgetAvgDepthASTNode(String featureText, String[] ASTtypesAvgDepth) throws IOException
177 | {
178 |
179 | int [] lines = getASTDepLines(featureText);
180 | float [] occurrences=new float[ASTtypesAvgDepth.length];
181 | float [] totalDepth=new float[ASTtypesAvgDepth.length];
182 | float [] avgDepth=new float[ASTtypesAvgDepth.length];
183 |
184 | String textAST=null;
185 | for (int i=0; i occurrencesHere = finder.findIndexesForKeyword(str);
193 | occurrences[j] = occurrences[j] + occurrencesHere.size();
194 |
195 |
196 | for(int k=0; k 0) {
54 | int newValue = costs[j - 1];
55 | if (s1.charAt(i - 1) != s2.charAt(j - 1))
56 | newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
57 | costs[j - 1] = lastValue;
58 | lastValue = newValue;
59 | }
60 | }
61 | }
62 | if (i > 0)
63 | costs[s2.length()] = lastValue;
64 | }
65 | return costs[s2.length()];
66 | }
67 |
68 | public static void printDistance(String s1, String s2) {
69 | // System.out.println(s1 + "-->" + s2 + ": " + computeDistance(s1, s2));
70 | System.out.println(levenshteinDistance(s1, s2));
71 |
72 | }
73 |
74 | @SuppressWarnings("resource")
75 | public static void main(String[] args) throws IOException {
76 |
77 | String parentDir = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/"
78 | + "SCAA_Datasets/bigExperiments/250authors/9FilesExactlyPer250Author_2014/";
79 | String output_file= "/Users/Aylin/Desktop/similarityTestAcrossAuthors.txt";
80 |
81 | File file = new File(parentDir);
82 | String[] directories = file.list(new FilenameFilter()
83 | {
84 | @Override
85 | public boolean accept(File current, String name)
86 | {
87 | return new File(current, name).isDirectory();
88 | }
89 | });
90 | System.out.println(Arrays.toString(directories));
91 | //within author similarity
92 | /* for (int i =0; i< directories.length; i++)
93 | {
94 | double author_ratio=0;
95 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname
96 | String authorName = directories[i].toString().substring(0, directories[i].toString().length());
97 | String authorDir = parentDir + directories[i] + "/";
98 | Util.writeFile(authorName +"\n", output_file, true);
99 | System.out.println(authorName);
100 | System.out.println(authorDir);
101 |
102 | BufferedReader br = null;
103 | String line = "";
104 | List test_cpp_paths = Util.listCPPFiles(authorDir);
105 |
106 | for(int j=0; j < test_cpp_paths.size();j++ )
107 | {
108 | double avg_ratio=0;
109 | double ratio =0;
110 |
111 | String file1 = Util.readFile(test_cpp_paths.get(j).toString());
112 | Util.writeFile(test_cpp_paths.get(j).toString() +"\n", output_file, true);
113 | for(int k=0; k < test_cpp_paths.size();k++ )
114 | {
115 | if(j!=k){
116 | String file2 = Util.readFile(test_cpp_paths.get(k).toString());
117 | int distance =computeDistance(file1, file2);
118 | if(file1.length() <= file2.length()){
119 | ratio = distance/(double)((Integer)file2.length());
120 | }
121 | if(file2.length() < file1.length()){
122 | ratio = distance/(double)((Integer)file1.length());
123 | }
124 |
125 | Util.writeFile("File1 length: "+file1.length() +" " +
126 | "File2 length:"+file2.length()+" ", output_file, true);
127 | Util.writeFile("distance: "+Integer.toString(distance)+
128 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true);
129 | avg_ratio =avg_ratio+ratio;
130 | }}
131 | avg_ratio = avg_ratio/(double)((Integer)(test_cpp_paths.size()-1));
132 | Util.writeFile("average ratio of file: "+avg_ratio +"\n", output_file, true);
133 | System.out.println("average ratio of file: "+avg_ratio );
134 |
135 | author_ratio=author_ratio+avg_ratio;
136 | }
137 | author_ratio=author_ratio/(double)((Integer)(test_cpp_paths.size()));
138 | System.out.println("average ratio of author: "+author_ratio);
139 |
140 | Util.writeFile("average ratio of author: "+author_ratio +"\n", output_file, true);
141 |
142 | }*/
143 |
144 | List all_cpp_paths = Util.listCPPFiles(parentDir);
145 | String authorName2;
146 | double avg_ratio=0;
147 |
148 | for (int i =0; i< all_cpp_paths.size(); i++)
149 | {
150 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname
151 | File newFile = new File(all_cpp_paths.get(i).toString());
152 | authorName2 = newFile.getParentFile().getName().toString();
153 | Util.writeFile(authorName2+":"+newFile.getName().toString() +"\n", output_file, true);
154 | System.out.println(authorName2);
155 |
156 | BufferedReader br = null;
157 |
158 | for(int j=0; j < all_cpp_paths.size();j++ )
159 | {
160 | File newFile1 = new File(all_cpp_paths.get(j).toString());
161 | String authorName3 = newFile1.getParentFile().getName().toString();
162 |
163 | double ratio =0;
164 | if(!authorName2.equals(authorName3)){
165 | String file1 = Util.readFile(all_cpp_paths.get(i).toString());
166 |
167 | String file2 = Util.readFile(all_cpp_paths.get(j).toString());
168 | int distance =levenshteinDistance(file1, file2);
169 | if(file1.length() <= file2.length()){
170 | ratio = distance/(double)((Integer)file2.length());
171 | }
172 | if(file2.length() < file1.length()){
173 | ratio = distance/(double)((Integer)file1.length());
174 | }
175 |
176 | /* Util.writeFile("File1 length: "+file1.length() +" " +
177 | "File2 length:"+file2.length()+" ", output_file, true);
178 | Util.writeFile("distance: "+Integer.toString(distance)+
179 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true);*/
180 | // Util.writeFile( Double.toString(ratio) + ", " ,output_file, true);
181 | avg_ratio =avg_ratio+ratio;
182 | }}
183 | avg_ratio = avg_ratio/(double)((Integer)(all_cpp_paths.size()-9));
184 | Util.writeFile("\n Average distance to all other files: "+avg_ratio +"\n", output_file, true);
185 | System.out.println("Average distance to all other files: "+avg_ratio );
186 | }
187 |
188 | }
189 | }
--------------------------------------------------------------------------------
/SCAA/src/FeatureExtractorInfoGain.java:
--------------------------------------------------------------------------------
1 | import java.io.File;
2 | import java.io.FileNotFoundException;
3 | import java.io.IOException;
4 | import java.util.Calendar;
5 | import java.util.HashSet;
6 | import java.util.List;
7 | import java.util.Set;
8 | import java.util.regex.Matcher;
9 | import java.util.regex.Pattern;
10 | import java.text.SimpleDateFormat;
11 |
12 | /**
13 | * FeatureExtractor writes extracted features to arff file to be used with WEKA
14 | * @author Aylin Caliskan-Islam (ac993@drexel.edu)
15 | */
16 |
17 | public class FeatureExtractorInfoGain {
18 | public static void main(String[] args) throws FileNotFoundException, IOException, ClassNotFoundException {
19 |
20 | //list the cppKeywords that appear in infogain
21 | String [] cppKeywords = {"auto","case", "class", "compl", "const","inline","namespace","operator",
22 | "signed", "static", "template", "typedef","typename","unsigned", "using"};
23 |
24 |
25 | String output_filename = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigExperiments/InfoGain/" +"InfoGain_9FilesPer250Author2012_bigExperiments.arff" ;
26 |
27 | String test_dir = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAA_Datasets/bigExperiments/250authors/9FilesExactlyPerAuthor_2012_validation_exact_allfeatures/";
28 | List test_file_paths = Util.listTextFiles(test_dir);
29 |
30 | String text = "";
31 | //Writing the test arff
32 | //first specify relation
33 | Util.writeFile("@relation InfoGain "+"\n"+"\n", output_filename, true);
34 | Util.writeFile("@attribute instanceID {", output_filename, true);
35 |
36 | List test_cpp_paths = Util.listCPPFiles(test_dir);
37 | for(int j=0; j < test_cpp_paths.size();j++ )
38 | {
39 | File fileCPP = new File(test_cpp_paths.get(j).toString());
40 | String fileName = fileCPP.getName();
41 | Util.writeFile(fileName+",", output_filename, true);
42 | if ((j+1)==test_cpp_paths.size())
43 | Util.writeFile("}"+"\n", output_filename, true);
44 | }
45 |
46 | // Util.writeFile("@attribute 'functionIDCount' numeric"+"\n", output_filename, true);
47 | // Util.writeFile("@attribute 'CFGNodeCount' numeric"+"\n", output_filename, true);
48 | // Util.writeFile("@attribute 'ASTFunctionIDCount' numeric"+"\n", output_filename, true);
49 | Util.writeFile("@attribute 'getMaxDepthASTLeaf' numeric"+"\n", output_filename, true);
50 |
51 |
52 | //List the info gain nodes
53 | String[] ASTtypesTF = {"T", "t", "FOR", "cout", "stdout", "freopen", "in", "tt", "tc",
54 | "test", "open", "ForStatement", "UnaryExpression", "IncDecOp", "scanf", "close",
55 | "argc", "argv", "fin", "stdin", "ofstream", "ForInit", "cin", "solve", "fopen",
56 | "ifstream", "fprintf", "cas", "printf", "ShiftExpression", "REP", "fout",
57 | "forn", "endl", "size_t", "out", "cases", "cerr"};
58 | String[] ASTtypesTFIDF = {"FOR", "cout", "stdout", "freopen", "tc", "test", "open",
59 | "close", "argc", "argv", "fin", "stdin", "ofstream", "cin", "solve", "fopen",
60 | "ifstream", "fprintf", "cas", "REP", "fout", "forn", "endl", "size_t", "out", "cases", "cerr"};
61 | String[] ASTtypesAvgDep = {"T", "d", "w", "t", "r", "FOR", "cout", "stdout", "freopen", "small",
62 | "in", "tt", "tc", "input", "test", "open", "ForStatement", "UnaryExpression", "inline",
63 | "IncDecOp", "scanf", "close", "argc", "argv", "const", "fin", "stdin", "ofstream",
64 | "ForInit", "cin", "solve", "txt", "sync_with_stdio", "fopen", "ifstream", "std", "cas",
65 | "printf", "ShiftExpression", "REP", "fout", "forn", "Case", "size_t", "out", "cases",
66 | "output", "cerr"};
67 |
68 | for (int i=0; i uniqueWords = new HashSet();
94 |
95 | for (String word : words) {
96 | uniqueWords.add(word);
97 | }
98 | words = uniqueWords.toArray(new String[0]);
99 | int authorCount = words.length;
100 | if (i+1==test_file_paths.size()){
101 | for (int j=0; j< authorCount; j++){
102 | {System.out.println(words[j]);
103 | if(j+1 == authorCount)
104 | {
105 | Util.writeFile(words[j]+"}"+"\n\n",output_filename, true);
106 | }
107 | else
108 | {
109 | Util.writeFile(words[j]+","+"",output_filename, true);
110 |
111 | }
112 | }
113 | }
114 |
115 | }
116 |
117 | }
118 |
119 |
120 | Util.writeFile("@data"+"\n", output_filename, true);
121 | //Finished defining the attributes
122 |
123 |
124 | //EXTRACT LABELED FEATURES
125 | for(int i=0; i< test_file_paths.size(); i++){
126 | String featureText = Util.readFile(test_file_paths.get(i).toString());
127 | int testIDlength = test_file_paths.get(i).toString().length();
128 | authorFileName= new File(test_file_paths.get(i).toString());
129 | String authorName= authorFileName.getParentFile().getName();
130 |
131 | System.out.println(test_file_paths.get(i));
132 | System.out.println(authorName);
133 | File fileCPPID = new File(test_cpp_paths.get(i).toString());
134 | String fileNameID = fileCPPID.getName();
135 | Util.writeFile(fileNameID+",", output_filename, true);
136 | // Util.writeFile(FeatureCalculators.functionIDCount(featureText)+",", output_filename, true);
137 | String ASTText = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"ast");
138 | String DepASTText = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"dep");
139 | String sourceCode = Util.readFile(test_file_paths.get(i).toString().substring(0,testIDlength-3)+"cpp");
140 |
141 | // Util.writeFile(FeatureCalculators.CFGNodeCount(ASTText)+",", output_filename, true);
142 | // Util.writeFile(FeatureCalculators.ASTFunctionIDCount(ASTText)+",", output_filename, true);
143 | Util.writeFile(DepthASTNode.getMaxDepthASTLeaf(DepASTText, ASTtypesTF)+",", output_filename, true);
144 |
145 |
146 |
147 | //get count of each ASTtype not-DepAST type present
148 | float[] typeCount = FeatureCalculators.DepASTTypeTF(DepASTText, ASTtypesTF );
149 | for (int j=0; j uniqueWords = new HashSet();
197 |
198 | for (String word : words) {
199 | uniqueWords.add(word);
200 | }
201 | words = uniqueWords.toArray(new String[0]);
202 | return words;
203 | }
204 |
205 |
206 | }
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
--------------------------------------------------------------------------------
/SCAA/src/IndexWrapper.java:
--------------------------------------------------------------------------------
1 | // Taken from http://whyjava.wordpress.com/2010/05/04/finding-all-the-indexes-of-a-whole-word-in-a-given-string-using-java/
2 |
3 | public class IndexWrapper {
4 |
5 | private int start;
6 | private int end;
7 |
8 | public IndexWrapper(int start, int end) {
9 | this.start = start;
10 | this.end = end;
11 | }
12 |
13 | public int getEnd() {
14 | return end;
15 | }
16 |
17 | public int getStart() {
18 | return start;
19 | }
20 |
21 | @Override
22 | public int hashCode() {
23 | final int prime = 31;
24 | int result = 1;
25 | result = prime * result + end;
26 | result = prime * result + start;
27 | return result;
28 | }
29 |
30 | @Override
31 | public boolean equals(Object obj) {
32 | if (this == obj)
33 | return true;
34 | if (obj == null)
35 | return false;
36 | if (getClass() != obj.getClass())
37 | return false;
38 | IndexWrapper other = (IndexWrapper) obj;
39 | if (end != other.end)
40 | return false;
41 | if (start != other.start)
42 | return false;
43 | return true;
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/SCAA/src/LevenshteinDistance.java:
--------------------------------------------------------------------------------
1 | import java.io.*;
2 | import java.util.*;
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileNotFoundException;
6 | import java.io.FilenameFilter;
7 | import java.io.IOException;
8 |
9 | import com.csvreader.CsvWriter;
10 |
11 |
12 | public class LevenshteinDistance {
13 |
14 | public static int computeDistance(String s1, String s2) {
15 | // s1 = s1.toLowerCase();
16 | // s2 = s2.toLowerCase();
17 |
18 | System.out.println("Length of first string: "+s1.length());
19 | System.out.println("Length of second string: "+s2.length());
20 |
21 | int[] costs = new int[s2.length() + 1];
22 | for (int i = 0; i <= s1.length(); i++) {
23 | int lastValue = i;
24 | for (int j = 0; j <= s2.length(); j++) {
25 | if (i == 0)
26 | costs[j] = j;
27 | else {
28 | if (j > 0) {
29 | int newValue = costs[j - 1];
30 | if (s1.charAt(i - 1) != s2.charAt(j - 1))
31 | newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
32 | costs[j - 1] = lastValue;
33 | lastValue = newValue;
34 | }
35 | }
36 | }
37 | if (i > 0)
38 | costs[s2.length()] = lastValue;
39 | }
40 | return costs[s2.length()];
41 | }
42 |
43 | public static void printDistance(String s1, String s2) {
44 | // System.out.println(s1 + "-->" + s2 + ": " + computeDistance(s1, s2));
45 | System.out.println(computeDistance(s1, s2));
46 |
47 | }
48 |
49 | @SuppressWarnings("resource")
50 | public static void main(String[] args) throws IOException {
51 |
52 | String parentDir = "/Users/Aylin/Desktop/Drexel/2014/ARLInternship/"
53 | + "SCAA_Datasets/bigExperiments/250authors/9FilesExactlyPer250Author_2014/";
54 | String output_file= "/Users/Aylin/Desktop/similarityTestAcrossAuthors.txt";
55 |
56 | File file = new File(parentDir);
57 | String[] directories = file.list(new FilenameFilter()
58 | {
59 | @Override
60 | public boolean accept(File current, String name)
61 | {
62 | return new File(current, name).isDirectory();
63 | }
64 | });
65 | System.out.println(Arrays.toString(directories));
66 | //within author similarity
67 | /* for (int i =0; i< directories.length; i++)
68 | {
69 | double author_ratio=0;
70 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname
71 | String authorName = directories[i].toString().substring(0, directories[i].toString().length());
72 | String authorDir = parentDir + directories[i] + "/";
73 | Util.writeFile(authorName +"\n", output_file, true);
74 | System.out.println(authorName);
75 | System.out.println(authorDir);
76 |
77 | BufferedReader br = null;
78 | String line = "";
79 | List test_cpp_paths = Util.listCPPFiles(authorDir);
80 |
81 | for(int j=0; j < test_cpp_paths.size();j++ )
82 | {
83 | double avg_ratio=0;
84 | double ratio =0;
85 |
86 | String file1 = Util.readFile(test_cpp_paths.get(j).toString());
87 | Util.writeFile(test_cpp_paths.get(j).toString() +"\n", output_file, true);
88 | for(int k=0; k < test_cpp_paths.size();k++ )
89 | {
90 | if(j!=k){
91 | String file2 = Util.readFile(test_cpp_paths.get(k).toString());
92 | int distance =computeDistance(file1, file2);
93 | if(file1.length() <= file2.length()){
94 | ratio = distance/(double)((Integer)file2.length());
95 | }
96 | if(file2.length() < file1.length()){
97 | ratio = distance/(double)((Integer)file1.length());
98 | }
99 |
100 | Util.writeFile("File1 length: "+file1.length() +" " +
101 | "File2 length:"+file2.length()+" ", output_file, true);
102 | Util.writeFile("distance: "+Integer.toString(distance)+
103 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true);
104 | avg_ratio =avg_ratio+ratio;
105 | }}
106 | avg_ratio = avg_ratio/(double)((Integer)(test_cpp_paths.size()-1));
107 | Util.writeFile("average ratio of file: "+avg_ratio +"\n", output_file, true);
108 | System.out.println("average ratio of file: "+avg_ratio );
109 |
110 | author_ratio=author_ratio+avg_ratio;
111 | }
112 | author_ratio=author_ratio/(double)((Integer)(test_cpp_paths.size()));
113 | System.out.println("average ratio of author: "+author_ratio);
114 |
115 | Util.writeFile("average ratio of author: "+author_ratio +"\n", output_file, true);
116 |
117 | }*/
118 |
119 | List all_cpp_paths = Util.listCPPFiles(parentDir);
120 | String authorName2;
121 | double avg_ratio=0;
122 |
123 | for (int i =0; i< all_cpp_paths.size(); i++)
124 | {
125 | //authorname is directoryname - 1 because Andrew put an extra 0 at the end of the authorname
126 | File newFile = new File(all_cpp_paths.get(i).toString());
127 | authorName2 = newFile.getParentFile().getName().toString();
128 | Util.writeFile(authorName2+":"+newFile.getName().toString() +"\n", output_file, true);
129 | System.out.println(authorName2);
130 |
131 | BufferedReader br = null;
132 |
133 | for(int j=0; j < all_cpp_paths.size();j++ )
134 | {
135 | File newFile1 = new File(all_cpp_paths.get(j).toString());
136 | String authorName3 = newFile1.getParentFile().getName().toString();
137 |
138 | double ratio =0;
139 | if(!authorName2.equals(authorName3)){
140 | String file1 = Util.readFile(all_cpp_paths.get(i).toString());
141 |
142 | String file2 = Util.readFile(all_cpp_paths.get(j).toString());
143 | int distance =computeDistance(file1, file2);
144 | if(file1.length() <= file2.length()){
145 | ratio = distance/(double)((Integer)file2.length());
146 | }
147 | if(file2.length() < file1.length()){
148 | ratio = distance/(double)((Integer)file1.length());
149 | }
150 |
151 | /* Util.writeFile("File1 length: "+file1.length() +" " +
152 | "File2 length:"+file2.length()+" ", output_file, true);
153 | Util.writeFile("distance: "+Integer.toString(distance)+
154 | " "+"ratio: "+ Double.toString(ratio) + "\n" ,output_file, true);*/
155 | // Util.writeFile( Double.toString(ratio) + ", " ,output_file, true);
156 | avg_ratio =avg_ratio+ratio;
157 | }}
158 | avg_ratio = avg_ratio/(double)((Integer)(all_cpp_paths.size()-9));
159 | Util.writeFile("\n Average distance to all other files: "+avg_ratio +"\n", output_file, true);
160 | System.out.println("Average distance to all other files: "+avg_ratio );
161 | }
162 |
163 | }
164 | }
--------------------------------------------------------------------------------
/SCAA/src/MergeArffFiles.java:
--------------------------------------------------------------------------------
1 | import java.io.BufferedReader;
2 | import java.io.DataInputStream;
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.io.InputStream;
8 | import java.io.InputStreamReader;
9 | import java.nio.charset.Charset;
10 | import java.util.Scanner;
11 |
12 | import org.apache.commons.io.FileUtils;
13 |
14 | /**
15 | * Appends two arff files where each feature vector contains the same ID.
16 | * This can be used to combine extracted features with features extracted from JSylo
17 | * (eg Writeprints limited, save to arff files in the Analysis tab)
18 | *
19 | *
20 | * @author Aylin Caliskan-Islam (ac993@drexel.edu)
21 | */
22 | public class MergeArffFiles {
23 |
24 | //after @data, if the first csv element is the same as file2's first csv element,
25 | //append file2's that line to file1 and move
26 | public static void main(String[] args) throws Exception{
27 |
28 |
29 | for(int numberFiles = 1; numberFiles <2; numberFiles++){
30 |
31 | String word = "@data";
32 |
33 | String file1 ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/UsenixArffs/62Authors/"
34 |
35 | + "62authors14FilesOnlyUsenixFeatures.arff";
36 |
37 | String file2 ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/UsenixArffs/62Authors/"
38 |
39 | + "62authors14FilesAndrewFeatures.arff";
40 |
41 | String outputArffName ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/UsenixArffs/62Authors/"
42 |
43 | + "62authors14FilesUsenixAndrewFeatures.arff";
44 |
45 | /* String file1_tosort ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigramArffs/2014/"
46 | + "9BigExperiment250_2014FS9Andrew.arff";
47 | String file1_sorted ="/Users/Aylin/Desktop/Drexel/2014/ARLInternship/SCAAarffs/bigramArffs/2014/"
48 | + "9BigExperiment250_2014FS9Andrew_sorted.arff ";
49 | Util.AlphabeticallySortLinesOfTextInFile(file1_tosort, file1_sorted);
50 | }*/
51 |
52 | int atDataLineNumberFile1 = MergeArffFiles.grepLineNumber(file1, word);
53 | int atDataLineNumberFile2 = MergeArffFiles.grepLineNumber(file2, word);
54 |
55 | //fast copy attributes
56 | /* File file = new File(file2);
57 | FileReader fileReader = new FileReader(file);
58 | BufferedReader bufferedReader = new BufferedReader(fileReader);
59 | StringBuffer stringBuffer = new StringBuffer();
60 | String line;
61 | while ((line = bufferedReader.readLine()) != "@data") {
62 | stringBuffer.append(line);
63 | stringBuffer.append("\n");
64 | Util.writeFile(line + "\n", outputArffName, true);
65 | }
66 | fileReader.close();*/
67 |
68 | //write the feature names in order from both files
69 | /* for(int firstFileAttributes=1; firstFileAttributes =file2LineNumberStart; j--)
90 |
91 | //for normal case in ascending order
92 | // for(int j=file2LineNumberStart; j <= atDataLineNumberFile2+numberOfInstances; j++)
93 | {
94 | System.out.println(j);
95 | if (instID.equals(MergeArffFiles.getInstanceID(file2, j)))
96 | {
97 |
98 | String firstPart = getInstance(file1, i);
99 | String secondPart = getInstanceVector(file2, j);
100 | System.out.println(firstPart);
101 |
102 | final Scanner scanner = new Scanner(outputArffName);
103 | while (scanner.hasNextLine()) {
104 | final String lineFromFile = scanner.nextLine();
105 | if(lineFromFile.equals(firstPart)==false) {
106 | Util.writeFile( firstPart+ "," +secondPart + "\n", outputArffName, true);
107 | System.out.println(j);
108 | }
109 | }
110 |
111 |
112 | //Use this if the second file is in descending order
113 | // if(j= atDataLineNumberFile2 +9){
131 |
132 | file2LineNumberStart= j+1;
133 |
134 | j = atDataLineNumberFile2+numberOfInstances;
135 | }
136 | }*/
137 | }
138 | }
139 | } }
140 |
141 | }
142 | public static String getInstanceID(String file, int lineNumber) throws IOException
143 | {
144 | //will give an error if there is onl
145 | String line = MergeArffFiles.readSpecificLineNumber(file, lineNumber);
146 | String arr[] = line.split(",", 2);
147 | String firstWord = arr[0];
148 | return firstWord;
149 | }
150 |
151 | public static String getInstance(String file, int lineNumber) throws IOException
152 | {
153 | //will give an error if there is onl
154 | String line = MergeArffFiles.readSpecificLineNumber(file, lineNumber);
155 | String arr[] = line.split(" ", 1);
156 | String firstWord = arr[0];
157 | return firstWord;
158 | }
159 |
160 |
161 | public static String getInstanceVector(String file, int lineNumber) throws IOException
162 | {
163 |
164 | String line = MergeArffFiles.readSpecificLineNumber(file, lineNumber);
165 | String arr[] = line.split(",", 2);
166 |
167 | // String firstWord = arr[0];
168 | String theRest = arr[1];
169 | return theRest;
170 | }
171 |
172 |
173 | public static String readSpecificLineNumber (String file, int lineNumber) throws IOException
174 | {
175 | String lineString = (String)FileUtils.readLines(new File(file)).get(lineNumber-1);
176 |
177 | return lineString;
178 | }
179 |
180 |
181 | public static int grepLineNumber(String file, String word) throws Exception {
182 | BufferedReader buf = new BufferedReader(new InputStreamReader(new DataInputStream(new FileInputStream(file))));
183 |
184 | String line;
185 | int lineNumber = 0;
186 | while ((line = buf.readLine()) != null) {
187 | lineNumber++;
188 | if (word.equals(line)) {
189 | return lineNumber;
190 | }
191 | }
192 | return -1;
193 | }
194 |
195 | }
196 |
--------------------------------------------------------------------------------
/SCAA/src/MergeArffFilesNew.java:
--------------------------------------------------------------------------------
1 | import java.io.BufferedReader;
2 | import java.io.DataInputStream;
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.io.InputStream;
8 | import java.io.InputStreamReader;
9 | import java.io.Reader;
10 | import java.nio.charset.Charset;
11 | import java.util.Enumeration;
12 | import java.util.HashSet;
13 | import java.util.Scanner;
14 | import java.util.Set;
15 | import java.util.regex.Matcher;
16 | import java.util.regex.Pattern;
17 |
18 | import org.apache.commons.io.FileUtils;
19 |
20 | import weka.core.Instances;
21 |
22 | /**
23 | * Appends two arff files where each feature vector contains the same ID.
24 | * This can be used to combine features at different times.
25 | * The two files can contain the same features.
26 | * It does a right join based on the first file. The resulting file would only have features
27 | * in the first file that also existed in the second file.
28 | *
29 | *
30 | * @author Aylin Caliskan-Islam (aylinc@princeton.edu)
31 | */
32 | public class MergeArffFilesNew {
33 |
34 | //Find the intersecting userIDs and merge all the features and instances to a new file
35 | //file2's instances and features are appended to file1
36 | public static void main(String[] args) throws Exception{
37 |
38 |
39 | String file1 ="/Users/Aylin/Desktop/Princeton/BAA/arffs/"
40 |
41 | + "C_62Authors14files_decompiledNEW.arff";
42 |
43 | String file2 ="/Users/Aylin/Desktop/Princeton/BAA/arffs/"
44 |
45 | // + "merged/C_62Authors14files_original_C++.arff";
46 | + "62authors14FilesUsenixAndrewFeatures.arff";
47 |
48 |
49 | String outputArffName ="/Users/Aylin/Desktop/Princeton/BAA/arffs/merged/"
50 |
51 | + "C_62Authors14files_decompiledPlusOriginal.arff";
52 |
53 |
54 |
55 |
56 |
57 | Util.writeFile("@relation " + file1+file2+"\n" +"\n" , outputArffName, true);
58 |
59 | // Read all the instances in the files
60 | Instances instances = new Instances(new FileReader(file1));
61 | Instances instances2 = new Instances(new FileReader(file2));
62 |
63 | for (int att=0; att < instances.numAttributes(); att++)
64 | // for (int att=0; att < 50; att++)
65 | { // System.out.println("instance no:"+att+" "+instances.attribute(att).name());
66 | String type="";
67 | String attValues="";
68 | String name=instances.attribute(att).name();
69 | name.replace("$", "dollarsign");
70 | String arr[] = name.split("\n", 10);
71 | if(arr.length>1){
72 | name="";
73 | int splits = arr.length;
74 | for(int i =0; i1){
128 | name="";
129 | int splits = arr.length;
130 | for(int i =0; i"+"\n", problemSetFilename, true);
31 | Util.writeFile("\t" + ""+"\n", problemSetFilename, true);
32 | for(int i=0; i< authorName.length; i++)
33 | {
34 |
35 | Util.writeFile("\t"+"\t"+ ""+"\n", problemSetFilename, true);
36 | List test_cpp_paths = Util.listCPPFiles(test_dir + authorName[i] + "/");
37 | // System.out.println(test_cpp_paths);
38 | for(int j=0; j < test_cpp_paths.size();j++ )
39 | {
40 | File fileCPP = new File(test_cpp_paths.get(j).toString());
41 | String fileName = fileCPP.getName();
42 | Util.writeFile("\t"+"\t"+"\t"+""
43 | + test_cpp_paths.get(j).toString() + ""
44 | + "\n", problemSetFilename, true);
45 | }
46 | Util.writeFile("\t"+"\t"+ ""+ "\n", problemSetFilename, true);
47 |
48 | }
49 | Util.writeFile("\t"+ ""+ "\n", problemSetFilename, true);
50 | Util.writeFile("\t"+ ""+ "\n", problemSetFilename, true);
51 | Util.writeFile("\t"+ ""+ "\n", problemSetFilename, true);
52 | Util.writeFile(""+ "\n", problemSetFilename, true);
53 |
54 |
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/SCAA/src/RelaxedEvaluation.java:
--------------------------------------------------------------------------------
1 | import weka.classifiers.CostMatrix;
2 | import weka.classifiers.Evaluation;
3 | import weka.core.Instance;
4 | import weka.core.Instances;
5 | import weka.core.Utils;
6 | import java.util.ArrayList;
7 | import java.util.Comparator;
8 | import java.util.List;
9 | import java.util.SortedMap;
10 | import java.util.TreeMap;
11 | //Author, Ariel Stolerman, code taken from the doppelganger finder project
12 |
13 | public class RelaxedEvaluation extends Evaluation {
14 | protected int relaxParam;
15 |
16 | /**
17 | * Initializes all the counters for the evaluation.
18 | * Use useNoPriors()
if the dataset is the test set and you
19 | * can't initialize with the priors from the training set via
20 | * setPriors(Instances)
.
21 | *
22 | * @param data set of training instances, to get some header
23 | * information and prior class distribution information
24 | * @throws Exception if the class is not defined
25 | * @see #useNoPriors()
26 | * @see #setPriors(Instances)
27 | */
28 | public RelaxedEvaluation(Instances data, int relaxParam) throws Exception {
29 | super(data);
30 | this.relaxParam = relaxParam;
31 | }
32 |
33 | /**
34 | * Initializes all the counters for the evaluation and also takes a
35 | * cost matrix as parameter.
36 | * Use useNoPriors()
if the dataset is the test set and you
37 | * can't initialize with the priors from the training set via
38 | * setPriors(Instances)
.
39 | *
40 | * @param data set of training instances, to get some header
41 | * information and prior class distribution information
42 | * @param costMatrix the cost matrix---if null, default costs will be used
43 | * @throws Exception if cost matrix is not compatible with
44 | * data, the class is not defined or the class is numeric
45 | * @see #useNoPriors()
46 | * @see #setPriors(Instances)
47 | */
48 | public RelaxedEvaluation(Instances data, CostMatrix costMatrix, int relaxParam)
49 | throws Exception {
50 | super(data, costMatrix);
51 | this.relaxParam = relaxParam;
52 | }
53 |
54 | /**
55 | * Compares Doubles by ascending order
56 | */
57 | static Comparator descendingDouble = new Comparator() {
58 | @Override
59 | public int compare(Double arg0, Double arg1) {
60 | return -1 * arg0.compareTo(arg1);
61 | }
62 | };
63 |
64 | static Comparator descendingInteger = new Comparator() {
65 | @Override
66 | public int compare(Integer arg0, Integer arg1) {
67 | return -1 * arg0.compareTo(arg1);
68 | }
69 | };
70 |
71 | /**
72 | * Updates all the statistics about a classifiers performance for
73 | * the current test instance.
74 | *
75 | * @param predictedDistribution the probabilities assigned to
76 | * each class
77 | * @param instance the instance to be classified
78 | * @throws Exception if the class of the instance is not
79 | * set
80 | */
81 | protected void updateStatsForClassifier(double [] predictedDistribution,
82 | Instance instance)
83 | throws Exception {
84 |
85 | int actualClass = (int)instance.classValue();
86 |
87 | if (!instance.classIsMissing()) {
88 | updateMargins(predictedDistribution, actualClass, instance.weight());
89 |
90 | // collect all predictions and their corresponding classes
91 | SortedMap predToClass =
92 | new TreeMap(descendingDouble);
93 | for(int i = 0; i < m_NumClasses; i++) {
94 | predToClass.put(predictedDistribution[i], i);
95 | }
96 | List candidateClasses = new ArrayList(relaxParam);
97 | int count = 0;
98 | for (Double pred: predToClass.keySet())
99 | {
100 | candidateClasses.add(predToClass.get(pred));
101 | count++;
102 | if (count == relaxParam)
103 | break;
104 | }
105 | // check if relaxed set of candidates contains actual, if so -
106 | // attribute that prediction
107 | // otherwise - take the to pprediction
108 | int predictedClass = -1;
109 | if (candidateClasses.contains(actualClass))
110 | predictedClass = actualClass;
111 | else
112 | predictedClass = candidateClasses.get(0);
113 |
114 | /*
115 | // Determine the predicted class (doesn't detect multiple
116 | // classifications)
117 | int predictedClass = -1;
118 | double bestProb = 0.0;
119 | for(int i = 0; i < m_NumClasses; i++) {
120 | if (predictedDistribution[i] > bestProb) {
121 | predictedClass = i;
122 | bestProb = predictedDistribution[i];
123 | }
124 | }
125 | */
126 |
127 | m_WithClass += instance.weight();
128 |
129 | // Determine misclassification cost
130 | if (m_CostMatrix != null) {
131 | if (predictedClass < 0) {
132 | // For missing predictions, we assume the worst possible cost.
133 | // This is pretty harsh.
134 | // Perhaps we could take the negative of the cost of a correct
135 | // prediction (-m_CostMatrix.getElement(actualClass,actualClass)),
136 | // although often this will be zero
137 | m_TotalCost += instance.weight()
138 | * m_CostMatrix.getMaxCost(actualClass, instance);
139 | } else {
140 | m_TotalCost += instance.weight()
141 | * m_CostMatrix.getElement(actualClass, predictedClass,
142 | instance);
143 | }
144 | }
145 |
146 | // Update counts when no class was predicted
147 | if (predictedClass < 0) {
148 | m_Unclassified += instance.weight();
149 | return;
150 | }
151 |
152 | double predictedProb = Math.max(MIN_SF_PROB,
153 | predictedDistribution[actualClass]);
154 | double priorProb = Math.max(MIN_SF_PROB,
155 | m_ClassPriors[actualClass]
156 | / m_ClassPriorsSum);
157 | if (predictedProb >= priorProb) {
158 | m_SumKBInfo += (Utils.log2(predictedProb) -
159 | Utils.log2(priorProb))
160 | * instance.weight();
161 | } else {
162 | m_SumKBInfo -= (Utils.log2(1.0-predictedProb) -
163 | Utils.log2(1.0-priorProb))
164 | * instance.weight();
165 | }
166 |
167 | m_SumSchemeEntropy -= Utils.log2(predictedProb) * instance.weight();
168 | m_SumPriorEntropy -= Utils.log2(priorProb) * instance.weight();
169 |
170 | updateNumericScores(predictedDistribution,
171 | makeDistribution(instance.classValue()),
172 | instance.weight());
173 |
174 | // Update other stats
175 | m_ConfusionMatrix[actualClass][predictedClass] += instance.weight();
176 | if (predictedClass != actualClass) {
177 | m_Incorrect += instance.weight();
178 | } else {
179 | m_Correct += instance.weight();
180 | }
181 | } else {
182 | m_MissingClass += instance.weight();
183 | }
184 | }
185 | }
186 |
--------------------------------------------------------------------------------
/SCAA/src/RemoveComments.java:
--------------------------------------------------------------------------------
1 | import java.io.IOException;
2 | import java.util.List;
3 |
4 |
5 | public class RemoveComments {
6 |
7 | public static void main(String[] args) throws IOException
8 | {
9 | String test = "githubManySmallSnippets/";
10 | List test_file_paths = Util.listCPPFiles(test); //use this for preprocessing
11 | for(int i=0; i< test_file_paths.size(); i++)
12 | {
13 | String fileName = test_file_paths.get(i).toString();
14 | System.out.println(fileName);
15 | String sourceCode = Util.readFile(fileName);
16 | // System.out.println(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)",""));
17 | //does not catch Gleb.kalachev's comments, removed them manually. Has a lot of commented code.
18 | Util.writeFile(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)",""), fileName, false);
19 | }}
20 | public static void removeComments(String test) throws IOException
21 | {
22 | List test_file_paths = Util.listCPPFiles(test); //use this for preprocessing
23 | for(int i=0; i< test_file_paths.size(); i++)
24 | {
25 | String fileName = test_file_paths.get(i).toString();
26 | System.out.println(fileName);
27 | String sourceCode = Util.readFile(fileName);
28 | // System.out.println(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)",""));
29 | //does not catch Gleb.kalachev's comments, removed them manually. Has a lot of commented code.
30 | Util.writeFile(sourceCode.replaceAll("(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)",""), fileName, false);
31 | }
32 |
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/SCAA/src/WholeWordIndexFinder.java:
--------------------------------------------------------------------------------
1 | import java.util.ArrayList;
2 | import java.util.List;
3 | import java.util.regex.Matcher;
4 | import java.util.regex.Pattern;
5 |
6 | // Taken from http://whyjava.wordpress.com/2010/05/04/finding-all-the-indexes-of-a-whole-word-in-a-given-string-using-java/
7 | public class WholeWordIndexFinder {
8 |
9 | private String searchString;
10 |
11 | public WholeWordIndexFinder(String searchString) {
12 | this.searchString = searchString;
13 | }
14 |
15 | public List findIndexesForKeyword(String keyword) {
16 | String regex = "\\b"+keyword+"\\b";
17 | Pattern pattern = Pattern.compile(regex);
18 | Matcher matcher = pattern.matcher(searchString);
19 |
20 | List wrappers = new ArrayList();
21 |
22 | while(matcher.find() == true){
23 | int end = matcher.end();
24 | int start = matcher.start();
25 | IndexWrapper wrapper = new IndexWrapper(start, end);
26 | wrappers.add(wrapper);
27 | }
28 | return wrappers;
29 | }
30 |
31 | public static void main(String[] args) {
32 | WholeWordIndexFinder finder = new WholeWordIndexFinder(
33 | "2 (FunctionDef(((CompoundStatement((ForStatement((ForInit((IdentifierDeclStatement(IdentifierDecl)))))((Condition((RelationalExpression(x)(10)))))((IncDecOp((x))((++))))");
34 | List indexes = finder.findIndexesForKeyword("Condition");
35 | System.out.println("Indexes found "+indexes.size() +" keyword found at index : " +indexes.get(0).getStart());
36 |
37 | //input should be the dep file, do this for each line
38 | //take the last line that a function id appears in that has the whole depth structure
39 | String input = "1111 t (flag)";
40 | //take the function id in the beginning of the line.
41 | String firstWord = input.substring(0, input.indexOf('\t'));
42 | System.out.println(firstWord);
43 |
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/SCAA/weka.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calaylin/CodeStylometry/e170576d49d6a5c4c0e345b82230e9024acd9db4/SCAA/weka.jar
--------------------------------------------------------------------------------