├── bestbuy
    ├── bestbuy.py
    └── readme
├── emc
    ├── data
    │   ├── SiteLocations.csv
    │   ├── SiteLocations_with_more_sites.csv
    │   ├── SubmissionZerosExceptNAs.csv
    │   ├── TrainingData (copy).csv
    │   ├── TrainingData.csv
    │   └── result.csv
    ├── feature_extraction.py
    ├── log
    ├── main.py
    ├── preprocess.py
    ├── readme
    ├── regression.py
    └── utilities.py
├── fb_suggest_missing_link
    ├── candidate.py
    ├── main.py
    ├── rank.py
    ├── readme
    ├── utilities.py
    └── validation.py
├── insult_detect
    ├── insult_detect.py
    └── readme
├── kicked_car
    ├── classification.py
    ├── data
    │   ├── .~lock.pos.csv#
    │   ├── .~lock.test.csv#
    │   ├── .~lock.training.csv#
    │   ├── feature_idx.csv
    │   ├── idx
    │   ├── log
    │   ├── log2
    │   ├── res.csv
    │   ├── test.csv
    │   └── training.csv
    ├── feature_extraction.py
    ├── main.py
    ├── preprocess.py
    ├── readme
    └── utilities.py
├── music_rating
    ├── music_rating.py
    └── readme
├── photo_quality_prediction
    ├── classification.py
    ├── data
    │   ├── result.csv
    │   ├── statistics
    │   ├── test.csv
    │   └── training.csv
    ├── feature_selection.py
    ├── main.py
    ├── readme
    └── utilities.py
├── readme.md
└── titanic
    ├── data
        ├── data.csv
        ├── data2.csv
        ├── error.csv
        ├── result.csv
        ├── result2.csv
        ├── result3.csv
        ├── result4.csv
        ├── test.csv
        ├── test2.csv
        └── train.csv
    ├── logistic_regression.py
    ├── readme
    └── titanic.py


/bestbuy/bestbuy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import csv
  3 | import re
  4 | import operator
  5 | import Levenshtein
  6 | 
  7 | __train__ = './data/train.csv'
  8 | __test__ = './data/test.csv'
  9 | 
 10 | def read_data(path, cols,ignore_header=True):
 11 |     csv_file_object = csv.reader(open(path, 'rb'))
 12 |     if ignore_header:
 13 |         header = csv_file_object.next()
 14 |     x = []
 15 |     for row in csv_file_object:
 16 |         r = []
 17 |         for col in cols:
 18 |             r.append(row[col])
 19 |         x.append(r)
 20 |     return x
 21 | 
 22 | def string_normalize(s):
 23 |     res = s.lower()
 24 |     res = res.replace(' ', '')
 25 |     res = ''.join(c for c in res if c.isalnum())
 26 |     return res
 27 | 
 28 | def preprocess_data(raw_data, col):
 29 |     for row in raw_data:
 30 |         row[col] = string_normalize(row[col])
 31 | 
 32 | def best_match_key(keys, query):
 33 |     similarity = 0
 34 |     best_key = None
 35 |     for key in keys:
 36 |         sim = Levenshtein.ratio(key, query)
 37 |         if sim > similarity:
 38 |             similarity = sim
 39 |             best_key = key
 40 |     return (best_key, similarity)
 41 | 
 42 | def create_match(x, thresh=.85):
 43 |     match = {}
 44 |     for row in x:
 45 |         matched_key = None
 46 |         sku, query = row
 47 |         # Fuzzy matching.
 48 |         best_key, similarity = best_match_key(match.keys(), query)
 49 |         if similarity > thresh:
 50 |             matched_key = best_key
 51 |         else:
 52 |             match[query] = {sku : 1}
 53 |         if matched_key is None:
 54 |             continue
 55 |         if not match[matched_key].has_key(sku):
 56 |             match[matched_key][sku] = 1
 57 |         else:
 58 |             match[matched_key][sku] += 1
 59 | 
 60 |     # Sorts the dictionary.
 61 |     for key in match.keys():
 62 |         tmp_dict = match[key]
 63 |         tmp_dict = sorted(tmp_dict.iteritems(), key=operator.itemgetter(1))
 64 |         tmp_dict.reverse()
 65 |         match[key] = tmp_dict
 66 |     return match
 67 | 
 68 | def get_top(x):
 69 |     sku_count_dict = {}
 70 |     for row in x:
 71 |         if not sku_count_dict.has_key(row[0]):
 72 |             sku_count_dict[row[0]] = 1
 73 |         else:
 74 |             sku_count_dict[row[0]] += 1
 75 |     sorted_dict = sorted(sku_count_dict.iteritems(), key=operator.itemgetter(1))
 76 |     sorted_dict.reverse()
 77 | 
 78 |     res = []
 79 |     for i in range(len(sorted_dict)):
 80 |         res.append(sorted_dict[i][0])
 81 |     return res;
 82 | 
 83 | def predict(match, top, query, k, thresh=.7):
 84 |     res = []
 85 |     matched_key, similarity = best_match_key(match.keys(), query)
 86 |     # if similarity < 0.8:
 87 |     #     print 'matched_key = %s, query = %s, sim = %s' \
 88 |     #         % (matched_key, query, similarity)
 89 |     if similarity > thresh:
 90 |         for i in range(min(k, len(match[matched_key]))):
 91 |             res.append(match[matched_key][i][0])
 92 |     if len(res) < k:
 93 |         for i in range(len(top)):
 94 |             if top[i] not in res:
 95 |                 res.append(top[i])
 96 |                 if len(res) == k:
 97 |                     break
 98 |     return res
 99 | 
100 | if __name__ == '__main__':
101 |     # Reads training data.
102 |     print 'Reading and preprocessing data...'
103 |     x = read_data(__train__, [1, 3])
104 |     preprocess_data(x, 1)
105 | 
106 |     # Divides into training and cv.
107 |     l = int(len(x) * 1)
108 |     x_cv = x[l - 10 : :]
109 |     x = x[0 : l]
110 | 
111 |     top = get_top(x)
112 | 
113 |     # Predicts on cv.
114 |     print 'Predicting...'
115 |     k = 5
116 |     # thresh_match = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
117 |     # thresh_predict = [0, 0.2, 0.4, 0.6, 0.7, 0.8]
118 |     thresh_match = [0.75]
119 |     thresh_predict = [0]
120 |     best_match = None
121 |     thresh1 = 0
122 |     thresh2 = 0
123 |     accuracy = -1
124 |     for t1 in thresh_match:
125 |         for t2 in thresh_predict:
126 |             match = create_match(x, t1)
127 |             p_cv = []
128 |             for row in x_cv:
129 |                 q = row[1]
130 |                 p_cv.append(predict(match, top, q, k, t2))
131 |             correct = 0.
132 |             for i in range(len(x_cv)):
133 |                 if x_cv[i][0] in p_cv[i]:
134 |                     correct += 1.
135 |             ac = correct / len(x_cv)
136 |             print 't1 = %f, t2 = %f, accuracy = %f' % (t1, t2, ac)
137 |             if ac > accuracy:
138 |                 accuracy = ac
139 |                 thresh1 = t1
140 |                 thresh2 = t2
141 |                 best_match = match
142 |     print 'thresh1 = %f, thresh2 = %f, accuracy = %f' \
143 |         % (thresh1, thresh2, accuracy)
144 | 
145 |     # Reads test set.
146 |     x_test = read_data(__test__, [2])
147 |     preprocess_data(x_test, 0)
148 | 
149 |     # Predicts.
150 |     res = []
151 |     k = 5
152 |     for row in x_test:
153 |         q = row[0]
154 |         res.append(predict(best_match, top, q, k, thresh2))
155 | 
156 |     open_file_object = csv.writer(open("./data/result.csv", "wb"))
157 |     open_file_object.writerow(['sku'])
158 |     for p in res:
159 |         open_file_object.writerow([' '.join(p)])
160 | 
161 | 


--------------------------------------------------------------------------------
/bestbuy/readme:
--------------------------------------------------------------------------------
1 | This code is for Bestbuy - Predict which Xbox game a visitor will be most interested in based on their search query (https://www.kaggle.com/c/acm-sf-chapter-hackathon-small).
2 | The approach is pretty straighforward, the basic idea is try to match a query with a game (so I ignore all other features like time, game name etc.). The initial approach is to create a map, the key is user's query, the value is also a map<game, count>, which means I store which game the user clicked and the frequecy of it when user searches for this query. After that, when predicting the game, we simply choose the most frequently clicked games for this query, and if there's less than 5 games, I use the most popular games among all queries to fill the gap.
3 | One optimization I used is to correct users' queries. Since there are lots of typos or short forms, I use Levenshtein to calculate two queries similarity, and if it is above a threshold, the algorithm believe they are the same query. The threshold is chosen by testing on cross validation set.
4 | Finally, I got around 74.3% and the leader is 78.9%.
5 | 


--------------------------------------------------------------------------------
/emc/data/SiteLocations.csv:
--------------------------------------------------------------------------------
 1 | "SITE_ID","LATITUDE","LONGITUDE"
 2 | 1,41.6709918952829,-87.7324568962847
 3 | 32,41.755832412403,-87.545349670582
 4 | 50,41.7075695897648,-87.5685738570845
 5 | 57,41.9128621248178,-87.7227234452095
 6 | 64,41.7907868783739,-87.6016464917605
 7 | 1003,41.9843323270383,-87.7920016971163
 8 | 1018,41.773889,-87.815278
 9 | 1601,41.6681203371799,-87.9905696935943
10 | 2001,41.6621094347378,-87.6964665157993
11 | 4002,41.8552431328191,-87.7524696987103
12 | 4101,42.053333,-88.108056
13 | 6004,41.8721168410596,-87.8290802510295
14 | 6006,41.8728971999587,-87.8258724913966
15 | 8003,41.631389,-87.568056
16 | 


--------------------------------------------------------------------------------
/emc/data/SiteLocations_with_more_sites.csv:
--------------------------------------------------------------------------------
 1 | "SITE_ID","LATITUDE","LONGITUDE"
 2 | 1,41.6709918952829,-87.7324568962847
 3 | 14,41.834243,-87.6238
 4 | 22,41.6871654376343,-87.5393154841479
 5 | 32,41.755832412403,-87.545349670582
 6 | 50,41.7075695897648,-87.5685738570845
 7 | 52,41.9654848301767,-87.7499280553202
 8 | 57,41.9128621248178,-87.7227234452095
 9 | 64,41.7907868783739,-87.6016464917605
10 | 76,41.7513999786378,-87.7134881520007
11 | 1003,41.9843323270383,-87.7920016971163
12 | 1018,41.773889,-87.815278
13 | 1601,41.6681203371799,-87.9905696935943
14 | 2001,41.6621094347378,-87.6964665157993
15 | 3301,41.7827660079251,-87.8053767946675
16 | 4002,41.8552431328191,-87.7524696987103
17 | 4101,42.053333,-88.108056
18 | 6004,41.8721168410596,-87.8290802510295
19 | 6005,41.8644264230095,-87.7489023825124
20 | 6006,41.8728971999587,-87.8258724913966
21 | 8003,41.631389,-87.568056
22 | 


--------------------------------------------------------------------------------
/emc/feature_extraction.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import preprocess
  3 | import utilities
  4 | 
  5 | def create_x_y(data, target, pos):
  6 |     span = 24
  7 |     x = []
  8 |     y = []
  9 |     for i in range(len(data)):
 10 |         if i % 192 < pos + span:
 11 |             continue
 12 |         chunk_id = data[i][1]
 13 |         hour = data[i][5]
 14 |         t = len(data[0]) - 39 + target
 15 | 
 16 |         features = []
 17 |         prev_hour = 0
 18 |         for j in range(i - pos - span, i - pos):
 19 |             features.append(float(data[j][t]))
 20 |             if data[j][5] == hour:
 21 |                 prev_hour = float(data[j][t])
 22 | 
 23 |         features.append(prev_hour)
 24 | 
 25 |         # Binary hour features.
 26 |         for h in range(24):
 27 |             if h == int(hour):
 28 |                 features.append(1)
 29 |             else:
 30 |                 features.append(0)
 31 | 
 32 |         # Binary month features.
 33 |         month = int(data[i][3])
 34 |         for m in range(1, 13):
 35 |             if m == month:
 36 |                 features.append(1)
 37 |             else:
 38 |                 features.append(0)
 39 | 
 40 |         # Weather features.
 41 |         for k in range(6, 56):
 42 |             features.append(float(data[i - pos][k]))
 43 |         for k in range(6, 56):
 44 |             features.append(float(data[i - pos - 1][k]))
 45 | 
 46 |         x.append(features)
 47 | 
 48 |         y.append(float(data[i][t]))
 49 | 
 50 |     return x, y
 51 | 
 52 | def get_features(chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk,
 53 |                  weekday_avg_by_chunk, hour_avg, weekday_avg):
 54 |     avg = [0.0] * 39
 55 |     for chunk_id in chunk_avg.keys():
 56 |         for i in range(len(avg)):
 57 |             avg[i] += chunk_avg[chunk_id][i]
 58 | 
 59 |     for i in range(len(avg)):
 60 |         avg[i] /= float(len(chunk_avg))
 61 | 
 62 |     tmp = []
 63 |     if chunk_id in chunk_avg:
 64 |         tmp.append(chunk_avg[chunk_id])
 65 |     else:
 66 |         tmp.append(avg)
 67 |     # if weekday in weekday_avg_by_chunk[chunk_id]:
 68 |     #     tmp.append(weekday_avg_by_chunk[chunk_id][weekday])
 69 |     # else:
 70 |     #     tmp.append(weekday_avg[weekday])
 71 |     if chunk_id in chunk_avg and hour in hour_avg_by_chunk[chunk_id]:
 72 |         tmp.append(hour_avg_by_chunk[chunk_id][hour])
 73 |     else:
 74 |         tmp.append(hour_avg[hour])
 75 |     return tmp
 76 | 
 77 | def get_avgs(data, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
 78 |              hour_avg, weekday_avg):
 79 |     res = []
 80 |     for line in data:
 81 |         chunk_id = line[1]
 82 |         weekday = line[4]
 83 |         hour = line[5]
 84 | 
 85 |         tmp = get_features(chunk_id, weekday, hour, chunk_avg,
 86 |                            hour_avg_by_chunk, weekday_avg_by_chunk,
 87 |                            hour_avg, weekday_avg)
 88 |         res.append(tmp)
 89 |     return res
 90 | 
 91 | def get_avg_maps(train_data):
 92 |     chunk_avg = utilities.get_chunk_avg(train_data)
 93 |     hour_avg = utilities.get_hour_avg(train_data)
 94 |     hour_avg_by_chunk = utilities.get_hour_avg_by_chunk(train_data)
 95 |     weekday_avg = utilities.get_weekday_avg(train_data)
 96 |     weekday_avg_by_chunk = utilities.get_weekday_avg_by_chunk(train_data)
 97 | 
 98 |     return (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
 99 |             hour_avg, weekday_avg)
100 | 
101 | def get_x_by_avg(train_data, cv_data, chunk_avg, hour_avg_by_chunk,
102 |                  weekday_avg_by_chunk, hour_avg, weekday_avg):
103 |     x_train = get_avgs(train_data, chunk_avg, hour_avg_by_chunk,
104 |                        weekday_avg_by_chunk, hour_avg, weekday_avg)
105 |     x_cv = get_avgs(cv_data, chunk_avg, hour_avg_by_chunk,
106 |                        weekday_avg_by_chunk, hour_avg, weekday_avg)
107 |     return x_train, x_cv
108 | 
109 | def get_x_y_by_target(x_train_all, x_cv_all, targets_train, targets_cv, index):
110 |     x_train = []
111 |     y_train = []
112 |     for i in range(len(targets_train)):
113 |         if not targets_train[i][index] == 'NA':
114 |             tmp = []
115 |             for features in x_train_all[i]:
116 |                 tmp.append(features[index])
117 |             x_train.append(tmp)
118 |             y_train.append(float(targets_train[i][index]))
119 | 
120 |     x_cv = []
121 |     y_cv = []
122 |     for i in range(len(targets_cv)):
123 |         if not targets_cv[i][index] == 'NA':
124 |             tmp = []
125 |             for features in x_cv_all[i]:
126 |                 tmp.append(features[index])
127 |             x_cv.append(tmp)
128 |             y_cv.append(float(targets_cv[i][index]))
129 | 
130 |     return x_train, y_train, x_cv, y_cv
131 | 
132 | if __name__ == '__main__':
133 |     pass
134 | 


--------------------------------------------------------------------------------
/emc/log:
--------------------------------------------------------------------------------
 1 | average by chunk			0.28652
 2 | average by hour in chunk		0.27529
 3 | 
 4 | LR with hour_avg and chunk_avg          0.29876
 5 | 
 6 | LR with previous 24h for each pos,target 0.22850
 7 | LR with previous 48h for each pos,target 0.22686
 8 | 24h prev, hour mean			0.23569
 9 | 
10 | 24h prev, prev_our 			 0.22792
11 | 24h prev, prev_our, binary_hour 	0.22166
12 | 24h prev, prev_our, binary_hour, in chunk 	0.22147
13 | 24h prev, prev_our, binary_hour, binary_weekday, in chunk 	0.22252
14 | 24h prev, prev_our, binary_hour, binary_month, in chunk 	0.22105
15 | 48h prev, prev_our, binary_hour, binary_month, in chunk 	0.22098
16 | 24h prev, prev_our, binary_hour, binary_month, binary_site, in chunk 	0.22105
17 | 24h prev, prev_our, binary_hour, binary_month, in chunk, last_weather 	0.21862
18 | 24h prev, prev_our, binary_hour, binary_month, in chunk, last_2_weather 	0.21795
19 | 24h prev, prev_our, binary_hour, binary_month, in chunk, last_3_weather 	0.21827
20 | 48h prev, prev_our, binary_hour, binary_month, in chunk, last_2_weather 	0.21820
21 | 


--------------------------------------------------------------------------------
/emc/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import time
  3 | import utilities
  4 | import preprocess
  5 | import feature_extraction
  6 | import regression
  7 | 
  8 | def time_series(training_file, submission_file, output_file):
  9 |     data = utilities.read_file(training_file, True)
 10 |     first_line = data[0]
 11 |     data = data[1 : :]
 12 |     data = preprocess.fill_NAs(data)
 13 | 
 14 |     (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
 15 |      hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data)
 16 | 
 17 |     clf_map = regression.linear_regression_2(data)
 18 | 
 19 |     print 'Filling submission file...'
 20 |     chunk_map = utilities.get_chunk_map(data, 1)
 21 |     sub_data = utilities.read_file(submission_file, True)
 22 | 
 23 |     positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
 24 |     for i in range(1, len(sub_data)):
 25 |         chunk_id = sub_data[i][1]
 26 |         hour = sub_data[i][3]
 27 |         pos = positions[(i - 1) % 10]
 28 |         for j in range(5, len(sub_data[i])):
 29 |             target = j - 5
 30 |             if sub_data[i][j] == '0':
 31 |                 if not chunk_id in chunk_map:
 32 |                     sub_data[i][j] = hour_avg[hour][target]
 33 |                 else:
 34 |                     data_in_chunk = chunk_map[chunk_id]
 35 |                     start = len(data_in_chunk) - 24
 36 |                     t = len(data_in_chunk[0]) - 39 + target
 37 |                     features = []
 38 |                     prev_hour = 0
 39 |                     for k in range(start, len(data_in_chunk)):
 40 |                         features.append(float(data_in_chunk[k][t]))
 41 |                         if data_in_chunk[k][5] == hour:
 42 |                             prev_hour = float(data_in_chunk[k][t])
 43 | 
 44 |                     features.append(prev_hour)
 45 | 
 46 |                     # Binary hour features.
 47 |                     for h in range(24):
 48 |                         if h == int(hour):
 49 |                             features.append(1)
 50 |                         else:
 51 |                             features.append(0)
 52 | 
 53 |                     # Binary month features.
 54 |                     month = int(sub_data[i][4])
 55 |                     for m in range(1, 13):
 56 |                         if m == month:
 57 |                             features.append(1)
 58 |                         else:
 59 |                             features.append(0)
 60 | 
 61 |                     # Weather features.
 62 |                     tmp_length = len(data_in_chunk)
 63 |                     for k in range(6, 56):
 64 |                         features.append(float(data_in_chunk[tmp_length - 1][k]))
 65 |                     for k in range(6, 56):
 66 |                         features.append(float(data_in_chunk[tmp_length - 2][k]))
 67 | 
 68 |                     sub_data[i][j] = \
 69 |                         clf_map[(target, pos)].predict([features])[0]
 70 | 
 71 |     utilities.write_file(output_file, sub_data)
 72 | 
 73 | def avg(training_file, submission_file, output_file):
 74 |     data = utilities.read_file(training_file)
 75 | 
 76 |     train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data)
 77 |     targets_train, targets_cv = preprocess.get_train_cv_targets(
 78 |         train_data, cv_data)
 79 | 
 80 |     (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
 81 |      hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data)
 82 | 
 83 |     x_train_all, x_cv_all = feature_extraction.get_x_by_avg(
 84 |             train_data, cv_data, chunk_avg, hour_avg_by_chunk,
 85 |              weekday_avg_by_chunk, hour_avg, weekday_avg)
 86 | 
 87 |     clfs = regression.linear_regression(
 88 |         x_train_all, x_cv_all, targets_train, targets_cv)
 89 |     clfs = regression.random_forest(
 90 |         x_train_all, x_cv_all, targets_train, targets_cv)
 91 | 
 92 |     print 'Filling submission file...'
 93 |     sub_data = utilities.read_file(submission_file, True)
 94 |     for i in range(1, len(sub_data)):
 95 |         chunk_id = sub_data[i][1]
 96 |         hour = sub_data[i][3]
 97 |         weekday = ''
 98 |         all_features = feature_extraction.get_features(
 99 |             chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk,
100 |             weekday_avg_by_chunk, hour_avg, weekday_avg)
101 | 
102 |         for j in range(5, len(sub_data[i])):
103 |             if sub_data[i][j] == '0':
104 |                 feature = []
105 |                 for f in all_features:
106 |                     feature.append(f[j - 5])
107 |                 sub_data[i][j] = clfs[j - 5].predict([feature])[0]
108 | 
109 |     utilities.write_file(output_file, sub_data)
110 | 
111 | def baseline(training_file, submission_file, output_file):
112 |     data = utilities.read_file(training_file)
113 |     sub_data = utilities.read_file(submission_file, True)
114 | 
115 |     print 'Calculating hour averages...'
116 |     hour_avg_by_chunk = utilities.get_hour_avg_by_chunk(data)
117 |     hour_avg = utilities.get_hour_avg(data)
118 | 
119 |     print 'Filling submission file...'
120 |     for i in range(1, len(sub_data)):
121 |         chunk_id = sub_data[i][1]
122 |         hour = sub_data[i][3]
123 |         for j in range(5, len(sub_data[i])):
124 |             if sub_data[i][j] == '0':
125 |                 if chunk_id in hour_avg_by_chunk:
126 |                     sub_data[i][j] = hour_avg_by_chunk[chunk_id][hour][j - 5]
127 |                 else:
128 |                     sub_data[i][j] = hour_avg[hour][j - 5]
129 | 
130 |     utilities.write_file(output_file, sub_data)
131 | 
132 | if __name__ == '__main__':
133 |     start_time = time.time()
134 |     time_series('./data/TrainingData.csv',
135 |              './data/SubmissionZerosExceptNAs.csv',
136 |              './data/result.csv')
137 |     print (time.time() - start_time) / 60.0, 'minutes'
138 | 


--------------------------------------------------------------------------------
/emc/preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import utilities
 3 | 
 4 | def translate_weekday(data):
 5 |     print 'Translating weekdays...'
 6 |     weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
 7 |                 'Friday', 'Saturday', 'Sunday']
 8 |     for i in range(len(data)):
 9 |         for j in range(len(weekdays)):
10 |             if data[i][4] == weekdays[j]:
11 |                 data[i][4] = j + 1
12 |     return data
13 | 
14 | def fill_NAs(data):
15 |     print 'Filling NAs...'
16 |     for target in range(6, len(data[0])):
17 |         if data[0][target] == 'NA':
18 |             for i in range(len(data)):
19 |                 if not data[i][target] == 'NA':
20 |                     for j in range(0, i):
21 |                         data[j][target] = data[i][target]
22 |                     break
23 | 
24 |     for i in range(len(data)):
25 |         for j in range(6, len(data[0])):
26 |             if data[i][j] == 'NA':
27 |                 if i > 0 and not data[i - 1][j] == 'NA':
28 |                     data[i][j] = data[i - 1][j]
29 | 
30 |     return data
31 | 
32 | def get_train_cv_data_by_chunk(data):
33 |     chunk_map = utilities.get_chunk_map(data, 1)
34 | 
35 |     train_data = []
36 |     cv_data = []
37 |     for chunk_id in chunk_map.keys():
38 |         num = len(chunk_map[chunk_id])
39 |         train_num = 147
40 |         train_data += chunk_map[chunk_id][0 : train_num]
41 |         cv_data += chunk_map[chunk_id][train_num : :]
42 |     return train_data, cv_data
43 | 
44 | def get_train_cv_targets(train_data, cv_data):
45 |     return get_targets(train_data), get_targets(cv_data)
46 | 
47 | def get_targets(data):
48 |     targets = []
49 |     for line in data:
50 |         n = len(line)
51 |         targets.append(line[n - 39 : :])
52 |     return targets
53 | 
54 | if __name__ == '__main__':
55 |     pass
56 | 


--------------------------------------------------------------------------------
/emc/readme:
--------------------------------------------------------------------------------
 1 | This is the code for EMC Data Science Global Hackathon (Air Quality Prediction)
 2 | (http://www.kaggle.com/c/dsg-hackathon). The problem is to predict future air quality
 3 | based on past air quality and some other weather info (some of the data might be missing).
 4 | My best approach is building a linear regression model for each predicted target and for each position
 5 | within the chunk, so totally there are 390 models. Features are mainly past target information.
 6 | Specifically, I include past 24 hour target data into features, which seems to be the most effective
 7 | ones. Also, hour and month information is also important, I binarized them into the features. Additionally,
 8 | the most recent 2 days weather information and the most recent one hour target data also improve the result.
 9 | One thing I feel is that for this kind of time series problem, past target data is very very important, even
10 | if I only use these data, the result is already good enougth. Finally, after about 68min training, I achieved 
11 | MAE 0.21795, ranking the 4th.
12 | 


--------------------------------------------------------------------------------
/emc/regression.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from sklearn import linear_model
 3 | from sklearn.ensemble import RandomForestClassifier
 4 | import feature_extraction
 5 | import utilities
 6 | 
 7 | def linear_regression_2(data):
 8 |     print 'Training with linear regression 2...'
 9 |     clf_map = {}
10 |     positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
11 |     mae = 0.0
12 |     num = 0
13 |     for target in range(0, 39):
14 |         for pos in positions:
15 |             t = len(data[0]) - 39 + target
16 |             key = (target, pos)
17 |             x, y = feature_extraction.create_x_y(data)
18 | 
19 |             clf = linear_model.LinearRegression()
20 |             clf.fit(x, y)
21 |             clf_map[key] = clf
22 | 
23 |             p = clf.predict(x)
24 |             mae += utilities.ae(y, p)
25 |             num += len(y)
26 | 
27 |             print '(%s, %s) completed.' % (target, pos)
28 |     mae /= float(num)
29 |     print 'MAE = %s' % mae
30 |     return clf_map
31 | 
32 | def linear_regression(x_train_all, x_cv_all, targets_train, targets_cv):
33 |     print 'Training with linear regression...'
34 |     clfs = regression(x_train_all, x_cv_all, targets_train, targets_cv,
35 |                            linear_model.LinearRegression)
36 |     return clfs
37 | 
38 | def random_forest(x_train_all, x_cv_all, targets_train, targets_cv):
39 |     print 'Training with random forest...'
40 |     clfs = regression(x_train_all, x_cv_all, targets_train, targets_cv,
41 |                            m_random_forest)
42 |     return clfs
43 | 
44 | def m_random_forest():
45 |     return RandomForestClassifier(n_estimators=10, max_depth=None,
46 |                                   min_samples_split=1, random_state=0)
47 | 
48 | def regression(x_train_all, x_cv_all, targets_train, targets_cv, classifier):
49 |     clfs = []
50 |     mae_train = 0
51 |     mae_cv = 0
52 |     num_train = 0
53 |     num_cv = 0
54 |     for i in range(len(targets_train[0])):
55 |         x_train,  y_train, x_cv, y_cv = feature_extraction.get_x_y_by_target(
56 |             x_train_all, x_cv_all, targets_train, targets_cv, i)
57 | 
58 |         clf = classifier()
59 |         clf.fit(x_train, y_train)
60 |         clfs.append(clf)
61 | 
62 |         p = clf.predict(x_cv)
63 |         mae_cv += utilities.ae(y_cv, p)
64 |         num_cv += len(y_cv)
65 | 
66 |         p = clf.predict(x_train)
67 |         mae_train += utilities.ae(y_train, p)
68 |         num_train += len(y_train)
69 | 
70 |         print 'Round %s completed.' % i
71 | 
72 |     mae_train /= float(num_train)
73 |     mae_cv /= float(num_cv)
74 | 
75 |     print 'MAE in training set: %s' % mae_train
76 |     print 'MAE in cv set: %s' % mae_cv
77 |     return clfs
78 | 
79 | if __name__ == '__main__':
80 |     pass
81 | 


--------------------------------------------------------------------------------
/emc/utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import csv
  3 | 
  4 | def read_file(file_name, header=False):
  5 |     print 'Reading file...'
  6 |     f = open(file_name)
  7 |     reader = csv.reader(f)
  8 |     res = []
  9 |     if not header:
 10 |         reader.next()
 11 |     for line in reader:
 12 |         res.append(line)
 13 | 
 14 |     f.close()
 15 |     return res
 16 | 
 17 | def write_file(file_name, data):
 18 |     print 'Writing submission file...'
 19 |     f = open(file_name, 'w')
 20 |     writer = csv.writer(f)
 21 |     for line in data:
 22 |         writer.writerow(line)
 23 |     f.close()
 24 | 
 25 | def get_site_map(first_line):
 26 |     res = []
 27 |     site_map = {}
 28 |     index = 0
 29 |     start = len(first_line) - 39
 30 |     for i in range(start, len(first_line)):
 31 |         site = first_line[i]
 32 |         idx = site.rfind('_', 0, len(site))
 33 |         site_num = int(site[idx + 1 : :])
 34 |         if site_num not in site_map:
 35 |             site_map[site_num] = index
 36 |             index += 1
 37 | 
 38 |         res.append(site_map[site_num])
 39 |     return res
 40 | 
 41 | def get_chunk_map(data, index):
 42 |     chunk_map = {}
 43 |     for line in data:
 44 |         key = line[index]
 45 |         if key not in chunk_map:
 46 |             chunk_map[key] = []
 47 |         chunk_map[key].append(line)
 48 |     return chunk_map
 49 | 
 50 | def get_avg_by_index(data, index):
 51 |     avg = {}
 52 |     num = {}
 53 |     for line in data:
 54 |         key = line[index]
 55 |         if key not in avg:
 56 |             avg[key] = [0.0] * 39
 57 |             num[key] = [0] * 39
 58 |         for i in range(56, len(line)):
 59 |             if not line[i] == 'NA':
 60 |                 num[key][i - 56] += 1
 61 |                 avg[key][i - 56] += float(line[i])
 62 | 
 63 |     for key in avg.keys():
 64 |         for i in range(len(avg[key])):
 65 |             if num[key][i] > 0:
 66 |                 avg[key][i] /= float(num[key][i])
 67 |     return avg
 68 | 
 69 | def get_chunk_avg(data):
 70 |     return get_avg_by_index(data, 1)
 71 | 
 72 | def get_hour_avg(data):
 73 |     return get_avg_by_index(data, 5)
 74 | 
 75 | def get_weekday_avg(data):
 76 |     return get_avg_by_index(data, 4)
 77 | 
 78 | def get_hour_avg_by_chunk(data):
 79 |     chunk_map = get_chunk_map(data, 1)
 80 | 
 81 |     hour_avg_by_chunk = {}
 82 |     for chunk_id in chunk_map.keys():
 83 |         hour_avg_by_chunk[chunk_id] = get_hour_avg(chunk_map[chunk_id])
 84 |     return hour_avg_by_chunk
 85 | 
 86 | def get_weekday_avg_by_chunk(data):
 87 |     chunk_map = get_chunk_map(data, 1)
 88 | 
 89 |     weekday_avg_by_chunk = {}
 90 |     for chunk_id in chunk_map.keys():
 91 |         weekday_avg_by_chunk[chunk_id] = get_weekday_avg(chunk_map[chunk_id])
 92 |     return weekday_avg_by_chunk
 93 | 
 94 | def get_weekday_in_sub(chunk_id, pos_in_chunk, chunk_map):
 95 |     chunk = chunk_map[chunk_id]
 96 |     last = chunk[len(chunk) - 1]
 97 |     last_weekday = last[4]
 98 |     last_hour = int(last[5])
 99 |     last_pos_in_chunk = int(last[2])
100 | 
101 |     hour_diff = last_pos_in_chunk - pos_in_chunk
102 |     if last_hour + hour_diff < 24:
103 |         return last_weekday
104 |     else:
105 |         hour_diff -= 23 - last_hour
106 |         day_diff = int(hour_diff / 24)
107 |         weekday = last_weekday + day_diff + 1
108 |         if weekday > 7:
109 |             weekday -= 7
110 |         return weekday
111 | 
112 | def ae(y, p):
113 |     ae = 0.0
114 |     for i in range(len(y)):
115 |         ae += abs(float(y[i]) - p[i])
116 |     return ae
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     pass
121 |     # res = read_file('./data/TrainingData.csv')
122 |     # print res[0]
123 | 


--------------------------------------------------------------------------------
/fb_suggest_missing_link/candidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Responsible for selecting candidates.
 3 | """
 4 | 
 5 | #!/usr/bin/env python
 6 | 
 7 | def get_surroundings(follow, followed, nodes):
 8 |     """ Gets all followers and followees of a given sets of nodes. """
 9 | 
10 |     followers_and_followees = set()
11 |     for node in nodes:
12 |         followers_and_followees.update(follow[node])
13 |         followers_and_followees.update(followed[node])
14 |     return followers_and_followees
15 | 
16 | def get_candidates(follow, followed, node):
17 |     """ Gets candidates for node to suggest follow. """
18 | 
19 |     nodes_exclude = follow[node].copy()
20 |     nodes_exclude.add(node)
21 | 
22 |     l1_candidates = get_surroundings(follow, followed, [node])
23 |     l2_candidates = get_surroundings(follow, followed, l1_candidates)
24 |     l3_candidates = get_surroundings(follow, followed, l2_candidates)
25 | 
26 |     candidates = set()
27 |     candidates.update(l1_candidates)
28 |     candidates.update(l2_candidates)
29 |     candidates.update(l3_candidates)
30 | 
31 |     candidates.difference_update(nodes_exclude)
32 |     return candidates
33 | 
34 | if __name__ == '__main__':
35 |     pass
36 | 
37 | 


--------------------------------------------------------------------------------
/fb_suggest_missing_link/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Based on number of mutual friends - 15%
  3 | Based on number of mutual follows - 30%
  4 | Remove non-followed suggestion - 66.3%
  5 | """
  6 | 
  7 | #!/usr/bin/env python
  8 | from collections import deque
  9 | import utilities
 10 | import time
 11 | import rank
 12 | import candidate
 13 | import validation
 14 | 
 15 | def get_popular_people(followed, num):
 16 |     """ Gets people with most followers. """
 17 | 
 18 |     dict_num_followers = {}
 19 |     for node in followed.keys():
 20 |         dict_num_followers[node] = len(followed[node])
 21 | 
 22 |     popular_people = sorted(dict_num_followers,
 23 |         key=dict_num_followers.__getitem__,
 24 |         reverse=True)
 25 | 
 26 |     return popular_people[0 : num]
 27 | 
 28 | def suggest_friends(follow, followed, clf, node, popular_people,
 29 |     max_suggestion):
 30 |     """ Suggests friends for a given node. """
 31 | 
 32 |     if not follow.has_key(node):
 33 |         return []
 34 | 
 35 |     candidates = candidate.get_candidates(follow, followed, node)
 36 |     suggested = rank.rank_candidates(follow, followed, clf, node, candidates)
 37 | 
 38 |     # Suggests most popular people when candidates are less than 10.
 39 |     if len(suggested) < max_suggestion:
 40 |         for star in popular_people:
 41 |             if star not in suggested:
 42 |                 suggested.append(star)
 43 |             if len(suggested) >= max_suggestion:
 44 |                 break
 45 |     else:
 46 |         suggested = suggested[0 : max_suggestion]
 47 | 
 48 |     return suggested
 49 | 
 50 | def main(follow, followed, test_file, submission_file, data_file,
 51 |     validation_file, max_suggestion):
 52 |     """ The main method for the problem. """
 53 | 
 54 |     print 'Reading graph...'
 55 |     test_nodes = utilities.read_nodes_list(test_file)
 56 | 
 57 |     print 'Training with logistic regression...'
 58 |     clf = rank.train(data_file, validation_file)
 59 | 
 60 |     print 'Getting popular people...'
 61 |     popular_people = get_popular_people(followed, max_suggestion)
 62 | 
 63 |     print 'Predicting...'
 64 |     predictions = []
 65 |     count = 0
 66 |     for node in test_nodes:
 67 |         suggested = suggest_friends(follow, followed, clf, node,
 68 |             popular_people, max_suggestion)
 69 |         predictions.append(suggested)
 70 | 
 71 |         count += 1
 72 |         if count % 100 == 0:
 73 |             print 'Suggested %d friends.' % count
 74 | 
 75 |     print 'Writing submission files...'
 76 |     utilities.write_submission_file(submission_file, test_nodes, predictions)
 77 | 
 78 | if __name__ == '__main__':
 79 |     start_time = time.time()
 80 |     follow, followed = utilities.read_graph('./data/train.csv')
 81 | 
 82 |     validation.generate_test_set(follow, followed,
 83 |          './data/test.csv',
 84 |          './data/validation.csv',
 85 |          './data/solution.csv',
 86 |          2000, 10)
 87 | 
 88 |     main(follow, followed,
 89 |          './data/validation.csv',
 90 |          './data/result.csv',
 91 |          './data/data.csv',
 92 |          './data/data_test.csv',
 93 |          10)
 94 | 
 95 |     # main(follow, followed,
 96 |     #      './data/test.csv',
 97 |     #      './data/result.csv',
 98 |     #      './data/data.csv',
 99 |     #      10)
100 | 
101 |     print (time.time() - start_time) / 60.0, 'minutes'
102 | 
103 | 


--------------------------------------------------------------------------------
/fb_suggest_missing_link/rank.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Responsible for extracting features, classification and ranking.
  3 | """
  4 | 
  5 | #!/usr/bin/env python
  6 | import utilities
  7 | from numpy import *
  8 | from sklearn import cross_validation
  9 | from sklearn import linear_model
 10 | from sklearn.metrics import precision_recall_fscore_support
 11 | 
 12 | def get_features(follow, followed, n1, n2):
 13 |     """ Creates features for a given pair of nodes. """
 14 | 
 15 |     # Level 1 features.
 16 |     does_follow = 0
 17 |     if n1 in follow[n2]:
 18 |         does_follow = 1
 19 | 
 20 |     # Level 2 features.
 21 |     followees_follow = set.intersection(follow[n1], followed[n2])
 22 |     percent_followees_follow = 0.0
 23 |     if len(follow[n1]) > 0:
 24 |         percent_followees_follow = 1.0 * len(followees_follow) / len(follow[n1])
 25 | 
 26 |     followees_followed = set.intersection(follow[n1], follow[n2])
 27 |     percent_followees_followed = 0.0
 28 |     if len(follow[n1]) > 0:
 29 |         percent_followees_followed = 1.0 * len(followees_followed) \
 30 |             / len(follow[n1])
 31 | 
 32 |     followers_follow = set.intersection(followed[n1], followed[n2])
 33 |     percent_followers_follow = 0.0
 34 |     if len(followed[n1]) > 0:
 35 |         percent_followers_follow = 1.0 * len(followers_follow) \
 36 |             / len(followed[n1])
 37 | 
 38 |     followers_followed = set.intersection(followed[n1], follow[n2])
 39 |     percent_followers_followed = 0.0
 40 |     if len(followed[n1]) > 0:
 41 |         percent_followers_followed = 1.0 * len(followers_followed) \
 42 |             / len(followed[n1])
 43 | 
 44 |     return [does_follow, percent_followees_follow, percent_followees_followed,
 45 |             percent_followers_follow, percent_followers_followed]
 46 | 
 47 | def rank_candidates(follow, followed, clf, node, candidates):
 48 |     """ Ranks the candidates based on the chance they will be followed. """
 49 | 
 50 |     if not candidates:
 51 |         return []
 52 | 
 53 |     # Generates feature matrix.
 54 |     candidates = list(candidates)
 55 |     x_candidates = []
 56 |     for candidate in candidates:
 57 |         features = get_features(follow, followed, node, candidate)
 58 |         x_candidates.append(features)
 59 | 
 60 |     # Uses classifier to estimate probability.
 61 |     candidate_score = {}
 62 |     prob = clf.predict_proba(x_candidates)
 63 |     for i in range(len(candidates)):
 64 |         candidate_score[candidates[i]] = prob[i][1]
 65 | 
 66 |     # Ranks candidates based on the score
 67 |     return  sorted(candidate_score, key=candidate_score.__getitem__,
 68 |         reverse=True)
 69 | 
 70 | def get_data(data_file, test_file):
 71 |     """ Produces training set, cross validation set and test set. """
 72 | 
 73 |     raw_data = utilities.read_file(data_file, True)
 74 |     test_data = utilities.read_file(test_file, True)
 75 |     x = array(raw_data, float64)
 76 |     y = x[:, 0]
 77 |     x = x[:, 1 : :]
 78 |     x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
 79 |         x, y, test_size=0.3, random_state=None)
 80 |     x = array(test_data, float64)
 81 |     y_test = x[:, 0]
 82 |     x_test = x[:, 1 : :]
 83 | 
 84 |     return (x_train, y_train, x_cv, y_cv, x_test, y_test)
 85 | 
 86 | def train(data_file, test_file):
 87 |     """ Uses random forest to train the model. """
 88 | 
 89 |     x_train, y_train, x_cv, y_cv, x_test, y_test = get_data(data_file,
 90 |         test_file)
 91 | 
 92 |     clf = linear_model.LogisticRegression(penalty='l1', C=1)
 93 |     clf.fit(x_train, y_train)
 94 |     print clf.coef_
 95 | 
 96 |     print 'Accuracy in training set: %f'% clf.score(x_train, y_train)
 97 |     print 'Accuracy in cv: %f' %  clf.score(x_cv, y_cv)
 98 |     print 'Accuracy in test: %f' %  clf.score(x_test, y_test)
 99 | 
100 |     precision, recall, f1, support = precision_recall_fscore_support(
101 |         y_test, clf.predict(x_test))
102 |     print precision, recall, f1
103 | 
104 |     return clf
105 | 
106 | if __name__ == '__main__':
107 |     train('./data/data.csv',
108 |           './data/data_test.csv')
109 | 
110 | 


--------------------------------------------------------------------------------
/fb_suggest_missing_link/readme:
--------------------------------------------------------------------------------
1 | This is the code for Kaggle - Facebook Recruiting Competion. The task is about predicting missing links in asymmetric social network. (http://www.kaggle.com/c/FacebookRecruiting)
2 | My approach can be divided into two phases. The first phase is selecting candidates and the second is ranking the candidates.
3 | In the first phase, for each predicting node, I select all surrounding nodes up to 3 levels as the candidates, and statistics show that this method usually get only 8% missing rate and is quite efficient in time.
4 | In the second phase, all candidates are ranked based on the probability to be potentially followed by the given node. This turns out to be a classification problem. Features I use includes: whether it follows the given node, percent of followers of the node follow it, percent of followers of the node followed by it, percent of followees of the node follow it, percent of followees of the node followed by it. 
5 | One thing that bothers me for a whole week is that this is a skewed classification, in which the fraction of postive training examples are less than 1%. So the classifier might get very very low recall, thus to be terrible. To mitigate this issue, I under-sampled the negative examples with about 1:10 (this ratio is achieved by experiments) and test the classifier in the original distribution. Logistic regression got a good result (about 73% recall and 23% precision).
6 | My best score is about 71.4% of mean average precision, and the leader is 72.98%.
7 | 


--------------------------------------------------------------------------------
/fb_suggest_missing_link/utilities.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from numpy import *
  3 | 
  4 | def edges_generator(file_name):
  5 |     """
  6 |     Generator that returns edges given a 2-column csv graph file
  7 |     """
  8 | 
  9 |     f = open(file_name)
 10 |     reader = csv.reader(f)
 11 |     # Ignore the header
 12 |     reader.next()
 13 | 
 14 |     for edges in reader:
 15 |         nodes = [int(node) for node in edges]
 16 |         yield nodes
 17 | 
 18 |     f.close()
 19 | 
 20 | def read_graph(file_name):
 21 |     """
 22 |     Reads a sparsely represented directed graph into a dictionary
 23 |     """
 24 | 
 25 |     # Store the graph as a dictionary of edges
 26 |     follow = {}
 27 |     followed = {}
 28 | 
 29 |     def initialize_node(node):
 30 |         if node not in follow:
 31 |             follow[node] = set()
 32 |         if node not in followed:
 33 |             followed[node] = set()
 34 | 
 35 |     count = 0
 36 |     for nodes in edges_generator(file_name):
 37 |         for node in nodes:
 38 |             initialize_node(node)
 39 |         follow[nodes[0]].add(nodes[1])
 40 |         followed[nodes[1]].add(nodes[0])
 41 |         count += 1
 42 |         if count % 1000000 == 0:
 43 |             print 'Already read %d nodes.' % count
 44 | 
 45 |     return (follow, followed)
 46 | 
 47 | def read_file(data_file, ignore_header=True):
 48 |     """ Reads data from the file. """
 49 | 
 50 |     f = open(data_file)
 51 |     reader = csv.reader(f)
 52 |     if ignore_header:
 53 |         reader.next()
 54 | 
 55 |     data = []
 56 |     for row in reader:
 57 |         data.append(row)
 58 | 
 59 |     f.close()
 60 |     return data
 61 | 
 62 | def read_nodes_list(test_file):
 63 |     """
 64 |     Reads of single-column list of nodes
 65 |     """
 66 | 
 67 |     f = open(test_file)
 68 |     reader = csv.reader(f)
 69 |     reader.next()
 70 | 
 71 |     nodes = []
 72 |     for row in reader:
 73 |         nodes.append(int(row[0]))
 74 |     return nodes
 75 |     f.close()
 76 | 
 77 | def write_file(data_file, data):
 78 |     """ Writes the data to the data_file. """
 79 | 
 80 |     f = open(data_file, 'w')
 81 |     writer = csv.writer(f)
 82 |     for row in data:
 83 |         writer.writerow(row)
 84 |     f.close()
 85 | 
 86 | def write_submission_file(submission_file, test_nodes, test_predictions):
 87 |     """
 88 |     Writes the submission file
 89 |     """
 90 | 
 91 |     f = open(submission_file, "w")
 92 |     writer = csv.writer(f)
 93 |     writer.writerow(["source_node", "destination_nodes"])
 94 | 
 95 |     for source_node, dest_nodes in zip(test_nodes, test_predictions):
 96 |         writer.writerow([str(source_node),
 97 |                          " ".join([str(n) for n in dest_nodes])])
 98 |     f.close()
 99 | 
100 | 


--------------------------------------------------------------------------------
/fb_suggest_missing_link/validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script is responsible for generating test data and analyzing the
  3 | prediction result.
  4 | """
  5 | 
  6 | #!/usr/bin/env python
  7 | import utilities
  8 | import candidate
  9 | from numpy import *
 10 | from random import randint
 11 | import rank
 12 | 
 13 | def generate_test_nodes(follow, nodes_exclude, num):
 14 |     """ Generates nodes for test. """
 15 | 
 16 |     test_nodes = []
 17 |     nodes_all = list(follow.keys())
 18 |     perm = random.permutation(len(nodes_all))
 19 |     for index in perm:
 20 |         node = nodes_all[index]
 21 |         if len(follow[node]) > 4 and node not in nodes_exclude:
 22 |             test_nodes.append(node)
 23 |             if len(test_nodes) >= num:
 24 |                 break
 25 |     return test_nodes
 26 | 
 27 | def remove_edges(follow, followed, node, max_remove_num):
 28 |     """ Randomly remove edges for the given node. The dict follow
 29 |     and follwed are modified in this method"""
 30 | 
 31 |     followees = list(follow[node])
 32 |     num_followees = len(followees)
 33 |     r = randint(4, min(num_followees, max_remove_num))
 34 |     perm = random.permutation(num_followees)
 35 |     perm = perm[0 : r]
 36 | 
 37 |     edges_removed = []
 38 |     for index in perm:
 39 |         n = followees[index]
 40 |         follow[node].remove(n)
 41 |         followed[n].remove(node)
 42 |         edges_removed.append(n)
 43 |     return edges_removed
 44 | 
 45 | def generate_solution(follow, followed, nodes_test, max_remove_num):
 46 |     """ Generates the solution for suggestion missing links. """
 47 | 
 48 |     solution = []
 49 |     for node in nodes_test:
 50 |         s = [node]
 51 |         edges_removed = remove_edges(follow, followed, node, max_remove_num)
 52 |         s += edges_removed
 53 |         solution.append(s)
 54 |     return solution
 55 | 
 56 | def generate_test_set(follow, followed, test_file, validation_file,
 57 |     solution_file, num,  max_remove_num):
 58 |     """ Generates the test set for analysis. """
 59 | 
 60 |     nodes_exclude = utilities.read_nodes_list(test_file)
 61 | 
 62 |     print 'Generating test nodes...'
 63 |     nodes_test = generate_test_nodes(follow, nodes_exclude, num)
 64 |     writable_nodes_test = [[n] for n in nodes_test]
 65 |     solution = generate_solution(follow, followed, nodes_test, max_remove_num)
 66 | 
 67 |     utilities.write_file(validation_file, writable_nodes_test)
 68 |     utilities.write_file(solution_file, solution)
 69 | 
 70 | def generate_training_set(follow, followed, ratio, solution_file, data_file):
 71 |     """ Uses the solution file to generate training set to train
 72 |     the model, hoping this method can get better result.
 73 |     Ratio controls the fraction of pos and neg data sets, if ratio is -1,
 74 |     the fraction is the origion fraction."""
 75 | 
 76 |     raw_solution = utilities.read_file(solution_file, False)
 77 |     dict_solution = {}
 78 |     for i in range(len(raw_solution)):
 79 |         row = raw_solution[i]
 80 |         dict_solution[int(row[0])] = set(int(n) for n in row[1 : :])
 81 | 
 82 |     x_train = [['spring brother is a true man']]
 83 |     for node in dict_solution.keys():
 84 |         nodes_pos = dict_solution[node]
 85 |         for n in nodes_pos:
 86 |             features = rank.get_features(follow, followed, node, n)
 87 |             x_train.append([1] + features)
 88 | 
 89 |         nodes_neg = candidate.get_candidates(follow, followed, node)
 90 |         nodes_neg.difference_update(nodes_pos)
 91 |         nodes_neg = list(nodes_neg)
 92 |         perm = random.permutation(len(nodes_neg))
 93 |         if ratio != -1:
 94 |             num = min(int(len(nodes_pos) * ratio), len(nodes_neg))
 95 |         else:
 96 |             num = len(nodes_neg)
 97 |         for i in range(num):
 98 |             node = nodes_neg[perm[i]]
 99 |             features = rank.get_features(follow, followed, node, n)
100 |             x_train.append([0] + features)
101 | 
102 |     utilities.write_file(data_file, x_train)
103 | 
104 | def analyze_candidates(solution_file, follow, followed):
105 |     """ Analyzes the method get_candidates. """
106 | 
107 |     raw_solution = utilities.read_file(solution_file, False)
108 |     dict_solution = {}
109 |     for row in raw_solution:
110 |         dict_solution[int(row[0])] = set(int(n) for n in row[1 : :])
111 | 
112 |     count_total = 0
113 |     count_miss = 0
114 |     for node in dict_solution:
115 |         candidates = candidate.get_candidates(follow, followed, node)
116 |         for n in dict_solution[node]:
117 |             if n not in candidates:
118 |                 count_miss += 1
119 |         count_total += len(dict_solution[node])
120 | 
121 |     print 'count_total = %d, count_miss = %d' %(
122 |         count_total, count_miss)
123 | 
124 | def ap(ground_truth, prediction):
125 |     """ Calculates the average precision. """
126 | 
127 |     ap = 0.0
128 |     already_hit = 0
129 |     for i in range(len(prediction)):
130 |         if prediction[i] in ground_truth:
131 |             already_hit += 1
132 |             ap += 1.0 * already_hit / (i + 1)
133 |     ap /= len(ground_truth)
134 |     return ap
135 | 
136 | def mean_average_precision(result_file, solution_file):
137 |     """ Calculates the mean average precision. """
138 | 
139 |     raw_result = utilities.read_file(result_file, True)
140 |     raw_solution = utilities.read_file(solution_file, False)
141 |     dict_result = {}
142 |     for row in raw_result:
143 |         dict_result[row[0]] = row[1 : :]
144 |     dict_solution = {}
145 |     for row in raw_solution:
146 |         dict_solution[row[0]] = set(row[1 : :])
147 | 
148 |     res = 0.0
149 |     for key in dict_result.keys():
150 |         prediction = dict_result[key][0].split()
151 |         ground_truth = dict_solution[key]
152 |         res += ap(ground_truth, prediction)
153 |     res /= len(dict_result)
154 |     print 'mean average precision = %f' % res
155 | 
156 | if __name__ == '__main__':
157 |     mean_average_precision('./data/result.csv',
158 |         './data/solution.csv')
159 | 
160 |     # follow, followed = utilities.read_graph('./data/train.csv')
161 |     # generate_test_set(follow, followed,
162 |     #      './data/test.csv',
163 |     #      './data/validation.csv',
164 |     #      './data/solution.csv',
165 |     #      10000, 10)
166 |     # print 'Generating training set...'
167 |     # generate_training_set(follow, followed, 10,
168 |     #     './data/solution.csv',
169 |     #     './data/data.csv')
170 | 
171 |     # print 'Generating test set...'
172 |     # generate_training_set2(follow, followed, -1,
173 |     #     './data/solution.csv',
174 |     #     './data/data_test.csv')
175 | 
176 | #    analyze_candidates('./data/solution.csv', follow, followed)
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 


--------------------------------------------------------------------------------
/insult_detect/insult_detect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import csv
 3 | from numpy import *
 4 | from sklearn import preprocessing
 5 | from sklearn import cross_validation
 6 | from sklearn.feature_extraction.text import TfidfVectorizer
 7 | from sklearn.naive_bayes import GaussianNB
 8 | import string
 9 | import re
10 | 
11 | """ The solution is based on tf-idf text vectorization and Gaussian
12 | Naive Bayes classification, achieving accuracy 93%. """
13 | 
14 | __train__ = './data/train.csv'
15 | __test__ = './data/test.csv'
16 | 
17 | def read_data(path, ignore_header=True):
18 |     csv_file_object = csv.reader(open(path, 'rb'))
19 |     if ignore_header:
20 |         header = csv_file_object.next()
21 |     x = []
22 |     for row in csv_file_object:
23 |         x.append(row)
24 |     return x
25 | 
26 | def feature_extract(raw_data):
27 |     y = []
28 |     x = []
29 |     for row in raw_data:
30 |         y.append(row[0])
31 |         x.append(row[2])
32 |     y = array(y, dtype=int32)
33 |     return (y, x)
34 | 
35 | def comment_filter(comment):
36 |     comment = comment.translate(string.maketrans('\n\t\r', '   '))
37 |     comment = comment.lower()
38 |     comment = comment.replace('\\', '')
39 |     comment = comment.replace('\'s', '')
40 |     comment = comment.replace('\'re', '')
41 |     comment = re.sub(r'([^\s\w]|_)+', '', comment)
42 |     comment = re.sub('[%s]' % string.digits, '9', comment)
43 |     return comment
44 | 
45 | if __name__ == '__main__':
46 |     print 'Preprocessing...'
47 |     raw_data = read_data(__train__)
48 |     test_data = read_data(__test__)
49 |     y, x = feature_extract(raw_data + test_data)
50 |     for i in range(len(x)):
51 |         x[i] = comment_filter(x[i])
52 | 
53 |     print 'Vectorizing...'
54 |     vectorizer = TfidfVectorizer(min_df=1, norm='l2', smooth_idf=True)
55 |     x = vectorizer.fit_transform(x)
56 |     x = x.toarray()
57 | 
58 |     print 'Dividing into training set and cv set...'
59 |     num_train = len(raw_data)
60 |     x_test = x[num_train : :, :]
61 |     y_test = y[num_train : :]
62 |     x = x[0 : num_train, :]
63 |     y = y[0 : num_train]
64 | 
65 |     x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
66 |         x, y, test_size=0.3, random_state=None)
67 |     print 'Training set size: %d, cv set size: %d' % (
68 |         y_train.shape[0], y_cv.shape[0])
69 | 
70 |     print 'Fitting Naive Bayes model...'
71 |     clf = GaussianNB()
72 |     clf.fit(x, y)
73 | 
74 |     print 'Predicting...'
75 |     print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
76 |     print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
77 | 
78 |     print 'Predicting the test set...'
79 |     p_test = clf.predict(x_test)
80 |     open_file_object = csv.writer(open("./data/result.csv", "wb"))
81 |     for i in range(len(test_data)):
82 |         test_data[i][0] = p_test[i] * 1.0
83 |         open_file_object.writerow(test_data[i])
84 | 


--------------------------------------------------------------------------------
/insult_detect/readme:
--------------------------------------------------------------------------------
1 | The task is to predict whether a comment posted during a public discussion is considered insulting to one of the participants (https://www.kaggle.com/c/detecting-insults-in-social-commentary).
2 | The problem is no more than a two classes text classification. After using several methods, my best approach is using classic tf-idf feature extraction (with normalization and idf smoothing) with Gaussian Naive Bayes classifier. 
3 | One optimization I use is the string preprocessing. As raw data of comment contains lots of meaningless characters, I filters them out and also replace all the numbers with 9. Actually this part can be further optimized for a great many ways.
4 | Finally I achieved 93% correctness for classification.
5 | 


--------------------------------------------------------------------------------
/kicked_car/classification.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from sklearn.ensemble import RandomForestClassifier
 4 | from sklearn.metrics import precision_recall_fscore_support
 5 | 
 6 | def random_forest(x_train, y_train, x_cv, y_cv):
 7 |     """ Using Random Forest to classify the data. """
 8 | 
 9 |     print 'Training with RF...'
10 |     clf = RandomForestClassifier(n_estimators = 10)
11 |     clf.fit(x_train, y_train)
12 | 
13 |     print 'Predicting...'
14 |     print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
15 |     if y_cv != None:
16 |         print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
17 |         precision, recall, f1, support = precision_recall_fscore_support(
18 |             y_cv, clf.predict(x_cv))
19 |         print precision, recall, f1
20 | 
21 |     return clf
22 | 
23 | if __name__ == '__main__':
24 |     pass
25 | 


--------------------------------------------------------------------------------
/kicked_car/data/.~lock.pos.csv#:
--------------------------------------------------------------------------------
1 | hang ,hang,hang,10.01.2013 20:29,file:///home/hang/.config/libreoffice/3;


--------------------------------------------------------------------------------
/kicked_car/data/.~lock.test.csv#:
--------------------------------------------------------------------------------
1 | hang ,hang,hang,02.01.2013 11:32,file:///home/hang/.config/libreoffice/3;


--------------------------------------------------------------------------------
/kicked_car/data/.~lock.training.csv#:
--------------------------------------------------------------------------------
1 | hang ,hang,hang,02.01.2013 11:31,file:///home/hang/.config/libreoffice/3;


--------------------------------------------------------------------------------
/kicked_car/data/feature_idx.csv:
--------------------------------------------------------------------------------
  1 | 5
  2 | 48
  3 | 71
  4 | 110
  5 | 122
  6 | 128
  7 | 320
  8 | 359
  9 | 364
 10 | 414
 11 | 419
 12 | 463
 13 | 470
 14 | 501
 15 | 517
 16 | 520
 17 | 521
 18 | 523
 19 | 524
 20 | 525
 21 | 526
 22 | 527
 23 | 528
 24 | 530
 25 | 532
 26 | 533
 27 | 538
 28 | 541
 29 | 553
 30 | 558
 31 | 559
 32 | 567
 33 | 568
 34 | 577
 35 | 584
 36 | 590
 37 | 600
 38 | 601
 39 | 616
 40 | 623
 41 | 624
 42 | 633
 43 | 634
 44 | 639
 45 | 649
 46 | 657
 47 | 668
 48 | 674
 49 | 687
 50 | 690
 51 | 692
 52 | 694
 53 | 696
 54 | 700
 55 | 721
 56 | 726
 57 | 745
 58 | 746
 59 | 769
 60 | 771
 61 | 776
 62 | 777
 63 | 810
 64 | 832
 65 | 857
 66 | 878
 67 | 889
 68 | 895
 69 | 915
 70 | 923
 71 | 928
 72 | 933
 73 | 953
 74 | 956
 75 | 975
 76 | 984
 77 | 986
 78 | 988
 79 | 992
 80 | 998
 81 | 1031
 82 | 1038
 83 | 1039
 84 | 1045
 85 | 1050
 86 | 1054
 87 | 1087
 88 | 1101
 89 | 1119
 90 | 1120
 91 | 1122
 92 | 1167
 93 | 1174
 94 | 1179
 95 | 1190
 96 | 1195
 97 | 1200
 98 | 1218
 99 | 1222
100 | 1223
101 | 1226
102 | 1227
103 | 1231
104 | 1232
105 | 1237
106 | 1239
107 | 1244
108 | 1252
109 | 1269
110 | 1287
111 | 1301
112 | 1310
113 | 1311
114 | 1324
115 | 1329
116 | 1343
117 | 1344
118 | 1370
119 | 1373
120 | 1393
121 | 1396
122 | 1409
123 | 1433
124 | 1435
125 | 1442
126 | 1445
127 | 1464
128 | 1474
129 | 1501
130 | 1502
131 | 1503
132 | 1504
133 | 1516
134 | 1522
135 | 1527
136 | 1530
137 | 1537
138 | 1539
139 | 1573
140 | 1601
141 | 1610
142 | 1618
143 | 1629
144 | 1634
145 | 1638
146 | 1652
147 | 1654
148 | 1655
149 | 1656
150 | 1657
151 | 1662
152 | 1663
153 | 1683
154 | 1700
155 | 1705
156 | 1717
157 | 1726
158 | 1729
159 | 1730
160 | 1734
161 | 1735
162 | 1736
163 | 1741
164 | 1743
165 | 1750
166 | 1755
167 | 1759
168 | 1764
169 | 1770
170 | 1782
171 | 1804
172 | 1809
173 | 1810
174 | 1812
175 | 1855
176 | 1870
177 | 1883
178 | 1907
179 | 1911
180 | 1938
181 | 1975
182 | 1991
183 | 1993
184 | 1998
185 | 2002
186 | 2020
187 | 2037
188 | 2041
189 | 2051
190 | 2101
191 | 2102
192 | 2112
193 | 2117
194 | 2127
195 | 2141
196 | 2175
197 | 2183
198 | 2207
199 | 2210
200 | 2216
201 | 2231
202 | 2234
203 | 2240
204 | 2241
205 | 2248
206 | 2253
207 | 2254
208 | 2267
209 | 2275
210 | 2282
211 | 2286
212 | 2297
213 | 2301
214 | 2306
215 | 2323
216 | 2329
217 | 2342
218 | 2352
219 | 2370
220 | 2376
221 | 2379
222 | 2383
223 | 2405
224 | 2411
225 | 2417
226 | 2424
227 | 2467
228 | 2477
229 | 2481
230 | 2488
231 | 2496
232 | 2516
233 | 2519
234 | 2526
235 | 2540
236 | 2544
237 | 2545
238 | 2549
239 | 2597
240 | 2598
241 | 2600
242 | 2635
243 | 2640
244 | 2648
245 | 2649
246 | 2650
247 | 2653
248 | 2654
249 | 2655
250 | 2657
251 | 2665
252 | 2666
253 | 2669
254 | 2676
255 | 2678
256 | 2679
257 | 2681
258 | 2682
259 | 2683
260 | 2684
261 | 2685
262 | 2686
263 | 2687
264 | 2688
265 | 2690
266 | 2693
267 | 2697
268 | 2710
269 | 2714
270 | 2730
271 | 2733
272 | 2734
273 | 2740
274 | 2741
275 | 2747
276 | 2751
277 | 2765
278 | 2784
279 | 2800
280 | 2802
281 | 2805
282 | 2826
283 | 2827
284 | 2829
285 | 2840
286 | 2843
287 | 2889
288 | 2900
289 | 2904
290 | 2905
291 | 2906
292 | 2911
293 | 2914
294 | 2915
295 | 2920
296 | 2932
297 | 2939
298 | 2948
299 | 2959
300 | 2962
301 | 


--------------------------------------------------------------------------------
/kicked_car/data/idx:
--------------------------------------------------------------------------------
 1 | Field Name				Definition
 2 |  RefID				        Unique (sequential) number assigned to vehicles
 3 |  IsBadBuy				Identifies if the kicked vehicle was an avoidable purchase 
 4 | 0 PurchDate				The Date the vehicle was Purchased at Auction
 5 | 1 Auction (3)				Auction provider at which the  vehicle was purchased
 6 | 2 VehYear (10)				The manufacturer's year of the vehicle
 7 | *3 VehicleAge				The Years elapsed since the manufacturer's year
 8 | 4 Make	(33)				Vehicle Manufacturer 
 9 | 5 Model	(1063)				Vehicle Model
10 | 6 Trim	(135)				Vehicle Trim Level
11 | 7 SubModel (864)		        Vehicle Submodel
12 | 8 Color	(17)				Vehicle Color
13 | 9 Transmission (5)			Vehicles transmission type (Automatic, Manual)
14 | 10 WheelTypeID (5)			The type id of the vehicle wheel
15 | 11 WheelType (4)		        The vehicle wheel type description (Alloy, Covers)
16 | *12 VehOdo				The vehicles odometer reading
17 | 13 Nationality (5)			The Manufacturer's country
18 | 14 Size (13)				The size category of the vehicle (Compact, SUV, etc.)
19 | 15 TopThreeAmericanName (5)		Identifies if the manufacturer is one of the top three American manufacturers
20 | *16 MMRAcquisitionAuctionAveragePrice	Acquisition price for this vehicle in average condition at time of purchase	
21 | *17 MMRAcquisitionAuctionCleanPrice	Acquisition price for this vehicle in the above Average condition at time of purchase
22 | *18 MMRAcquisitionRetailAveragePrice	Acquisition price for this vehicle in the retail market in average condition at time of purchase
23 | *19 MMRAcquisitonRetailCleanPrice	Acquisition price for this vehicle in the retail market in above average condition at time of purchase
24 | *20 MMRCurrentAuctionAveragePrice	Acquisition price for this vehicle in average condition as of current day	
25 | *21 MMRCurrentAuctionCleanPrice		Acquisition price for this vehicle in the above condition as of current day
26 | *22 MMRCurrentRetailAveragePrice        Acquisition price for this vehicle in the retail market in average condition as of current day
27 | *23 MMRCurrentRetailCleanPrice		Acquisition price for this vehicle in the retail market in above average condition as of current day
28 | 24 PRIMEUNIT (3)			Identifies if the vehicle would have a higher demand than a standard purchase
29 | 25 AUCGUART (3)				The level guarntee provided by auction for the vehicle (Green light - Guaranteed/arbitratable,      Yellow Light - caution/issue, red light - sold as is)
30 | 26 BYRNO (74)				Unique number assigned to the buyer that purchased the vehicle
31 | 27 VNZIP (163)                          Zipcode where the car was purchased
32 | 28 VNST (37)                            State where the the car was purchased
33 | *29 VehBCost				Acquisition cost paid for the vehicle at time of purchase
34 | 30 IsOnlineSale	(2)			Identifies if the vehicle was originally purchased online
35 | *31 WarrantyCost                        Warranty price (term=36month  and millage=36K) 
36 | 
37 | 
38 | 
39 | [ 0.11074237  0.10457275  0.04262179  0.04206746  0.03988719  0.0396514
40 |   0.03864223  0.03826018  0.03824352  0.03791842  0.03714719  0.0364236
41 |   0.03529239  0.03211145  0.03013207  0.02963083  0.02820693  0.0277857
42 |   0.02660546  0.0241205   0.02401912  0.02379755  0.02328335  0.0194586
43 |   0.01837794  0.01630656  0.01116595  0.00631357  0.00553677  0.0055280
44 |   0.0038861   0.00226295]                                              
45 | [10, 11, 29, 17, 12, 16, 20, 21, 23, 19, 22, 18, 0, 31, 26, 27, 5, 3, 7, 28, 6, 2, 8, 4, 1, 14, 15, 13, 24, 25, 9, 30]    
46 | 
47 | [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 24, 25, 26, 27, 28, 30]
48 | 
49 | 
50 | [36, 41, 0, 12, 32, 29, 16, 1, 8, 20, 18, 5, 22, 38, 4, 27, 49, 26, 21,                                                                      
51 |  48, 17, 6, 7, 31, 3, 2, 23, 39, 19, 50, 28, 37, 10, 14, 44, 35, 9, 24,                                                                      
52 |  13, 47, 30, 15, 34, 45, 46, 33, 40, 25, 42, 43, 11, 51]
53 | 
54 | 
55 | 36, 41, 32, 38, 49, 48, 39, 50, 37, 44, 35, 47, 34, 45, 46, 33, 40, 42, 43, 51
56 | 5,  10,  0,  7, 27, 26,  8, 28,  6, 14, 4,  25, 2,  15, 24, 1,  9,  11, 13, 30
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/kicked_car/data/log:
--------------------------------------------------------------------------------
 1 | RF 30, all features      	           .902855     .845  .245  0.3804        0.23019
 2 | RF 300, all features      	           .888102     .948  .083  0.1534        0.22366
 3 | RF 30, all features, ds 5                  .890340     .865  .115  0.2036        0.22477
 4 | RF 300, all features, ds 5                 .900754     .783  .253  0.3828        0.23201
 5 | RF 30, all features, ds 3                  .896186     .736  .227  0.3466        0.22719
 6 | RF 300, all features, ds 3                 .893126     .624  .303  0.4082        0.23433
 7 | RF 300, all features, ds 3, scale          .893674     .632  .299  0.4058        0.23378
 8 | RF 30, all features, ds 1                  .688787     .231  .674  0.3447        0.21998
 9 | RF 300, all features, ds 1                 .724549     .253  .648  0.3636        0.23506
10 | 
11 | LR, 5 binary features, ds 1                .746609     .265  .612  0.3701        0.23383
12 | LR, 10 binary features, ds 0.8             .646852     .222  .717  0.3392        0.23412
13 | LR, 10 binary features, ds 1               .749516     .277  .608  0.3804        0.23606
14 | LR, 10 binary features, ds 1, scale        .676251     .226  .646  0.3353        0.21143
15 | LR, 10 binary features, ds 3                                                     0.11535
16 | LR, all binary features, ds 1              .720096     .255  .633  0.3637        0.23073
17 | 
18 | RF 300, avg cate               		   .904681     .858  .258  0.3969        0.23040
19 | RF 300, avg cate, ds 3                     .896278     .631  .352  0.4518        0.23541
20 | RF 300, all + avg cate, ds 3               .897008     .643  .342  0.4466        0.23622
21 | RF 300, all + avg cate, ds 2               .876364     .490  .434  0.4602        0.23950
22 | RF 300, all + avg cate, ds 2, chi 40       .874629     .483  .455  0.4685        0.23477
23 | RF 300, all + mean std cate, ds 2          .870701     .467  .453  0.4596        0.23232
24 | RF 300, all + avg cate, ds 1               .749258     .279  .672  0.3943        0.23399
25 | RF 300, all + avg cate                     .904362     .858  .255  0.3930        0.23313
26 | RF 300, avg cate, ds 1                     .746792     .278  .678  0.3941        0.22876
27 | RF 300, 10 avg cate, ds 3                  .894953     .622  .345  0.4440        0.23053
28 | 
29 | RF 300, 5 binary, ds 1                     .725655     .256  .617  0.3623        0.23070
30 | 
31 | RF 300, 5 binary, chi 200, ds 2            .877004     .492  .385  0.4319        0.23977
32 | RF 300, 5 binary, chi 300, ds 2            .877753     .496  .384  0.4330        0.24076
33 | RF 100, 5 binary, chi 300, ds 2            .879013     .503  .376  0.4302        0.23772
34 | RF 300, 5 binary, chi 200 (ds 2), ds 2     .876364     .489  .383  0.4293        0.23961
35 | RF 300, 5 binary, chi 100, ds 2            .872071     .468  .388  0.4240        0.23696
36 | RF 300, 5 binary, RF_select 28, ds 2       .867367     .447  .386  0.4143        0.23044
37 | RF 300, 5 binary, LR_select 31, ds 2       .862480     .429  .399  0.4136        0.22871
38 | RF 300, 5 bin, LR_select 87(ds 2), ds 2    .869422     .457  .398  0.4253        0.23783
39 | RF 300, 5 bin, LR_select 268(ds 2), ds 2   .880201     .509  .385  0.4382        0.23926
40 | RF 300, 5 bin, LR_select 256, ds 2         .878831     .502  .379  0.4320        0.23911
41 | 
42 | RF 300, 5 binary + avg, chi 300, ds 2      .879333     .504  .380  0.4335        0.24221
43 | RF 300, 5 binary + avg, chi 400, ds 2      .881799     .520  .354  0.4213        0.24150
44 | RF 300, 5 binary + avg, pca 300, ds 2      .880749     .514  .329  0.4018        0.21122
45 | RF 300, 5 binary + avg, tree 300, ds 2     .880566     .512  .374  0.4324        0.24024
46 | RF 300, all binary + avg, LR 350, ds 2     				         0.24222
47 | RF 300, all binary + avg, LR 400, ds 2     .876684     .490  .385  0.4311        0.23812 
48 | RF 300, all binary + avg, LR 300, ds 2     .876501     .490  .393  0.4362        0.24153  
49 | RF 300, all binary + avg, chi2 300, ds 2   .876639     .490  .370  0.4215        0.24025 
50 | 
51 | RF 300, all num   			   .876958     .369  .018  0.0344        0.14464 
52 | 


--------------------------------------------------------------------------------
/kicked_car/data/log2:
--------------------------------------------------------------------------------
 1 | 	RF 300, num   			   		.876958     .369  .018  0.0344        0.14464
 2 | RF 300, num, ds 2				.813428     .262  .296  0.278         0.15263
 3 | RF 300, num+diff, ds 2				.814524     .257  .279  0.268         0.14948
 4 | RF 300, num+diff2, ds 2				.813        .262  .296  0.278         0.15370
 5 | RF 300, num+diff2+avg_meter, ds 2	        .814        .258  .282  0.260         0.15031
 6 | RF 300, num+diff2-price, ds 2	                .823        .273  .278  0.275         0.15144
 7 | RF 300, num+diff2, ds 1.5			.755        .230  .433  0.300         0.15396
 8 | RF 300, num+diff3, ds 1.5			.772        .253  .448  0.323         0.17862
 9 | 
10 | 
11 | RF 300, num+diff2, ds 1.5			.755        .230  .433  0.300         0.15396
12 | RF 300, num+diff2, ds 2				.813        .262  .296  0.278         0.15370      
13 | RF 300, num+diff2+auction, ds 2		        .820        .276  .297  0.286         0.15909	6  (1)
14 | RF 300, num+diff2+year, ds 2		        .816        .260  .278  0.268         0.15254      (2)
15 | RF 300, num+diff2+make, ds 2		        .823        .272  .274  0.273         0.15659	9  (4)
16 | RF 300, num+diff2+trim, ds 2		        .839        .292  .239  0.257         0.15980	5  (6)
17 | RF 300, num+diff2+color, ds 2		        .823        .270  .268  0.270         0.15550	10 (8)
18 | RF 300, num+diff2+trans, ds 2		        .820        .272  .286  0.279         0.15452	11 (9)
19 | RF 300, num+diff2+w_type_id, ds 2	        .864        .432  .378  0.403         0.22412	2  (10)
20 | RF 300, num+diff2+w_type, ds 2	                .864        .429  .367  0.396         0.22440	1  (11)     
21 | RF 300, num+diff2+nation, ds 2	                .814        .263  .294  0.278         0.15165      (13)
22 | RF 300, num+diff2+size, ds 2	                .819        .266  .279  0.272         0.15121      (14)
23 | RF 300, num+diff2+top3, ds 2	                .819        .266  .277  0.271         0.15377	12 (15)
24 | RF 300, num+diff2+prim, ds 2	                .820        .284  .316  0.300         0.16330	3  (24)
25 | RF 300, num+diff2+byrno, ds 2	                .832        .285  .252  0.268         0.15859	8  (26)
26 | RF 300, num+diff2+zip, ds 2	                .837        .292  .237  0.262         0.16025	4  (27)
27 | RF 300, num+diff2+state, ds 2	                .833        .291  .258  0.273         0.15901	7  (28)
28 | RF 300, num+diff2+online, ds 2	                .820        .270  .288  0.279         0.15365      (30)
29 | 
30 | RF 300, num+diff2+top2, ds 2	       	        .862        .422  .373  0.396         0.22514
31 | RF 300, num+diff2+top3, ds 2	       	        .862        .423  .381  0.401         0.22804
32 | RF 300, num+diff2+top4, ds 2	       	        .875        .483  .361  0.413         0.23637
33 | RF 300, num+diff2+top5, ds 2	       	        .874        .476  .364  0.412         0.23707
34 | RF 300, num+diff2+top4+6, ds 2	       	        .878        .499  .361  0.419         0.23448
35 | RF 300, num+diff2+top6, ds 2	       	        .878        .499  .362  0.420         0.23848
36 | RF 300, num+diff2+top12, ds 2	       	        .881        .512  .362  0.424         0.23776
37 | RF 300, num+diff2+top12, ds 1.5	       	        .838        .370  .477  0.417         0.23885
38 | RF 300, num+diff2+top6, ds 1.5	       	        .841        .373  .457  0.410         0.23785
39 | RF 300, num+diff2+top12, chi 300 ds 1.5	       	.840        .373  .465  0.414         0.24088
40 | RF 300, num+diff2+top12, chi 300 ds 2	       	.880        .505  .360  0.420         0.23887
41 | RF 300, num+diff2+top12, chi 350 ds 2	       	.879        .503  .365  0.423         0.23774
42 | RF 300, num+diff2+top12, chi 350 ds 1.5	       	.843        .383  .476  0.424         0.24114
43 | RF 1000, num+diff2+top12, chi 350 ds 1.5	--          --    --    --            0.24327
44 | RF 300, num+diff2+top12, chi 400 ds 1.5	       	.845        .384  .464  0.420         0.24021
45 | RF 300, num+diff2+top12+log, chi 350 ds 1.5	.846        .388  .468  0.425         0.24037
46 | RF 300, num+diff2+all_cate, chi 350 ds 1.5	.836        .367  .476  0.415         0.24048
47 | RF 300, num+diff2+top12+tree, chi 350 ds 1.5	.840        .374  .469  0.416         0.24097
48 | RF 300, num+diff2+top12+tree, chi 300 ds 1.5	.844        .383  .462  0.419         0.23820
49 | RF 300, num+diff2+top12+tree, chi 300 ds 2	.882        .517  .368  0.430         0.23821
50 | RF 300, num+diff2+top12+tree, chi 350 ds 2	.800        .508  .371  0.429         0.23855
51 | RF 300, num+diff2+top12+tree, chi 400 ds 1.5	.844        .383  .464  0.419         0.24075
52 | 
53 | RF 1000, num+diff2+top12+avg, chi 350 ds 1.5	--          --    --    --            0.24221
54 | RF 300, num+diff2+top12+avg, chi 350 ds 1.5	.845        .386  .459  0.419         0.23697
55 | RF 300, num+diff2+top12+avg, chi 300 ds 1.5	.843        .382  .467  0.420         0.23885
56 | 
57 | 
58 | RF 300, num+diff3+top12, chi 350 ds 1.5	       	.843        .383  .476  0.424         0.24166
59 | RF 300, num+diff3+tree,  ds 1.5  	       	.837        .368  .474  0.415         0.23893
60 | RF 300, num+diff3+top2+tree,  ds 1.5  	       	.837        .366  .468  0.411         0.23570
61 | RF 300, num+diff3+tree2,  ds 1.5  	       	.822        .334  .469  0.390         0.22707
62 | RF 1000, (11, 15, 27, 24, 1)							      0.24151
63 | RF 300, (11, 15, 27, 24, 1), ds 1.5		.834        .365  .476  .413	      0.23842
64 | 
65 | RF 300, num+diff3+top_all, chi 350 ds 1.5	.837        .368  .473  0.414         0.24085
66 | RF 300, num+diff2+top12, chi 350 ds 1.5	       	.843        .383  .476  0.424         0.24114
67 | RF 300, num+diff2+top12, tree 100 ds 1.5	.836        .364  .470  0.410         0.23528	
68 | RF 300, num+diff2+top12, tree 200 ds 1.5	.839        .369  .462  0.410         0.23745
69 | RF 300, num+diff2+top12, tree 300 ds 1.5	.832        .354  .470  0.404         0.23866	
70 | RF 300, num+diff2+top12, tree 300 ds 2		.880        .508  .373  0.430         0.23965
71 | RF 300, num+diff2+top12, tree 350 ds 2		.880        .507  .368  0.427         0.23686				   				    			     				      			      
72 | RF 1000, [10, 11, 24, 27, 6, 1, 28, 4, 9, 15], ds 1.5				      0.24364
73 | RF 300, [10, 11, 24, 27, 6, 1, 28, 4, 9, 15], ds 1.5    .834 .363 .488  .416	       .23936
74 | 
75 | RF 300, num+diff3+all, chi 300 ds 1.5		.837        .370  .480  0.417         0.24263
76 | RF 300, num+diff3+all, chi 320 ds 1.5		.838        .371  .481  0.419         0.24126
77 | RF 300, num+diff3+all, chi 350 ds 1.5		.837        .365  .467  0.410         0.23921
78 | RF 300, num+diff3+all, chi 280 ds 1.5		.829        .353  .487  0.410         0.24036	
79 | RF 300, num+diff3+all, chi 250 ds 1.5		.832        .358  .480  0.410         0.24113	
80 | 
81 | RF 300, num+diff3+all+avg, chi 300 ds 1.5       .838        .372  .483  0.421         0.24431
82 | RF 1000, num+diff3+all+ratio, chi 300 ds 1.5                                          0.24644
83 | RF 300, num+diff3+all+ratio, chi 300 ds 1.5     .838        .372  .483  0.421         0.24005
84 | RF 300, num+diff3+all+ratio, chi 300 ds 1.5     .838        .372  .483  0.421         0.24462
85 | 
86 | RF 300, num+diff3+all+ratio, tree 300 ds 1.5        			              0.24421
87 | RF 1000, num+diff3+all+ratio, tree 300 ds 1.5        			              0.24981	
88 | RF 300, num+diff3+all+ratio, tree 300 ds 1.7        			              0.24270
89 | RF 300, num+diff3+all+ratio, tree 300 ds 1.3        			              0.24393
90 | RF 300, num+diff3+all+avg, tree 300 ds 1.5        			              0.24289
91 | RF 300, num+diff3+all+tree, tree 300 ds 1.5        			              0.24263
92 | RF 300, num+diff3+all+ratio+year, tree 300 ds 1.5        			      0.24565
93 | RF 300, num+diff3+all+ratio+year,month, tree 300 ds 1.5        			      0.24696
94 | RF 1000, num+diff3+all+ratio+year,month, tree 300 ds 1.5        	              0.25147 (31)
95 | RF 300, num+diff3+all+all_ratio+year,month, tree 300 ds 1.5        	              0.24627
96 | RF 1000, num+diff3+all+all_ratio+year,month, tree 300 ds 1.5        	              0.24958
97 | ------------------------------------------------------------------------------------------------------------ 
98 | 
99 | 


--------------------------------------------------------------------------------
/kicked_car/feature_extraction.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from sklearn.tree import DecisionTreeClassifier
  4 | from collections import defaultdict
  5 | import math
  6 | 
  7 | _NUM_FEATURE_INDICES = [3, 12, 16, 17, 18, 19, 20, 21, 22, 23, 29, 31]
  8 | _CATE_FEATURE_INDICES = [0, 1, 2, 4, 6, 8, 9, 10, 11, 13, 14, 15, 24, 26,
  9 |                         27, 28, 30, 32]
 10 | 
 11 | def get_loglikelihood_ratio(x, y, idx, cate_range):
 12 |     pos_num = 0
 13 |     neg_num = 0
 14 |     pos_map = defaultdict(int)
 15 |     neg_map = defaultdict(int)
 16 |     for i in range(len(x)):
 17 |         cate = x[i][idx]
 18 |         if y[i] == 1:
 19 |             pos_num += 1
 20 |             pos_map[cate] += 1
 21 |         else:
 22 |             neg_num += 1
 23 |             neg_map[cate] += 1
 24 | 
 25 |     ratio_map = defaultdict(lambda: 0)
 26 |     for cate in range(cate_range + 1):
 27 |         p_pos = -100
 28 |         if cate in pos_map:
 29 |             p_pos = math.log10(pos_map[cate] / float(pos_num))
 30 |         p_neg = -100
 31 |         if cate in neg_map:
 32 |             p_neg = math.log10(neg_map[cate] / float(neg_num))
 33 |         ratio_map[cate] = p_pos - p_neg
 34 |     return ratio_map
 35 | 
 36 | def get_feature(x, range_map, ratio_map):
 37 |     x_new = []
 38 |     for line in x:
 39 |         # Numerical features.
 40 |         features = [line[idx] for idx in _NUM_FEATURE_INDICES]
 41 |         # Cur - avg.
 42 |         features.append(line[17] - line[16])
 43 |         features.append(line[19] - line[18])
 44 |         features.append(line[21] - line[20])
 45 |         features.append(line[23] - line[22])
 46 | 
 47 |         # Diff cur.
 48 |         features.append(line[19] - line[17])
 49 |         features.append(line[21] - line[19])
 50 |         features.append(line[23] - line[21])
 51 |         features.append(line[21] - line[17])
 52 |         features.append(line[23] - line[17])
 53 |         features.append(line[23] - line[19])
 54 | 
 55 |         # Diff avg.
 56 |         features.append(line[18] - line[16])
 57 |         features.append(line[20] - line[18])
 58 |         features.append(line[22] - line[20])
 59 |         features.append(line[22] - line[18])
 60 | 
 61 |         # Categorical features.
 62 |         for idx in _CATE_FEATURE_INDICES:
 63 |             for i in range(range_map[idx] + 1):
 64 |                 if i == line[idx]:
 65 |                     features.append(1)
 66 |                 else:
 67 |                     features.append(0)
 68 | 
 69 |         # Log likelihood ratio
 70 |         for idx in _CATE_FEATURE_INDICES:
 71 |             cate = line[idx]
 72 |             cur_ratio_map = ratio_map[idx]
 73 |             features.append(cur_ratio_map[cate])
 74 | 
 75 |         x_new.append(features)
 76 |     return x_new
 77 | 
 78 | def create_feature(x, y, x_test):
 79 |     range_map = {}
 80 |     ratio_map = {}
 81 |     for idx in _CATE_FEATURE_INDICES:
 82 |         range_map[idx] = max(x, key=lambda s: s[idx])[idx]
 83 |         ratio_map[idx] = get_loglikelihood_ratio(x, y, idx, range_map[idx])
 84 | 
 85 |     x_new = get_feature(x, range_map, ratio_map)
 86 |     x_test_new = get_feature(x_test, range_map, ratio_map)
 87 |     return (x_new, x_test_new)
 88 | 
 89 | def get_best_k_feature_indices(x, y, k):
 90 |     print 'Getting best k features...'
 91 |     clf = DecisionTreeClassifier(random_state=0, compute_importances=True)
 92 |     clf.fit(x, y)
 93 |     importance_pairs = [(i, clf.feature_importances_[i])
 94 |                         for i in range(len(clf.feature_importances_))]
 95 |     importance_pairs = sorted(importance_pairs, key=lambda s: s[1])
 96 |     return [importance_pairs[i][0] for i in range(k)]
 97 | 
 98 | def get_best_k_features(x, indices):
 99 |     x_important = []
100 |     for line in x:
101 |         features = [line[idx] for idx in indices]
102 |         x_important.append(features)
103 |     return x_important
104 | 
105 | if __name__ == '__main__':
106 |     pass
107 | 


--------------------------------------------------------------------------------
/kicked_car/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import time
 4 | import utilities
 5 | import preprocess
 6 | import classification
 7 | import feature_extraction
 8 | from sklearn import cross_validation
 9 | 
10 | def main(training_file, test_file, submission_file, ratio):
11 |     data = utilities.read_file(training_file)
12 |     test_data = utilities.read_file(test_file)
13 | 
14 |     print 'Preparing data...'
15 |     x, y = preprocess.prepare_data(data)
16 |     refid, x_test = preprocess.prepare_test_data(test_data)
17 |     x, x_test = preprocess.preprocess_features(x, x_test)
18 | 
19 |     print 'Feature extracting...'
20 |     x, x_test = feature_extraction.create_feature(x, y, x_test)
21 | 
22 |     indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
23 |     x = feature_extraction.get_best_k_features(x, indices)
24 |     x_test = feature_extraction.get_best_k_features(x_test, indices)
25 |     print 'Get %s features.' % len(x[0])
26 | 
27 |     x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
28 |         x, y, test_size=.3, random_state=0)
29 |     x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)
30 | 
31 |     clf = classification.random_forest(x_train, y_train, x_cv, y_cv)
32 | 
33 |     print 'Predicting...'
34 |     predict = clf.predict_proba(x_test)
35 |     utilities.write_submission_file(submission_file, refid, predict)
36 | 
37 | if __name__ == '__main__':
38 |     start_time = time.time()
39 | 
40 |     main('./data/training.csv',
41 |        './data/test.csv',
42 |        './data/res.csv',
43 |        1.5)
44 | 
45 |     print (time.time() - start_time) / 60.0, 'minutes'
46 | 


--------------------------------------------------------------------------------
/kicked_car/preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from random import shuffle
 3 | 
 4 | def extract_year_month(x):
 5 |     idx = 0
 6 |     for i in range(len(x)):
 7 |         pos = x[i][idx].rfind('/', 0, len(x[i][idx]))
 8 |         pos0 = x[i][idx].find('/', 0, len(x[i][idx]))
 9 |         x[i].append(x[i][idx][0 : pos0])
10 |         x[i][idx] = x[i][idx][pos + 1 : :]
11 | 
12 | def create_category_map(x, idx):
13 |     category_map = {}
14 |     cur = 0
15 |     for line in x:
16 |         cate = line[idx]
17 |         if not cate in category_map:
18 |             category_map[cate] = cur
19 |             cur += 1
20 |     return category_map
21 | 
22 | def convert_category_to_int(x, idx, category_map):
23 |     cur = max(category_map.values()) + 1
24 |     for i in range(len(x)):
25 |         cate = x[i][idx]
26 |         if cate in category_map:
27 |             cate_num = category_map[cate]
28 |         else:
29 |             category_map[cate] = cur
30 |             cur += 1
31 |         x[i][idx] = cate_num
32 |     return x
33 | 
34 | def convert_categories(x, x_test):
35 |     cate_feature_indices = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15,
36 |                             24, 25, 26, 27, 28, 30, 32]
37 |     for idx in cate_feature_indices:
38 |         cate_map = create_category_map(x, idx)
39 |         x = convert_category_to_int(x, idx, cate_map)
40 |         x_test = convert_category_to_int(x_test, idx, cate_map)
41 |     return (x, x_test)
42 | 
43 | def get_single_numerical_median(x, idx):
44 |     all_values = [float(line[idx]) for line in x
45 |                   if not line[idx] == '' and not line[idx] == 'NULL']
46 |     all_values.sort()
47 |     return all_values[len(all_values) / 2]
48 | 
49 | def fill_missing_numerical_feature(x, idx, median):
50 |     for i in range(len(x)):
51 |         if x[i][idx] == '' or x[i][idx] == 'NULL':
52 |             x[i][idx] = median
53 |         else:
54 |             x[i][idx] = float(x[i][idx])
55 |     return x
56 | 
57 | def fill_numerical_features(x, x_test):
58 |     num_feature_indices = [3, 12, 16, 17, 18, 19, 20, 21, 22, 23, 29, 31]
59 |     for idx in num_feature_indices:
60 |         median = get_single_numerical_median(x, idx)
61 |         x = fill_missing_numerical_feature(x, idx, median)
62 |         x_test = fill_missing_numerical_feature(x_test, idx, median)
63 |     return (x, x_test)
64 | 
65 | def preprocess_features(x, x_test):
66 |     extract_year_month(x)
67 |     extract_year_month(x_test)
68 |     x, x_test = fill_numerical_features(x, x_test)
69 |     x, x_test= convert_categories(x, x_test)
70 |     return (x, x_test)
71 | 
72 | def down_sample(x, y, ratio):
73 |     print 'Down sampling...'
74 |     pos_indices = [i for i in range(len(y)) if y[i] == 1]
75 |     neg_indices = [i for i in range(len(y)) if y[i] == 0]
76 | 
77 |     neg_num = min(int(len(pos_indices) * ratio), len(neg_indices))
78 |     shuffle(neg_indices)
79 |     sample_indices = pos_indices + neg_indices[0 : neg_num]
80 |     shuffle(sample_indices)
81 | 
82 |     # Down sampling.
83 |     x_ds = [x[idx] for idx in sample_indices]
84 |     y_ds = [y[idx] for idx in sample_indices]
85 |     return (x_ds, y_ds)
86 | 
87 | def prepare_data(data):
88 |     x = [line[2 : :] for line in data]
89 |     y = [int(line[1]) for line in data]
90 |     return (x, y)
91 | 
92 | def prepare_test_data(data):
93 |     x = [line[1 : :] for line in data]
94 |     refid = [line[0] for line in data]
95 |     return (refid, x)
96 | 
97 | if __name__ == '__main__':
98 |     pass
99 | 


--------------------------------------------------------------------------------
/kicked_car/readme:
--------------------------------------------------------------------------------
1 | This is the code for Kaggle - Don't Get Kicked! The problem is to predict if a car purchased is a kick, which means the car purchased by an auto dealership at an auto auction might have serious issues that prevent it from being sold to customers. There are about 30 features of a car given, most of which are categorical features like the model of the car, which country produced the car etc.. Also, this is a 
2 | skewed class problem as only 1/7 of the cars are marked as kicks. For more details, check Kaggle’s official description http://www.kaggle.com/c/DontGetKicked.
3 | My approach is based on random forest and most of my time was spent on feature engineering. For numerical features, I found differences between prices quite informative, and I added about 14 features based on these. For categorical features, I binarized all of them except "model" and "submodel". Also for each categorical feature, I added the log-likelihood ratio of it, which boost the result a little bit. Totally I got 500+ features, which turned out to be  redundant. So I trained a decision tree with these 500+ features and selected best 300 features based on Gini impurity. Since this is a skewed classes problem, I down sampled the training set to make pos/neg ratio 1/1.5.
4 | With about 1h training, my best performance is 0.25147, ranking 31 among 571 groups. And the leader is 0.26720.
5 | 


--------------------------------------------------------------------------------
/kicked_car/utilities.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import csv
 3 | from numpy import *
 4 | 
 5 | def read_file(file_name, header=False):
 6 |     print 'Reading file...'
 7 |     f = open(file_name)
 8 |     reader = csv.reader(f)
 9 |     if not header:
10 |         reader.next()
11 |     res = [line for line in reader]
12 |     f.close()
13 |     return res
14 | 
15 | def write_submission_file(file_name, refid, predict):
16 |     print 'Writing submission file...'
17 |     f = open(file_name, 'w')
18 |     writer = csv.writer(f)
19 |     for i in range(len(refid)):
20 |         writer.writerow([refid[i], predict[i][1]])
21 |     f.close()
22 | 
23 | if __name__ == '__main__':
24 |     pass
25 | 


--------------------------------------------------------------------------------
/music_rating/music_rating.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import csv
  3 | from numpy import *
  4 | from sklearn import preprocessing
  5 | from sklearn import cross_validation
  6 | from sklearn import linear_model
  7 | 
  8 | __train__ = './data/train.csv'
  9 | __test__ = './data/test.csv'
 10 | __users__ = './data/users.csv'
 11 | __words__ = './data/words.csv'
 12 | 
 13 | __music_clf_map__ = {}
 14 | 
 15 | def read_data(path, ignore_header=True,  max_line=-1):
 16 |     """ Reads data from file. """
 17 |     csv_file_object = csv.reader(open(path, 'rb'))
 18 |     if ignore_header:
 19 |         header = csv_file_object.next()
 20 |     x = []
 21 |     for row in csv_file_object:
 22 |         if max_line >= 0 and len(row) >= max_line:
 23 |             break
 24 |         x.append(row)
 25 |     return x
 26 | 
 27 | def generate_user_features():
 28 |     """ Generate user features from users.csv.
 29 |     Features include sex, age, questions."""
 30 |     profiles = read_data(__users__)
 31 |     indices_features = [0, 1, 2] + range(8, 27)
 32 |     user_map = {}
 33 |     for row in profiles:
 34 |         features = []
 35 |         for index in indices_features:
 36 |             features.append(row[index])
 37 |         if features[1] == 'Male':
 38 |             features[1] = 0
 39 |         else:
 40 |             features[1] = 1
 41 |         user_map[features[0]] = features[1 : :]
 42 |     return user_map
 43 | 
 44 | def get_mean(user_map, num_features):
 45 |     mean = [0.0] * num_features
 46 |     count = [0] * num_features
 47 |     for key in user_map.keys():
 48 |         features = user_map[key]
 49 |         for i in range(num_features):
 50 |             if features[i] != '':
 51 |                 mean[i] += float(features[i])
 52 |                 count[i] += 1
 53 |     for i in range(num_features):
 54 |         mean[i] /= count[i]
 55 |     return mean
 56 | 
 57 | def get_std(user_map, num_features, mean):
 58 |     std = [0.0] * num_features
 59 |     count = [0] * num_features
 60 |     for key in user_map.keys():
 61 |         features = user_map[key]
 62 |         for i in range(num_features):
 63 |             if features[i] != '':
 64 |                 std[i] += (float(features[i]) - mean[i]) ** 2
 65 |                 count[i] += 1
 66 |     for i in range(num_features):
 67 |         std[i] = math.sqrt(std[i] / count[i])
 68 |     return std
 69 | 
 70 | def preprocess_feature(user_map):
 71 |     """ Fills empty features with averages and scales the data."""
 72 |     num_features = 21
 73 |     mean = get_mean(user_map, num_features)
 74 |     std = get_std(user_map, num_features, mean)
 75 |     # Scaling.
 76 |     for key in user_map.keys():
 77 |         features = user_map[key]
 78 |         for i in range(len(features)):
 79 |             if features[i] == '':
 80 |                 features[i] = 0.0
 81 |             else:
 82 |                 features[i] = (float(features[i]) - mean[i]) / std[i]
 83 | 
 84 | def extract_rating(data, artist):
 85 |     """ Extracts all the data includes rating, track, user etc. given
 86 |     an artist id. """
 87 |     ratings = []
 88 |     for row in data:
 89 |         if row[0] == artist:
 90 |             ratings.append(row)
 91 |     return ratings
 92 | 
 93 | def generate_train_set(user_map, ratings, artist_user_pref):
 94 |     """ Generates training set based on all ratings of a particular artist,
 95 |     features combine both user profile and features from word.csv. """
 96 |     x = []
 97 |     y = []
 98 |     cnt = 0
 99 |     for row in ratings:
100 |         if user_map.has_key(row[2]):
101 |             artist_user = (row[0], row[2])
102 |             if artist_user_pref.has_key(artist_user):
103 |                 y.append(row[3])
104 |                 x.append(user_map[row[2]] + artist_user_pref[artist_user])
105 |             else:
106 |                 cnt += 1
107 |     print cnt
108 |     x = array(x, float64)
109 |     y = array(y, float64)
110 |     return (x, y)
111 | 
112 | def rmse(real_value, predict_value):
113 |     """ Calculating RMSE error. """
114 |     rmse = 0.0
115 |     for i in range(real_value.shape[0]):
116 |         rmse += (real_value[i] - predict_value[i]) ** 2
117 |     rmse = math.sqrt(rmse / real_value.shape[0])
118 |     return rmse
119 | 
120 | def generate_music_clf_map(data, artist_user_pref):
121 |     """ Generates classifiers for each artist. """
122 |     for row in data:
123 |         artist = row[0]
124 |         if __music_clf_map__.has_key(artist):
125 |             continue
126 |         ratings = extract_rating(data, artist)
127 |         x, y = generate_train_set(user_map, ratings, artist_user_pref)
128 |         clf = linear_model.Lasso(alpha=.5)
129 |         clf.fit(x, y)
130 |         __music_clf_map__[artist] = clf
131 |         print 'RMSE for %s: %f' % (artist, rmse(y, clf.predict(x)))
132 | 
133 | def generate_artist_user_pref():
134 |     """ Generates features for each (artist, user) pair from word.csv. """
135 |     words = read_data(__words__)
136 |     artist_user_pref = {}
137 |     for row in words:
138 |         artist_user = (row[0], row[1])
139 |         pref = row[4 : :]
140 |         for i in range(len(pref)):
141 |             if pref[i] == '':
142 |                 pref[i] = 0.0
143 |             else:
144 |                 pref[i] = float(pref[i])
145 |         if len(pref) == 82:
146 |             pref.append(0)
147 |         artist_user_pref[artist_user] = pref
148 |     return artist_user_pref
149 | 
150 | def generate_artist_mean(data):
151 |     """ Calculate average rating for each artist. """
152 |     artist_mean = {}
153 |     artist_rate = {}
154 |     for row in data:
155 |         artist = row[0]
156 |         rate = row[3]
157 |         if artist_rate.has_key(artist):
158 |             artist_rate[artist].append(float(rate))
159 |         else:
160 |             artist_rate[artist] = [float(rate)]
161 |     for key in artist_rate.keys():
162 |         artist_mean[key] = sum(artist_rate[key]) / len(artist_rate[key])
163 |     return artist_mean
164 | 
165 | if __name__ == '__main__':
166 |     print 'Generating user features...'
167 |     user_map = generate_user_features()
168 |     preprocess_feature(user_map)
169 |     data = read_data(__train__)
170 |     artist_mean = generate_artist_mean(data)
171 |     artist_user_pref = generate_artist_user_pref()
172 | 
173 |     print 'Generating classifiers for each artist...'
174 |     generate_music_clf_map(data, artist_user_pref)
175 |     test_data = read_data(__test__)
176 |     p_test = []
177 |     for row in test_data:
178 |         miss = False
179 |         feature = None
180 |         artist = row[0]
181 |         uid = row[2]
182 |         clf = __music_clf_map__[artist]
183 |         if user_map.has_key(uid):
184 |             feature = list(user_map[uid])
185 |             if artist_user_pref.has_key((artist, uid)):
186 |                 feature += artist_user_pref[(artist, uid)]
187 |             else:
188 |                 miss = True
189 |         else:
190 |             miss = True
191 |         if not miss:
192 |             p_test.append(clf.predict(feature))
193 |         else:
194 |             # Uses average ratings when user cannot be found.
195 |             p_test.append(artist_mean[artist])
196 | 
197 |     open_file_object = csv.writer(open("./data/result.csv", "wb"))
198 |     for p in p_test:
199 |         open_file_object.writerow([p])
200 | 


--------------------------------------------------------------------------------
/music_rating/readme:
--------------------------------------------------------------------------------
1 | The task is to predict the rating a user will give to a song (https://www.kaggle.com/c/MusicHackathon).
2 | The interesting part is that this problem provides us with tremendous amount of data, including users's rating, profile, preferences etc.. And they are in various format, ratings, words, binary... So the big challange here is how to select features, which turns out to be the key to this problem.
3 | The basic idea of my approach is to create models for each artist (rather than each artist, track pair). For a particular artist, we extract all its ratings from train.csv, and the features for each user we create from both users.csv and words.csv. I first extract features from users.csv (the file contains users' profiles) for each user, the feature includes age, sex, and the answer for their habbit questions. And then from words.csv (survey for users), I use the score this user give to this song as additional features. Basically I combine this two, and use Lasso regression (L1 norm) to build model.
4 | Due to time issue, I do not fully optimize the algorithm and there are lots of work remains to be done. I finally got rmse 16.68 and the leader got 13.24. 
5 | 


--------------------------------------------------------------------------------
/photo_quality_prediction/classification.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from numpy import *
 4 | from sklearn import cross_validation
 5 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
 6 | from sklearn.ensemble import RandomForestClassifier
 7 | from sklearn import linear_model
 8 | from sklearn.neighbors import KNeighborsClassifier
 9 | 
10 | def prepare_data(x, y, size=0.3, state=0):
11 |     """ Divides data into training set and cross validation set. """
12 | 
13 |     x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
14 |         x, y, test_size=size, random_state=state)
15 | 
16 |     return (x_train, y_train, x_cv, y_cv)
17 | 
18 | def knn(x_train, y_train, x_cv, y_cv, k=3):
19 |     """ Using KNN to classify the data. """
20 | 
21 |     print 'Training with KNN...'
22 |     clf = KNeighborsClassifier(n_neighbors=k)
23 |     clf.fit(x_train, y_train)
24 | 
25 |     print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
26 |     print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
27 |     return clf
28 | 
29 | def bernoulli_naive_bayes(x_train, y_train, x_cv, y_cv):
30 |     """ Using Naive Bayes to classify the data. """
31 | 
32 |     print 'Training with NB...'
33 |     clf = BernoulliNB()
34 |     clf.fit(x_train, y_train)
35 | 
36 |     print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
37 |     print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
38 |     return clf
39 | 
40 | def naive_bayes(x_train, y_train, x_cv, y_cv):
41 |     """ Using Naive Bayes to classify the data. """
42 | 
43 |     print 'Training with NB...'
44 |     clf = MultinomialNB()
45 |     clf.fit(x_train, y_train)
46 | 
47 |     print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
48 |     print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
49 |     return clf
50 | 
51 | def random_forest(x_train, y_train, x_cv, y_cv):
52 |     """ Using Random Forest to classify the data. """
53 | 
54 |     print 'Training with RF...'
55 |     clf = RandomForestClassifier(n_estimators = 2000, max_features=2)
56 |     clf.fit(x_train, y_train)
57 | 
58 |     print 'Predicting...'
59 |     print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
60 |     if y_cv != None:
61 |         print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
62 |     return clf
63 | 
64 | def logistic_regression(x_train, y_train, x_cv, y_cv):
65 |     """ Using Logistic Regression to classify the data. """
66 | 
67 |     print 'Training with LR...'
68 |     clf = linear_model.LogisticRegression(penalty='l2', C=.03)
69 |     clf.fit(x_train, y_train)
70 | 
71 |     print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
72 |     if y_cv != None:
73 |         print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
74 |     return clf
75 | 
76 | def get_prob(clf, x):
77 |     """ Gets the probability of being good. """
78 | 
79 |     prob = array(clf.predict_proba(x))
80 |     return prob[:, 1]
81 | 
82 | if __name__ == '__main__':
83 |     pass
84 | 


--------------------------------------------------------------------------------
/photo_quality_prediction/data/statistics:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FindBoat/Kaggle/870cb9884d67725a8fe9bad90fceb6ac286f9fe1/photo_quality_prediction/data/statistics


--------------------------------------------------------------------------------
/photo_quality_prediction/feature_selection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from numpy import *
  3 | import utilities
  4 | import classification
  5 | 
  6 | def generate_features(meta_data_train, y_train, meta_data_test):
  7 |     """ Generates features for classifier. """
  8 | 
  9 |     # Generate maps.
 10 |     name_score_map, desc_score_map, caption_score_map, word_score_map = \
 11 |         generate_text_score_map(meta_data_train, y_train)
 12 |     geo_score_map, lat_score_map, lon_score_map = generate_geo_score_map(
 13 |         meta_data_train, y_train)
 14 |     shape_score_map, size_score_map, width_score_map, height_score_map = \
 15 |             generate_size_score_map(meta_data_train, y_train)
 16 | 
 17 |     # Genearte text features.
 18 |     text_features_train = generate_text_features(meta_data_train,
 19 |         name_score_map, desc_score_map, caption_score_map, word_score_map)
 20 |     text_features_test = generate_text_features(meta_data_test,
 21 |         name_score_map, desc_score_map, caption_score_map, word_score_map)
 22 | 
 23 |     # Generates geo features.
 24 |     geo_features_train = generate_geo_features(meta_data_train, geo_score_map,
 25 |         lat_score_map, lon_score_map)
 26 |     geo_features_test = generate_geo_features(meta_data_test, geo_score_map,
 27 |         lat_score_map, lon_score_map)
 28 | 
 29 |     # Generates size features
 30 |     size_features_train = generate_size_features(meta_data_train,
 31 |         shape_score_map, size_score_map, width_score_map, height_score_map)
 32 |     size_features_test = generate_size_features(meta_data_test,
 33 |         shape_score_map, size_score_map, width_score_map, height_score_map)
 34 | 
 35 |     # Combines all features.
 36 |     x_train = []
 37 |     for i in range(len(text_features_train)):
 38 |         x_train.append(text_features_train[i] + size_features_train[i] \
 39 |              + geo_features_train[i])
 40 | 
 41 |     x_test = []
 42 |     for i in range(len(text_features_test)):
 43 |         x_test.append(text_features_test[i] + size_features_test[i] \
 44 |             + geo_features_test[i])
 45 | 
 46 |     return (x_train, x_test)
 47 | 
 48 | def generate_geo_features(meta_data, geo_score_map, lat_score_map,
 49 |     lon_score_map):
 50 |     """ Generates features for geo information. """
 51 | 
 52 |     geo_avg_score = get_map_avg(geo_score_map)
 53 |     lat_avg_score = get_map_avg(lat_score_map)
 54 |     lon_avg_score = get_map_avg(lon_score_map)
 55 | 
 56 |     geo_score_features = []
 57 |     for line in meta_data:
 58 |         lat = line[0]
 59 |         lon = line[1]
 60 |         geo = (lat, lon)
 61 | 
 62 |         geo_score = geo_avg_score
 63 |         if geo in geo_score_map:
 64 |             geo_score = geo_score_map[geo]
 65 | 
 66 |         lat_score = lat_avg_score
 67 |         if lat in lat_score_map:
 68 |             lat_score = lat_score_map[lat]
 69 | 
 70 |         lon_score = lon_avg_score
 71 |         if lon in lon_score_map:
 72 |             lon_score = lon_score_map[lon]
 73 | 
 74 |         geo_score_features.append([geo_score, lat_score, lon_score])
 75 |     return geo_score_features
 76 | 
 77 | def generate_geo_score_map(meta_data, y):
 78 |     """ Generates score map for geo information. """
 79 | 
 80 |     print 'Extracting geo features...'
 81 |     geo_score_pairs = []
 82 |     lat_score_pairs = []
 83 |     lon_score_pairs = []
 84 |     for i in range(len(y)):
 85 |         lat = meta_data[i][0]
 86 |         lon = meta_data[i][1]
 87 |         geo = (lat, lon)
 88 | 
 89 |         geo_score_pairs.append((geo, y[i]))
 90 |         lat_score_pairs.append((lat, y[i]))
 91 |         lon_score_pairs.append((lon, y[i]))
 92 | 
 93 |     geo_score_map = create_key_avg_map(geo_score_pairs)
 94 |     lat_score_map = create_key_avg_map(lat_score_pairs)
 95 |     lon_score_map = create_key_avg_map(lon_score_pairs)
 96 |     return (geo_score_map, lat_score_map, lon_score_map)
 97 | 
 98 | def generate_size_features(meta_data, shape_score_map, size_score_map,
 99 |     width_score_map, height_score_map):
100 |     """ Generates features for shape, size. """
101 | 
102 |     avg_shape_score = get_map_avg(shape_score_map)
103 |     avg_size_score = get_map_avg(size_score_map)
104 |     avg_width_score = get_map_avg(width_score_map)
105 |     avg_height_score = get_map_avg(height_score_map)
106 | 
107 |     size_score_features = []
108 |     for line in meta_data:
109 |         width = line[2]
110 |         height = line[3]
111 |         shape = (width, height)
112 |         size = line[4]
113 | 
114 |         shape_score = avg_shape_score
115 |         if shape in shape_score_map:
116 |             shape_score = shape_score_map[shape]
117 | 
118 |         size_score = avg_size_score
119 |         if size in size_score_map:
120 |             size_score = size_score_map[size]
121 | 
122 |         width_score = avg_width_score
123 |         if width in width_score_map:
124 |             width_score = width_score_map[width]
125 | 
126 |         height_score = avg_height_score
127 |         if height in height_score_map:
128 |             height_score = height_score_map[height]
129 | 
130 |         size_score_features.append(
131 |             [shape_score, size_score, width_score, height_score])
132 |     return size_score_features
133 | 
134 | def generate_size_score_map(meta_data, y):
135 |     """ Generates score map for width, heigth, size. """
136 | 
137 |     print 'Extracting size features...'
138 |     shape_score_pairs = []
139 |     size_score_pairs = []
140 |     width_score_pairs = []
141 |     height_score_pairs = []
142 |     for i in range(len(y)):
143 |         width = meta_data[i][2]
144 |         height = meta_data[i][3]
145 |         shape = (width, height)
146 |         size = meta_data[i][4]
147 | 
148 |         shape_score_pairs.append((shape, y[i]))
149 |         size_score_pairs.append((size, y[i]))
150 |         width_score_pairs.append((width, y[i]))
151 |         height_score_pairs.append((height, y[i]))
152 | 
153 |     shape_score_map = create_key_avg_map(shape_score_pairs)
154 |     size_score_map = create_key_avg_map(size_score_pairs)
155 |     width_score_map = create_key_avg_map(width_score_pairs)
156 |     height_score_map = create_key_avg_map(height_score_pairs)
157 |     return (shape_score_map, size_score_map, width_score_map, height_score_map)
158 | 
159 | def generate_text_features(meta_data, name_score_map, desc_score_map,
160 |     caption_score_map, word_score_map):
161 |     """ Generates features from name, desc, caption. """
162 | 
163 |     avg_name_score = get_map_avg(name_score_map)
164 |     avg_desc_score = get_map_avg(desc_score_map)
165 |     avg_caption_score = get_map_avg(caption_score_map)
166 | 
167 |     text_score_features = []
168 |     for i in range(len(meta_data)):
169 |         name = meta_data[i][5].split(' ')
170 |         desc = meta_data[i][6].split(' ')
171 |         caption = meta_data[i][7].split(' ')
172 | 
173 |         name_scores = []
174 |         for s in name:
175 |             if s in name_score_map:
176 |                 name_scores.append(name_score_map[s])
177 |             elif s in word_score_map:
178 |                 name_scores.append(word_score_map[s])
179 |             else:
180 |                 name_scores.append(avg_name_score)
181 | 
182 |         desc_scores = []
183 |         for s in desc:
184 |             if s in desc_score_map:
185 |                 desc_scores.append(desc_score_map[s])
186 |             elif s in word_score_map:
187 |                 desc_scores.append(word_score_map[s])
188 |             else:
189 |                 desc_scores.append(avg_desc_score)
190 | 
191 |         caption_scores = []
192 |         for s in caption:
193 |             if s in caption_score_map:
194 |                 caption_scores.append(caption_score_map[s])
195 |             elif s in word_score_map:
196 |                 caption_scores.append(word_score_map[s])
197 |             else:
198 |                 caption_scores.append(avg_caption_score)
199 | 
200 |         # Generates features.
201 |         name_avg_score = float(sum(name_scores)) / len(name_scores)
202 |         desc_avg_score = float(sum(desc_scores)) / len(desc_scores)
203 |         caption_avg_score = float(sum(caption_scores)) / len(caption_scores)
204 | 
205 |         all_scores = name_scores + desc_scores + caption_scores
206 |         total_avg_score = float(sum(all_scores)) / len(all_scores)
207 | 
208 |         name_std = std(name_scores, name_avg_score)
209 |         desc_std = std(desc_scores, desc_avg_score)
210 |         caption_std = std(caption_scores, caption_avg_score)
211 |         total_std = std(all_scores, total_avg_score)
212 | 
213 |         name_len = 0
214 |         if name[0] != '':
215 |             name_len = len(name)
216 |         desc_len = 0
217 |         if desc[0] != '':
218 |             desc_len = len(desc)
219 |         caption_len = 0
220 |         if caption[0] != '':
221 |             caption_len = len(caption)
222 | 
223 |         text_score_features.append([name_avg_score, desc_avg_score,
224 |             caption_avg_score, total_avg_score, name_len, desc_len,
225 |             caption_len, name_std, desc_std, caption_std, total_std])
226 |     return text_score_features
227 | 
228 | def generate_text_score_map(meta_data, y):
229 |     """ Generates the text score map for text features. """
230 | 
231 |     print 'Extracting text features...'
232 |     name_y_pairs = []
233 |     desc_y_pairs = []
234 |     caption_y_pairs = []
235 |     for i in range(len(y)):
236 |         name = meta_data[i][5].split(' ')
237 |         desc = meta_data[i][6].split(' ')
238 |         caption = meta_data[i][7].split(' ')
239 | 
240 |         for s in name:
241 |             name_y_pairs.append((s, y[i]))
242 |         for s in desc:
243 |             desc_y_pairs.append((s, y[i]))
244 |         for s in caption:
245 |             caption_y_pairs.append((s, y[i]))
246 |     word_y_pairs = name_y_pairs + desc_y_pairs + caption_y_pairs
247 | 
248 |     name_score_map = create_key_avg_map(name_y_pairs)
249 |     desc_score_map = create_key_avg_map(desc_y_pairs)
250 |     caption_score_map = create_key_avg_map(caption_y_pairs)
251 |     word_score_map = create_key_avg_map(word_y_pairs)
252 |     return (name_score_map, desc_score_map, caption_score_map, word_score_map)
253 | 
254 | def std(iterable, avg):
255 |     """ Calculate the standard deviation. """
256 | 
257 |     std = 0.0
258 |     for n in iterable:
259 |         std += (n - avg) ** 2
260 |     return math.sqrt(std)
261 | 
262 | def get_map_avg(k_v_map):
263 |     """ Calculates the average value of a map. """
264 | 
265 |     avg = 0.0
266 |     for key in k_v_map.keys():
267 |         avg += k_v_map[key]
268 |     return float(avg) / len(k_v_map)
269 | 
270 | def create_key_avg_map(k_v_pairs):
271 |     """ Creates a map which maps a key to its average value. """
272 | 
273 |     key_avg_map = {}
274 |     for pair in k_v_pairs:
275 |         k = pair[0]
276 |         v = pair[1]
277 |         if k not in key_avg_map:
278 |             key_avg_map[k] = [v, 1]
279 |         else:
280 |             key_avg_map[k][0] += v
281 |             key_avg_map[k][1] += 1
282 | 
283 |     for key in key_avg_map.keys():
284 |         key_avg_map[key] = float(key_avg_map[key][0]) / key_avg_map[key][1]
285 | 
286 |     return key_avg_map
287 | 
288 | if __name__ == '__main__':
289 |     pass
290 | 


--------------------------------------------------------------------------------
/photo_quality_prediction/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import time
 3 | import utilities
 4 | import classification
 5 | import feature_selection
 6 | 
 7 | def all_feature_classify(training_file, num):
 8 |     """ Classifier using all features. """
 9 | 
10 |     y, meta_data = utilities.read_training_file(training_file)
11 |     y, meta_data = utilities.sample(y, meta_data, num)
12 | 
13 |     meta_data_train, y_train, meta_data_cv, y_cv = \
14 |         classification.prepare_data(meta_data, y)
15 | 
16 |     x_train, x_cv = feature_selection.generate_features(meta_data_train,
17 |         y_train, meta_data_cv)
18 | 
19 |     clf = classification.random_forest(x_train, y_train, x_cv, y_cv)
20 |     print utilities.binomial_deviance(y_train,
21 |         classification.get_prob(clf, x_train))
22 |     print utilities.binomial_deviance(y_cv, classification.get_prob(clf, x_cv))
23 | 
24 | def spring_brother(training_file, test_file, submission_file):
25 |     """ Running on the test file. """
26 | 
27 |     y, meta_data = utilities.read_training_file(training_file)
28 |     ids, meta_data_test = utilities.read_test_file(test_file)
29 | 
30 |     x_train, x_test = feature_selection.generate_features(meta_data,
31 |         y, meta_data_test)
32 | 
33 |     clf = classification.random_forest(x_train, y, None, None)
34 | 
35 |     p = classification.get_prob(clf, x_test)
36 |     utilities.write_submission_file(submission_file, ids, p)
37 | 
38 | if __name__ == '__main__':
39 |     start_time = time.time()
40 | 
41 |     spring_brother('./data/training.csv',
42 |         './data/test.csv',
43 |         './data/result.csv')
44 | 
45 | #    all_feature_classify('./data/training.csv', 40000)
46 | 
47 |     print (time.time() - start_time) / 60.0, 'minutes'
48 | 


--------------------------------------------------------------------------------
/photo_quality_prediction/readme:
--------------------------------------------------------------------------------
1 | This is the code for Kaggle competition Photo Qaulity Prediction (http://www.kaggle.com/c/PhotoQualityPrediction).
2 | The problem is to predict whether a given photo is of good quality or not based on its meta data rather than the image file. The meta data contains: location of this photo(latitude, longitude), width, heigth, size, and the name, description and caption.
3 | The approach is based on Random Forest. The key is choosing features from the meta data. Since the name, description and caption usually have few words, text classification method does not get very good result (Naive Bayes with tf-idf only gets around 0.22). Features finally I chose include: avg score of locations, avg score of shape and size, avg score for name, description and caption based on the score of each word etc.. And Random Forest with max_features = 2 turns out to be most effective.
4 | Finally the approach got 0.19131 of binomial deviance (ranking 28th/200), and the leader is 0.18434.
5 | 


--------------------------------------------------------------------------------
/photo_quality_prediction/utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import csv
  3 | from numpy import *
  4 | 
  5 | def write_submission_file(file_name, ids, p):
  6 | 
  7 |     print 'Writing submission file...'
  8 |     f = open(file_name, 'w')
  9 |     writer = csv.writer(f)
 10 |     for i in range(len(p)):
 11 |         writer.writerow([ids[i], p[i]])
 12 |     f.close()
 13 | 
 14 | def write_file(file_name, data, single=True):
 15 |     """ Writes data to the file. """
 16 | 
 17 |     print 'Writing output file...'
 18 |     f = open(file_name, 'w')
 19 |     writer = csv.writer(f)
 20 |     for line in data:
 21 |         if single:
 22 |             writer.writerow([line])
 23 |         else:
 24 |             writer.writerow(line)
 25 |     f.close()
 26 | 
 27 | def read_file(file_name, single=True):
 28 |     """ Reads file. """
 29 | 
 30 |     print 'Reading file...'
 31 |     f = open(file_name)
 32 |     reader = csv.reader(f)
 33 |     res = []
 34 |     for line in reader:
 35 |         if single:
 36 |             res.append(line[0])
 37 |         else:
 38 |             res.append(line)
 39 | 
 40 |     f.close()
 41 |     return res
 42 | 
 43 | def read_test_file(file_name):
 44 |     print 'Reading test file...'
 45 |     f = open(file_name)
 46 |     reader = csv.reader(f)
 47 |     reader.next()
 48 | 
 49 |     ids = []
 50 |     meta_data = []
 51 |     for line in reader:
 52 |         latitude = int(line[1])
 53 |         longitude = int(line[2])
 54 |         width = int(line[3])
 55 |         height = int(line[4])
 56 |         size = int(line[5])
 57 |         name = line[6]
 58 |         description = line[7]
 59 |         caption = line[8]
 60 | 
 61 |         ids.append(line[0])
 62 |         meta_data.append([latitude, longitude, width, height, size, name,
 63 |             description, caption])
 64 | 
 65 |     f.close()
 66 |     return (ids, meta_data)
 67 | 
 68 | def read_training_file(file_name):
 69 |     """ Reads training file and generates data. """
 70 | 
 71 |     print 'Reading training file...'
 72 |     f = open(file_name)
 73 |     reader = csv.reader(f)
 74 |     reader.next()
 75 | 
 76 |     y = []
 77 |     meta_data = []
 78 |     for line in reader:
 79 |         latitude = int(line[1])
 80 |         longitude = int(line[2])
 81 |         width = int(line[3])
 82 |         height = int(line[4])
 83 |         size = int(line[5])
 84 |         name = line[6]
 85 |         description = line[7]
 86 |         caption = line[8]
 87 |         good = int(line[9])
 88 | 
 89 |         y.append(good)
 90 |         meta_data.append([latitude, longitude, width, height, size, name,
 91 |             description, caption])
 92 | 
 93 |     f.close()
 94 |     return (y, meta_data)
 95 | 
 96 | def sample(y, meta_data, num, randomly=True):
 97 |     """ Randomly samples num data from the whole data set. """
 98 | 
 99 |     if num == -1:
100 |         num = len(y)
101 |     y_sample = []
102 |     meta_data_sample = []
103 |     perm = range(len(y))
104 |     if randomly:
105 |         perm = random.permutation(len(y))
106 |     perm = perm[0 : min(num, len(y))]
107 |     for index in perm:
108 |         y_sample.append(y[index])
109 |         meta_data_sample.append(meta_data[index])
110 |     return (y_sample, meta_data_sample)
111 | 
112 | def binomial_deviance(y, prediction):
113 |     """ Calculates the binomial deviance for the prediction. """
114 | 
115 |     binomial_deviance = 0.0
116 |     for i in range(len(prediction)):
117 |         if prediction[i] > .99:
118 |             prediction[i] = .99
119 |         elif prediction[i] < .1:
120 |             prediction[i] = .1
121 |         tmp = y[i] * math.log10(prediction[i])
122 |         tmp += (1 - y[i]) * math.log10(1 - prediction[i])
123 |         binomial_deviance -= tmp
124 |     binomial_deviance /= float(len(prediction))
125 |     return binomial_deviance
126 | 
127 | if __name__ == '__main__':
128 |     pass
129 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | This repository contains codes for several competitions in Kaggle.
2 | 
3 | **If you like this repo, you can tweet me @sgzhaohang, why thank you!**
4 | 


--------------------------------------------------------------------------------
/titanic/data/data.csv:
--------------------------------------------------------------------------------
  1 | 0,3,0,22,7.25
  2 | 1,1,1,38,71.2833
  3 | 1,3,1,26,7.925
  4 | 1,1,1,35,53.1
  5 | 0,3,0,35,8.05
  6 | 0,1,0,54,51.8625
  7 | 0,3,0,2,21.075
  8 | 1,3,1,27,11.1333
  9 | 1,2,1,14,30.0708
 10 | 1,3,1,4,16.7
 11 | 1,1,1,58,26.55
 12 | 0,3,0,20,8.05
 13 | 0,3,0,39,31.275
 14 | 0,3,1,14,7.8542
 15 | 1,2,1,55,16
 16 | 0,3,0,2,29.125
 17 | 0,3,1,31,18
 18 | 0,2,0,35,26
 19 | 1,2,0,34,13
 20 | 1,3,1,15,8.0292
 21 | 1,1,0,28,35.5
 22 | 0,3,1,8,21.075
 23 | 1,3,1,38,31.3875
 24 | 0,1,0,19,263
 25 | 0,1,0,40,27.7208
 26 | 0,2,0,66,10.5
 27 | 0,1,0,28,82.1708
 28 | 0,1,0,42,52
 29 | 0,3,0,21,8.05
 30 | 0,3,1,18,18
 31 | 1,3,1,14,11.2417
 32 | 0,3,1,40,9.475
 33 | 0,2,1,27,21
 34 | 1,2,1,3,41.5792
 35 | 1,3,1,19,7.8792
 36 | 0,3,1,18,17.8
 37 | 0,3,0,7,39.6875
 38 | 0,3,0,21,7.8
 39 | 1,1,1,49,76.7292
 40 | 1,2,1,29,26
 41 | 0,1,0,65,61.9792
 42 | 1,2,1,21,10.5
 43 | 0,3,0,28.5,7.2292
 44 | 1,2,1,5,27.75
 45 | 0,3,0,11,46.9
 46 | 0,3,0,22,7.2292
 47 | 1,1,1,38,80
 48 | 0,1,0,45,83.475
 49 | 0,3,0,4,27.9
 50 | 1,2,1,29,10.5
 51 | 0,3,0,19,8.1583
 52 | 1,3,1,17,7.925
 53 | 0,3,0,26,8.6625
 54 | 0,2,0,32,10.5
 55 | 0,3,1,16,46.9
 56 | 0,2,0,21,73.5
 57 | 0,3,0,26,14.4542
 58 | 1,3,0,32,56.4958
 59 | 0,3,0,25,7.65
 60 | 1,2,0,0.83,29
 61 | 1,3,1,30,12.475
 62 | 0,3,0,22,9
 63 | 1,3,0,29,9.5
 64 | 0,1,0,28,47.1
 65 | 1,2,1,17,10.5
 66 | 1,3,1,33,15.85
 67 | 0,3,0,16,34.375
 68 | 1,1,1,23,263
 69 | 0,3,0,24,8.05
 70 | 0,3,0,29,8.05
 71 | 0,3,0,20,7.8542
 72 | 0,1,0,46,61.175
 73 | 0,3,0,26,20.575
 74 | 0,3,0,59,7.25
 75 | 0,1,0,71,34.6542
 76 | 1,1,0,23,63.3583
 77 | 1,2,1,34,23
 78 | 0,2,0,34,26
 79 | 0,3,1,28,7.8958
 80 | 0,1,0,21,77.2875
 81 | 0,3,0,33,8.6542
 82 | 0,3,0,37,7.925
 83 | 0,3,0,28,7.8958
 84 | 1,3,1,21,7.65
 85 | 0,3,0,38,7.8958
 86 | 0,1,0,47,52
 87 | 0,3,1,14.5,14.4542
 88 | 0,3,0,22,8.05
 89 | 0,3,1,20,9.825
 90 | 0,3,1,17,14.4583
 91 | 0,3,0,21,7.925
 92 | 0,3,0,70.5,7.75
 93 | 0,2,0,29,21
 94 | 0,1,0,24,247.5208
 95 | 0,3,1,2,31.275
 96 | 0,2,0,21,73.5
 97 | 0,2,0,32.5,30.0708
 98 | 1,2,1,32.5,13
 99 | 0,1,0,54,77.2875
100 | 1,3,0,12,11.2417
101 | 1,3,0,24,7.1417
102 | 0,3,0,45,6.975
103 | 0,3,0,33,7.8958
104 | 0,3,0,20,7.05
105 | 0,3,1,47,14.5
106 | 1,2,1,29,26
107 | 0,2,0,25,13
108 | 0,2,0,23,15.0458
109 | 1,1,1,19,26.2833
110 | 0,1,0,37,53.1
111 | 0,3,0,16,9.2167
112 | 0,1,0,24,79.2
113 | 1,3,1,22,7.75
114 | 1,3,1,24,15.85
115 | 0,3,0,19,6.75
116 | 0,2,0,18,11.5
117 | 0,2,0,19,36.75
118 | 1,3,0,27,7.7958
119 | 0,3,1,9,34.375
120 | 0,2,0,36.5,26
121 | 0,2,0,42,13
122 | 0,2,0,51,12.525
123 | 1,1,1,22,66.6
124 | 0,3,0,55.5,8.05
125 | 0,3,0,40.5,14.5
126 | 0,1,0,51,61.3792
127 | 1,3,1,16,7.7333
128 | 0,3,0,30,8.05
129 | 0,3,0,44,16.1
130 | 1,2,1,40,15.75
131 | 0,3,0,26,7.775
132 | 0,3,0,17,8.6625
133 | 0,3,0,1,39.6875
134 | 1,3,0,9,20.525
135 | 0,3,1,45,27.9
136 | 0,3,0,28,56.4958
137 | 0,1,0,61,33.5
138 | 0,3,0,4,29.125
139 | 1,3,1,1,11.1333
140 | 0,3,0,21,7.925
141 | 0,1,0,56,30.6958
142 | 0,3,0,18,7.8542
143 | 0,1,1,50,28.7125
144 | 0,2,0,30,13
145 | 0,3,0,36,0
146 | 0,3,0,9,31.3875
147 | 1,2,0,1,39
148 | 1,3,1,4,22.025
149 | 1,1,0,45,26.55
150 | 0,3,0,40,15.5
151 | 0,3,0,36,7.8958
152 | 1,2,1,32,13
153 | 0,2,0,19,13
154 | 1,3,1,19,7.8542
155 | 1,2,0,3,26
156 | 1,1,1,44,27.7208
157 | 1,1,1,58,146.5208
158 | 0,3,0,42,8.4042
159 | 0,2,1,24,13
160 | 0,3,0,28,9.5
161 | 0,3,0,34,6.4958
162 | 0,3,0,45.5,7.225
163 | 1,3,0,18,8.05
164 | 0,3,1,2,10.4625
165 | 0,3,0,32,15.85
166 | 1,3,0,26,18.7875
167 | 1,3,1,16,7.75
168 | 1,1,0,40,31
169 | 0,3,0,24,7.05
170 | 1,2,1,35,21
171 | 0,3,0,22,7.25
172 | 0,2,0,30,13
173 | 1,1,1,31,113.275
174 | 1,3,1,27,7.925
175 | 0,2,0,42,27
176 | 1,1,1,32,76.2917
177 | 0,2,0,30,10.5
178 | 1,3,0,16,8.05
179 | 0,2,0,27,13
180 | 0,3,0,51,8.05
181 | 1,1,0,38,90
182 | 0,3,0,22,9.35
183 | 1,2,0,19,10.5
184 | 0,3,0,20.5,7.25
185 | 0,2,0,18,13
186 | 1,1,1,35,83.475
187 | 0,3,0,29,7.775
188 | 0,2,0,59,13.5
189 | 1,3,1,5,31.3875
190 | 0,2,0,24,10.5
191 | 0,2,0,44,26
192 | 1,2,1,8,26.25
193 | 0,2,0,19,10.5
194 | 0,2,0,33,12.275
195 | 0,2,0,29,10.5
196 | 0,3,0,22,7.125
197 | 0,3,0,30,7.225
198 | 0,1,0,44,90
199 | 0,3,1,25,7.775
200 | 1,2,1,24,14.5
201 | 1,1,0,37,52.5542
202 | 0,2,0,54,26
203 | 0,3,1,29,10.4625
204 | 0,1,0,62,26.55
205 | 0,3,0,30,16.1
206 | 0,3,1,41,20.2125
207 | 1,3,1,29,15.2458
208 | 1,1,1,30,86.5
209 | 1,1,1,35,512.3292
210 | 1,2,1,50,26
211 | 1,3,0,3,31.3875
212 | 0,1,0,52,79.65
213 | 0,1,0,40,0
214 | 0,2,0,36,10.5
215 | 0,3,0,16,39.6875
216 | 1,3,0,25,7.775
217 | 1,1,1,58,153.4625
218 | 1,1,1,35,135.6333
219 | 1,3,0,25,0
220 | 1,2,1,41,19.5
221 | 0,1,0,37,29.7
222 | 1,1,1,63,77.9583
223 | 0,3,1,45,7.75
224 | 0,3,0,7,29.125
225 | 1,3,1,35,20.25
226 | 0,3,0,65,7.75
227 | 0,3,0,28,7.8542
228 | 0,3,0,16,9.5
229 | 1,3,0,19,8.05
230 | 0,3,0,33,8.6625
231 | 1,3,0,30,9.5
232 | 0,3,0,22,7.8958
233 | 1,2,0,42,13
234 | 1,3,1,22,7.75
235 | 1,1,1,26,78.85
236 | 1,1,1,19,91.0792
237 | 0,2,0,36,12.875
238 | 0,3,1,24,8.85
239 | 0,3,0,24,7.8958
240 | 0,3,0,23.5,7.2292
241 | 0,1,1,2,151.55
242 | 1,1,1,50,247.5208
243 | 0,3,0,19,0
244 | 1,1,0,0.92,151.55
245 | 1,1,1,17,108.9
246 | 0,2,0,30,24
247 | 1,1,1,30,56.9292
248 | 1,1,1,24,83.1583
249 | 1,1,1,18,262.375
250 | 0,2,1,26,26
251 | 0,3,0,28,7.8958
252 | 0,2,0,43,26.25
253 | 1,3,1,26,7.8542
254 | 1,2,1,24,26
255 | 0,2,0,54,14
256 | 1,1,1,31,164.8667
257 | 1,1,1,40,134.5
258 | 0,3,0,22,7.25
259 | 0,3,0,27,7.8958
260 | 1,2,1,30,12.35
261 | 1,2,1,22,29
262 | 1,1,1,36,135.6333
263 | 0,3,0,61,6.2375
264 | 1,2,1,36,13
265 | 1,3,1,31,20.525
266 | 1,1,1,16,57.9792
267 | 0,1,0,45.5,28.5
268 | 0,1,0,38,153.4625
269 | 0,3,0,16,18
270 | 0,1,0,29,66.6
271 | 1,1,1,41,134.5
272 | 1,3,0,45,8.05
273 | 0,1,0,45,35.5
274 | 1,2,0,2,26
275 | 1,1,1,24,263
276 | 0,2,0,28,13
277 | 0,2,0,25,13
278 | 0,2,0,36,13
279 | 1,2,1,24,13
280 | 1,2,1,40,13
281 | 1,3,0,3,15.9
282 | 0,3,0,42,8.6625
283 | 0,3,0,23,9.225
284 | 0,3,0,15,7.2292
285 | 0,3,0,25,17.8
286 | 0,3,0,28,9.5
287 | 1,1,1,22,55
288 | 0,2,1,38,13
289 | 0,3,0,40,27.9
290 | 0,2,0,29,27.7208
291 | 0,3,1,45,14.4542
292 | 0,3,0,35,7.05
293 | 0,3,0,30,7.25
294 | 1,1,1,60,75.25
295 | 1,1,1,24,69.3
296 | 1,1,0,25,55.4417
297 | 0,3,0,18,6.4958
298 | 0,3,0,19,8.05
299 | 0,1,0,22,135.6333
300 | 0,3,1,3,21.075
301 | 1,3,1,22,7.25
302 | 0,1,0,27,211.5
303 | 0,3,0,20,4.0125
304 | 0,3,0,19,7.775
305 | 1,1,1,42,227.525
306 | 1,3,1,1,15.7417
307 | 0,3,0,32,7.925
308 | 1,1,1,35,52
309 | 0,2,0,18,73.5
310 | 0,3,0,1,46.9
311 | 1,2,1,36,13
312 | 1,2,1,17,12
313 | 1,1,0,36,120
314 | 1,3,0,21,7.7958
315 | 0,3,0,28,7.925
316 | 1,1,1,23,113.275
317 | 1,3,1,24,16.7
318 | 0,3,0,22,7.7958
319 | 0,3,1,31,7.8542
320 | 0,2,0,46,26
321 | 0,2,0,23,10.5
322 | 1,2,1,28,12.65
323 | 1,3,0,39,7.925
324 | 0,3,0,26,8.05
325 | 0,3,1,21,9.825
326 | 0,3,0,28,15.85
327 | 0,3,1,20,8.6625
328 | 0,2,0,34,21
329 | 0,3,0,51,7.75
330 | 1,2,0,3,18.75
331 | 0,3,0,21,7.775
332 | 1,1,1,33,90
333 | 1,3,0,44,7.925
334 | 1,2,1,34,32.5
335 | 1,2,1,18,13
336 | 0,2,0,30,13
337 | 0,3,1,10,24.15
338 | 0,3,0,21,7.7333
339 | 0,3,0,29,7.875
340 | 0,3,1,28,14.4
341 | 0,3,0,18,20.2125
342 | 1,2,1,28,26
343 | 1,2,1,19,26
344 | 1,3,0,32,8.05
345 | 1,1,0,28,26.55
346 | 1,2,1,42,26
347 | 0,3,0,17,7.125
348 | 0,1,0,50,55.9
349 | 1,1,1,14,120
350 | 0,3,1,21,34.375
351 | 1,2,1,24,18.75
352 | 0,1,0,64,263
353 | 0,2,0,31,10.5
354 | 1,2,1,45,26.25
355 | 0,3,0,20,9.5
356 | 0,3,0,25,7.775
357 | 1,2,1,28,13
358 | 1,1,0,4,81.8583
359 | 1,2,1,13,19.5
360 | 1,1,0,34,26.55
361 | 1,3,1,5,19.2583
362 | 1,1,0,52,30.5
363 | 0,2,0,36,27.75
364 | 0,1,0,30,27.75
365 | 1,1,0,49,89.1042
366 | 1,3,0,29,7.8958
367 | 0,1,0,65,26.55
368 | 1,2,1,50,10.5
369 | 1,1,0,48,26.55
370 | 0,3,0,34,8.05
371 | 0,1,0,47,38.5
372 | 0,2,0,48,13
373 | 0,3,0,38,7.05
374 | 0,1,0,56,26.55
375 | 1,3,1,0.75,19.2583
376 | 0,3,0,38,8.6625
377 | 1,2,1,33,27.75
378 | 1,2,1,23,13.7917
379 | 0,3,1,22,9.8375
380 | 0,2,0,34,21
381 | 0,3,0,29,7.0458
382 | 0,3,0,22,7.5208
383 | 1,3,1,2,12.2875
384 | 0,3,0,9,46.9
385 | 0,3,0,50,8.05
386 | 1,3,1,63,9.5875
387 | 1,1,0,25,91.0792
388 | 1,1,1,35,90
389 | 0,1,0,58,29.7
390 | 0,3,0,30,8.05
391 | 1,3,0,9,15.9
392 | 0,3,0,21,7.25
393 | 0,1,0,55,30.5
394 | 0,1,0,71,49.5042
395 | 0,3,0,21,8.05
396 | 1,1,1,54,78.2667
397 | 0,1,1,25,151.55
398 | 0,3,0,24,7.7958
399 | 0,3,0,17,8.6625
400 | 0,3,1,21,7.75
401 | 0,3,1,37,9.5875
402 | 1,1,1,16,86.5
403 | 0,1,0,18,108.9
404 | 1,2,1,33,26
405 | 0,3,0,28,22.525
406 | 1,3,0,26,56.4958
407 | 1,3,0,29,7.75
408 | 1,1,0,36,26.2875
409 | 1,1,1,54,59.4
410 | 0,3,0,24,7.4958
411 | 0,1,0,47,34.0208
412 | 1,2,1,34,10.5
413 | 1,2,1,36,26
414 | 0,3,0,32,7.8958
415 | 1,1,1,30,93.5
416 | 0,3,0,22,7.8958
417 | 1,1,1,44,57.9792
418 | 0,3,0,40.5,7.75
419 | 1,2,1,50,10.5
420 | 0,3,0,39,7.925
421 | 0,2,0,23,11.5
422 | 1,2,1,2,26
423 | 0,3,0,17,7.2292
424 | 0,3,1,30,8.6625
425 | 1,2,1,7,26.25
426 | 0,1,0,45,26.55
427 | 1,1,1,30,106.425
428 | 1,1,1,22,49.5
429 | 1,1,1,36,71
430 | 0,3,1,9,31.275
431 | 0,3,1,11,31.275
432 | 1,2,0,32,26
433 | 0,1,0,50,106.425
434 | 0,1,0,64,26
435 | 1,2,1,19,26
436 | 0,3,0,33,20.525
437 | 1,2,0,8,36.75
438 | 1,1,0,17,110.8833
439 | 0,2,0,27,26
440 | 1,3,0,22,7.225
441 | 1,3,1,22,7.775
442 | 0,1,0,62,26.55
443 | 1,1,1,48,39.6
444 | 1,1,1,39,79.65
445 | 1,3,1,36,17.4
446 | 0,3,0,40,7.8958
447 | 0,2,0,28,13.5
448 | 0,3,0,24,24.15
449 | 0,3,0,19,7.8958
450 | 0,3,1,29,21.075
451 | 1,3,0,32,7.8542
452 | 1,2,0,62,10.5
453 | 1,1,1,53,51.4792
454 | 1,1,0,36,26.3875
455 | 0,3,0,16,8.05
456 | 0,3,0,19,14.5
457 | 1,2,1,34,13
458 | 1,1,1,39,55.9
459 | 1,3,0,32,7.925
460 | 1,2,1,25,30
461 | 1,1,1,39,110.8833
462 | 0,2,0,54,26
463 | 0,1,0,36,40.125
464 | 1,1,1,18,79.65
465 | 0,2,0,47,15
466 | 1,1,0,60,79.2
467 | 0,3,0,22,8.05
468 | 0,3,0,35,7.125
469 | 1,1,1,52,78.2667
470 | 0,3,0,47,7.25
471 | 0,2,0,37,26
472 | 0,3,0,36,24.15
473 | 0,3,0,49,0
474 | 1,1,0,49,56.9292
475 | 1,2,1,24,27
476 | 0,3,0,44,8.05
477 | 1,1,0,35,26.55
478 | 0,3,0,36,15.55
479 | 0,3,0,30,7.8958
480 | 1,1,0,27,30.5
481 | 1,2,1,22,41.5792
482 | 1,1,1,40,153.4625
483 | 0,3,1,39,31.275
484 | 0,3,0,35,8.05
485 | 1,2,1,24,65
486 | 0,3,0,34,14.4
487 | 0,3,1,26,16.1
488 | 1,2,1,4,39
489 | 0,2,0,26,10.5
490 | 0,3,0,27,14.4542
491 | 1,1,0,42,52.5542
492 | 1,3,0,20,15.7417
493 | 0,3,0,21,7.8542
494 | 0,3,0,21,16.1
495 | 0,1,0,61,32.3208
496 | 0,2,0,57,12.35
497 | 1,1,1,21,77.9583
498 | 0,3,0,26,7.8958
499 | 1,1,0,80,30
500 | 0,3,0,51,7.0542
501 | 1,1,0,32,30.5
502 | 0,3,1,9,27.9
503 | 1,2,1,28,13
504 | 0,3,0,32,7.925
505 | 0,2,0,31,26.25
506 | 0,3,1,41,39.6875
507 | 0,3,0,20,7.8542
508 | 1,1,1,24,69.3
509 | 0,3,1,2,27.9
510 | 1,3,1,0.75,19.2583
511 | 1,1,0,48,76.7292
512 | 0,3,0,19,7.8958
513 | 1,1,0,56,35.5
514 | 1,3,1,23,7.55
515 | 1,2,1,18,23
516 | 0,3,0,21,8.4333
517 | 0,3,1,18,6.75
518 | 0,2,0,24,73.5
519 | 0,3,1,32,15.5
520 | 0,2,0,23,13
521 | 0,1,0,58,113.275
522 | 1,1,0,50,133.65
523 | 0,3,0,40,7.225
524 | 0,1,0,47,25.5875
525 | 0,3,0,36,7.4958
526 | 1,3,0,20,7.925
527 | 0,2,0,32,73.5
528 | 0,2,0,25,13
529 | 0,3,0,43,8.05
530 | 1,2,1,40,39
531 | 0,1,0,31,52
532 | 0,2,0,70,10.5
533 | 1,2,0,31,13
534 | 0,3,0,18,7.775
535 | 0,3,0,24.5,8.05
536 | 1,3,1,18,9.8417
537 | 0,3,1,43,46.9
538 | 1,1,0,36,512.3292
539 | 1,1,0,27,76.7292
540 | 0,3,0,20,9.225
541 | 0,3,0,14,46.9
542 | 0,2,0,60,39
543 | 0,2,0,25,41.5792
544 | 0,3,0,14,39.6875
545 | 0,3,0,19,10.1708
546 | 0,3,0,18,7.7958
547 | 1,1,1,15,211.3375
548 | 1,1,0,31,57
549 | 1,3,1,4,13.4167
550 | 0,3,0,25,7.225
551 | 0,1,0,60,26.55
552 | 0,2,0,52,13.5
553 | 0,3,0,44,8.05
554 | 0,1,0,49,110.8833
555 | 0,3,0,42,7.65
556 | 1,1,1,18,227.525
557 | 1,1,0,35,26.2875
558 | 0,3,1,18,14.4542
559 | 0,3,0,25,7.7417
560 | 0,3,0,26,7.8542
561 | 0,2,0,39,26
562 | 1,2,1,45,13.5
563 | 1,1,0,42,26.2875
564 | 1,1,1,22,151.55
565 | 1,1,1,24,49.5042
566 | 1,1,0,48,52
567 | 0,3,0,29,9.4833
568 | 0,2,0,52,13
569 | 0,3,0,19,7.65
570 | 1,1,1,38,227.525
571 | 1,2,1,27,10.5
572 | 0,3,0,33,7.775
573 | 1,2,1,6,33
574 | 0,3,0,17,7.0542
575 | 0,2,0,34,13
576 | 0,2,0,50,13
577 | 1,1,0,27,53.1
578 | 0,3,0,20,8.6625
579 | 1,2,1,30,21
580 | 0,2,0,25,26
581 | 0,3,1,25,7.925
582 | 1,1,1,29,211.3375
583 | 0,3,0,11,18.7875
584 | 0,2,0,23,13
585 | 0,2,0,23,13
586 | 0,3,0,28.5,16.1
587 | 0,3,1,48,34.375
588 | 1,1,0,35,512.3292
589 | 0,1,0,36,78.85
590 | 1,1,1,21,262.375
591 | 0,3,0,24,16.1
592 | 1,3,0,31,7.925
593 | 0,1,0,70,71
594 | 0,3,0,16,20.25
595 | 1,2,1,30,13
596 | 0,1,0,19,53.1
597 | 0,3,0,31,7.75
598 | 1,2,1,4,23
599 | 1,3,0,6,12.475
600 | 0,3,0,33,9.5
601 | 0,3,0,23,7.8958
602 | 1,2,1,48,65
603 | 1,2,0,0.67,14.5
604 | 0,3,0,28,7.7958
605 | 0,2,0,18,11.5
606 | 0,3,0,34,8.05
607 | 1,1,1,33,86.5
608 | 0,3,0,41,7.125
609 | 1,3,0,20,7.2292
610 | 1,1,1,36,120
611 | 0,3,0,16,7.775
612 | 1,1,1,51,77.9583
613 | 0,3,1,30.5,7.75
614 | 0,3,0,32,8.3625
615 | 0,3,0,24,9.5
616 | 0,3,0,48,7.8542
617 | 0,2,1,57,10.5
618 | 1,2,1,54,23
619 | 0,3,0,18,7.75
620 | 1,3,1,5,12.475
621 | 1,1,1,43,211.3375
622 | 1,3,1,13,7.2292
623 | 1,1,1,17,57
624 | 0,1,0,29,30
625 | 0,3,0,25,7.05
626 | 0,3,0,25,7.25
627 | 1,3,1,18,7.4958
628 | 0,3,0,8,29.125
629 | 1,3,0,1,20.575
630 | 0,1,0,46,79.2
631 | 0,2,0,16,26
632 | 0,3,0,25,7.8958
633 | 0,2,0,39,13
634 | 1,1,1,49,25.9292
635 | 1,3,1,31,8.6833
636 | 0,3,0,30,7.2292
637 | 0,3,1,30,24.15
638 | 0,2,0,34,13
639 | 1,2,1,31,26.25
640 | 1,1,0,11,120
641 | 1,3,0,0.42,8.5167
642 | 1,3,0,27,6.975
643 | 0,3,0,31,7.775
644 | 0,1,0,39,0
645 | 0,3,1,18,7.775
646 | 0,2,0,39,13
647 | 1,1,1,33,53.1
648 | 0,3,0,26,7.8875
649 | 0,3,0,39,24.15
650 | 0,2,0,35,10.5
651 | 0,3,1,6,31.275
652 | 0,3,0,30.5,8.05
653 | 0,3,1,23,7.925
654 | 0,2,0,31,37.0042
655 | 0,3,0,43,6.45
656 | 0,3,0,10,27.9
657 | 1,1,1,52,93.5
658 | 1,3,0,27,8.6625
659 | 0,1,0,38,0
660 | 1,3,1,27,12.475
661 | 0,3,0,2,39.6875
662 | 1,2,0,1,37.0042
663 | 1,1,1,62,80
664 | 1,3,1,15,14.4542
665 | 1,2,0,0.83,18.75
666 | 0,3,0,23,7.8542
667 | 0,3,0,18,8.3
668 | 1,1,1,39,83.1583
669 | 0,3,0,21,8.6625
670 | 1,3,0,32,56.4958
671 | 0,3,0,20,7.925
672 | 0,2,0,16,10.5
673 | 1,1,1,30,31
674 | 0,3,0,34.5,6.4375
675 | 0,3,0,17,8.6625
676 | 0,3,0,42,7.55
677 | 0,3,0,35,7.8958
678 | 0,2,0,28,33
679 | 0,3,0,4,31.275
680 | 0,3,0,74,7.775
681 | 0,3,1,9,15.2458
682 | 1,1,1,16,39.4
683 | 0,2,1,44,26
684 | 1,3,1,18,9.35
685 | 1,1,1,45,164.8667
686 | 1,1,0,51,26.55
687 | 1,3,1,24,19.2583
688 | 0,3,0,41,14.1083
689 | 0,2,0,21,11.5
690 | 1,1,1,48,25.9292
691 | 0,2,0,24,13
692 | 1,2,1,42,13
693 | 1,2,1,27,13.8583
694 | 0,1,0,31,50.4958
695 | 1,3,0,4,11.1333
696 | 0,3,0,26,7.8958
697 | 1,1,1,47,52.5542
698 | 0,1,0,33,5
699 | 0,3,0,47,9
700 | 1,2,1,28,24
701 | 1,3,1,15,7.225
702 | 0,3,0,20,9.8458
703 | 0,3,0,19,7.8958
704 | 1,1,1,56,83.1583
705 | 1,2,1,25,26
706 | 0,3,0,33,7.8958
707 | 0,3,1,22,10.5167
708 | 0,2,0,28,10.5
709 | 0,3,0,25,7.05
710 | 0,3,1,39,29.125
711 | 0,2,0,27,13
712 | 1,1,1,19,30
713 | 1,1,0,26,30
714 | 0,3,0,32,7.75
715 | 


--------------------------------------------------------------------------------
/titanic/data/data2.csv:
--------------------------------------------------------------------------------
  1 | 0,3,0,22,7.25
  2 | 1,1,1,38,71.2833
  3 | 1,3,1,26,7.925
  4 | 1,1,1,35,53.1
  5 | 0,3,0,35,8.05
  6 | 0,3,0,30,8.4583
  7 | 0,1,0,54,51.8625
  8 | 0,3,0,2,21.075
  9 | 1,3,1,27,11.1333
 10 | 1,2,1,14,30.0708
 11 | 1,3,1,4,16.7
 12 | 1,1,1,58,26.55
 13 | 0,3,0,20,8.05
 14 | 0,3,0,39,31.275
 15 | 0,3,1,14,7.8542
 16 | 1,2,1,55,16
 17 | 0,3,0,2,29.125
 18 | 1,2,0,30,13
 19 | 0,3,1,31,18
 20 | 1,3,1,30,7.225
 21 | 0,2,0,35,26
 22 | 1,2,0,34,13
 23 | 1,3,1,15,8.0292
 24 | 1,1,0,28,35.5
 25 | 0,3,1,8,21.075
 26 | 1,3,1,38,31.3875
 27 | 0,3,0,30,7.225
 28 | 0,1,0,19,263
 29 | 1,3,1,30,7.8792
 30 | 0,3,0,30,7.8958
 31 | 0,1,0,40,27.7208
 32 | 1,1,1,30,146.5208
 33 | 1,3,1,30,7.75
 34 | 0,2,0,66,10.5
 35 | 0,1,0,28,82.1708
 36 | 0,1,0,42,52
 37 | 1,3,0,30,7.2292
 38 | 0,3,0,21,8.05
 39 | 0,3,1,18,18
 40 | 1,3,1,14,11.2417
 41 | 0,3,1,40,9.475
 42 | 0,2,1,27,21
 43 | 0,3,0,30,7.8958
 44 | 1,2,1,3,41.5792
 45 | 1,3,1,19,7.8792
 46 | 0,3,0,30,8.05
 47 | 0,3,0,30,15.5
 48 | 1,3,1,30,7.75
 49 | 0,3,0,30,21.6792
 50 | 0,3,1,18,17.8
 51 | 0,3,0,7,39.6875
 52 | 0,3,0,21,7.8
 53 | 1,1,1,49,76.7292
 54 | 1,2,1,29,26
 55 | 0,1,0,65,61.9792
 56 | 1,1,0,30,35.5
 57 | 1,2,1,21,10.5
 58 | 0,3,0,28.5,7.2292
 59 | 1,2,1,5,27.75
 60 | 0,3,0,11,46.9
 61 | 0,3,0,22,7.2292
 62 | 1,1,1,38,80
 63 | 0,1,0,45,83.475
 64 | 0,3,0,4,27.9
 65 | 0,1,0,30,27.7208
 66 | 1,3,0,30,15.2458
 67 | 1,2,1,29,10.5
 68 | 0,3,0,19,8.1583
 69 | 1,3,1,17,7.925
 70 | 0,3,0,26,8.6625
 71 | 0,2,0,32,10.5
 72 | 0,3,1,16,46.9
 73 | 0,2,0,21,73.5
 74 | 0,3,0,26,14.4542
 75 | 1,3,0,32,56.4958
 76 | 0,3,0,25,7.65
 77 | 0,3,0,30,7.8958
 78 | 0,3,0,30,8.05
 79 | 1,2,0,0.83,29
 80 | 1,3,1,30,12.475
 81 | 0,3,0,22,9
 82 | 1,3,0,29,9.5
 83 | 1,3,1,30,7.7875
 84 | 0,1,0,28,47.1
 85 | 1,2,1,17,10.5
 86 | 1,3,1,33,15.85
 87 | 0,3,0,16,34.375
 88 | 0,3,0,30,8.05
 89 | 1,1,1,23,263
 90 | 0,3,0,24,8.05
 91 | 0,3,0,29,8.05
 92 | 0,3,0,20,7.8542
 93 | 0,1,0,46,61.175
 94 | 0,3,0,26,20.575
 95 | 0,3,0,59,7.25
 96 | 0,3,0,30,8.05
 97 | 0,1,0,71,34.6542
 98 | 1,1,0,23,63.3583
 99 | 1,2,1,34,23
100 | 0,2,0,34,26
101 | 0,3,1,28,7.8958
102 | 0,3,0,30,7.8958
103 | 0,1,0,21,77.2875
104 | 0,3,0,33,8.6542
105 | 0,3,0,37,7.925
106 | 0,3,0,28,7.8958
107 | 1,3,1,21,7.65
108 | 1,3,0,30,7.775
109 | 0,3,0,38,7.8958
110 | 1,3,1,30,24.15
111 | 0,1,0,47,52
112 | 0,3,1,14.5,14.4542
113 | 0,3,0,22,8.05
114 | 0,3,1,20,9.825
115 | 0,3,1,17,14.4583
116 | 0,3,0,21,7.925
117 | 0,3,0,70.5,7.75
118 | 0,2,0,29,21
119 | 0,1,0,24,247.5208
120 | 0,3,1,2,31.275
121 | 0,2,0,21,73.5
122 | 0,3,0,30,8.05
123 | 0,2,0,32.5,30.0708
124 | 1,2,1,32.5,13
125 | 0,1,0,54,77.2875
126 | 1,3,0,12,11.2417
127 | 0,3,0,30,7.75
128 | 1,3,0,24,7.1417
129 | 1,3,1,30,22.3583
130 | 0,3,0,45,6.975
131 | 0,3,0,33,7.8958
132 | 0,3,0,20,7.05
133 | 0,3,1,47,14.5
134 | 1,2,1,29,26
135 | 0,2,0,25,13
136 | 0,2,0,23,15.0458
137 | 1,1,1,19,26.2833
138 | 0,1,0,37,53.1
139 | 0,3,0,16,9.2167
140 | 0,1,0,24,79.2
141 | 0,3,1,30,15.2458
142 | 1,3,1,22,7.75
143 | 1,3,1,24,15.85
144 | 0,3,0,19,6.75
145 | 0,2,0,18,11.5
146 | 0,2,0,19,36.75
147 | 1,3,0,27,7.7958
148 | 0,3,1,9,34.375
149 | 0,2,0,36.5,26
150 | 0,2,0,42,13
151 | 0,2,0,51,12.525
152 | 1,1,1,22,66.6
153 | 0,3,0,55.5,8.05
154 | 0,3,0,40.5,14.5
155 | 0,3,0,30,7.3125
156 | 0,1,0,51,61.3792
157 | 1,3,1,16,7.7333
158 | 0,3,0,30,8.05
159 | 0,3,0,30,8.6625
160 | 0,3,0,30,69.55
161 | 0,3,0,44,16.1
162 | 1,2,1,40,15.75
163 | 0,3,0,26,7.775
164 | 0,3,0,17,8.6625
165 | 0,3,0,1,39.6875
166 | 1,3,0,9,20.525
167 | 1,1,1,30,55
168 | 0,3,1,45,27.9
169 | 0,1,0,30,25.925
170 | 0,3,0,28,56.4958
171 | 0,1,0,61,33.5
172 | 0,3,0,4,29.125
173 | 1,3,1,1,11.1333
174 | 0,3,0,21,7.925
175 | 0,1,0,56,30.6958
176 | 0,3,0,18,7.8542
177 | 0,3,0,30,25.4667
178 | 0,1,1,50,28.7125
179 | 0,2,0,30,13
180 | 0,3,0,36,0
181 | 0,3,1,30,69.55
182 | 0,2,0,30,15.05
183 | 0,3,0,9,31.3875
184 | 1,2,0,1,39
185 | 1,3,1,4,22.025
186 | 0,1,0,30,50
187 | 1,3,1,30,15.5
188 | 1,1,0,45,26.55
189 | 0,3,0,40,15.5
190 | 0,3,0,36,7.8958
191 | 1,2,1,32,13
192 | 0,2,0,19,13
193 | 1,3,1,19,7.8542
194 | 1,2,0,3,26
195 | 1,1,1,44,27.7208
196 | 1,1,1,58,146.5208
197 | 0,3,0,30,7.75
198 | 0,3,0,42,8.4042
199 | 1,3,1,30,7.75
200 | 0,2,1,24,13
201 | 0,3,0,28,9.5
202 | 0,3,0,30,69.55
203 | 0,3,0,34,6.4958
204 | 0,3,0,45.5,7.225
205 | 1,3,0,18,8.05
206 | 0,3,1,2,10.4625
207 | 0,3,0,32,15.85
208 | 1,3,0,26,18.7875
209 | 1,3,1,16,7.75
210 | 1,1,0,40,31
211 | 0,3,0,24,7.05
212 | 1,2,1,35,21
213 | 0,3,0,22,7.25
214 | 0,2,0,30,13
215 | 0,3,0,30,7.75
216 | 1,1,1,31,113.275
217 | 1,3,1,27,7.925
218 | 0,2,0,42,27
219 | 1,1,1,32,76.2917
220 | 0,2,0,30,10.5
221 | 1,3,0,16,8.05
222 | 0,2,0,27,13
223 | 0,3,0,51,8.05
224 | 0,3,0,30,7.8958
225 | 1,1,0,38,90
226 | 0,3,0,22,9.35
227 | 1,2,0,19,10.5
228 | 0,3,0,20.5,7.25
229 | 0,2,0,18,13
230 | 0,3,1,30,25.4667
231 | 1,1,1,35,83.475
232 | 0,3,0,29,7.775
233 | 0,2,0,59,13.5
234 | 1,3,1,5,31.3875
235 | 0,2,0,24,10.5
236 | 0,3,1,30,7.55
237 | 0,2,0,44,26
238 | 1,2,1,8,26.25
239 | 0,2,0,19,10.5
240 | 0,2,0,33,12.275
241 | 0,3,1,30,14.4542
242 | 1,3,1,30,15.5
243 | 0,2,0,29,10.5
244 | 0,3,0,22,7.125
245 | 0,3,0,30,7.225
246 | 0,1,0,44,90
247 | 0,3,1,25,7.775
248 | 1,2,1,24,14.5
249 | 1,1,0,37,52.5542
250 | 0,2,0,54,26
251 | 0,3,0,30,7.25
252 | 0,3,1,29,10.4625
253 | 0,1,0,62,26.55
254 | 0,3,0,30,16.1
255 | 0,3,1,41,20.2125
256 | 1,3,1,29,15.2458
257 | 1,1,1,30,79.2
258 | 1,1,1,30,86.5
259 | 1,1,1,35,512.3292
260 | 1,2,1,50,26
261 | 0,3,0,30,7.75
262 | 1,3,0,3,31.3875
263 | 0,1,0,52,79.65
264 | 0,1,0,40,0
265 | 0,3,1,30,7.75
266 | 0,2,0,36,10.5
267 | 0,3,0,16,39.6875
268 | 1,3,0,25,7.775
269 | 1,1,1,58,153.4625
270 | 1,1,1,35,135.6333
271 | 0,1,0,30,31
272 | 1,3,0,25,0
273 | 1,2,1,41,19.5
274 | 0,1,0,37,29.7
275 | 1,3,1,30,7.75
276 | 1,1,1,63,77.9583
277 | 0,3,1,45,7.75
278 | 0,2,0,30,0
279 | 0,3,0,7,29.125
280 | 1,3,1,35,20.25
281 | 0,3,0,65,7.75
282 | 0,3,0,28,7.8542
283 | 0,3,0,16,9.5
284 | 1,3,0,19,8.05
285 | 0,1,0,30,26
286 | 0,3,0,33,8.6625
287 | 1,3,0,30,9.5
288 | 0,3,0,22,7.8958
289 | 1,2,0,42,13
290 | 1,3,1,22,7.75
291 | 1,1,1,26,78.85
292 | 1,1,1,19,91.0792
293 | 0,2,0,36,12.875
294 | 0,3,1,24,8.85
295 | 0,3,0,24,7.8958
296 | 0,1,0,30,27.7208
297 | 0,3,0,23.5,7.2292
298 | 0,1,1,2,151.55
299 | 1,1,0,30,30.5
300 | 1,1,1,50,247.5208
301 | 1,3,1,30,7.75
302 | 1,3,0,30,23.25
303 | 0,3,0,19,0
304 | 1,2,1,30,12.35
305 | 0,3,0,30,8.05
306 | 1,1,0,0.92,151.55
307 | 1,1,1,30,110.8833
308 | 1,1,1,17,108.9
309 | 0,2,0,30,24
310 | 1,1,1,30,56.9292
311 | 1,1,1,24,83.1583
312 | 1,1,1,18,262.375
313 | 0,2,1,26,26
314 | 0,3,0,28,7.8958
315 | 0,2,0,43,26.25
316 | 1,3,1,26,7.8542
317 | 1,2,1,24,26
318 | 0,2,0,54,14
319 | 1,1,1,31,164.8667
320 | 1,1,1,40,134.5
321 | 0,3,0,22,7.25
322 | 0,3,0,27,7.8958
323 | 1,2,1,30,12.35
324 | 1,2,1,22,29
325 | 0,3,0,30,69.55
326 | 1,1,1,36,135.6333
327 | 0,3,0,61,6.2375
328 | 1,2,1,36,13
329 | 1,3,1,31,20.525
330 | 1,1,1,16,57.9792
331 | 1,3,1,30,23.25
332 | 0,1,0,45.5,28.5
333 | 0,1,0,38,153.4625
334 | 0,3,0,16,18
335 | 1,1,1,30,133.65
336 | 0,3,0,30,7.8958
337 | 0,1,0,29,66.6
338 | 1,1,1,41,134.5
339 | 1,3,0,45,8.05
340 | 0,1,0,45,35.5
341 | 1,2,0,2,26
342 | 1,1,1,24,263
343 | 0,2,0,28,13
344 | 0,2,0,25,13
345 | 0,2,0,36,13
346 | 1,2,1,24,13
347 | 1,2,1,40,13
348 | 1,3,1,30,16.1
349 | 1,3,0,3,15.9
350 | 0,3,0,42,8.6625
351 | 0,3,0,23,9.225
352 | 0,1,0,30,35
353 | 0,3,0,15,7.2292
354 | 0,3,0,25,17.8
355 | 0,3,0,30,7.225
356 | 0,3,0,28,9.5
357 | 1,1,1,22,55
358 | 0,2,1,38,13
359 | 1,3,1,30,7.8792
360 | 1,3,1,30,7.8792
361 | 0,3,0,40,27.9
362 | 0,2,0,29,27.7208
363 | 0,3,1,45,14.4542
364 | 0,3,0,35,7.05
365 | 0,3,0,30,15.5
366 | 0,3,0,30,7.25
367 | 1,1,1,60,75.25
368 | 1,3,1,30,7.2292
369 | 1,3,1,30,7.75
370 | 1,1,1,24,69.3
371 | 1,1,0,25,55.4417
372 | 0,3,0,18,6.4958
373 | 0,3,0,19,8.05
374 | 0,1,0,22,135.6333
375 | 0,3,1,3,21.075
376 | 1,1,1,30,82.1708
377 | 1,3,1,22,7.25
378 | 0,1,0,27,211.5
379 | 0,3,0,20,4.0125
380 | 0,3,0,19,7.775
381 | 1,1,1,42,227.525
382 | 1,3,1,1,15.7417
383 | 0,3,0,32,7.925
384 | 1,1,1,35,52
385 | 0,3,0,30,7.8958
386 | 0,2,0,18,73.5
387 | 0,3,0,1,46.9
388 | 1,2,1,36,13
389 | 0,3,0,30,7.7292
390 | 1,2,1,17,12
391 | 1,1,0,36,120
392 | 1,3,0,21,7.7958
393 | 0,3,0,28,7.925
394 | 1,1,1,23,113.275
395 | 1,3,1,24,16.7
396 | 0,3,0,22,7.7958
397 | 0,3,1,31,7.8542
398 | 0,2,0,46,26
399 | 0,2,0,23,10.5
400 | 1,2,1,28,12.65
401 | 1,3,0,39,7.925
402 | 0,3,0,26,8.05
403 | 0,3,1,21,9.825
404 | 0,3,0,28,15.85
405 | 0,3,1,20,8.6625
406 | 0,2,0,34,21
407 | 0,3,0,51,7.75
408 | 1,2,0,3,18.75
409 | 0,3,0,21,7.775
410 | 0,3,1,30,25.4667
411 | 0,3,0,30,7.8958
412 | 0,3,0,30,6.8583
413 | 1,1,1,33,90
414 | 0,2,0,30,0
415 | 1,3,0,44,7.925
416 | 0,3,1,30,8.05
417 | 1,2,1,34,32.5
418 | 1,2,1,18,13
419 | 0,2,0,30,13
420 | 0,3,1,10,24.15
421 | 0,3,0,30,7.8958
422 | 0,3,0,21,7.7333
423 | 0,3,0,29,7.875
424 | 0,3,1,28,14.4
425 | 0,3,0,18,20.2125
426 | 0,3,0,30,7.25
427 | 1,2,1,28,26
428 | 1,2,1,19,26
429 | 0,3,0,30,7.75
430 | 1,3,0,32,8.05
431 | 1,1,0,28,26.55
432 | 1,3,1,30,16.1
433 | 1,2,1,42,26
434 | 0,3,0,17,7.125
435 | 0,1,0,50,55.9
436 | 1,1,1,14,120
437 | 0,3,1,21,34.375
438 | 1,2,1,24,18.75
439 | 0,1,0,64,263
440 | 0,2,0,31,10.5
441 | 1,2,1,45,26.25
442 | 0,3,0,20,9.5
443 | 0,3,0,25,7.775
444 | 1,2,1,28,13
445 | 1,3,0,30,8.1125
446 | 1,1,0,4,81.8583
447 | 1,2,1,13,19.5
448 | 1,1,0,34,26.55
449 | 1,3,1,5,19.2583
450 | 1,1,0,52,30.5
451 | 0,2,0,36,27.75
452 | 0,3,0,30,19.9667
453 | 0,1,0,30,27.75
454 | 1,1,0,49,89.1042
455 | 0,3,0,30,8.05
456 | 1,3,0,29,7.8958
457 | 0,1,0,65,26.55
458 | 1,1,1,30,51.8625
459 | 1,2,1,50,10.5
460 | 0,3,0,30,7.75
461 | 1,1,0,48,26.55
462 | 0,3,0,34,8.05
463 | 0,1,0,47,38.5
464 | 0,2,0,48,13
465 | 0,3,0,30,8.05
466 | 0,3,0,38,7.05
467 | 0,2,0,30,0
468 | 0,1,0,56,26.55
469 | 0,3,0,30,7.725
470 | 1,3,1,0.75,19.2583
471 | 0,3,0,30,7.25
472 | 0,3,0,38,8.6625
473 | 1,2,1,33,27.75
474 | 1,2,1,23,13.7917
475 | 0,3,1,22,9.8375
476 | 0,1,0,30,52
477 | 0,2,0,34,21
478 | 0,3,0,29,7.0458
479 | 0,3,0,22,7.5208
480 | 1,3,1,2,12.2875
481 | 0,3,0,9,46.9
482 | 0,2,0,30,0
483 | 0,3,0,50,8.05
484 | 1,3,1,63,9.5875
485 | 1,1,0,25,91.0792
486 | 0,3,1,30,25.4667
487 | 1,1,1,35,90
488 | 0,1,0,58,29.7
489 | 0,3,0,30,8.05
490 | 1,3,0,9,15.9
491 | 0,3,0,30,19.9667
492 | 0,3,0,21,7.25
493 | 0,1,0,55,30.5
494 | 0,1,0,71,49.5042
495 | 0,3,0,21,8.05
496 | 0,3,0,30,14.4583
497 | 1,1,1,54,78.2667
498 | 0,3,0,30,15.1
499 | 0,1,1,25,151.55
500 | 0,3,0,24,7.7958
501 | 0,3,0,17,8.6625
502 | 0,3,1,21,7.75
503 | 0,3,1,30,7.6292
504 | 0,3,1,37,9.5875
505 | 1,1,1,16,86.5
506 | 0,1,0,18,108.9
507 | 1,2,1,33,26
508 | 1,1,0,30,26.55
509 | 0,3,0,28,22.525
510 | 1,3,0,26,56.4958
511 | 1,3,0,29,7.75
512 | 0,3,0,30,8.05
513 | 1,1,0,36,26.2875
514 | 1,1,1,54,59.4
515 | 0,3,0,24,7.4958
516 | 0,1,0,47,34.0208
517 | 1,2,1,34,10.5
518 | 0,3,0,30,24.15
519 | 1,2,1,36,26
520 | 0,3,0,32,7.8958
521 | 1,1,1,30,93.5
522 | 0,3,0,22,7.8958
523 | 0,3,0,30,7.225
524 | 1,1,1,44,57.9792
525 | 0,3,0,30,7.2292
526 | 0,3,0,40.5,7.75
527 | 1,2,1,50,10.5
528 | 0,1,0,30,221.7792
529 | 0,3,0,39,7.925
530 | 0,2,0,23,11.5
531 | 1,2,1,2,26
532 | 0,3,0,30,7.2292
533 | 0,3,0,17,7.2292
534 | 1,3,1,30,22.3583
535 | 0,3,1,30,8.6625
536 | 1,2,1,7,26.25
537 | 0,1,0,45,26.55
538 | 1,1,1,30,106.425
539 | 0,3,0,30,14.5
540 | 1,1,1,22,49.5
541 | 1,1,1,36,71
542 | 0,3,1,9,31.275
543 | 0,3,1,11,31.275
544 | 1,2,0,32,26
545 | 0,1,0,50,106.425
546 | 0,1,0,64,26
547 | 1,2,1,19,26
548 | 1,2,0,30,13.8625
549 | 0,3,0,33,20.525
550 | 1,2,0,8,36.75
551 | 1,1,0,17,110.8833
552 | 0,2,0,27,26
553 | 0,3,0,30,7.8292
554 | 1,3,0,22,7.225
555 | 1,3,1,22,7.775
556 | 0,1,0,62,26.55
557 | 1,1,1,48,39.6
558 | 0,1,0,30,227.525
559 | 1,1,1,39,79.65
560 | 1,3,1,36,17.4
561 | 0,3,0,30,7.75
562 | 0,3,0,40,7.8958
563 | 0,2,0,28,13.5
564 | 0,3,0,30,8.05
565 | 0,3,1,30,8.05
566 | 0,3,0,24,24.15
567 | 0,3,0,19,7.8958
568 | 0,3,1,29,21.075
569 | 0,3,0,30,7.2292
570 | 1,3,0,32,7.8542
571 | 1,2,0,62,10.5
572 | 1,1,1,53,51.4792
573 | 1,1,0,36,26.3875
574 | 1,3,1,30,7.75
575 | 0,3,0,16,8.05
576 | 0,3,0,19,14.5
577 | 1,2,1,34,13
578 | 1,1,1,39,55.9
579 | 0,3,1,30,14.4583
580 | 1,3,0,32,7.925
581 | 1,2,1,25,30
582 | 1,1,1,39,110.8833
583 | 0,2,0,54,26
584 | 0,1,0,36,40.125
585 | 0,3,0,30,8.7125
586 | 1,1,1,18,79.65
587 | 0,2,0,47,15
588 | 1,1,0,60,79.2
589 | 0,3,0,22,8.05
590 | 0,3,0,30,8.05
591 | 0,3,0,35,7.125
592 | 1,1,1,52,78.2667
593 | 0,3,0,47,7.25
594 | 0,3,1,30,7.75
595 | 0,2,0,37,26
596 | 0,3,0,36,24.15
597 | 1,2,1,30,33
598 | 0,3,0,49,0
599 | 0,3,0,30,7.225
600 | 1,1,0,49,56.9292
601 | 1,2,1,24,27
602 | 0,3,0,30,7.8958
603 | 0,1,0,30,42.4
604 | 0,3,0,44,8.05
605 | 1,1,0,35,26.55
606 | 0,3,0,36,15.55
607 | 0,3,0,30,7.8958
608 | 1,1,0,27,30.5
609 | 1,2,1,22,41.5792
610 | 1,1,1,40,153.4625
611 | 0,3,1,39,31.275
612 | 0,3,0,30,7.05
613 | 1,3,1,30,15.5
614 | 0,3,0,30,7.75
615 | 0,3,0,35,8.05
616 | 1,2,1,24,65
617 | 0,3,0,34,14.4
618 | 0,3,1,26,16.1
619 | 1,2,1,4,39
620 | 0,2,0,26,10.5
621 | 0,3,0,27,14.4542
622 | 1,1,0,42,52.5542
623 | 1,3,0,20,15.7417
624 | 0,3,0,21,7.8542
625 | 0,3,0,21,16.1
626 | 0,1,0,61,32.3208
627 | 0,2,0,57,12.35
628 | 1,1,1,21,77.9583
629 | 0,3,0,26,7.8958
630 | 0,3,0,30,7.7333
631 | 1,1,0,80,30
632 | 0,3,0,51,7.0542
633 | 1,1,0,32,30.5
634 | 0,1,0,30,0
635 | 0,3,1,9,27.9
636 | 1,2,1,28,13
637 | 0,3,0,32,7.925
638 | 0,2,0,31,26.25
639 | 0,3,1,41,39.6875
640 | 0,3,0,30,16.1
641 | 0,3,0,20,7.8542
642 | 1,1,1,24,69.3
643 | 0,3,1,2,27.9
644 | 1,3,0,30,56.4958
645 | 1,3,1,0.75,19.2583
646 | 1,1,0,48,76.7292
647 | 0,3,0,19,7.8958
648 | 1,1,0,56,35.5
649 | 0,3,0,30,7.55
650 | 1,3,1,23,7.55
651 | 0,3,0,30,7.8958
652 | 1,2,1,18,23
653 | 0,3,0,21,8.4333
654 | 1,3,1,30,7.8292
655 | 0,3,1,18,6.75
656 | 0,2,0,24,73.5
657 | 0,3,0,30,7.8958
658 | 0,3,1,32,15.5
659 | 0,2,0,23,13
660 | 0,1,0,58,113.275
661 | 1,1,0,50,133.65
662 | 0,3,0,40,7.225
663 | 0,1,0,47,25.5875
664 | 0,3,0,36,7.4958
665 | 1,3,0,20,7.925
666 | 0,2,0,32,73.5
667 | 0,2,0,25,13
668 | 0,3,0,30,7.775
669 | 0,3,0,43,8.05
670 | 1,1,1,30,52
671 | 1,2,1,40,39
672 | 0,1,0,31,52
673 | 0,2,0,70,10.5
674 | 1,2,0,31,13
675 | 0,2,0,30,0
676 | 0,3,0,18,7.775
677 | 0,3,0,24.5,8.05
678 | 1,3,1,18,9.8417
679 | 0,3,1,43,46.9
680 | 1,1,0,36,512.3292
681 | 0,3,1,30,8.1375
682 | 1,1,0,27,76.7292
683 | 0,3,0,20,9.225
684 | 0,3,0,14,46.9
685 | 0,2,0,60,39
686 | 0,2,0,25,41.5792
687 | 0,3,0,14,39.6875
688 | 0,3,0,19,10.1708
689 | 0,3,0,18,7.7958
690 | 1,1,1,15,211.3375
691 | 1,1,0,31,57
692 | 1,3,1,4,13.4167
693 | 1,3,0,30,56.4958
694 | 0,3,0,25,7.225
695 | 0,1,0,60,26.55
696 | 0,2,0,52,13.5
697 | 0,3,0,44,8.05
698 | 1,3,1,30,7.7333
699 | 0,1,0,49,110.8833
700 | 0,3,0,42,7.65
701 | 1,1,1,18,227.525
702 | 1,1,0,35,26.2875
703 | 0,3,1,18,14.4542
704 | 0,3,0,25,7.7417
705 | 0,3,0,26,7.8542
706 | 0,2,0,39,26
707 | 1,2,1,45,13.5
708 | 1,1,0,42,26.2875
709 | 1,1,1,22,151.55
710 | 1,3,0,30,15.2458
711 | 1,1,1,24,49.5042
712 | 0,1,0,30,26.55
713 | 1,1,0,48,52
714 | 0,3,0,29,9.4833
715 | 0,2,0,52,13
716 | 0,3,0,19,7.65
717 | 1,1,1,38,227.525
718 | 1,2,1,27,10.5
719 | 0,3,0,30,15.5
720 | 0,3,0,33,7.775
721 | 1,2,1,6,33
722 | 0,3,0,17,7.0542
723 | 0,2,0,34,13
724 | 0,2,0,50,13
725 | 1,1,0,27,53.1
726 | 0,3,0,20,8.6625
727 | 1,2,1,30,21
728 | 1,3,1,30,7.7375
729 | 0,2,0,25,26
730 | 0,3,1,25,7.925
731 | 1,1,1,29,211.3375
732 | 0,3,0,11,18.7875
733 | 0,2,0,30,0
734 | 0,2,0,23,13
735 | 0,2,0,23,13
736 | 0,3,0,28.5,16.1
737 | 0,3,1,48,34.375
738 | 1,1,0,35,512.3292
739 | 0,3,0,30,7.8958
740 | 0,3,0,30,7.8958
741 | 1,1,0,30,30
742 | 0,1,0,36,78.85
743 | 1,1,1,21,262.375
744 | 0,3,0,24,16.1
745 | 1,3,0,31,7.925
746 | 0,1,0,70,71
747 | 0,3,0,16,20.25
748 | 1,2,1,30,13
749 | 0,1,0,19,53.1
750 | 0,3,0,31,7.75
751 | 1,2,1,4,23
752 | 1,3,0,6,12.475
753 | 0,3,0,33,9.5
754 | 0,3,0,23,7.8958
755 | 1,2,1,48,65
756 | 1,2,0,0.67,14.5
757 | 0,3,0,28,7.7958
758 | 0,2,0,18,11.5
759 | 0,3,0,34,8.05
760 | 1,1,1,33,86.5
761 | 0,3,0,30,14.5
762 | 0,3,0,41,7.125
763 | 1,3,0,20,7.2292
764 | 1,1,1,36,120
765 | 0,3,0,16,7.775
766 | 1,1,1,51,77.9583
767 | 0,1,0,30,39.6
768 | 0,3,1,30.5,7.75
769 | 0,3,0,30,24.15
770 | 0,3,0,32,8.3625
771 | 0,3,0,24,9.5
772 | 0,3,0,48,7.8542
773 | 0,2,1,57,10.5
774 | 0,3,0,30,7.225
775 | 1,2,1,54,23
776 | 0,3,0,18,7.75
777 | 0,3,0,30,7.75
778 | 1,3,1,5,12.475
779 | 0,3,0,30,7.7375
780 | 1,1,1,43,211.3375
781 | 1,3,1,13,7.2292
782 | 1,1,1,17,57
783 | 0,1,0,29,30
784 | 0,3,0,30,23.45
785 | 0,3,0,25,7.05
786 | 0,3,0,25,7.25
787 | 1,3,1,18,7.4958
788 | 0,3,0,8,29.125
789 | 1,3,0,1,20.575
790 | 0,1,0,46,79.2
791 | 0,3,0,30,7.75
792 | 0,2,0,16,26
793 | 0,3,1,30,69.55
794 | 0,1,0,30,30.6958
795 | 0,3,0,25,7.8958
796 | 0,2,0,39,13
797 | 1,1,1,49,25.9292
798 | 1,3,1,31,8.6833
799 | 0,3,0,30,7.2292
800 | 0,3,1,30,24.15
801 | 0,2,0,34,13
802 | 1,2,1,31,26.25
803 | 1,1,0,11,120
804 | 1,3,0,0.42,8.5167
805 | 1,3,0,27,6.975
806 | 0,3,0,31,7.775
807 | 0,1,0,39,0
808 | 0,3,1,18,7.775
809 | 0,2,0,39,13
810 | 1,1,1,33,53.1
811 | 0,3,0,26,7.8875
812 | 0,3,0,39,24.15
813 | 0,2,0,35,10.5
814 | 0,3,1,6,31.275
815 | 0,3,0,30.5,8.05
816 | 0,1,0,30,0
817 | 0,3,1,23,7.925
818 | 0,2,0,31,37.0042
819 | 0,3,0,43,6.45
820 | 0,3,0,10,27.9
821 | 1,1,1,52,93.5
822 | 1,3,0,27,8.6625
823 | 0,1,0,38,0
824 | 1,3,1,27,12.475
825 | 0,3,0,2,39.6875
826 | 0,3,0,30,6.95
827 | 0,3,0,30,56.4958
828 | 1,2,0,1,37.0042
829 | 1,3,0,30,7.75
830 | 1,1,1,62,80
831 | 1,3,1,15,14.4542
832 | 1,2,0,0.83,18.75
833 | 0,3,0,30,7.2292
834 | 0,3,0,23,7.8542
835 | 0,3,0,18,8.3
836 | 1,1,1,39,83.1583
837 | 0,3,0,21,8.6625
838 | 0,3,0,30,8.05
839 | 1,3,0,32,56.4958
840 | 1,1,0,30,29.7
841 | 0,3,0,20,7.925
842 | 0,2,0,16,10.5
843 | 1,1,1,30,31
844 | 0,3,0,34.5,6.4375
845 | 0,3,0,17,8.6625
846 | 0,3,0,42,7.55
847 | 0,3,0,30,69.55
848 | 0,3,0,35,7.8958
849 | 0,2,0,28,33
850 | 1,1,1,30,89.1042
851 | 0,3,0,4,31.275
852 | 0,3,0,74,7.775
853 | 0,3,1,9,15.2458
854 | 1,1,1,16,39.4
855 | 0,2,1,44,26
856 | 1,3,1,18,9.35
857 | 1,1,1,45,164.8667
858 | 1,1,0,51,26.55
859 | 1,3,1,24,19.2583
860 | 0,3,0,30,7.2292
861 | 0,3,0,41,14.1083
862 | 0,2,0,21,11.5
863 | 1,1,1,48,25.9292
864 | 0,3,1,30,69.55
865 | 0,2,0,24,13
866 | 1,2,1,42,13
867 | 1,2,1,27,13.8583
868 | 0,1,0,31,50.4958
869 | 0,3,0,30,9.5
870 | 1,3,0,4,11.1333
871 | 0,3,0,26,7.8958
872 | 1,1,1,47,52.5542
873 | 0,1,0,33,5
874 | 0,3,0,47,9
875 | 1,2,1,28,24
876 | 1,3,1,15,7.225
877 | 0,3,0,20,9.8458
878 | 0,3,0,19,7.8958
879 | 0,3,0,30,7.8958
880 | 1,1,1,56,83.1583
881 | 1,2,1,25,26
882 | 0,3,0,33,7.8958
883 | 0,3,1,22,10.5167
884 | 0,2,0,28,10.5
885 | 0,3,0,25,7.05
886 | 0,3,1,39,29.125
887 | 0,2,0,27,13
888 | 1,1,1,19,30
889 | 0,3,1,30,23.45
890 | 1,1,0,26,30
891 | 0,3,0,32,7.75
892 | 


--------------------------------------------------------------------------------
/titanic/data/error.csv:
--------------------------------------------------------------------------------
 1 | 0,2,1,57,10.5,0.7302221251009093
 2 | 0,1,0,29,30,0.675536885938038
 3 | 0,3,1,18,7.774999999999999,0.6282204455573266
 4 | 0,3,1,23,7.925000000000001,0.5079028840062008
 5 | 0,3,1,9,15.2458,0.6488754889618661
 6 | 0,2,1,44,26,0.8607264852516288
 7 | 0,1,0,31,50.4958,0.5351885699944017
 8 | 0,1,0,33,5,0.6211402657508864
 9 | 0,3,1,39,29.125,0.5269026514727263
10 | 1,3,0,20,7.229199999999999,0.1977638232454237
11 | 1,3,0,1,20.575,0.440943917205827
12 | 1,3,1,31,8.683299999999999,0.3704616329942221
13 | 1,3,0,27,6.975000000000001,0.1233117481969939
14 | 1,3,0,27,8.662500000000001,0.1267544048066883
15 | 1,3,1,27,12.475,0.3964072664612249
16 | 1,3,0,32,56.4958,0.3914447790949062
17 | 1,1,0,51,26.55,0.2160881298374403
18 | 1,3,1,24,19.2583,0.3570362276990257
19 | 


--------------------------------------------------------------------------------
/titanic/data/result.csv:
--------------------------------------------------------------------------------
  1 | 0
  2 | 0
  3 | 0
  4 | 0
  5 | 0
  6 | 0
  7 | 0
  8 | 0
  9 | 1
 10 | 0
 11 | 0
 12 | 0
 13 | 1
 14 | 0
 15 | 1
 16 | 1
 17 | 0
 18 | 0
 19 | 0
 20 | 0
 21 | 0
 22 | 0
 23 | 1
 24 | 1
 25 | 1
 26 | 0
 27 | 1
 28 | 0
 29 | 0
 30 | 0
 31 | 0
 32 | 0
 33 | 0
 34 | 0
 35 | 1
 36 | 0
 37 | 0
 38 | 1
 39 | 0
 40 | 0
 41 | 0
 42 | 1
 43 | 0
 44 | 1
 45 | 1
 46 | 0
 47 | 0
 48 | 0
 49 | 1
 50 | 0
 51 | 1
 52 | 0
 53 | 1
 54 | 1
 55 | 0
 56 | 0
 57 | 0
 58 | 0
 59 | 0
 60 | 1
 61 | 0
 62 | 0
 63 | 0
 64 | 1
 65 | 0
 66 | 1
 67 | 1
 68 | 0
 69 | 1
 70 | 1
 71 | 1
 72 | 0
 73 | 0
 74 | 1
 75 | 1
 76 | 0
 77 | 0
 78 | 1
 79 | 0
 80 | 1
 81 | 0
 82 | 0
 83 | 0
 84 | 0
 85 | 0
 86 | 0
 87 | 0
 88 | 1
 89 | 0
 90 | 1
 91 | 0
 92 | 0
 93 | 1
 94 | 0
 95 | 1
 96 | 0
 97 | 1
 98 | 0
 99 | 1
100 | 0
101 | 1
102 | 0
103 | 0
104 | 0
105 | 1
106 | 0
107 | 0
108 | 0
109 | 0
110 | 0
111 | 0
112 | 0
113 | 1
114 | 1
115 | 1
116 | 0
117 | 0
118 | 1
119 | 1
120 | 1
121 | 1
122 | 0
123 | 1
124 | 0
125 | 0
126 | 0
127 | 0
128 | 0
129 | 0
130 | 0
131 | 0
132 | 0
133 | 0
134 | 0
135 | 0
136 | 0
137 | 0
138 | 0
139 | 1
140 | 0
141 | 0
142 | 1
143 | 0
144 | 0
145 | 0
146 | 0
147 | 1
148 | 0
149 | 1
150 | 0
151 | 1
152 | 0
153 | 0
154 | 0
155 | 0
156 | 0
157 | 1
158 | 1
159 | 0
160 | 0
161 | 0
162 | 0
163 | 1
164 | 0
165 | 0
166 | 0
167 | 0
168 | 0
169 | 1
170 | 1
171 | 0
172 | 0
173 | 0
174 | 0
175 | 0
176 | 1
177 | 1
178 | 0
179 | 1
180 | 1
181 | 0
182 | 0
183 | 1
184 | 0
185 | 1
186 | 0
187 | 1
188 | 0
189 | 0
190 | 0
191 | 0
192 | 1
193 | 0
194 | 0
195 | 1
196 | 0
197 | 1
198 | 1
199 | 0
200 | 0
201 | 0
202 | 1
203 | 0
204 | 1
205 | 0
206 | 1
207 | 0
208 | 0
209 | 1
210 | 0
211 | 0
212 | 0
213 | 0
214 | 1
215 | 0
216 | 0
217 | 0
218 | 0
219 | 1
220 | 0
221 | 1
222 | 0
223 | 1
224 | 0
225 | 1
226 | 0
227 | 0
228 | 0
229 | 0
230 | 0
231 | 1
232 | 1
233 | 0
234 | 0
235 | 0
236 | 0
237 | 0
238 | 0
239 | 1
240 | 1
241 | 1
242 | 1
243 | 0
244 | 0
245 | 0
246 | 0
247 | 1
248 | 0
249 | 1
250 | 0
251 | 1
252 | 0
253 | 0
254 | 0
255 | 0
256 | 0
257 | 0
258 | 0
259 | 1
260 | 0
261 | 0
262 | 0
263 | 1
264 | 1
265 | 0
266 | 0
267 | 0
268 | 0
269 | 0
270 | 0
271 | 0
272 | 0
273 | 1
274 | 0
275 | 0
276 | 1
277 | 0
278 | 0
279 | 0
280 | 0
281 | 1
282 | 1
283 | 0
284 | 1
285 | 0
286 | 0
287 | 0
288 | 1
289 | 0
290 | 0
291 | 1
292 | 0
293 | 0
294 | 0
295 | 0
296 | 0
297 | 1
298 | 0
299 | 1
300 | 0
301 | 0
302 | 0
303 | 0
304 | 0
305 | 0
306 | 1
307 | 0
308 | 1
309 | 0
310 | 0
311 | 0
312 | 0
313 | 0
314 | 0
315 | 1
316 | 1
317 | 0
318 | 0
319 | 0
320 | 0
321 | 0
322 | 0
323 | 0
324 | 0
325 | 1
326 | 0
327 | 1
328 | 0
329 | 0
330 | 0
331 | 1
332 | 0
333 | 0
334 | 0
335 | 0
336 | 1
337 | 0
338 | 0
339 | 0
340 | 0
341 | 0
342 | 0
343 | 0
344 | 1
345 | 0
346 | 1
347 | 0
348 | 0
349 | 0
350 | 1
351 | 1
352 | 0
353 | 0
354 | 0
355 | 0
356 | 0
357 | 1
358 | 0
359 | 0
360 | 0
361 | 0
362 | 1
363 | 1
364 | 0
365 | 1
366 | 0
367 | 0
368 | 0
369 | 1
370 | 0
371 | 0
372 | 1
373 | 0
374 | 0
375 | 1
376 | 1
377 | 1
378 | 0
379 | 0
380 | 0
381 | 0
382 | 0
383 | 0
384 | 0
385 | 0
386 | 1
387 | 0
388 | 0
389 | 0
390 | 0
391 | 1
392 | 1
393 | 0
394 | 0
395 | 0
396 | 1
397 | 0
398 | 1
399 | 0
400 | 0
401 | 1
402 | 0
403 | 1
404 | 1
405 | 0
406 | 0
407 | 0
408 | 0
409 | 0
410 | 1
411 | 0
412 | 1
413 | 0
414 | 0
415 | 1
416 | 0
417 | 0
418 | 0
419 | 


--------------------------------------------------------------------------------
/titanic/data/result2.csv:
--------------------------------------------------------------------------------
  1 | 0
  2 | 0
  3 | 0
  4 | 0
  5 | 1
  6 | 0
  7 | 1
  8 | 0
  9 | 1
 10 | 0
 11 | 0
 12 | 0
 13 | 1
 14 | 0
 15 | 1
 16 | 1
 17 | 0
 18 | 0
 19 | 1
 20 | 0
 21 | 0
 22 | 0
 23 | 1
 24 | 1
 25 | 1
 26 | 0
 27 | 1
 28 | 0
 29 | 0
 30 | 0
 31 | 0
 32 | 0
 33 | 0
 34 | 0
 35 | 0
 36 | 0
 37 | 1
 38 | 1
 39 | 0
 40 | 0
 41 | 0
 42 | 0
 43 | 0
 44 | 1
 45 | 1
 46 | 0
 47 | 0
 48 | 0
 49 | 1
 50 | 0
 51 | 1
 52 | 0
 53 | 1
 54 | 1
 55 | 0
 56 | 0
 57 | 0
 58 | 0
 59 | 0
 60 | 1
 61 | 0
 62 | 0
 63 | 0
 64 | 1
 65 | 0
 66 | 1
 67 | 1
 68 | 0
 69 | 0
 70 | 1
 71 | 1
 72 | 0
 73 | 1
 74 | 0
 75 | 1
 76 | 0
 77 | 0
 78 | 1
 79 | 0
 80 | 1
 81 | 0
 82 | 0
 83 | 0
 84 | 0
 85 | 0
 86 | 0
 87 | 1
 88 | 1
 89 | 1
 90 | 1
 91 | 1
 92 | 0
 93 | 1
 94 | 0
 95 | 0
 96 | 0
 97 | 1
 98 | 0
 99 | 1
100 | 0
101 | 1
102 | 0
103 | 0
104 | 0
105 | 1
106 | 0
107 | 0
108 | 0
109 | 0
110 | 0
111 | 0
112 | 1
113 | 1
114 | 1
115 | 1
116 | 0
117 | 0
118 | 1
119 | 0
120 | 1
121 | 1
122 | 0
123 | 1
124 | 0
125 | 0
126 | 1
127 | 0
128 | 0
129 | 0
130 | 0
131 | 0
132 | 0
133 | 0
134 | 0
135 | 0
136 | 0
137 | 0
138 | 0
139 | 1
140 | 0
141 | 0
142 | 1
143 | 0
144 | 0
145 | 0
146 | 0
147 | 0
148 | 0
149 | 0
150 | 0
151 | 1
152 | 0
153 | 0
154 | 0
155 | 0
156 | 0
157 | 1
158 | 1
159 | 0
160 | 0
161 | 1
162 | 0
163 | 1
164 | 0
165 | 0
166 | 0
167 | 0
168 | 0
169 | 1
170 | 1
171 | 0
172 | 0
173 | 0
174 | 0
175 | 0
176 | 1
177 | 1
178 | 0
179 | 1
180 | 1
181 | 0
182 | 0
183 | 1
184 | 0
185 | 1
186 | 0
187 | 1
188 | 0
189 | 0
190 | 0
191 | 0
192 | 0
193 | 0
194 | 0
195 | 1
196 | 0
197 | 1
198 | 1
199 | 0
200 | 1
201 | 0
202 | 0
203 | 0
204 | 1
205 | 0
206 | 0
207 | 0
208 | 0
209 | 1
210 | 0
211 | 0
212 | 0
213 | 0
214 | 1
215 | 0
216 | 0
217 | 1
218 | 0
219 | 1
220 | 0
221 | 1
222 | 0
223 | 1
224 | 0
225 | 1
226 | 0
227 | 0
228 | 1
229 | 0
230 | 0
231 | 0
232 | 1
233 | 0
234 | 0
235 | 0
236 | 0
237 | 0
238 | 0
239 | 1
240 | 1
241 | 1
242 | 1
243 | 0
244 | 0
245 | 0
246 | 0
247 | 1
248 | 0
249 | 1
250 | 0
251 | 1
252 | 0
253 | 0
254 | 0
255 | 0
256 | 0
257 | 0
258 | 0
259 | 1
260 | 0
261 | 0
262 | 0
263 | 1
264 | 1
265 | 0
266 | 0
267 | 0
268 | 0
269 | 1
270 | 0
271 | 0
272 | 0
273 | 1
274 | 0
275 | 0
276 | 1
277 | 0
278 | 0
279 | 0
280 | 0
281 | 1
282 | 0
283 | 1
284 | 1
285 | 1
286 | 0
287 | 0
288 | 1
289 | 0
290 | 0
291 | 0
292 | 1
293 | 0
294 | 0
295 | 0
296 | 0
297 | 1
298 | 0
299 | 0
300 | 0
301 | 0
302 | 0
303 | 0
304 | 0
305 | 1
306 | 1
307 | 0
308 | 1
309 | 0
310 | 0
311 | 0
312 | 0
313 | 0
314 | 0
315 | 1
316 | 1
317 | 0
318 | 0
319 | 0
320 | 0
321 | 0
322 | 0
323 | 0
324 | 0
325 | 1
326 | 0
327 | 1
328 | 0
329 | 0
330 | 0
331 | 1
332 | 0
333 | 0
334 | 1
335 | 0
336 | 0
337 | 0
338 | 0
339 | 0
340 | 0
341 | 0
342 | 0
343 | 0
344 | 1
345 | 0
346 | 1
347 | 0
348 | 0
349 | 0
350 | 1
351 | 1
352 | 0
353 | 0
354 | 0
355 | 1
356 | 0
357 | 1
358 | 0
359 | 0
360 | 0
361 | 0
362 | 1
363 | 1
364 | 0
365 | 1
366 | 0
367 | 0
368 | 0
369 | 1
370 | 0
371 | 0
372 | 1
373 | 0
374 | 0
375 | 1
376 | 1
377 | 1
378 | 0
379 | 0
380 | 0
381 | 0
382 | 0
383 | 0
384 | 1
385 | 0
386 | 1
387 | 0
388 | 0
389 | 0
390 | 0
391 | 1
392 | 1
393 | 0
394 | 0
395 | 0
396 | 1
397 | 0
398 | 1
399 | 0
400 | 0
401 | 1
402 | 0
403 | 1
404 | 1
405 | 0
406 | 0
407 | 0
408 | 0
409 | 1
410 | 1
411 | 1
412 | 1
413 | 1
414 | 0
415 | 1
416 | 0
417 | 0
418 | 0
419 | 


--------------------------------------------------------------------------------
/titanic/data/result3.csv:
--------------------------------------------------------------------------------
  1 | 0.0
  2 | 0.0
  3 | 0.0
  4 | 0.0
  5 | 0.0
  6 | 0.0
  7 | 0.0
  8 | 0.0
  9 | 0.0
 10 | 0.0
 11 | 0.0
 12 | 0.0
 13 | 1.0
 14 | 0.0
 15 | 1.0
 16 | 0.0
 17 | 0.0
 18 | 0.0
 19 | 0.0
 20 | 0.0
 21 | 0.0
 22 | 0.0
 23 | 0.0
 24 | 0.0
 25 | 1.0
 26 | 0.0
 27 | 1.0
 28 | 0.0
 29 | 0.0
 30 | 1.0
 31 | 0.0
 32 | 0.0
 33 | 0.0
 34 | 1.0
 35 | 0.0
 36 | 0.0
 37 | 1.0
 38 | 0.0
 39 | 0.0
 40 | 0.0
 41 | 0.0
 42 | 0.0
 43 | 0.0
 44 | 0.0
 45 | 1.0
 46 | 0.0
 47 | 0.0
 48 | 0.0
 49 | 1.0
 50 | 0.0
 51 | 0.0
 52 | 0.0
 53 | 0.0
 54 | 1.0
 55 | 1.0
 56 | 0.0
 57 | 0.0
 58 | 0.0
 59 | 1.0
 60 | 1.0
 61 | 0.0
 62 | 0.0
 63 | 0.0
 64 | 0.0
 65 | 0.0
 66 | 0.0
 67 | 0.0
 68 | 0.0
 69 | 0.0
 70 | 1.0
 71 | 0.0
 72 | 0.0
 73 | 0.0
 74 | 0.0
 75 | 1.0
 76 | 0.0
 77 | 0.0
 78 | 1.0
 79 | 0.0
 80 | 0.0
 81 | 0.0
 82 | 0.0
 83 | 0.0
 84 | 0.0
 85 | 0.0
 86 | 0.0
 87 | 0.0
 88 | 0.0
 89 | 1.0
 90 | 0.0
 91 | 0.0
 92 | 0.0
 93 | 1.0
 94 | 0.0
 95 | 0.0
 96 | 0.0
 97 | 1.0
 98 | 0.0
 99 | 0.0
100 | 0.0
101 | 1.0
102 | 0.0
103 | 0.0
104 | 0.0
105 | 0.0
106 | 0.0
107 | 0.0
108 | 0.0
109 | 0.0
110 | 0.0
111 | 0.0
112 | 0.0
113 | 1.0
114 | 0.0
115 | 1.0
116 | 0.0
117 | 1.0
118 | 0.0
119 | 0.0
120 | 0.0
121 | 0.0
122 | 0.0
123 | 1.0
124 | 0.0
125 | 0.0
126 | 0.0
127 | 0.0
128 | 1.0
129 | 0.0
130 | 0.0
131 | 0.0
132 | 0.0
133 | 1.0
134 | 1.0
135 | 0.0
136 | 0.0
137 | 0.0
138 | 0.0
139 | 0.0
140 | 0.0
141 | 0.0
142 | 1.0
143 | 0.0
144 | 0.0
145 | 0.0
146 | 0.0
147 | 0.0
148 | 0.0
149 | 0.0
150 | 0.0
151 | 1.0
152 | 0.0
153 | 0.0
154 | 0.0
155 | 0.0
156 | 0.0
157 | 1.0
158 | 0.0
159 | 0.0
160 | 0.0
161 | 0.0
162 | 0.0
163 | 0.0
164 | 0.0
165 | 0.0
166 | 0.0
167 | 0.0
168 | 0.0
169 | 0.0
170 | 0.0
171 | 0.0
172 | 0.0
173 | 0.0
174 | 0.0
175 | 0.0
176 | 0.0
177 | 0.0
178 | 0.0
179 | 0.0
180 | 1.0
181 | 0.0
182 | 0.0
183 | 1.0
184 | 0.0
185 | 1.0
186 | 0.0
187 | 0.0
188 | 0.0
189 | 0.0
190 | 0.0
191 | 0.0
192 | 0.0
193 | 0.0
194 | 0.0
195 | 0.0
196 | 0.0
197 | 0.0
198 | 0.0
199 | 0.0
200 | 1.0
201 | 0.0
202 | 1.0
203 | 0.0
204 | 0.0
205 | 0.0
206 | 0.0
207 | 0.0
208 | 0.0
209 | 1.0
210 | 0.0
211 | 0.0
212 | 0.0
213 | 0.0
214 | 0.0
215 | 0.0
216 | 0.0
217 | 1.0
218 | 0.0
219 | 1.0
220 | 0.0
221 | 0.0
222 | 0.0
223 | 0.0
224 | 0.0
225 | 1.0
226 | 0.0
227 | 0.0
228 | 1.0
229 | 0.0
230 | 0.0
231 | 0.0
232 | 1.0
233 | 0.0
234 | 0.0
235 | 0.0
236 | 0.0
237 | 0.0
238 | 0.0
239 | 0.0
240 | 1.0
241 | 1.0
242 | 0.0
243 | 0.0
244 | 0.0
245 | 1.0
246 | 0.0
247 | 0.0
248 | 0.0
249 | 0.0
250 | 0.0
251 | 0.0
252 | 0.0
253 | 0.0
254 | 0.0
255 | 0.0
256 | 0.0
257 | 0.0
258 | 0.0
259 | 0.0
260 | 0.0
261 | 0.0
262 | 0.0
263 | 0.0
264 | 0.0
265 | 0.0
266 | 0.0
267 | 1.0
268 | 0.0
269 | 1.0
270 | 0.0
271 | 0.0
272 | 0.0
273 | 1.0
274 | 0.0
275 | 1.0
276 | 0.0
277 | 0.0
278 | 0.0
279 | 0.0
280 | 0.0
281 | 0.0
282 | 0.0
283 | 1.0
284 | 0.0
285 | 0.0
286 | 0.0
287 | 1.0
288 | 0.0
289 | 0.0
290 | 0.0
291 | 0.0
292 | 0.0
293 | 0.0
294 | 0.0
295 | 0.0
296 | 0.0
297 | 0.0
298 | 1.0
299 | 0.0
300 | 0.0
301 | 0.0
302 | 1.0
303 | 0.0
304 | 0.0
305 | 1.0
306 | 1.0
307 | 0.0
308 | 0.0
309 | 0.0
310 | 0.0
311 | 0.0
312 | 0.0
313 | 0.0
314 | 0.0
315 | 1.0
316 | 0.0
317 | 0.0
318 | 0.0
319 | 0.0
320 | 0.0
321 | 0.0
322 | 0.0
323 | 0.0
324 | 0.0
325 | 1.0
326 | 0.0
327 | 0.0
328 | 0.0
329 | 0.0
330 | 0.0
331 | 0.0
332 | 0.0
333 | 1.0
334 | 0.0
335 | 0.0
336 | 0.0
337 | 0.0
338 | 0.0
339 | 0.0
340 | 0.0
341 | 0.0
342 | 0.0
343 | 0.0
344 | 1.0
345 | 0.0
346 | 0.0
347 | 0.0
348 | 0.0
349 | 0.0
350 | 0.0
351 | 1.0
352 | 0.0
353 | 0.0
354 | 0.0
355 | 0.0
356 | 0.0
357 | 1.0
358 | 0.0
359 | 0.0
360 | 0.0
361 | 0.0
362 | 0.0
363 | 0.0
364 | 0.0
365 | 1.0
366 | 0.0
367 | 0.0
368 | 0.0
369 | 1.0
370 | 0.0
371 | 0.0
372 | 1.0
373 | 0.0
374 | 0.0
375 | 1.0
376 | 1.0
377 | 0.0
378 | 0.0
379 | 0.0
380 | 0.0
381 | 0.0
382 | 0.0
383 | 0.0
384 | 0.0
385 | 0.0
386 | 0.0
387 | 0.0
388 | 0.0
389 | 0.0
390 | 0.0
391 | 0.0
392 | 1.0
393 | 0.0
394 | 0.0
395 | 0.0
396 | 1.0
397 | 0.0
398 | 1.0
399 | 0.0
400 | 0.0
401 | 1.0
402 | 0.0
403 | 1.0
404 | 0.0
405 | 0.0
406 | 0.0
407 | 0.0
408 | 0.0
409 | 0.0
410 | 0.0
411 | 1.0
412 | 1.0
413 | 0.0
414 | 0.0
415 | 1.0
416 | 0.0
417 | 0.0
418 | 0.0
419 | 


--------------------------------------------------------------------------------
/titanic/data/result4.csv:
--------------------------------------------------------------------------------
  1 | 1.0
  2 | 0.0
  3 | 1.0
  4 | 1.0
  5 | 0.0
  6 | 1.0
  7 | 0.0
  8 | 1.0
  9 | 0.0
 10 | 1.0
 11 | 1.0
 12 | 1.0
 13 | 1.0
 14 | 1.0
 15 | 1.0
 16 | 0.0
 17 | 1.0
 18 | 1.0
 19 | 0.0
 20 | 0.0
 21 | 1.0
 22 | 1.0
 23 | 1.0
 24 | 1.0
 25 | 1.0
 26 | 1.0
 27 | 1.0
 28 | 1.0
 29 | 1.0
 30 | 1.0
 31 | 1.0
 32 | 1.0
 33 | 0.0
 34 | 0.0
 35 | 1.0
 36 | 1.0
 37 | 1.0
 38 | 0.0
 39 | 1.0
 40 | 1.0
 41 | 1.0
 42 | 0.0
 43 | 1.0
 44 | 0.0
 45 | 1.0
 46 | 1.0
 47 | 1.0
 48 | 0.0
 49 | 1.0
 50 | 0.0
 51 | 1.0
 52 | 1.0
 53 | 0.0
 54 | 1.0
 55 | 1.0
 56 | 1.0
 57 | 1.0
 58 | 1.0
 59 | 0.0
 60 | 1.0
 61 | 1.0
 62 | 1.0
 63 | 1.0
 64 | 0.0
 65 | 1.0
 66 | 0.0
 67 | 0.0
 68 | 1.0
 69 | 1.0
 70 | 1.0
 71 | 0.0
 72 | 1.0
 73 | 0.0
 74 | 1.0
 75 | 1.0
 76 | 1.0
 77 | 0.0
 78 | 1.0
 79 | 1.0
 80 | 0.0
 81 | 1.0
 82 | 1.0
 83 | 1.0
 84 | 1.0
 85 | 0.0
 86 | 0.0
 87 | 0.0
 88 | 0.0
 89 | 1.0
 90 | 1.0
 91 | 0.0
 92 | 1.0
 93 | 1.0
 94 | 0.0
 95 | 1.0
 96 | 1.0
 97 | 1.0
 98 | 1.0
 99 | 0.0
100 | 1.0
101 | 1.0
102 | 1.0
103 | 0.0
104 | 1.0
105 | 0.0
106 | 1.0
107 | 1.0
108 | 0.0
109 | 0.0
110 | 1.0
111 | 1.0
112 | 0.0
113 | 1.0
114 | 0.0
115 | 1.0
116 | 1.0
117 | 0.0
118 | 0.0
119 | 1.0
120 | 0.0
121 | 0.0
122 | 0.0
123 | 1.0
124 | 1.0
125 | 0.0
126 | 0.0
127 | 1.0
128 | 0.0
129 | 1.0
130 | 1.0
131 | 1.0
132 | 1.0
133 | 0.0
134 | 0.0
135 | 1.0
136 | 1.0
137 | 1.0
138 | 1.0
139 | 0.0
140 | 1.0
141 | 0.0
142 | 1.0
143 | 1.0
144 | 1.0
145 | 1.0
146 | 1.0
147 | 0.0
148 | 1.0
149 | 0.0
150 | 1.0
151 | 1.0
152 | 1.0
153 | 1.0
154 | 0.0
155 | 1.0
156 | 1.0
157 | 1.0
158 | 0.0
159 | 1.0
160 | 0.0
161 | 1.0
162 | 1.0
163 | 0.0
164 | 1.0
165 | 1.0
166 | 0.0
167 | 1.0
168 | 1.0
169 | 0.0
170 | 0.0
171 | 1.0
172 | 1.0
173 | 1.0
174 | 0.0
175 | 1.0
176 | 0.0
177 | 0.0
178 | 1.0
179 | 0.0
180 | 1.0
181 | 1.0
182 | 1.0
183 | 1.0
184 | 0.0
185 | 1.0
186 | 1.0
187 | 0.0
188 | 1.0
189 | 0.0
190 | 1.0
191 | 1.0
192 | 0.0
193 | 1.0
194 | 1.0
195 | 1.0
196 | 1.0
197 | 1.0
198 | 0.0
199 | 1.0
200 | 1.0
201 | 1.0
202 | 1.0
203 | 1.0
204 | 0.0
205 | 1.0
206 | 0.0
207 | 0.0
208 | 1.0
209 | 1.0
210 | 1.0
211 | 1.0
212 | 0.0
213 | 1.0
214 | 0.0
215 | 0.0
216 | 1.0
217 | 0.0
218 | 1.0
219 | 1.0
220 | 0.0
221 | 0.0
222 | 1.0
223 | 0.0
224 | 1.0
225 | 1.0
226 | 0.0
227 | 1.0
228 | 1.0
229 | 1.0
230 | 1.0
231 | 1.0
232 | 1.0
233 | 1.0
234 | 1.0
235 | 1.0
236 | 1.0
237 | 1.0
238 | 1.0
239 | 0.0
240 | 1.0
241 | 1.0
242 | 0.0
243 | 1.0
244 | 1.0
245 | 1.0
246 | 1.0
247 | 0.0
248 | 1.0
249 | 0.0
250 | 0.0
251 | 0.0
252 | 1.0
253 | 1.0
254 | 1.0
255 | 1.0
256 | 1.0
257 | 0.0
258 | 1.0
259 | 0.0
260 | 1.0
261 | 1.0
262 | 1.0
263 | 0.0
264 | 0.0
265 | 1.0
266 | 1.0
267 | 0.0
268 | 1.0
269 | 1.0
270 | 1.0
271 | 1.0
272 | 0.0
273 | 1.0
274 | 1.0
275 | 0.0
276 | 0.0
277 | 1.0
278 | 1.0
279 | 1.0
280 | 1.0
281 | 0.0
282 | 0.0
283 | 1.0
284 | 0.0
285 | 0.0
286 | 1.0
287 | 0.0
288 | 1.0
289 | 0.0
290 | 0.0
291 | 0.0
292 | 0.0
293 | 0.0
294 | 1.0
295 | 1.0
296 | 1.0
297 | 0.0
298 | 1.0
299 | 1.0
300 | 1.0
301 | 1.0
302 | 1.0
303 | 1.0
304 | 1.0
305 | 1.0
306 | 1.0
307 | 1.0
308 | 0.0
309 | 1.0
310 | 0.0
311 | 1.0
312 | 1.0
313 | 1.0
314 | 0.0
315 | 1.0
316 | 0.0
317 | 1.0
318 | 1.0
319 | 1.0
320 | 1.0
321 | 1.0
322 | 1.0
323 | 1.0
324 | 1.0
325 | 1.0
326 | 1.0
327 | 0.0
328 | 1.0
329 | 1.0
330 | 1.0
331 | 0.0
332 | 1.0
333 | 0.0
334 | 0.0
335 | 1.0
336 | 1.0
337 | 1.0
338 | 1.0
339 | 1.0
340 | 0.0
341 | 1.0
342 | 1.0
343 | 0.0
344 | 1.0
345 | 0.0
346 | 0.0
347 | 1.0
348 | 0.0
349 | 1.0
350 | 0.0
351 | 1.0
352 | 1.0
353 | 1.0
354 | 1.0
355 | 0.0
356 | 1.0
357 | 1.0
358 | 1.0
359 | 0.0
360 | 0.0
361 | 1.0
362 | 0.0
363 | 0.0
364 | 1.0
365 | 1.0
366 | 0.0
367 | 0.0
368 | 0.0
369 | 1.0
370 | 1.0
371 | 1.0
372 | 1.0
373 | 0.0
374 | 1.0
375 | 1.0
376 | 1.0
377 | 0.0
378 | 1.0
379 | 1.0
380 | 1.0
381 | 0.0
382 | 1.0
383 | 0.0
384 | 0.0
385 | 1.0
386 | 0.0
387 | 1.0
388 | 1.0
389 | 1.0
390 | 1.0
391 | 1.0
392 | 1.0
393 | 1.0
394 | 1.0
395 | 1.0
396 | 1.0
397 | 1.0
398 | 1.0
399 | 1.0
400 | 1.0
401 | 1.0
402 | 1.0
403 | 1.0
404 | 1.0
405 | 1.0
406 | 1.0
407 | 1.0
408 | 1.0
409 | 1.0
410 | 0.0
411 | 1.0
412 | 1.0
413 | 0.0
414 | 0.0
415 | 1.0
416 | 1.0
417 | 0.0
418 | 0.0
419 | 


--------------------------------------------------------------------------------
/titanic/data/test.csv:
--------------------------------------------------------------------------------
  1 | pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
  2 | 3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
  3 | 3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S
  4 | 2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q
  5 | 3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S
  6 | 3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S
  7 | 3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S
  8 | 3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q
  9 | 2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S
 10 | 3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C
 11 | 3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S
 12 | 3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
 13 | 1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S
 14 | 1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S
 15 | 2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S
 16 | 1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S
 17 | 2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C
 18 | 2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q
 19 | 3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C
 20 | 3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S
 21 | 3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C
 22 | 1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C
 23 | 3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S
 24 | 1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
 25 | 1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C
 26 | 1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C
 27 | 3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S
 28 | 1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C
 29 | 3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C
 30 | 1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S
 31 | 3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
 32 | 2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S
 33 | 2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S
 34 | 3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S
 35 | 3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S
 36 | 1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C
 37 | 3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C
 38 | 3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S
 39 | 3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S
 40 | 3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S
 41 | 3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S
 42 | 3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C
 43 | 1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S
 44 | 3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S
 45 | 2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S
 46 | 1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S
 47 | 3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S
 48 | 1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C
 49 | 3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q
 50 | 1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C
 51 | 3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S
 52 | 1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S
 53 | 2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C
 54 | 2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S
 55 | 1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S
 56 | 2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C
 57 | 3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q
 58 | 3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S
 59 | 3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S
 60 | 3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S
 61 | 1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C
 62 | 3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S
 63 | 2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S
 64 | 3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S
 65 | 3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q
 66 | 1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C
 67 | 2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S
 68 | 3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q
 69 | 1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S
 70 | 1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C
 71 | 1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S
 72 | 3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q
 73 | 3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S
 74 | 3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S
 75 | 1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C
 76 | 1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C
 77 | 1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C
 78 | 3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S
 79 | 1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S
 80 | 2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S
 81 | 3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q
 82 | 3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C
 83 | 1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S
 84 | 1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S
 85 | 3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S
 86 | 2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q
 87 | 3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C
 88 | 3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q
 89 | 3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S
 90 | 3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q
 91 | 2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S
 92 | 3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S
 93 | 3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S
 94 | 1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S
 95 | 3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S
 96 | 1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C
 97 | 3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S
 98 | 1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S
 99 | 3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S
100 | 3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S
101 | 3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S
102 | 1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C
103 | 2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S
104 | 3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q
105 | 3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S
106 | 3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C
107 | 3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S
108 | 3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q
109 | 3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q
110 | 3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S
111 | 2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S
112 | 2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C
113 | 3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q
114 | 1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C
115 | 3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q
116 | 1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S
117 | 3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C
118 | 3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C
119 | 3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S
120 | 1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C
121 | 2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S
122 | 2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S
123 | 3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q
124 | 1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C
125 | 3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S
126 | 3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q
127 | 3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S
128 | 3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S
129 | 3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q
130 | 2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S
131 | 3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S
132 | 3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S
133 | 1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C
134 | 3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S
135 | 3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C
136 | 3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S
137 | 3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S
138 | 3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C
139 | 2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S
140 | 3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S
141 | 3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S
142 | 3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S
143 | 1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S
144 | 1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C
145 | 2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S
146 | 1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S
147 | 3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S
148 | 1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S
149 | 3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S
150 | 1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S
151 | 2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S
152 | 1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C
153 | 3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C
154 | 3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S
155 | 3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S
156 | 3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S
157 | 3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S
158 | 1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S
159 | 3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S
160 | 1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S
161 | 3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S
162 | 3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q
163 | 3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C
164 | 2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S
165 | 3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S
166 | 2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S
167 | 3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S
168 | 1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C
169 | 3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S
170 | 1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C
171 | 3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S
172 | 3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S
173 | 3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C
174 | 3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S
175 | 3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C
176 | 3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S
177 | 2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S
178 | 2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S
179 | 1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C
180 | 2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S
181 | 1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C
182 | 2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S
183 | 1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C
184 | 1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S
185 | 3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q
186 | 1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C
187 | 2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S
188 | 2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S
189 | 3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S
190 | 3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S
191 | 2,"Veal, Mr. James",male,40,0,0,28221,13,,S
192 | 2,"Angle, Mr. William A",male,34,1,0,226875,26,,S
193 | 1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S
194 | 3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S
195 | 2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q
196 | 2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S
197 | 3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S
198 | 1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C
199 | 3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S
200 | 2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S
201 | 3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S
202 | 3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q
203 | 3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S
204 | 1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C
205 | 2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S
206 | 2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S
207 | 1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C
208 | 3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q
209 | 2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S
210 | 1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C
211 | 3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S
212 | 3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S
213 | 3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S
214 | 2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S
215 | 2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S
216 | 3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S
217 | 1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S
218 | 3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q
219 | 1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S
220 | 1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C
221 | 3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S
222 | 2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C
223 | 3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S
224 | 2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S
225 | 3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S
226 | 1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C
227 | 3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C
228 | 3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S
229 | 3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q
230 | 3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S
231 | 2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S
232 | 2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S
233 | 1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S
234 | 3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S
235 | 3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q
236 | 1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C
237 | 3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S
238 | 1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C
239 | 3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C
240 | 2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S
241 | 1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C
242 | 1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C
243 | 2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S
244 | 1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C
245 | 3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S
246 | 3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S
247 | 1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S
248 | 2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S
249 | 2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S
250 | 2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S
251 | 3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C
252 | 2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S
253 | 3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S
254 | 1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C
255 | 3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S
256 | 3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S
257 | 3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S
258 | 3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q
259 | 3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S
260 | 2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S
261 | 3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S
262 | 3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S
263 | 3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S
264 | 2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S
265 | 3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S
266 | 2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C
267 | 3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S
268 | 1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S
269 | 3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S
270 | 3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S
271 | 3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S
272 | 1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C
273 | 3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q
274 | 1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C
275 | 3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q
276 | 3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C
277 | 2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S
278 | 2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S
279 | 2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S
280 | 2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S
281 | 2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S
282 | 3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S
283 | 3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S
284 | 3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q
285 | 3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C
286 | 3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S
287 | 3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S
288 | 3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
289 | 1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S
290 | 3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C
291 | 3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S
292 | 1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S
293 | 3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q
294 | 3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C
295 | 1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S
296 | 3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S
297 | 3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S
298 | 2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C
299 | 3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C
300 | 1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S
301 | 3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S
302 | 3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S
303 | 2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C
304 | 2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S
305 | 3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S
306 | 3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
307 | 1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S
308 | 1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S
309 | 3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S
310 | 1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S
311 | 3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S
312 | 3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S
313 | 3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C
314 | 3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S
315 | 3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q
316 | 1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C
317 | 3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q
318 | 1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C
319 | 2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S
320 | 3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S
321 | 2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S
322 | 3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S
323 | 3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C
324 | 2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S
325 | 1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S
326 | 1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S
327 | 3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S
328 | 2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S
329 | 1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C
330 | 2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S
331 | 2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S
332 | 2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S
333 | 1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C
334 | 3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C
335 | 3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C
336 | 3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S
337 | 1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S
338 | 2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S
339 | 3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C
340 | 2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S
341 | 3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C
342 | 2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S
343 | 3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S
344 | 3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S
345 | 1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C
346 | 3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S
347 | 3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S
348 | 2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S
349 | 3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C
350 | 2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S
351 | 2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S
352 | 1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C
353 | 2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S
354 | 2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S
355 | 2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S
356 | 3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S
357 | 1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S
358 | 1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S
359 | 3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S
360 | 3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q
361 | 3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S
362 | 3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S
363 | 2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C
364 | 2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S
365 | 3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S
366 | 1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C
367 | 3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S
368 | 3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C
369 | 3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S
370 | 1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C
371 | 2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C
372 | 2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S
373 | 1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C
374 | 1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S
375 | 2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S
376 | 1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S
377 | 1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C
378 | 3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S
379 | 2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S
380 | 1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S
381 | 3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S
382 | 3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q
383 | 3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q
384 | 3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S
385 | 3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S
386 | 2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
387 | 2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S
388 | 3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S
389 | 2,"Ashby, Mr. John",male,57,0,0,244346,13,,S
390 | 3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q
391 | 3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S
392 | 1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S
393 | 1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S
394 | 3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S
395 | 2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S
396 | 3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S
397 | 1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S
398 | 3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q
399 | 1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C
400 | 3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S
401 | 3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q
402 | 1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S
403 | 2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S
404 | 1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C
405 | 1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S
406 | 1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C
407 | 2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C
408 | 2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S
409 | 1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C
410 | 3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
411 | 3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S
412 | 3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q
413 | 1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q
414 | 3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S
415 | 3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
416 | 1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C
417 | 3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
418 | 3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
419 | 3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C
420 | 


--------------------------------------------------------------------------------
/titanic/data/test2.csv:
--------------------------------------------------------------------------------
  1 | 3,0,34.5,7.8292
  2 | 3,1,47,7
  3 | 2,0,62,9.6875
  4 | 3,0,27,8.6625
  5 | 3,1,22,12.2875
  6 | 3,0,14,9.225
  7 | 3,1,30,7.6292
  8 | 2,0,26,29
  9 | 3,1,18,7.2292
 10 | 3,0,21,24.15
 11 | 3,0,-1,7.8958
 12 | 1,0,46,26
 13 | 1,1,23,82.2667
 14 | 2,0,63,26
 15 | 1,1,47,61.175
 16 | 2,1,24,27.7208
 17 | 2,0,35,12.35
 18 | 3,0,21,7.225
 19 | 3,1,27,7.925
 20 | 3,1,45,7.225
 21 | 1,0,55,59.4
 22 | 3,0,9,3.1708
 23 | 1,1,-1,31.6833
 24 | 1,0,21,61.3792
 25 | 1,1,48,262.375
 26 | 3,0,50,14.5
 27 | 1,1,22,61.9792
 28 | 3,0,22.5,7.225
 29 | 1,0,41,30.5
 30 | 3,0,-1,21.6792
 31 | 2,0,50,26
 32 | 2,0,24,31.5
 33 | 3,1,33,20.575
 34 | 3,1,-1,23.45
 35 | 1,0,30,57.75
 36 | 3,0,18.5,7.2292
 37 | 3,1,-1,8.05
 38 | 3,1,21,8.6625
 39 | 3,0,25,9.5
 40 | 3,0,-1,56.4958
 41 | 3,0,39,13.4167
 42 | 1,0,-1,26.55
 43 | 3,0,41,7.85
 44 | 2,1,30,13
 45 | 1,1,45,52.5542
 46 | 3,0,25,7.925
 47 | 1,0,45,29.7
 48 | 3,0,-1,7.75
 49 | 1,1,60,76.2917
 50 | 3,1,36,15.9
 51 | 1,0,24,60
 52 | 2,0,27,15.0333
 53 | 2,1,20,23
 54 | 1,1,28,263
 55 | 2,0,-1,15.5792
 56 | 3,0,10,29.125
 57 | 3,0,35,7.8958
 58 | 3,0,25,7.65
 59 | 3,0,-1,16.1
 60 | 1,1,36,262.375
 61 | 3,0,17,7.8958
 62 | 2,0,32,13.5
 63 | 3,0,18,7.75
 64 | 3,1,22,7.725
 65 | 1,0,13,262.375
 66 | 2,1,-1,21
 67 | 3,1,18,7.8792
 68 | 1,0,47,42.4
 69 | 1,0,31,28.5375
 70 | 1,1,60,263
 71 | 3,1,24,7.75
 72 | 3,0,21,7.8958
 73 | 3,1,29,7.925
 74 | 1,0,28.5,27.7208
 75 | 1,1,35,211.5
 76 | 1,0,32.5,211.5
 77 | 3,0,-1,8.05
 78 | 1,1,55,25.7
 79 | 2,0,30,13
 80 | 3,1,24,7.75
 81 | 3,0,6,15.2458
 82 | 1,0,67,221.7792
 83 | 1,0,49,26
 84 | 3,0,-1,7.8958
 85 | 2,0,-1,10.7083
 86 | 3,0,-1,14.4542
 87 | 3,1,27,7.8792
 88 | 3,1,18,8.05
 89 | 3,1,-1,7.75
 90 | 2,0,2,23
 91 | 3,1,22,13.9
 92 | 3,0,-1,7.775
 93 | 1,1,27,52
 94 | 3,0,-1,8.05
 95 | 1,0,25,26
 96 | 3,0,25,7.7958
 97 | 1,1,76,78.85
 98 | 3,0,29,7.925
 99 | 3,1,20,7.8542
100 | 3,0,33,8.05
101 | 1,1,43,55.4417
102 | 2,0,27,26
103 | 3,0,-1,7.75
104 | 3,0,26,7.775
105 | 3,1,16,8.5167
106 | 3,0,28,22.525
107 | 3,0,21,7.8208
108 | 3,0,-1,7.75
109 | 3,0,-1,8.7125
110 | 2,0,18.5,13
111 | 2,0,41,15.0458
112 | 3,1,-1,7.7792
113 | 1,1,36,31.6792
114 | 3,1,18.5,7.2833
115 | 1,1,63,221.7792
116 | 3,0,18,14.4542
117 | 3,0,-1,6.4375
118 | 3,1,1,16.7
119 | 1,0,36,75.2417
120 | 2,1,29,26
121 | 2,1,12,15.75
122 | 3,0,-1,7.75
123 | 1,1,35,57.75
124 | 3,0,28,7.25
125 | 3,0,-1,7.75
126 | 3,1,17,16.1
127 | 3,0,22,7.7958
128 | 3,1,-1,23.25
129 | 2,0,42,13
130 | 3,0,24,8.05
131 | 3,0,32,8.05
132 | 1,0,53,28.5
133 | 3,1,-1,25.4667
134 | 3,0,-1,6.4375
135 | 3,0,43,7.8958
136 | 3,0,24,7.8542
137 | 3,0,26.5,7.225
138 | 2,0,26,13
139 | 3,1,23,8.05
140 | 3,0,40,46.9
141 | 3,1,10,46.9
142 | 1,1,33,151.55
143 | 1,0,61,262.375
144 | 2,0,28,26
145 | 1,0,42,26.55
146 | 3,0,31,18
147 | 1,0,-1,51.8625
148 | 3,0,22,8.05
149 | 1,0,-1,26.55
150 | 2,0,30,26
151 | 1,1,23,83.1583
152 | 3,0,-1,7.8958
153 | 3,0,60.5,35
154 | 3,1,36,12.1833
155 | 3,0,13,31.3875
156 | 3,0,24,7.55
157 | 1,1,29,221.7792
158 | 3,1,23,7.8542
159 | 1,0,42,26.55
160 | 3,1,26,13.775
161 | 3,1,-1,7.7333
162 | 3,0,7,15.2458
163 | 2,1,26,13.5
164 | 3,0,-1,7
165 | 2,0,41,13
166 | 3,1,26,22.025
167 | 1,0,48,50.4958
168 | 3,0,18,34.375
169 | 1,1,-1,27.7208
170 | 3,1,22,8.9625
171 | 3,0,-1,7.55
172 | 3,0,27,7.225
173 | 3,0,23,13.9
174 | 3,0,-1,7.2292
175 | 3,0,40,31.3875
176 | 2,1,15,39
177 | 2,1,20,36.75
178 | 1,0,54,55.4417
179 | 2,1,36,39
180 | 1,1,64,83.1583
181 | 2,0,30,13
182 | 1,0,37,83.1583
183 | 1,1,18,53.1
184 | 3,0,-1,7.75
185 | 1,1,27,247.5208
186 | 2,0,40,16
187 | 2,1,21,21
188 | 3,0,17,8.05
189 | 3,1,-1,69.55
190 | 2,0,40,13
191 | 2,0,34,26
192 | 1,0,-1,26
193 | 3,0,11.5,14.5
194 | 2,0,61,12.35
195 | 2,0,8,32.5
196 | 3,0,33,7.8542
197 | 1,0,6,134.5
198 | 3,1,18,7.775
199 | 2,0,23,10.5
200 | 3,1,-1,8.1125
201 | 3,1,-1,15.5
202 | 3,0,0.33,14.4
203 | 1,0,47,227.525
204 | 2,1,8,26
205 | 2,0,25,10.5
206 | 1,0,-1,25.7417
207 | 3,1,35,7.75
208 | 2,0,24,10.5
209 | 1,1,33,27.7208
210 | 3,0,25,7.8958
211 | 3,0,32,22.525
212 | 3,0,-1,7.05
213 | 2,0,17,73.5
214 | 2,1,60,26
215 | 3,1,38,7.775
216 | 1,0,42,42.5
217 | 3,1,-1,7.8792
218 | 1,0,57,164.8667
219 | 1,1,50,211.5
220 | 3,0,-1,8.05
221 | 2,1,30,13.8583
222 | 3,0,21,8.05
223 | 2,1,22,10.5
224 | 3,0,21,7.7958
225 | 1,1,53,27.4458
226 | 3,1,-1,15.2458
227 | 3,0,23,7.7958
228 | 3,1,-1,7.75
229 | 3,0,40.5,15.1
230 | 2,0,36,13
231 | 2,0,14,65
232 | 1,1,21,26.55
233 | 3,0,21,6.4958
234 | 3,0,-1,7.8792
235 | 1,0,39,71.2833
236 | 3,0,20,7.8542
237 | 1,0,64,75.25
238 | 3,0,20,7.225
239 | 2,1,18,13
240 | 1,1,48,106.425
241 | 1,1,55,27.7208
242 | 2,1,45,30
243 | 1,0,45,134.5
244 | 3,0,-1,7.8875
245 | 3,0,-1,23.45
246 | 1,0,41,51.8625
247 | 2,1,22,21
248 | 2,0,42,32.5
249 | 2,1,29,26
250 | 3,1,-1,14.4542
251 | 2,1,0.92,27.75
252 | 3,0,20,7.925
253 | 1,0,27,136.7792
254 | 3,0,24,9.325
255 | 3,0,32.5,9.5
256 | 3,0,-1,7.55
257 | 3,0,-1,7.75
258 | 3,0,28,8.05
259 | 2,1,19,13
260 | 3,0,21,7.775
261 | 3,0,36.5,17.4
262 | 3,0,21,7.8542
263 | 2,1,29,23
264 | 3,1,1,12.1833
265 | 2,0,30,12.7375
266 | 3,0,-1,7.8958
267 | 1,0,-1,0
268 | 3,0,-1,7.55
269 | 3,1,-1,8.05
270 | 3,0,17,8.6625
271 | 1,0,46,75.2417
272 | 3,0,-1,7.75
273 | 1,1,26,136.7792
274 | 3,1,-1,15.5
275 | 3,0,-1,7.225
276 | 2,1,20,26
277 | 2,0,28,10.5
278 | 2,0,40,26
279 | 2,0,30,21
280 | 2,0,22,10.5
281 | 3,1,23,8.6625
282 | 3,0,0.75,13.775
283 | 3,1,-1,7.75
284 | 3,1,9,15.2458
285 | 3,1,2,20.2125
286 | 3,0,36,7.25
287 | 3,0,-1,7.25
288 | 1,0,24,82.2667
289 | 3,0,-1,7.2292
290 | 3,0,-1,8.05
291 | 1,0,-1,39.6
292 | 3,1,30,6.95
293 | 3,0,-1,7.2292
294 | 1,0,53,81.8583
295 | 3,0,36,9.5
296 | 3,0,26,7.8958
297 | 2,1,1,41.5792
298 | 3,0,-1,21.6792
299 | 1,0,30,45.5
300 | 3,0,29,7.8542
301 | 3,0,32,7.775
302 | 2,0,-1,15.0458
303 | 2,0,43,21
304 | 3,0,24,8.6625
305 | 3,1,-1,7.75
306 | 1,1,64,26.55
307 | 1,0,30,151.55
308 | 3,0,0.83,9.35
309 | 1,0,55,93.5
310 | 3,1,45,14.1083
311 | 3,0,18,8.6625
312 | 3,0,22,7.225
313 | 3,0,-1,7.575
314 | 3,1,37,7.75
315 | 1,1,55,135.6333
316 | 3,1,17,7.7333
317 | 1,0,57,146.5208
318 | 2,0,19,10.5
319 | 3,0,27,7.8542
320 | 2,0,22,31.5
321 | 3,0,26,7.775
322 | 3,0,25,7.2292
323 | 2,0,26,13
324 | 1,0,33,26.55
325 | 1,1,39,211.3375
326 | 3,0,23,7.05
327 | 2,1,12,39
328 | 1,0,46,79.2
329 | 2,0,29,26
330 | 2,0,21,13
331 | 2,1,48,36.75
332 | 1,0,39,29.7
333 | 3,0,-1,7.225
334 | 3,1,19,15.7417
335 | 3,0,27,7.8958
336 | 1,0,30,26
337 | 2,0,32,13
338 | 3,0,39,7.2292
339 | 2,0,25,31.5
340 | 3,0,-1,7.2292
341 | 2,0,18,10.5
342 | 3,0,32,7.5792
343 | 3,0,-1,69.55
344 | 1,1,58,512.3292
345 | 3,0,-1,14.5
346 | 3,1,16,7.65
347 | 2,0,26,13
348 | 3,1,38,7.2292
349 | 2,0,24,13.5
350 | 2,1,31,21
351 | 1,1,45,63.3583
352 | 2,0,25,10.5
353 | 2,0,18,73.5
354 | 2,0,49,65
355 | 3,1,0.17,20.575
356 | 1,0,50,26
357 | 1,1,59,51.4792
358 | 3,0,-1,7.8792
359 | 3,0,-1,7.75
360 | 3,1,30,15.55
361 | 3,0,14.5,69.55
362 | 2,1,24,37.0042
363 | 2,1,31,21
364 | 3,0,27,8.6625
365 | 1,1,25,55.4417
366 | 3,1,-1,69.55
367 | 3,0,-1,14.4583
368 | 3,1,22,39.6875
369 | 1,1,45,59.4
370 | 2,0,29,13.8583
371 | 2,0,21,11.5
372 | 1,1,31,134.5
373 | 1,0,49,0
374 | 2,0,44,13
375 | 1,1,54,81.8583
376 | 1,1,45,262.375
377 | 3,1,22,8.6625
378 | 2,0,21,11.5
379 | 1,0,55,50
380 | 3,0,5,31.3875
381 | 3,0,-1,7.75
382 | 3,0,26,7.8792
383 | 3,1,-1,14.5
384 | 3,1,19,16.1
385 | 2,0,-1,12.875
386 | 2,1,24,65
387 | 3,0,24,7.775
388 | 2,0,57,13
389 | 3,0,21,7.75
390 | 3,0,6,21.075
391 | 1,0,23,93.5
392 | 1,1,51,39.4
393 | 3,0,13,20.25
394 | 2,0,47,10.5
395 | 3,0,29,22.025
396 | 1,1,18,60
397 | 3,0,24,7.25
398 | 1,1,48,79.2
399 | 3,0,22,7.775
400 | 3,0,31,7.7333
401 | 1,1,30,164.8667
402 | 2,0,38,21
403 | 1,1,22,59.4
404 | 1,0,17,47.1
405 | 1,0,43,27.7208
406 | 2,0,20,13.8625
407 | 2,0,23,10.5
408 | 1,0,50,211.5
409 | 3,1,-1,7.7208
410 | 3,1,3,13.775
411 | 3,1,-1,7.75
412 | 1,1,37,90
413 | 3,1,28,7.775
414 | 3,0,-1,8.05
415 | 1,1,39,108.9
416 | 3,0,38.5,7.25
417 | 3,0,-1,8.05
418 | 3,0,-1,22.3583
419 | 


--------------------------------------------------------------------------------
/titanic/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from numpy import *
 3 | from scipy.optimize import fmin_bfgs
 4 | 
 5 | class LogisticRegression:
 6 |     """ An implementation of logistic regression. """
 7 |     def __init__ (self, x, y, lambda_=0):
 8 |         self.x = x
 9 |         self.y = atleast_2d(y).transpose()
10 |         self._lambda = lambda_
11 | 
12 |     def _sigmoid(self, x):
13 |         res = 1 / (1 + exp(-x))
14 |         idx = res == 1
15 |         res[idx] = .99
16 |         return res
17 | 
18 |     def _compute_cost(self, theta):
19 |         """ Calculate the cost function:
20 |         J = -1 / m * (y' * log(sigmoid(X * theta)) + (1 .- y') * log(1 .- sigmoid(X * theta)))
21 |         J += lambda / (2 * m) * theta(2 : end)' * theta(2 : end)
22 |         """
23 |         m = self.x.shape[0]
24 |         x_bias = hstack((ones((m, 1)), self.x))
25 |         theta = atleast_2d(theta).transpose()
26 |         J = -1.0 / m * (dot(self.y.transpose(), log(self._sigmoid(dot(x_bias, theta))))
27 |             + dot(1 - self.y.transpose(), log(1 - self._sigmoid(dot(x_bias, theta)))))
28 |         J += self._lambda / (2 * m) * sum(theta[1 : :] ** 2)
29 |         return J[0, 0]
30 | 
31 |     def _compute_grad(self, theta):
32 |         """ Calculate the gradient of J:
33 |         grad = 1 / m * (X' * (sigmoid(X * theta) - y))
34 |         grad(2 : end) += lambda / m * theta(2 : end)
35 |         """
36 |         m = self.x.shape[0]
37 |         x_bias = hstack((ones((m, 1)), self.x))
38 |         theta = atleast_2d(theta).transpose()
39 |         grad = 1.0 / m * (dot(x_bias.transpose(), self._sigmoid(dot(x_bias, theta)) - self.y))
40 |         grad[1 : :] += self._lambda / m * theta[1 : :]
41 |         return grad.ravel()
42 | 
43 |     def learn(self, max_iter=300):
44 |         """ Train theta from the dataset, return value is a 1-D array.
45 |         """
46 |         initial_theta = [0] * (self.x.shape[1] + 1)
47 |         args_ = ()
48 |         theta = fmin_bfgs(f=self._compute_cost, x0=initial_theta,
49 |             fprime=self._compute_grad, args=args_, maxiter=max_iter)
50 |         self._theta = atleast_2d(theta).transpose()
51 | 
52 |     def predict(self, x):
53 |         m = x.shape[0]
54 |         x_bias = hstack((ones((m, 1)), x))
55 |         p = zeros((m, 1))
56 |         prob = self._sigmoid(dot(x_bias, self._theta))
57 |         idx = prob >= 0.5
58 |         p[idx] = 1
59 |         return p.ravel()
60 | 
61 | if __name__ == '__main__':
62 |     pass
63 | 


--------------------------------------------------------------------------------
/titanic/readme:
--------------------------------------------------------------------------------
1 | Kaggle - Titanic
2 | Problem link: https://www.kaggle.com/c/titanic-gettingStarted
3 | This is  a logistic regression solution for Kaggle Titanic prediction, achieving accuracy about 83%. The code contains a implementation of logistic regression with L2 regularization.
4 | The data preprocessing part is left out, we construct the training and testing data with only four features: pclass, sex, age and fare.


--------------------------------------------------------------------------------
/titanic/titanic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import csv
 3 | from numpy import *
 4 | from logistic_regression import LogisticRegression
 5 | 
 6 | def map_feature(x):
 7 |     """ Add polynomial features to x in order to reduce high bias.
 8 |     """
 9 |     m, n = x.shape
10 |     out = x
11 | 
12 |     # Add quodratic features.
13 |     for i in range(n):
14 |         for j in range(i, n):
15 |             out = hstack((out, x[:, i].reshape(m, 1) * x[:, j].reshape(m, 1)))
16 | 
17 |     # Add cubic features.
18 |     for i in range(n):
19 |         for j in range(i, n):
20 |             for k in range(j, n):
21 |                 out = hstack(
22 |                     (out, x[:, i].reshape(m, 1) * x[:, j].reshape(m, 1) * x[:, k].reshape(m, 1)))
23 |     return out
24 | 
25 | def scale_data(x):
26 |     """ Scale data with zero mean and unit variance.
27 |     """
28 |     mu = x.mean(axis=0)
29 |     sigma = x.std(axis=0)
30 |     x = (x - mu) / sigma
31 |     return (x, mu, sigma)
32 | 
33 | def read_data():
34 |     # Data in the file has been preprocessed by eliminating rows with missing values.
35 |     csv_file_object = csv.reader(open('./data/data.csv', 'rb')) 
36 |     header = csv_file_object.next()
37 |     x = []
38 |     for row in csv_file_object:
39 |         x.append(row)
40 |     return array(x, dtype=float64)
41 | 
42 | if __name__ == '__main__':
43 |     x = read_data()
44 | 
45 |     # Generates training set and cross validation set.
46 |     y = x[:, 0]
47 |     x = x[:, 1 : :]
48 |     x = map_feature(x)
49 |     num = int(x.shape[0] * .7)
50 |     x_cv = x[num : :, :]
51 |     y_cv = y[num : :]
52 |     x = x[0 : num, :]
53 |     y = y[0 : num]
54 | 
55 |     # Feature scaling.
56 |     x, mu, sigma = scale_data(x)
57 |     x_cv = (x_cv - mu) / sigma
58 | 
59 |     # Use cross validation set to find the best lambda for regularization.
60 |     C_candidates = [0, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
61 |     lambda_ = 0
62 |     best_accuracy = 0
63 |     for C in C_candidates:
64 |         clf = LogisticRegression(x, y, C)
65 |         clf.learn()
66 |         p_cv = clf.predict(x_cv)
67 |         accuracy = (p_cv == y_cv).mean()
68 |         if accuracy > best_accuracy:
69 |             best_accuracy = accuracy
70 |             lambda_ = C
71 |     print 'Best regularization parameter lambda: %f' % lambda_
72 | 
73 |     clf = LogisticRegression(x, y, lambda_)
74 |     clf.learn()
75 |     p = clf.predict(x)
76 |     p_cv = clf.predict(x_cv)
77 |     print 'Accuracy in training set: %f'% (p == y).mean()
78 |     print 'Accuracy in cv: %f' %  (p_cv == y_cv).mean()
79 | 


--------------------------------------------------------------------------------