├── README.md ├── V1 ├── averaging_methods.py ├── construct_liblinear_b1.py ├── construct_liblinear_multi.py ├── construct_maxlabel.py ├── construct_maxprob.py ├── construct_maxprob_balance.py ├── construct_maxprob_multi.py ├── construct_semilda.py ├── construct_session_prob.py ├── forest.py ├── markov_all.py ├── markov_sessoin_label.py ├── metric_F1.py ├── metric_confusion.py ├── prepare1.py ├── prepare2.py ├── prepare3.py ├── prepare_lda_test.py ├── prepare_lda_train.py ├── prepare_liblinear_1vsA.py ├── prepare_session.py ├── refine_train_by_sesson_query.py ├── run_average.sh ├── run_forest_dog.sh ├── run_liblinear_multi.sh ├── run_liblinear_pig.sh ├── run_semilda_pig.sh ├── run_xgboost3_dog.sh ├── run_xgboost3_pig.sh ├── run_xgboost3_pig2.sh ├── run_xgboost3_session.sh ├── split_train.py ├── trans_session.py ├── trans_train1.py ├── trans_train2.py └── xgboost3.conf └── V2 ├── construct_liblinear_b1.py ├── construct_maxprob.py ├── construct_maxprob_multi.py ├── construct_semilda.py ├── construct_session_prob.py ├── markov_sessoin_label.py ├── metric_F1.py ├── prepare_ensemble_cat.py ├── prepare_ensemble_dog.py ├── prepare_ensemble_pig.py ├── prepare_feature_dog1.py ├── prepare_feature_pig1.py ├── prepare_lda_test.py ├── prepare_lda_train.py ├── prepare_session.py ├── refine_train_by_sesson_query.py ├── run_all.sh ├── run_ensemble.sh ├── run_liblinear_dog.sh ├── run_liblinear_pig.sh ├── run_prepare.sh ├── run_semilda.sh ├── run_session_label.sh ├── run_xgboost3_dog.sh ├── run_xgboost3_pig.sh ├── split_train.py ├── trans_session.py ├── trans_train0.py ├── trans_train1.py ├── trans_train2.py └── xgboost3.conf /README.md: -------------------------------------------------------------------------------- 1 | Fancyspeed's solution for CIKM2014 Cup (the 5th place). 2 | =================================================== 3 | 4 | ## Background 5 | 6 | The task is query classification, or query intent detection. 7 | 8 | About the competition, please visit http://cikm2014.fudan.edu.cn/index.php/Index/index and http://openresearch.baidu.com/topic/71.jspx 9 | 10 | 11 | ## Challenges 12 | 13 | * Multi-class multi-label 14 | * Short text 15 | * Click and session 16 | * Unlabelled data 17 | * Unbalanced data 18 | 19 | ## Ideas for each challenge 20 | 21 | * Structured labels 22 | * N-gram, word position, aggregated query as a sample 23 | * In-session queries and labels, keyword and entity detection 24 | * Semi-supervised learning 25 | * Sampling, post-processin 26 | 27 | ## Features 28 | 29 | * query words (1-gram, 2-gram, word position) 30 | * clicked title words (1-gram, 2-gram) 31 | * words of top 30 titles in query's same sessions 32 | * words of top 3 labels in query's same sessions 33 | * labels in query's same sessions 34 | * query length 35 | * query frequence 36 | * average length of clicked titles 37 | * average search times in query's same sessions 38 | * average click times in query's same sessions 39 | * averge duplicated clicks in query's same sessions 40 | 41 | ## Methods and tools 42 | 43 | * GBM: Xgboost with softmax-objective 44 | * SVC: Liblinear 45 | * Multi-class LR: Sklearn.MultiTaskLasso 46 | * Random Forest: Sklearn.RandomForestClassifier 47 | * Labelled LDA: modified PLDA 48 | * Markov Chain: query-query similarity by text and session co-occurrence 49 | 50 | ## Ensembles 51 | 52 | * weighted averaging 53 | * linear model 54 | * cascading: feed xgboost 55 | 56 | ## Post-processing 57 | 58 | * Calibration: same label distribution as training set 59 | * Threshold: same average labels as training set 60 | 61 | ## How to run 62 | 63 | * Dependencies: 64 | 1. XGBoost for GBM: https://github.com/tqchen/xgboost 65 | 2. Liblinear for LR and SVC: http://www.csie.ntu.edu.tw/~cjlin/liblinear/ 66 | 67 | * Assumpation: 68 | 1. XGboost's path is ../../tools/xgboost3/ 69 | 2. Liblinear's path is ../../tools/liblinear/ 70 | 3. raw training data is in ../raw_data 71 | 4. need 3 folds ../trans_data, ../dataset, ../submit for temporary data 72 | 73 | * Run: 74 | 1. `cd V2` 75 | 2. `sh -x run_all.sh` 76 | 77 | * Steps: 78 | 1. split train.txt to dog/valid (for offline tuning): split_train.py 79 | 2. merge information for each query: trans_train.py 80 | 3. generate features: prepare_feature.py 81 | 4. train and predict by xgboost: run_xgboost3_dog.sh 82 | 5. train and predict by liblinear: run_liblinear_dog.sh 83 | 6. ensemble: run_ensemble.sh 84 | 85 | -------------------------------------------------------------------------------- /V1/averaging_methods.py: -------------------------------------------------------------------------------- 1 | n_sample = 39013 2 | 3 | label_map = {} 4 | label_map2 = {} 5 | max_label = 0 6 | for line in open('../dataset/label_map'): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | label_map2[label] = int(c) 10 | max_label = max(max_label, int(c)) 11 | print 'max_label:', max_label 12 | 13 | weights = [] 14 | for i in xrange(n_sample): 15 | weights.append([0]*(max_label+1)) 16 | 17 | if __name__ == '__main__': 18 | import sys 19 | if len(sys.argv) < 4: 20 | print ' out in1 m1 w1 [in2 m2 w2 ...]' 21 | exit(1) 22 | 23 | i = 2 24 | while i < len(sys.argv): 25 | in_i = sys.argv[i] 26 | m_i = sys.argv[i+1] 27 | w_i = float(sys.argv[i+2]) 28 | print in_i, m_i, w_i 29 | if m_i == 'xgboost': 30 | nclass = int(sys.argv[i+3]) 31 | fin = open(in_i) 32 | for isample in xrange(n_sample): 33 | for ipred in xrange(nclass): 34 | pred = float(fin.readline().strip()) 35 | if ipred <= max_label: 36 | weights[isample][ipred] += pred * w_i 37 | fin.close() 38 | i += 4 39 | elif m_i == 'liblinear': 40 | fin = open(in_i) 41 | fin.readline() 42 | for isample in xrange(n_sample): 43 | preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 44 | for ipred, pred in enumerate(preds): 45 | weights[isample][ipred] += pred * w_i 46 | i += 3 47 | elif m_i == 'semilda': 48 | lda_map = {} 49 | for line in open('../dataset/label_map_lda'): 50 | label, c = line.strip().split('\t') 51 | lda_map[int(c)] = label 52 | feat_map = {} 53 | fin = open('../trans_data/test.simple') 54 | for line in open(in_i): 55 | preds = [float(v) for v in line.strip().split(' ')] 56 | tot = sum(preds) + 0.001 57 | preds = [v/tot for v in preds] 58 | feats = fin.readline().strip().split('\t')[1] 59 | feat_map[feats] = preds 60 | fin.close() 61 | fin = open('../raw_data/test.txt') 62 | for isample in xrange(n_sample): 63 | feats = fin.readline().strip() 64 | preds = feat_map[feats] 65 | for ipred, pred in enumerate(preds): 66 | ipred2 = label_map2[lda_map[ipred]] 67 | weights[isample][ipred2] += pred / tot * w_i 68 | fin.close() 69 | i += 3 70 | elif m_i == 'sessionlabel': 71 | fin = open(in_i) 72 | for isample in xrange(n_sample): 73 | try: 74 | pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')] 75 | if pairs and pairs[0] and pairs[0][0]: 76 | tot = sum([float(v[1]) for v in pairs]) 77 | if tot > 0: 78 | for pair in pairs: 79 | label, v = pair[0], float(pair[1]) 80 | label = ' | '.join(sorted(label.split(' | '))) 81 | if label not in label_map2: label = label.split(' | ')[0] 82 | c = label_map2[label] 83 | weights[isample][c] += v / tot * w_i 84 | except: 85 | print pairs 86 | exit(1) 87 | i += 3 88 | 89 | 90 | with open(sys.argv[1], 'w') as fo: 91 | for preds in weights: 92 | for pred in preds: 93 | fo.write('%s\n' % pred) 94 | #label = sorted([(i, v) for i, v in enumerate(preds)], key=lambda d:-d[1])[0][0] 95 | #fo.write('%s\n' % label) 96 | 97 | 98 | -------------------------------------------------------------------------------- /V1/construct_liblinear_b1.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | def get_match(p_pred, p_test, p_out): 12 | fo = open(p_out, 'w') 13 | fl = open(p_test) 14 | fin = open(p_pred) 15 | fin.readline() 16 | for line in fin: 17 | c = int(line.split(' ')[0]) 18 | #c = max(0, min(max_label, int(float(line.strip())+0.5))) 19 | label = label_map[c] 20 | feats = fl.readline().strip() 21 | fo.write('%s\t%s\n' % (feats, label)) 22 | fin.close() 23 | fl.close() 24 | fo.close() 25 | 26 | if __name__ == '__main__': 27 | import sys 28 | if len(sys.argv) != 5: 29 | print ' pred test label out' 30 | exit(1) 31 | 32 | load_label_map(sys.argv[3]) 33 | print 'max_label:', max_label 34 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 35 | -------------------------------------------------------------------------------- /V1/construct_liblinear_multi.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | def get_match(p_pred, p_test, p_out): 12 | fin = [] 13 | for i in range(7): 14 | fin.append( open('%s_%s.txt' % (p_pred, i)) ) 15 | fin[i].readline() 16 | fo = open(p_out, 'w') 17 | 18 | for line in open(p_test): 19 | feats = line.strip() 20 | 21 | preds = [] 22 | for i in range(7): 23 | #preds.append( (label_map[i], int(fin[i].readline().split(' ')[0]))) 24 | preds.append( (label_map[i], float(fin[i].readline().strip().split(' ')[2]))) 25 | labels = sorted(preds, key=lambda d:-d[1]) 26 | if labels[1][1] > 0.5: 27 | label = labels[0][0] + ' | ' + labels[1][0] 28 | else: 29 | label = labels[0][0] 30 | fo.write('%s\t%s\n' % (feats, label)) 31 | fo.close() 32 | for i in range(7): 33 | fin[i].close() 34 | 35 | if __name__ == '__main__': 36 | import sys 37 | if len(sys.argv) != 5: 38 | print ' pred test label out' 39 | exit(1) 40 | 41 | load_label_map(sys.argv[3]) 42 | print 'max_label:', max_label 43 | 44 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 45 | -------------------------------------------------------------------------------- /V1/construct_maxlabel.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | def get_match(p_pred, p_test, p_out): 12 | fo = open(p_out, 'w') 13 | fl = open(p_test) 14 | for line in open(p_pred): 15 | c = max(0, min(max_label, int(float(line.strip())+0.5))) 16 | label = label_map[c] 17 | feats = fl.readline().strip() 18 | fo.write('%s\t%s\n' % (feats, label)) 19 | fl.close() 20 | fo.close() 21 | 22 | if __name__ == '__main__': 23 | import sys 24 | if len(sys.argv) != 5: 25 | print ' pred test label out' 26 | exit(1) 27 | 28 | load_label_map(sys.argv[3]) 29 | print 'max_label:', max_label 30 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 31 | -------------------------------------------------------------------------------- /V1/construct_maxprob.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | adjust = {} 5 | def load_label_map(p_in): 6 | global max_label 7 | for line in open(p_in): 8 | label, c = line.strip().split('\t') 9 | label_map[int(c)] = label 10 | max_label = max(max_label, int(c)) 11 | 12 | for i in range(max_label+1): 13 | adjust[i] = 0 14 | 15 | def get_match(p_pred, p_test, p_out): 16 | npred = len(open(p_pred).readlines()) / len(open(p_test).readlines()) 17 | fo = open(p_out, 'w') 18 | fp = open(p_pred) 19 | for line in open(p_test): 20 | feats = line.strip() 21 | 22 | pred = [] 23 | for i in range(npred): 24 | if i <= max_label: 25 | pred.append(float(fp.readline().strip())) 26 | else: 27 | fp.readline() 28 | c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0] 29 | label = label_map[c] 30 | fo.write('%s\t%s\n' % (feats, label)) 31 | fp.close() 32 | fo.close() 33 | 34 | if __name__ == '__main__': 35 | import sys 36 | if len(sys.argv) < 5: 37 | print ' pred test label out adjust' 38 | exit(1) 39 | 40 | load_label_map(sys.argv[3]) 41 | print 'max_label:', max_label 42 | if len(sys.argv) >= 6: 43 | for line in open(sys.argv[5]): 44 | cid, v = line.strip().split('\t') 45 | adjust[int(cid)] = float(v) 46 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 47 | -------------------------------------------------------------------------------- /V1/construct_maxprob_balance.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | label_map2 = {} 4 | max_label = 0 5 | adjust = {} 6 | def load_label_map(p_in): 7 | global max_label 8 | for line in open(p_in): 9 | label, c = line.strip().split('\t') 10 | label_map[int(c)] = label 11 | label_map2[label] = int(c) 12 | max_label = max(max_label, int(c)) 13 | print label_map 14 | print label_map2 15 | 16 | for c in range(max_label+1): 17 | adjust[c] = 0. 18 | 19 | truth_dict = {} 20 | def load_truth(p_truth): 21 | tot = 0. 22 | for line in open(p_truth): 23 | label = line.strip().split('\t')[1] 24 | label = ' | '.join(sorted(label.split(' | '))) 25 | if label not in label_map: label = label.split(' | ')[0] 26 | c = label_map2[label] 27 | truth_dict[c] = truth_dict.get(c, 0) + 1 28 | tot += 1. 29 | for c in range(max_label+1): 30 | truth_dict[c] = truth_dict.get(c, 0) / tot 31 | print truth_dict 32 | 33 | def learn(p_pred, npred): 34 | i = 0 35 | preds = [] 36 | pred = [] 37 | for line in open(p_pred): 38 | j = i % npred 39 | if j <= max_label: 40 | pred.append(float(line.strip())) 41 | if j == npred-1: 42 | preds.append(pred) 43 | pred = [] 44 | i += 1 45 | ite = 0 46 | while ite < 20: 47 | cids = {} 48 | tot = len(preds) 49 | for pred in preds: 50 | c = sorted([(k, v+adjust[k]) for (k, v) in enumerate(pred)], key=lambda d:-d[1])[0][0] 51 | cids[c] = cids.get(c, 0) + 1./tot 52 | for c in cids: 53 | if cids[c] < truth_dict[c] * 0.8: adjust[c] += 0.005 54 | elif cids[c] < truth_dict[c] * 0.2: adjust[c] += 0.015 55 | elif cids[c] > truth_dict[c] * 1.2: adjust[c] -= 0.005 56 | elif cids[c] > truth_dict[c] * 5: adjust[c] -= 0.015 57 | ite += 1 58 | 59 | 60 | def get_match(p_pred, p_test, p_out): 61 | npred = len(open(p_pred).readlines()) / len(open(p_test).readlines()) 62 | 63 | learn(p_pred, npred) 64 | 65 | fo = open(p_out, 'w') 66 | fp = open(p_pred) 67 | for line in open(p_test): 68 | feats = line.strip() 69 | 70 | pred = [] 71 | for c in range(npred): 72 | if c <= max_label: 73 | pred.append(float(fp.readline().strip())) 74 | else: 75 | fp.readline() 76 | sort_list = sorted([(c, v+adjust[c]) for c, v in enumerate(pred)], key=lambda d:-d[1]) 77 | c = sort_list[0][0] 78 | label = label_map[c] 79 | if sort_list[1][1] > 0.45: 80 | label = label + ' | ' + label_map[sort_list[1][0]] 81 | label = ' | '.join(label.split(' | ')[:2]) 82 | fo.write('%s\t%s\n' % (feats, label)) 83 | fp.close() 84 | fo.close() 85 | 86 | if __name__ == '__main__': 87 | import sys 88 | if len(sys.argv) < 5: 89 | print ' pred test label out adjust' 90 | exit(1) 91 | 92 | load_label_map(sys.argv[3]) 93 | load_truth('../trans_data/valid.label') 94 | print 'max_label:', max_label 95 | if len(sys.argv) >= 6: 96 | for line in open(sys.argv[5]): 97 | cid, v = line.strip().split('\t') 98 | adjust[int(cid)] = float(v) 99 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 100 | 101 | -------------------------------------------------------------------------------- /V1/construct_maxprob_multi.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | adjust = {} 5 | def load_label_map(p_in): 6 | global max_label 7 | for line in open(p_in): 8 | label, c = line.strip().split('\t') 9 | label_map[int(c)] = label 10 | max_label = max(max_label, int(c)) 11 | 12 | for i in range(max_label+1): 13 | adjust[i] = 0 14 | 15 | def get_match(p_pred, p_test, p_out): 16 | npred = len(open(p_pred).readlines()) / len(open(p_test).readlines()) 17 | fo = open(p_out, 'w') 18 | fp = open(p_pred) 19 | for line in open(p_test): 20 | feats = line.strip() 21 | 22 | pred = [] 23 | for i in range(npred): 24 | if i <= max_label: 25 | pred.append(float(fp.readline().strip())) 26 | else: 27 | fp.readline() 28 | sort_list = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1]) 29 | c = sort_list[0][0] 30 | label = label_map[c] 31 | if sort_list[1][1] > 0.45: 32 | label = label + ' | ' + label_map[sort_list[1][0]] 33 | label = ' | '.join(label.split(' | ')[:2]) 34 | fo.write('%s\t%s\n' % (feats, label)) 35 | fp.close() 36 | fo.close() 37 | 38 | if __name__ == '__main__': 39 | import sys 40 | if len(sys.argv) < 5: 41 | print ' pred test label out adjust' 42 | exit(1) 43 | 44 | load_label_map(sys.argv[3]) 45 | print 'max_label:', max_label 46 | if len(sys.argv) >= 6: 47 | for line in open(sys.argv[5]): 48 | cid, v = line.strip().split('\t') 49 | adjust[int(cid)] = float(v) 50 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 51 | -------------------------------------------------------------------------------- /V1/construct_semilda.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | def get_match(p_pred, p_test_simple, p_test, p_out): 12 | feat_map = {} 13 | fl = open(p_test_simple) 14 | fin = open(p_pred) 15 | for line in fin: 16 | pred = [float(v) for v in line.strip().split(' ')] 17 | tot = sum(pred) + 0.001 18 | pred = [v/tot for v in pred] 19 | c = sorted([(k, v) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0] 20 | label = label_map[c] 21 | feats = fl.readline().strip().split('\t')[1] 22 | feat_map[feats] = label 23 | fin.close() 24 | fl.close() 25 | fo = open(p_out, 'w') 26 | for line in open(p_test): 27 | feats = line.strip() 28 | fo.write('%s\t%s\n' % (feats, feat_map[feats])) 29 | fo.close() 30 | 31 | if __name__ == '__main__': 32 | import sys 33 | if len(sys.argv) != 6: 34 | print ' pred test.simple test label out' 35 | exit(1) 36 | 37 | load_label_map(sys.argv[4]) 38 | print 'max_label:', max_label 39 | get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5]) 40 | -------------------------------------------------------------------------------- /V1/construct_session_prob.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | 12 | def get_match(p_pred, p_session, p_test, p_out): 13 | npred = len(open(p_pred).readlines()) / len(open(p_session).readlines()) 14 | query_dict = {} 15 | query_num = {} 16 | for line in open(p_test): 17 | query = line.strip() 18 | query_dict[query] = [0]*npred 19 | query_num[query] = 0 20 | 21 | fp = open(p_pred) 22 | for line in open(p_session): 23 | query_list = line.strip().split('\t')[1].split(';') 24 | 25 | pred = [] 26 | for i in range(npred): 27 | if i <= max_label: 28 | pred.append(float(fp.readline().strip())) 29 | else: 30 | fp.readline() 31 | for query in query_list: 32 | if query in query_dict: 33 | for i, v in enumerate(pred): 34 | query_dict[query][i] += v 35 | query_num[query] += 1 36 | fp.close() 37 | print 'query_dict', len(query_dict) 38 | 39 | not_in_test = 0 40 | fo = open(p_out, 'w') 41 | for line in open(p_test): 42 | query = line.strip() 43 | #c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0] 44 | #label = label_map[c] 45 | if query_num[query] > 0: 46 | for i in range(npred): 47 | fo.write('%s\n' % (query_dict[query][i]/query_num[query])) 48 | else: 49 | for i in range(npred): 50 | fo.write('0\n') 51 | not_in_test += 1 52 | fo.close() 53 | print 'not in session:', not_in_test 54 | 55 | if __name__ == '__main__': 56 | import sys 57 | if len(sys.argv) < 6: 58 | print ' pred session testid label out' 59 | exit(1) 60 | 61 | load_label_map(sys.argv[4]) 62 | print 'max_label:', max_label 63 | 64 | get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5]) 65 | -------------------------------------------------------------------------------- /V1/forest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | # Import the random forest package 4 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 5 | from sklearn.datasets import load_svmlight_file, load_svmlight_files 6 | 7 | if len(sys.argv) < 5: 8 | print ' p_trian p_test n_tree depth' 9 | exit(1) 10 | p_train = sys.argv[1] 11 | p_test = sys.argv[2] 12 | n_tree = int(sys.argv[3]) 13 | depth = int(sys.argv[4]) 14 | print p_train, p_test, n_tree, depth 15 | 16 | # How to load data? 17 | #dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) 18 | #X_train, y_train, X_test, y_test = load_svmlight_files(p_train, p_test) 19 | X_train, y_train = load_svmlight_file(p_train) 20 | X_train = X_train.toarray() 21 | X_test, y_test = load_svmlight_file(p_test, n_features=X_train.shape[1]) 22 | X_test = X_test.toarray() 23 | 24 | # Create the random forest object which will include all the parameters 25 | # for the fit 26 | forest = RandomForestClassifier(n_estimators=n_tree, criterion='gini', max_depth=depth, n_jobs=4) 27 | 28 | # Fit the training data to the Survived labels and create the decision trees 29 | forest = forest.fit(X_train, y_train) 30 | 31 | # Take the same decision trees and run it on the test data 32 | output = forest.predict(X_test) 33 | 34 | with open('pred_forest.txt', 'w') as fo: 35 | for o in output: 36 | fo.write('%s\n' % o) 37 | -------------------------------------------------------------------------------- /V1/markov_all.py: -------------------------------------------------------------------------------- 1 | import random 2 | import gc 3 | 4 | p_train = '../raw_data/train.txt' 5 | p_test = '../raw_data/test.txt' 6 | p_dog = '../trans_data/dog.txt' 7 | p_valid = '../trans_data/valid.txt' 8 | 9 | def norm(a2b): 10 | gc.disable() 11 | for a in a2b: 12 | tot = sum([a2b[a][b] for b in a2b[a]]) 13 | for b in a2b[a]: a2b[a][b] /= tot 14 | gc.enable() 15 | 16 | def multiple(a2b, b2c): 17 | gc.disable() 18 | a2c = {} 19 | for a in a2b: 20 | if a not in a2c: a2c[a] = {} 21 | for b in a2b[a]: 22 | v1 = a2b[a][b] 23 | if b in b2c: 24 | for c in b2c[b]: 25 | if c not in a2c[a]: a2c[a][c] = 0 26 | v2 = b2c[b][c] 27 | a2c[a][c] += a2b[a][b] * b2c[b][c] 28 | gc.enable() 29 | return a2c 30 | 31 | class Converter(object): 32 | def __init__(self): 33 | self.max_idx = 0 34 | self.s_dict = {} 35 | def str2id(self, s): 36 | if s not in self.s_dict: 37 | self.s_dict[s] = self.max_idx 38 | self.s_dict[self.max_idx] = s 39 | self.max_idx += 1 40 | return self.s_dict[s] 41 | def id2str(self, i): 42 | return self.s_dict.get(i, '') 43 | 44 | def build(p_in): 45 | #gc.disable() 46 | convert = Converter() 47 | 48 | all_to_class = {} 49 | query_to_all = {} 50 | 51 | print 'loading from', p_in 52 | 53 | session_query = set() 54 | session_all = set() 55 | for line in open(p_in): 56 | if not line.strip(): 57 | #session end 58 | for q in session_query: 59 | for q2 in session_all: 60 | if q!=q2: 61 | if q not in query_to_all: query_to_all[q] = {} 62 | query_to_all[q][q2] = query_to_all[q].get(q2, 0) + 1. 63 | 64 | session_query = set() 65 | session_all = set() 66 | continue 67 | 68 | try: 69 | labels, query, title = line.strip().split('\t') 70 | except: 71 | labels, query = line.strip().split('\t') 72 | title = '-' 73 | 74 | query = convert.str2id(query) 75 | session_query.add(query) 76 | session_all.add(query) 77 | if title and title != '-': 78 | title = convert.str2id('t_' + title) 79 | session_all.add(title) 80 | 81 | if labels!='CLASS=TEST' and labels!='CLASS=UNKNOWN': 82 | label_list = labels.split(' | ') 83 | for label in label_list: 84 | label = convert.str2id(label) 85 | if query not in all_to_class: all_to_class[query] = {} 86 | all_to_class[query][label] = all_to_class[query].get(label, 0) + 1. 87 | if title and title != '-': 88 | if title not in all_to_class: all_to_class[title] = {} 89 | all_to_class[title][label] = all_to_class[title].get(label, 0) + 1. 90 | print 'load finished' 91 | 92 | norm(all_to_class) 93 | norm(query_to_all) 94 | print 'normalize finished' 95 | 96 | return all_to_class, query_to_all, convert 97 | 98 | def markov(p_in, p_query, p_out1, p_out2): 99 | all_to_class, query_to_all, convert = build(p_in) 100 | 101 | query_to_class1 = multiple(query_to_all, all_to_class) 102 | print 'round 1 finished' 103 | query_to_class2 = multiple(query_to_all, query_to_class1) 104 | print 'round 2 finished' 105 | 106 | fo1 = open(p_out1, 'w') 107 | fo2 = open(p_out2, 'w') 108 | for line in open(p_query): 109 | query = line.strip() 110 | query = convert.str2id(query) 111 | if query not in query_to_class1: 112 | fo1.write('\n') 113 | fo2.write('\n') 114 | 115 | query = convert.id2str(query) 116 | rs = ['%s:%s' % (convert.id2str(k), v) for k, v in query_to_class1.get(query, {}).items()] 117 | rs2 = ['%s:%s' % (convert.id2str(k), v) for k, v in query_to_class2.get(query, {}).items()] 118 | fo1.write('%s\n' % (' || '.join(rs))) 119 | fo2.write('%s\n' % (' || '.join(rs2))) 120 | fo1.close() 121 | fo2.close() 122 | print 'write to file finished' 123 | 124 | markov(p_train, p_test, 'pred_markov1', 'pred_markov2') 125 | #markov(p_dog, p_valid, 'dog_pred1', 'dog_pred2') 126 | 127 | -------------------------------------------------------------------------------- /V1/markov_sessoin_label.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../raw_data/train.txt' 3 | p_test = '../raw_data/test.txt' 4 | p_dog = '../trans_data/dog_refine.txt' 5 | p_dog = '../trans_data/dog.txt' 6 | p_valid = '../trans_data/valid.txt' 7 | 8 | def markov(p_in, p_query, p_out): 9 | test_label = {} 10 | unknown_label = {} 11 | test_unknown = {} 12 | 13 | label_query = {} 14 | unknown_query = {} 15 | test_query = {} 16 | session = [] 17 | for line in open(p_in): 18 | if not line.strip(): 19 | n_query = len(label_query) + len(unknown_query) + len(test_query) 20 | label_dict = {} 21 | for query, label in label_query.items(): 22 | label_dict[label] = label_dict.get(label, 0) + 1 23 | if len(label_dict) <= 1 or (len(label_dict) == 2 and (label_dict.keys()[0].find(label_dict.keys()[1])==0 or label_dict.keys()[1].find(label_dict.keys()[0])==0)): 24 | for query in test_query: 25 | if query not in test_label: 26 | test_label[query] = {} 27 | for query2, label in label_query.items(): 28 | test_label[query][label] = test_label[query].get(label, 0) + 1 29 | if query not in test_unknown: 30 | test_unknown[query] = {} 31 | for query2 in unknown_query: 32 | test_unknown[query][query2] = test_unknown[query].get(query2, 0) + 1 33 | for query in unknown_query: 34 | if query not in unknown_label: 35 | unknown_label[query] = {} 36 | for query2, label in label_query.items(): 37 | unknown_label[query][label] = unknown_label[query].get(label, 0) + 1 38 | else: 39 | #print session 40 | pass 41 | label_query = {} 42 | unknown_query = {} 43 | test_query = {} 44 | session = [] 45 | continue 46 | label, query = line.strip().split('\t')[:2] 47 | label = ' | '.join(sorted(label.split(' | '))) 48 | if not session or query != session[-1][1]: 49 | session.append( (label, query) ) 50 | if label=='CLASS=TEST': 51 | test_query[query] = 1 52 | elif label=='CLASS=UNKNOWN': 53 | if query.count(' ') > 1: 54 | unknown_query[query] = 1 55 | else: 56 | label_query[query] = label 57 | 58 | with open(p_out, 'w') as fo: 59 | for line in open(p_query): 60 | query = line.strip() 61 | if query in test_label and test_label[query]: 62 | s = ['%s:%s' % (k, v) for k, v in test_label[query].items()] 63 | fo.write('%s\n' % (' || '.join(s))) 64 | elif query in test_unknown: 65 | label_dict = {} 66 | for query2, v1 in test_unknown[query].items(): 67 | if query2 in unknown_label: 68 | for label, v2 in unknown_label[query2].items(): 69 | label_dict[label] = label_dict.get(label, 0) + v1*v2 70 | if label_dict: 71 | s = ['%s:%s' % (k, v) for k, v in label_dict.items()] 72 | fo.write('%s\n' % (' || '.join(s))) 73 | else: 74 | fo.write('\n') 75 | else: 76 | fo.write('\n') 77 | 78 | #markov(p_dog, p_valid, 'pred.txt') 79 | markov(p_train, p_test, 'pred_session_label.txt') 80 | -------------------------------------------------------------------------------- /V1/metric_F1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Evaluation metric for the CIKM CUP 2014 4 | F-score 5 | 6 | @author: Michael Liu 7 | Created: Thu July 22 2014 8 | """ 9 | 10 | import os 11 | import csv 12 | import math 13 | 14 | def create_solution_dictionary(solution): 15 | """ 16 | """ 17 | 18 | solnDict = {} 19 | with open(solution, 'rb') as f: 20 | for line in f: 21 | query, labels = line.strip().split('\t') 22 | label_list = labels.split(' | ') 23 | solnDict[query] = label_list 24 | return solnDict 25 | 26 | def check_submission(submission, solutionDict): 27 | """ 28 | """ 29 | 30 | submissionDict = {} 31 | with open(submission, 'rb') as f: 32 | for line in f: 33 | query, labels = line.strip('\n').split('\t') 34 | if query in submissionDict: 35 | print 'duplicate id in submission' 36 | return False 37 | if query not in solutionDict: 38 | print 'submission id must in solution' 39 | return False 40 | label_list = labels.split(' | ') 41 | submissionDict[query] = label_list 42 | 43 | if len(submissionDict) != len(solutionDict): 44 | print 'size of submission and solution must be the same' 45 | return False 46 | return submissionDict 47 | 48 | def F1_metric(solution, submission): 49 | """ 50 | """ 51 | 52 | solutionDict = create_solution_dictionary(solution) 53 | submissionDict = check_submission(submission, solutionDict) 54 | 55 | if submissionDict: 56 | true_positive = {} 57 | all_positive = {} 58 | groundtruth = {} 59 | 60 | for query in solutionDict: 61 | label_list = set(submissionDict[query]) 62 | truth_list = set(solutionDict[query]) 63 | for label in label_list: 64 | if label in truth_list: 65 | true_positive[label] = true_positive.get(label, 0) + 1. 66 | all_positive[label] = all_positive.get(label, 0) + 1 67 | for label in truth_list: 68 | groundtruth[label] = groundtruth.get(label, 0) + 1 69 | 70 | precision_list = [] 71 | recall_list = [] 72 | for label in groundtruth: 73 | precision = 0 74 | if label in all_positive: 75 | precision = true_positive.get(label, 0) / all_positive.get(label, 0) 76 | print label, 'precision', precision 77 | 78 | recall = true_positive[label] / groundtruth[label] 79 | print label, 'recall', recall 80 | 81 | precision_list.append(precision) 82 | recall_list.append(recall) 83 | 84 | ap = sum(precision_list) / len(recall_list) 85 | ar = sum(recall_list) / len(recall_list) 86 | F1 = 2*ap*ar / (ap + ar) 87 | print 'ap', ap 88 | print 'ar', ar 89 | print 'F1', F1 90 | 91 | if __name__ == "__main__": 92 | solutionFile = "" 93 | submissionFile = "" 94 | 95 | import sys 96 | if len(sys.argv) < 3: 97 | print ' solution submission' 98 | exit(-1) 99 | solutionFile = sys.argv[1] 100 | submissionFile = sys.argv[2] 101 | 102 | F1_metric(solutionFile, submissionFile) 103 | 104 | 105 | -------------------------------------------------------------------------------- /V1/metric_confusion.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Evaluation metric for the CIKM CUP 2014 4 | F-score 5 | 6 | @author: Michael Liu 7 | Created: Thu July 22 2014 8 | """ 9 | 10 | import os 11 | import csv 12 | import math 13 | 14 | def create_solution_dictionary(solution): 15 | """ 16 | """ 17 | 18 | solnDict = {} 19 | with open(solution, 'rb') as f: 20 | for line in f: 21 | query, labels = line.strip().split('\t') 22 | solnDict[query] = labels 23 | return solnDict 24 | 25 | def check_submission(submission, solutionDict): 26 | """ 27 | """ 28 | 29 | submissionDict = {} 30 | with open(submission, 'rb') as f: 31 | for line in f: 32 | query, labels = line.strip('\n').split('\t') 33 | if query in submissionDict: 34 | print 'duplicate id in submission' 35 | return False 36 | if query not in solutionDict: 37 | print 'submission id must in solution' 38 | return False 39 | submissionDict[query] = labels 40 | 41 | if len(submissionDict) != len(solutionDict): 42 | print 'size of submission and solution must be the same' 43 | return False 44 | return submissionDict 45 | 46 | def confusion(solution, submission): 47 | """ 48 | """ 49 | 50 | solutionDict = create_solution_dictionary(solution) 51 | submissionDict = check_submission(submission, solutionDict) 52 | 53 | if submissionDict: 54 | matrix = {} 55 | for query in solutionDict: 56 | label = submissionDict[query] 57 | truth = solutionDict[query] 58 | matrix[truth] = matrix.get(truth, {}) 59 | matrix[truth][label] = matrix[truth].get(label, 0) + 1 60 | 61 | confusion = [] 62 | for truth in matrix: 63 | label_list = sorted(matrix[truth].items(), key=lambda d:-d[1]) 64 | confusion.append( (truth, label_list) ) 65 | for truth, label_list in sorted(confusion, key=lambda d:-d[1][1][1] if len(d[1])>=2 else 0): 66 | label_list_str = ['%s:%s' % (k, v) for k, v in label_list] 67 | print truth, ' ==>> ', ' '.join(label_list_str) 68 | 69 | if __name__ == "__main__": 70 | solutionFile = "" 71 | submissionFile = "" 72 | 73 | import sys 74 | if len(sys.argv) < 3: 75 | print ' solution submission' 76 | exit(-1) 77 | solutionFile = sys.argv[1] 78 | submissionFile = sys.argv[2] 79 | 80 | confusion(solutionFile, submissionFile) 81 | 82 | 83 | -------------------------------------------------------------------------------- /V1/prepare1.py: -------------------------------------------------------------------------------- 1 | 2 | p_dog_train_feat = '../trans_data/dog.simple' 3 | p_dog_valid_feat = '../trans_data/valid.simple' 4 | p_dog_valid_id = '../trans_data/valid.txt' 5 | 6 | p_pig_train_feat = '../trans_data/train.simple' 7 | p_pig_valid_feat = '../trans_data/test.simple' 8 | p_pig_valid_id = '../raw_data/test.txt' 9 | 10 | p_label_map = '../dataset/label_map' 11 | p_dog_train = '../dataset/dog_train' 12 | p_dog_test = '../dataset/dog_test' 13 | p_pig_train = '../dataset/pig_train' 14 | p_pig_test = '../dataset/pig_test' 15 | 16 | min_word_df = 5 17 | min_title_df = 10 18 | 19 | label_map = {} 20 | cur_label = 0 21 | word_map = {} 22 | cur_word = 1 23 | 24 | pig_train = [] 25 | test_dict = {} 26 | pig_test = [] 27 | 28 | label_df = {} 29 | word_df = {} 30 | 31 | def get_df(p_in): 32 | for line in open(p_in): 33 | row = line.strip().split('\t') 34 | label = row[0] 35 | label = ' | '.join(sorted(label.split(' | '))) 36 | label_df[label] = label_df.get(label, 0) + 1 37 | query = row[1] 38 | titles = row[2] if len(row)>=3 else '' 39 | session_queries = row[5] if len(row)>=6 else '' 40 | session_titles = row[6] if len(row)>=7 else '' 41 | 42 | feat_list = query.split(' ') 43 | for i, word in enumerate(feat_list): 44 | #if not word: continue 45 | word_df[word] = word_df.get(word, 0) + 1 46 | if i>=1: 47 | word = ' '.join(feat_list[i-1:i+1]) 48 | word_df[word] = word_df.get(word, 0) + 1 49 | word = '%s_%s' % (i, feat_list[i]) 50 | word_df[word] = word_df.get(word, 0) + 1 51 | if i >= len(feat_list)/2: 52 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 53 | word_df[word] = word_df.get(word, 0) + 1 54 | 55 | for pair in titles.split(';'): 56 | if not pair: continue 57 | title, freq = pair.split(':') 58 | feat_list = title.split(' ') 59 | for i, word in enumerate(feat_list): 60 | #if not word: continue 61 | word = 't_' + word 62 | word_df[word] = word_df.get(word, 0) + 1 63 | if i>=1: 64 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 65 | word_df[word] = word_df.get(word, 0) + 1 66 | 67 | for pair in session_queries.split(';'): 68 | if not pair: continue 69 | title, freq = pair.split(':') 70 | feat_list = title.split(' ') 71 | for i, word in enumerate(feat_list): 72 | #if not word: continue 73 | word = 'sq_' + word 74 | word_df[word] = word_df.get(word, 0) + 1 75 | for pair in session_titles.split(';'): 76 | if not pair: continue 77 | title, freq = pair.split(':') 78 | feat_list = title.split(' ') 79 | for i, word in enumerate(feat_list): 80 | #if not word: continue 81 | word = 'st_' + word 82 | word_df[word] = word_df.get(word, 0) + 1 83 | if i>=1: 84 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 85 | word_df[word] = word_df.get(word, 0) + 1 86 | 87 | def prepare(p_in, p_out, isTrain, p_in2): 88 | global cur_label 89 | global cur_word 90 | 91 | if isTrain: fo = open(p_out, 'w') 92 | fin2 = open(p_in2) 93 | 94 | for line in open(p_in): 95 | row = line.strip().split('\t') 96 | label = row[0] 97 | query = row[1] 98 | titles = row[2] if len(row)>=3 else '' 99 | labels = row[3] if len(row)>=4 else '' 100 | session_queries = row[5] if len(row)>=6 else '' 101 | session_titles = row[6] if len(row)>=7 else '' 102 | 103 | row2 = fin2.readline().split('\t') 104 | if row2[0] == query: 105 | stats = row2[1] 106 | stats2 = row2[2].strip() 107 | else: 108 | print 'query mismatch' 109 | exit(1) 110 | 111 | if isTrain: 112 | label = ' | '.join(sorted(label.split(' | '))) 113 | if label_df[label] < 200: 114 | label = label.split(' ')[0] 115 | if label not in label_map: 116 | label_map[label] = cur_label 117 | cur_label += 1 118 | 119 | feat_list = query.split(' ') 120 | word_tf = {} 121 | for i, word in enumerate(feat_list): 122 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 123 | word_map[word] = cur_word 124 | cur_word += 1 125 | if word in word_map: 126 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 127 | if i>=1: 128 | word = ' '.join(feat_list[i-1:i+1]) 129 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 130 | word_map[word] = cur_word 131 | cur_word += 1 132 | if word in word_map: 133 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 134 | word = '%s_%s' % (i, feat_list[i]) 135 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 136 | word_map[word] = cur_word 137 | cur_word += 1 138 | if word in word_map: 139 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 140 | if i >= len(feat_list) / 2: 141 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 142 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 143 | word_map[word] = cur_word 144 | cur_word += 1 145 | if word in word_map: 146 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 147 | 148 | tot_freq = 0 149 | for pair in titles.split(';'): 150 | if not pair: continue 151 | title, freq = pair.split(':') 152 | tot_freq += float(freq) 153 | word_tf2 = {} 154 | for pair in titles.split(';'): 155 | if not pair: continue 156 | title, freq = pair.split(':') 157 | freq = float(freq) 158 | feat_list = title.split(' ') 159 | for i, word in enumerate(feat_list): 160 | word = 't_' + word 161 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 162 | word_map[word] = cur_word 163 | cur_word += 1 164 | if word in word_map: 165 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 166 | if word in word_map and word_map[word] in word_tf: 167 | word = 'qt_' + word 168 | if isTrain and word not in word_map: 169 | word_map[word] = cur_word 170 | cur_word += 1 171 | if word in word_map: 172 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1.*freq*1./tot_freq 173 | if i>=1: 174 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 175 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 176 | word_map[word] = cur_word 177 | cur_word += 1 178 | if word in word_map: 179 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 180 | 181 | tot_freq = 0 182 | for pair in session_queries.split(';'): 183 | if not pair: continue 184 | title, freq = pair.split(':') 185 | tot_freq += float(freq) 186 | for pair in session_queries.split(';'): 187 | if not pair: continue 188 | title, freq = pair.split(':') 189 | freq = float(freq) 190 | feat_list = title.split(' ') 191 | for i, word in enumerate(feat_list): 192 | word = 'sq_' + word 193 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 194 | word_map[word] = cur_word 195 | cur_word += 1 196 | if word in word_map: 197 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 198 | 199 | tot_freq = 0 200 | for pair in session_titles.split(';'): 201 | if not pair: continue 202 | title, freq = pair.split(':') 203 | tot_freq += float(freq) 204 | for pair in session_titles.split(';'): 205 | if not pair: continue 206 | title, freq = pair.split(':') 207 | freq = float(freq) 208 | feat_list = title.split(' ') 209 | for i, word in enumerate(feat_list): 210 | word = 'st_' + word 211 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 212 | word_map[word] = cur_word 213 | cur_word += 1 214 | if word in word_map: 215 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 216 | if i>=1: 217 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 218 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 219 | word_map[word] = cur_word 220 | cur_word += 1 221 | if word in word_map: 222 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 223 | 224 | for pair in labels.split(';'): 225 | if not pair: continue 226 | word, freq = pair.split(':') 227 | freq = float(freq) 228 | if isTrain and word not in word_map: 229 | word_map[word] = cur_word 230 | cur_word += 1 231 | if word in word_map: 232 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 233 | 234 | for pair in stats.split(';') + stats2.split(';'): 235 | if not pair: continue 236 | word, freq = pair.split(':') 237 | freq = float(freq) 238 | if isTrain and word not in word_map: 239 | word_map[word] = cur_word 240 | cur_word += 1 241 | if word in word_map: 242 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 243 | 244 | if isTrain: 245 | label, F = (label_map[label], word_tf.items() + word_tf2.items()) 246 | F = sorted(F, key=lambda d:d[0]) 247 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 248 | fo.write('%s %s\n' % (label, f_str)) 249 | else: 250 | test_dict[query] = word_tf.items() + word_tf2.items() 251 | 252 | if isTrain: fo.close() 253 | fin2.close() 254 | 255 | # save label map 256 | with open(p_label_map, 'w') as fo: 257 | for label in label_map: 258 | fo.write('%s\t%s\n' % (label, label_map[label])) 259 | 260 | 261 | def save_test(p_in, p_out): 262 | with open(p_out, 'w') as fo: 263 | for line in open(p_in): 264 | query = line.strip() 265 | F = test_dict[query] 266 | F = sorted(F, key=lambda d:d[0]) 267 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 268 | fo.write('%s %s\n' % (0, f_str)) 269 | 270 | #for dog 271 | #get_df(p_dog_train_feat) 272 | #prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2') 273 | #for pig 274 | get_df(p_pig_train_feat) 275 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2') 276 | 277 | #prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2') 278 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2') 279 | #save_test(p_dog_valid_id, p_dog_test) 280 | save_test(p_pig_valid_id, p_pig_test) 281 | 282 | -------------------------------------------------------------------------------- /V1/prepare2.py: -------------------------------------------------------------------------------- 1 | 2 | p_dog_train_feat = '../trans_data/dog.simple' 3 | p_dog_valid_feat = '../trans_data/valid.simple' 4 | p_dog_valid_id = '../trans_data/valid.txt' 5 | 6 | p_pig_train_feat = '../trans_data/train.simple' 7 | p_pig_valid_feat = '../trans_data/test.simple' 8 | p_pig_valid_id = '../raw_data/test.txt' 9 | 10 | p_label_map = '../dataset/label_map' 11 | p_dog_train = '../dataset/dog_train' 12 | p_dog_test = '../dataset/dog_test' 13 | p_pig_train = '../dataset/pig_train2' 14 | p_pig_test = '../dataset/pig_test2' 15 | 16 | min_word_df = 5 17 | min_title_df = 10 18 | 19 | label_map = {} 20 | cur_label = 0 21 | word_map = {} 22 | cur_word = 1 23 | 24 | pig_train = [] 25 | test_dict = {} 26 | pig_test = [] 27 | 28 | label_df = {} 29 | word_df = {} 30 | 31 | def get_df(p_in): 32 | for line in open(p_in): 33 | row = line.strip().split('\t') 34 | label = row[0] 35 | label = ' | '.join(sorted(label.split(' | '))) 36 | label_df[label] = label_df.get(label, 0) + 1 37 | query = row[1] 38 | titles = row[2] if len(row)>=3 else '' 39 | session_queries = row[5] if len(row)>=6 else '' 40 | session_titles = row[6] if len(row)>=7 else '' 41 | 42 | feat_list = query.split(' ') 43 | for i, word in enumerate(feat_list): 44 | #if not word: continue 45 | word_df[word] = word_df.get(word, 0) + 1 46 | if i>=1: 47 | word = ' '.join(feat_list[i-1:i+1]) 48 | word_df[word] = word_df.get(word, 0) + 1 49 | word = '%s_%s' % (i, feat_list[i]) 50 | word_df[word] = word_df.get(word, 0) + 1 51 | if i >= len(feat_list)/2: 52 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 53 | word_df[word] = word_df.get(word, 0) + 1 54 | 55 | for pair in titles.split(';'): 56 | if not pair: continue 57 | title, freq = pair.split(':') 58 | feat_list = title.split(' ') 59 | for i, word in enumerate(feat_list): 60 | #if not word: continue 61 | word = 't_' + word 62 | word_df[word] = word_df.get(word, 0) + 1 63 | if i>=1: 64 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 65 | word_df[word] = word_df.get(word, 0) + 1 66 | 67 | for pair in session_queries.split(';'): 68 | if not pair: continue 69 | title, freq = pair.split(':') 70 | feat_list = title.split(' ') 71 | for i, word in enumerate(feat_list): 72 | #if not word: continue 73 | word = 'sq_' + word 74 | word_df[word] = word_df.get(word, 0) + 1 75 | for pair in session_titles.split(';'): 76 | if not pair: continue 77 | title, freq = pair.split(':') 78 | feat_list = title.split(' ') 79 | for i, word in enumerate(feat_list): 80 | #if not word: continue 81 | word = 'st_' + word 82 | word_df[word] = word_df.get(word, 0) + 1 83 | if i>=1: 84 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 85 | word_df[word] = word_df.get(word, 0) + 1 86 | 87 | def prepare(p_in, p_out, isTrain, p_in2): 88 | global cur_label 89 | global cur_word 90 | 91 | if isTrain: fo = open(p_out, 'w') 92 | fin2 = open(p_in2) 93 | 94 | for line in open(p_in): 95 | row = line.strip().split('\t') 96 | label = row[0] 97 | query = row[1] 98 | titles = row[2] if len(row)>=3 else '' 99 | labels = row[3] if len(row)>=4 else '' 100 | session_queries = row[5] if len(row)>=6 else '' 101 | session_titles = row[6] if len(row)>=7 else '' 102 | 103 | row2 = fin2.readline().split('\t') 104 | if row2[0] == query: 105 | stats = row2[1] 106 | stats2 = row2[2].strip() 107 | else: 108 | print 'query mismatch' 109 | exit(1) 110 | 111 | if isTrain: 112 | label = ' | '.join(sorted(label.split(' | '))) 113 | if label_df[label] < 200: 114 | label = label.split(' ')[0] 115 | if label not in label_map: 116 | label_map[label] = cur_label 117 | cur_label += 1 118 | 119 | feat_list = query.split(' ') 120 | word_tf = {} 121 | for i, word in enumerate(feat_list): 122 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 123 | word_map[word] = cur_word 124 | cur_word += 1 125 | if word in word_map: 126 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 127 | if i>=1: 128 | word = ' '.join(feat_list[i-1:i+1]) 129 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 130 | word_map[word] = cur_word 131 | cur_word += 1 132 | if word in word_map: 133 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 134 | word = '%s_%s' % (i, feat_list[i]) 135 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 136 | word_map[word] = cur_word 137 | cur_word += 1 138 | if word in word_map: 139 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 140 | if i >= len(feat_list) / 2: 141 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 142 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 143 | word_map[word] = cur_word 144 | cur_word += 1 145 | if word in word_map: 146 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 147 | 148 | tot_freq = 0 149 | for pair in titles.split(';'): 150 | if not pair: continue 151 | title, freq = pair.split(':') 152 | tot_freq += float(freq) 153 | word_tf2 = {} 154 | for pair in titles.split(';'): 155 | if not pair: continue 156 | title, freq = pair.split(':') 157 | freq = float(freq) 158 | feat_list = title.split(' ') 159 | for i, word in enumerate(feat_list): 160 | #word = 't_' + word 161 | #if isTrain and word_df[word] >= min_title_df and word not in word_map: 162 | # word_map[word] = cur_word 163 | # cur_word += 1 164 | #if word in word_map: 165 | # word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 166 | if i>=1: 167 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 168 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 169 | word_map[word] = cur_word 170 | cur_word += 1 171 | if word in word_map: 172 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 173 | 174 | ''' 175 | tot_freq = 0 176 | for pair in session_queries.split(';'): 177 | if not pair: continue 178 | title, freq = pair.split(':') 179 | tot_freq += float(freq) 180 | for pair in session_queries.split(';'): 181 | if not pair: continue 182 | title, freq = pair.split(':') 183 | freq = float(freq) 184 | feat_list = title.split(' ') 185 | for i, word in enumerate(feat_list): 186 | word = 'sq_' + word 187 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 188 | word_map[word] = cur_word 189 | cur_word += 1 190 | if word in word_map: 191 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 192 | ''' 193 | 194 | tot_freq = 0 195 | for pair in session_titles.split(';'): 196 | if not pair: continue 197 | title, freq = pair.split(':') 198 | tot_freq += float(freq) 199 | for pair in session_titles.split(';'): 200 | if not pair: continue 201 | title, freq = pair.split(':') 202 | freq = float(freq) 203 | feat_list = title.split(' ') 204 | for i, word in enumerate(feat_list): 205 | #word = 'st_' + word 206 | #if isTrain and word_df[word] >= min_title_df and word not in word_map: 207 | # word_map[word] = cur_word 208 | # cur_word += 1 209 | #if word in word_map: 210 | # word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 211 | if i>=1: 212 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 213 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 214 | word_map[word] = cur_word 215 | cur_word += 1 216 | if word in word_map: 217 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 218 | 219 | for pair in labels.split(';'): 220 | if not pair: continue 221 | word, freq = pair.split(':') 222 | freq = float(freq) 223 | if isTrain and word not in word_map: 224 | word_map[word] = cur_word 225 | cur_word += 1 226 | if word in word_map: 227 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 228 | 229 | for pair in stats.split(';') + stats2.split(';'): 230 | if not pair: continue 231 | word, freq = pair.split(':') 232 | freq = float(freq) 233 | if isTrain and word not in word_map: 234 | word_map[word] = cur_word 235 | cur_word += 1 236 | if word in word_map: 237 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 238 | 239 | if isTrain: 240 | label, F = (label_map[label], word_tf.items() + word_tf2.items()) 241 | F = sorted(F, key=lambda d:d[0]) 242 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 243 | fo.write('%s %s\n' % (label, f_str)) 244 | else: 245 | test_dict[query] = word_tf.items() + word_tf2.items() 246 | 247 | if isTrain: fo.close() 248 | fin2.close() 249 | 250 | # save label map 251 | with open(p_label_map, 'w') as fo: 252 | for label in label_map: 253 | fo.write('%s\t%s\n' % (label, label_map[label])) 254 | 255 | 256 | def save_test(p_in, p_out): 257 | with open(p_out, 'w') as fo: 258 | for line in open(p_in): 259 | query = line.strip() 260 | F = test_dict[query] 261 | F = sorted(F, key=lambda d:d[0]) 262 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 263 | fo.write('%s %s\n' % (0, f_str)) 264 | 265 | #for dog 266 | #get_df(p_dog_train_feat) 267 | #prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2') 268 | #for pig 269 | get_df(p_pig_train_feat) 270 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2') 271 | 272 | #prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2') 273 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2') 274 | #save_test(p_dog_valid_id, p_dog_test) 275 | save_test(p_pig_valid_id, p_pig_test) 276 | 277 | -------------------------------------------------------------------------------- /V1/prepare3.py: -------------------------------------------------------------------------------- 1 | 2 | p_dog_train_feat = '../trans_data/dog.simple' 3 | p_dog_valid_feat = '../trans_data/valid.simple' 4 | p_dog_valid_id = '../trans_data/valid.txt' 5 | 6 | p_pig_train_feat = '../trans_data/train.simple' 7 | p_pig_valid_feat = '../trans_data/test.simple' 8 | p_pig_valid_id = '../raw_data/test.txt' 9 | 10 | p_label_map = '../dataset/label_map' 11 | p_dog_train = '../dataset/dog_train' 12 | p_dog_test = '../dataset/dog_test' 13 | p_pig_train = '../dataset/pig_train' 14 | p_pig_test = '../dataset/pig_test' 15 | 16 | min_word_df = 5 17 | min_title_df = 10 18 | 19 | label_map = {} 20 | cur_label = 0 21 | word_map = {} 22 | cur_word = 1 23 | 24 | pig_train = [] 25 | test_dict = {} 26 | pig_test = [] 27 | 28 | label_df = {} 29 | word_df = {} 30 | 31 | def get_df(p_in): 32 | for line in open(p_in): 33 | row = line.strip().split('\t') 34 | label = row[0] 35 | label = ' | '.join(sorted(label.split(' | '))) 36 | label_df[label] = label_df.get(label, 0) + 1 37 | query = row[1] 38 | titles = row[2] if len(row)>=3 else '' 39 | session_queries = row[5] if len(row)>=6 else '' 40 | session_titles = row[6] if len(row)>=7 else '' 41 | 42 | feat_list = query.split(' ') 43 | for i, word in enumerate(feat_list): 44 | #if not word: continue 45 | word_df[word] = word_df.get(word, 0) + 1 46 | if i>=1: 47 | word = ' '.join(feat_list[i-1:i+1]) 48 | word_df[word] = word_df.get(word, 0) + 1 49 | word = '%s_%s' % (i-1, ' '.join(feat_list[i-1:i+1])) 50 | word_df[word] = word_df.get(word, 0) + 1 51 | word = '%s_%s' % (i, feat_list[i]) 52 | word_df[word] = word_df.get(word, 0) + 1 53 | if i >= len(feat_list)/2: 54 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 55 | word_df[word] = word_df.get(word, 0) + 1 56 | 57 | for pair in titles.split(';'): 58 | if not pair: continue 59 | title, freq = pair.split(':') 60 | feat_list = title.split(' ') 61 | for i, word in enumerate(feat_list): 62 | #if not word: continue 63 | word = 't_' + word 64 | word_df[word] = word_df.get(word, 0) + 1 65 | if i>=1: 66 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 67 | word_df[word] = word_df.get(word, 0) + 1 68 | 69 | for pair in session_queries.split(';'): 70 | if not pair: continue 71 | title, freq = pair.split(':') 72 | feat_list = title.split(' ') 73 | for i, word in enumerate(feat_list): 74 | #if not word: continue 75 | word = 'sq_' + word 76 | word_df[word] = word_df.get(word, 0) + 1 77 | for pair in session_titles.split(';'): 78 | if not pair: continue 79 | title, freq = pair.split(':') 80 | feat_list = title.split(' ') 81 | for i, word in enumerate(feat_list): 82 | #if not word: continue 83 | word = 'st_' + word 84 | word_df[word] = word_df.get(word, 0) + 1 85 | 86 | def prepare(p_in, p_out, isTrain, p_in2): 87 | global cur_label 88 | global cur_word 89 | 90 | if isTrain: fo = open(p_out, 'w') 91 | fin2 = open(p_in2) 92 | 93 | for line in open(p_in): 94 | row = line.strip().split('\t') 95 | label = row[0] 96 | query = row[1] 97 | titles = row[2] if len(row)>=3 else '' 98 | labels = row[3] if len(row)>=4 else '' 99 | session_queries = row[5] if len(row)>=6 else '' 100 | session_titles = row[6] if len(row)>=7 else '' 101 | 102 | row2 = fin2.readline().split('\t') 103 | if row2[0] == query: 104 | stats = row2[1] 105 | stats2 = row2[2].strip() 106 | else: 107 | print 'query mismatch' 108 | exit(1) 109 | 110 | if isTrain: 111 | label = ' | '.join(sorted(label.split(' | '))) 112 | if label_df[label] < 200: 113 | label = label.split(' ')[0] 114 | if label not in label_map: 115 | label_map[label] = cur_label 116 | cur_label += 1 117 | 118 | feat_list = query.split(' ') 119 | word_tf = {} 120 | for i, word in enumerate(feat_list): 121 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 122 | word_map[word] = cur_word 123 | cur_word += 1 124 | if word in word_map: 125 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 126 | if i>=1: 127 | word = ' '.join(feat_list[i-1:i+1]) 128 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 129 | word_map[word] = cur_word 130 | cur_word += 1 131 | if word in word_map: 132 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 133 | word = '%s_%s' % (i-1, ' '.join(feat_list[i-1:i+1])) 134 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 135 | word_map[word] = cur_word 136 | cur_word += 1 137 | if word in word_map: 138 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 139 | word = '%s_%s' % (i, feat_list[i]) 140 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 141 | word_map[word] = cur_word 142 | cur_word += 1 143 | if word in word_map: 144 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 145 | if i >= len(feat_list) / 2: 146 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 147 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 148 | word_map[word] = cur_word 149 | cur_word += 1 150 | if word in word_map: 151 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 152 | 153 | tot_freq = 0 154 | for pair in titles.split(';'): 155 | if not pair: continue 156 | title, freq = pair.split(':') 157 | tot_freq += float(freq) 158 | word_tf2 = {} 159 | for pair in titles.split(';'): 160 | if not pair: continue 161 | title, freq = pair.split(':') 162 | freq = float(freq) 163 | feat_list = title.split(' ') 164 | for i, word in enumerate(feat_list): 165 | word = 't_' + word 166 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 167 | word_map[word] = cur_word 168 | cur_word += 1 169 | if word in word_map: 170 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 171 | if i>=1: 172 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 173 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 174 | word_map[word] = cur_word 175 | cur_word += 1 176 | if word in word_map: 177 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 178 | 179 | tot_freq = 0 180 | for pair in session_queries.split(';'): 181 | if not pair: continue 182 | title, freq = pair.split(':') 183 | tot_freq += float(freq) 184 | for pair in session_queries.split(';'): 185 | if not pair: continue 186 | title, freq = pair.split(':') 187 | freq = float(freq) 188 | feat_list = title.split(' ') 189 | for i, word in enumerate(feat_list): 190 | word = 'sq_' + word 191 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 192 | word_map[word] = cur_word 193 | cur_word += 1 194 | if word in word_map: 195 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 196 | 197 | tot_freq = 0 198 | for pair in session_titles.split(';'): 199 | if not pair: continue 200 | title, freq = pair.split(':') 201 | tot_freq += float(freq) 202 | for pair in session_titles.split(';'): 203 | if not pair: continue 204 | title, freq = pair.split(':') 205 | freq = float(freq) 206 | feat_list = title.split(' ') 207 | for i, word in enumerate(feat_list): 208 | word = 'st_' + word 209 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 210 | word_map[word] = cur_word 211 | cur_word += 1 212 | if word in word_map: 213 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 214 | 215 | for pair in labels.split(';'): 216 | if not pair: continue 217 | word, freq = pair.split(':') 218 | freq = float(freq) 219 | if isTrain and word not in word_map: 220 | word_map[word] = cur_word 221 | cur_word += 1 222 | if word in word_map: 223 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 224 | 225 | for pair in stats.split(';') + stats2.split(';'): 226 | if not pair: continue 227 | word, freq = pair.split(':') 228 | freq = float(freq) 229 | if isTrain and word not in word_map: 230 | word_map[word] = cur_word 231 | cur_word += 1 232 | if word in word_map: 233 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 234 | 235 | if isTrain: 236 | label, F = (label_map[label], word_tf.items() + word_tf2.items()) 237 | F = sorted(F, key=lambda d:d[0]) 238 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 239 | fo.write('%s %s\n' % (label, f_str)) 240 | else: 241 | test_dict[query] = word_tf.items() + word_tf2.items() 242 | 243 | if isTrain: fo.close() 244 | fin2.close() 245 | 246 | # save label map 247 | with open(p_label_map, 'w') as fo: 248 | for label in label_map: 249 | fo.write('%s\t%s\n' % (label, label_map[label])) 250 | 251 | 252 | def save_test(p_in, p_out): 253 | with open(p_out, 'w') as fo: 254 | for line in open(p_in): 255 | query = line.strip() 256 | F = test_dict[query] 257 | F = sorted(F, key=lambda d:d[0]) 258 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 259 | fo.write('%s %s\n' % (0, f_str)) 260 | 261 | #for dog 262 | #get_df(p_dog_train_feat) 263 | #prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2') 264 | #for pig 265 | get_df(p_pig_train_feat) 266 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2') 267 | 268 | #prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2') 269 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2') 270 | #save_test(p_dog_valid_id, p_dog_test) 271 | save_test(p_pig_valid_id, p_pig_test) 272 | 273 | -------------------------------------------------------------------------------- /V1/prepare_lda_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding: utf-8 3 | # @author: zuotaoliu@126.com 4 | # @created: 2014-08-29 5 | import os 6 | import sys 7 | import re 8 | 9 | def do_word_index(p_in, p_out): 10 | label_map = {} 11 | cur_label = 0 12 | word_count = {} 13 | cur_idx = 0 14 | fo = open(p_out, 'w') 15 | for line in open(p_in): 16 | row = line.rstrip().split('\t') 17 | 18 | if len(row) >= 3: 19 | feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5]) 20 | else: 21 | feats = row[1] + ':1' 22 | 23 | labels = row[0].split(' | ') 24 | cids = [] 25 | for label in labels: 26 | if label not in label_map: 27 | label_map[label] = cur_label 28 | cur_label += 1 29 | cids.append(str(label_map[label])) 30 | wc = {} 31 | for pair in feats.split(';'): 32 | if not pair: continue 33 | words, freq = pair.split(':') 34 | freq = min(1, int(freq)) 35 | for word in words.split(' '): 36 | if not word: continue 37 | wc[word] = wc.get(word, 0) + freq 38 | word_count[word] = word_count.get(word, 0) + freq 39 | fo.write('%s\n' % (' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 40 | fo.close() 41 | 42 | return word_count 43 | 44 | if __name__ == "__main__": 45 | if len(sys.argv) < 3: 46 | print ' inputfile outputfile' 47 | exit(-1) 48 | word_count = do_word_index(sys.argv[1], sys.argv[2]) 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /V1/prepare_lda_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding: utf-8 3 | # @author: zuotaoliu@126.com 4 | # @created: 2014-08-29 5 | import os 6 | import sys 7 | import re 8 | 9 | def do_word_index(p_in, p_out): 10 | label_map = {} 11 | cur_label = 0 12 | word_count = {} 13 | cur_idx = 0 14 | fo = open(p_out, 'w') 15 | for line in open(p_in): 16 | row = line.rstrip().split('\t') 17 | 18 | if len(row) >= 3: 19 | feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5]) 20 | else: 21 | feats = row[1] + ':1' 22 | 23 | labels = row[0].split(' | ') 24 | cids = [] 25 | for label in labels: 26 | if label not in label_map: 27 | label_map[label] = cur_label 28 | cur_label += 1 29 | cids.append(str(label_map[label])) 30 | wc = {} 31 | for pair in feats.split(';'): 32 | if not pair: continue 33 | words, freq = pair.split(':') 34 | freq = min(1, int(freq)) 35 | for word in words.split(' '): 36 | if not word: continue 37 | wc[word] = wc.get(word, 0) + freq 38 | word_count[word] = word_count.get(word, 0) + freq 39 | fo.write('[%s] %s\n' % (' '.join(cids), ' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 40 | fo.close() 41 | with open('../dataset/label_map_lda', 'w') as fo: 42 | for label in label_map: 43 | fo.write('%s\t%s\n' % (label, label_map[label])) 44 | 45 | return word_count 46 | 47 | if __name__ == "__main__": 48 | if len(sys.argv) < 4: 49 | print ' inputfile outputfile wordindex' 50 | exit(-1) 51 | word_count = do_word_index(sys.argv[1], sys.argv[2]) 52 | 53 | sort_list = sorted(word_count.items(), key=lambda d:d[1], reverse=True) 54 | with open(sys.argv[3], 'w') as fo: 55 | for id, pair in enumerate(sort_list): 56 | word, num = pair 57 | if num >= 5: 58 | fo.write('%s %s\n' % (id, word)) 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /V1/prepare_liblinear_1vsA.py: -------------------------------------------------------------------------------- 1 | p_simple = '../trans_data/train.simple' 2 | p_raw_train = '../dataset/pig_train' 3 | p_svm_train = '../dataset/svm_train' 4 | 5 | label_map = {} 6 | cur_label = 0 7 | for line in open('../dataset/label_map'): 8 | label = line.split('\t')[0] 9 | if label.count(' | ') == 0: 10 | if label not in label_map: 11 | label_map[label] = cur_label 12 | cur_label += 1 13 | print label 14 | with open('../dataset/label_map_svm', 'w') as fo: 15 | for label in label_map: 16 | fo.write('%s\t%s\n' % (label, label_map[label])) 17 | 18 | fout = [] 19 | for c in range(7): 20 | fout.append( open('%s%s' % (p_svm_train, c), 'w') ) 21 | 22 | with open(p_simple) as fin: 23 | for line in open(p_raw_train): 24 | arr = line.split(' ') 25 | labels = fin.readline().split('\t')[0].split(' | ') 26 | c_dict = {label_map[label]:1 for label in labels} 27 | for c in range(7): 28 | if c in c_dict: 29 | fout[c].write('%s %s' % (1, ' '.join(arr[1:]))) 30 | else: 31 | fout[c].write('%s %s' % (0, ' '.join(arr[1:]))) 32 | 33 | 34 | for c in range(7): 35 | fout[c].close() 36 | -------------------------------------------------------------------------------- /V1/prepare_session.py: -------------------------------------------------------------------------------- 1 | 2 | p_dog_train_feat = '../trans_data/dog.simple5' 3 | p_dog_valid_feat = '../trans_data/valid.simple5' 4 | p_dog_valid_id = '../trans_data/valid.txt' 5 | 6 | p_pig_train_feat = '../trans_data/pig.simple5' 7 | p_pig_valid_feat = '../trans_data/test.simple5' 8 | p_pig_valid_id = '../raw_data/test.txt' 9 | 10 | p_label_map = '../dataset/label_map_session' 11 | p_dog_train = '../dataset/dog_train_session' 12 | p_dog_test = '../dataset/dog_test_session' 13 | p_pig_train = '../dataset/pig_train_session' 14 | p_pig_test = '../dataset/pig_test_session' 15 | 16 | min_word_df = 5 17 | min_title_df = 10 18 | 19 | label_map = {} 20 | cur_label = 0 21 | word_map = {} 22 | cur_word = 1 23 | 24 | pig_train = [] 25 | test_dict = {} 26 | pig_test = [] 27 | 28 | label_df = {} 29 | word_df = {} 30 | 31 | def get_df(p_in): 32 | for line in open(p_in): 33 | row = line.strip().split('\t') 34 | label = row[0] 35 | label = ' | '.join(sorted(label.split(' | '))) 36 | label_df[label] = label_df.get(label, 0) + 1 37 | queries = row[1] 38 | titles = row[2] if len(row)>=3 else '' 39 | 40 | for query in queries.split(';'): 41 | if not query: continue 42 | feat_list = query.split(' ') 43 | for i, word in enumerate(feat_list): 44 | if not word: continue 45 | word_df[word] = word_df.get(word, 0) + 1 46 | #if i>=1: 47 | # word = ' '.join(feat_list[i-1:i+1]) 48 | # word_df[word] = word_df.get(word, 0) + 1 49 | #word = '%s_%s' % (i, feat_list[i]) 50 | #word_df[word] = word_df.get(word, 0) + 1 51 | #if i >= len(feat_list)/2: 52 | # word = '%s_%s' % (i-len(feat_list), feat_list[i]) 53 | # word_df[word] = word_df.get(word, 0) + 1 54 | 55 | for title in titles.split(';'): 56 | if not title: continue 57 | feat_list = title.split(' ') 58 | for i, word in enumerate(feat_list): 59 | if not word: continue 60 | word = 't_' + word 61 | word_df[word] = word_df.get(word, 0) + 1 62 | #if i>=1: 63 | # word = 't_' + ' '.join(feat_list[i-1:i+1]) 64 | # word_df[word] = word_df.get(word, 0) + 1 65 | 66 | 67 | def prepare(p_in, p_out, isTrain): 68 | global cur_label 69 | global cur_word 70 | 71 | fo = open(p_out, 'w') 72 | 73 | for line in open(p_in): 74 | row = line.strip().split('\t') 75 | label = row[0] 76 | queries = row[1] 77 | titles = row[2] if len(row)>=3 else '' 78 | 79 | if isTrain: 80 | label = ' | '.join(sorted(label.split(' | '))) 81 | if label_df[label] < 50: 82 | label = label.split(' ')[0] 83 | if label not in label_map: 84 | label_map[label] = cur_label 85 | cur_label += 1 86 | 87 | word_tf = {} 88 | query_list = queries.split(';') 89 | for query in query_list: 90 | if not query: continue 91 | feat_list = query.split(' ') 92 | for i, word in enumerate(feat_list): 93 | if not word: continue 94 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 95 | word_map[word] = cur_word 96 | cur_word += 1 97 | if word in word_map: 98 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)*1./len(query_list) 99 | #if i>=1: 100 | # word = ' '.join(feat_list[i-1:i+1]) 101 | # if isTrain and word_df[word] >= min_word_df and word not in word_map: 102 | # word_map[word] = cur_word 103 | # cur_word += 1 104 | # if word in word_map: 105 | # word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 106 | #word = '%s_%s' % (i, feat_list[i]) 107 | #if isTrain and word_df[word] >= min_word_df and word not in word_map: 108 | # word_map[word] = cur_word 109 | # cur_word += 1 110 | #if word in word_map: 111 | # word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 112 | #if i >= len(feat_list) / 2: 113 | # word = '%s_%s' % (i-len(feat_list), feat_list[i]) 114 | # if isTrain and word_df[word] >= min_word_df and word not in word_map: 115 | # word_map[word] = cur_word 116 | # cur_word += 1 117 | # if word in word_map: 118 | # word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 119 | 120 | word_tf2 = {} 121 | title_list = titles.split(';') 122 | for title in title_list: 123 | if not title: continue 124 | feat_list = title.split(' ') 125 | for i, word in enumerate(feat_list): 126 | word = 't_' + word 127 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 128 | word_map[word] = cur_word 129 | cur_word += 1 130 | if word in word_map: 131 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*1./len(title_list) 132 | #if i>=1: 133 | # word = 't_' + ' '.join(feat_list[i-1:i+1]) 134 | # if isTrain and word_df[word] >= min_title_df and word not in word_map: 135 | # word_map[word] = cur_word 136 | # cur_word += 1 137 | # if word in word_map: 138 | # word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 139 | 140 | 141 | if isTrain: 142 | label, F = (label_map[label], word_tf.items() + word_tf2.items()) 143 | F = sorted(F, key=lambda d:d[0]) 144 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 145 | fo.write('%s %s\n' % (label, f_str)) 146 | else: 147 | label, F = 0, word_tf.items() + word_tf2.items() 148 | F = sorted(F, key=lambda d:d[0]) 149 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 150 | fo.write('%s %s\n' % (label, f_str)) 151 | 152 | if isTrain: fo.close() 153 | 154 | # save label map 155 | with open(p_label_map, 'w') as fo: 156 | for label in label_map: 157 | fo.write('%s\t%s\n' % (label, label_map[label])) 158 | 159 | 160 | #for dog 161 | #get_df(p_dog_train_feat) 162 | #prepare(p_dog_train_feat, p_dog_train, True) 163 | #for pig 164 | get_df(p_pig_train_feat) 165 | prepare(p_pig_train_feat, p_pig_train, True) 166 | 167 | #prepare(p_dog_valid_feat, p_dog_test, False) 168 | prepare(p_pig_valid_feat, p_pig_test, False) 169 | 170 | -------------------------------------------------------------------------------- /V1/refine_train_by_sesson_query.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../raw_data/train.txt' 3 | p_test = '../raw_data/test.txt' 4 | p_dog = '../trans_data/dog.txt' 5 | p_valid = '../trans_data/valid.txt' 6 | 7 | def refine(p_in, p_out): 8 | with open(p_out, 'w') as fo: 9 | last_query = None 10 | has_known = False 11 | session_lines = [] 12 | for line in open(p_in): 13 | if not line.strip(): 14 | fo.write('\n') 15 | if has_known: 16 | for l in session_lines: 17 | fo.write(l) 18 | last_query = None 19 | has_known = False 20 | session_lines = [] 21 | else: 22 | label, query = line.strip().split('\t')[:2] 23 | query_set = set(query.split(' ')) 24 | if not last_query or (last_query & query_set): 25 | session_lines.append(line) 26 | if label != 'CLASS=UNKNOWN': 27 | has_known = True 28 | last_query = query_set 29 | else: 30 | fo.write('\n') 31 | if has_known: 32 | for l in session_lines: 33 | fo.write(l) 34 | last_query = None 35 | has_known = False 36 | session_lines = [] 37 | 38 | refine(p_dog, '../trans_data/dog_refine.txt') 39 | refine(p_train, '../trans_data/train_refine.txt') 40 | 41 | 42 | -------------------------------------------------------------------------------- /V1/run_average.sh: -------------------------------------------------------------------------------- 1 | #python averaging_methods.py pred_average.txt pred_linear.txt liblinear 0.2 pred_xgboost.txt xgboost 0.8 18 2 | python averaging_methods.py pred_average.txt pred_xgboost.txt xgboost 0.65 18 pred_linear.txt liblinear 0.15 pred_semilda.txt semilda 0.0 pred_session2.txt xgboost 0.1 18 pred_session_label.txt sessionlabel 0.1 3 | 4 | python construct_maxprob.py pred_average.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_average.txt 5 | python construct_maxprob_multi.py pred_average.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_average2.txt 6 | #python construct_maxprob_balance.py pred_average.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_average3.txt 7 | -------------------------------------------------------------------------------- /V1/run_forest_dog.sh: -------------------------------------------------------------------------------- 1 | python forest.py ../dataset/pig_train ../dataset/pig_test 20 5 2 | 3 | python construct_maxlabel.py pred_forest.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_forest.txt 4 | -------------------------------------------------------------------------------- /V1/run_liblinear_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TRAIN_BIN=../../tools/liblinear/train 3 | TEST_BIN=../../tools/liblinear/predict 4 | 5 | for ((i=0; i<=6; i++)); do 6 | $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/svm_train${i} svm.model 7 | $TEST_BIN -b 1 ../dataset/pig_test svm.model pred_linear_${i}.txt 8 | done 9 | 10 | #python construct_liblinear_b1.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt 11 | 12 | #$TEST_BIN ../dataset/pig_test pig.model pred_linear.txt 13 | #python construct_maxlabel.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt 14 | 15 | 16 | -------------------------------------------------------------------------------- /V1/run_liblinear_pig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TRAIN_BIN=../../tools/liblinear/train 3 | TEST_BIN=../../tools/liblinear/predict 4 | 5 | #$TRAIN_BIN -s 6 -c 10 -e 0.001 -w0 0.5 -w5 0.6 ../dataset/pig_train pig.model 6 | $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/pig_train pig.model 7 | 8 | $TEST_BIN -b 1 ../dataset/pig_test pig.model pred_linear.txt 9 | python construct_liblinear_b1.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt 10 | 11 | #$TEST_BIN ../dataset/pig_test pig.model pred_linear.txt 12 | #python construct_maxlabel.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt 13 | 14 | 15 | -------------------------------------------------------------------------------- /V1/run_semilda_pig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -x 2 | ldapath=../cpp_lda/src 3 | 4 | train_file=../trans_data/dog.simple 5 | test_file=../trans_data/test.simple 6 | 7 | ldatrain_file=../dataset/train_semilda.train 8 | ldatest_file=../dataset/test_semilda.test 9 | ldapred_file=pred_semilda.txt 10 | 11 | index_file=../dataset/word_index 12 | seed_file=lda_seed_words 13 | model_file=lda.model 14 | 15 | num_topic=7 16 | alpha=0.5 17 | beta=0.05 18 | 19 | python prepare_lda_train.py $train_file $ldatrain_file $index_file 20 | 21 | time /Users/zuotaoliu/install/mpich2/bin/mpiexec -n 4 $ldapath/mpi_slda \ 22 | --num_topics $num_topic \ 23 | --alpha $alpha --beta $beta \ 24 | --training_data_file $ldatrain_file \ 25 | --model_file $model_file \ 26 | --word_index_file $index_file \ 27 | --compute_likelihood true \ 28 | --burn_in_iterations 50 --total_iterations 120 29 | 30 | 31 | python prepare_lda_test.py $test_file $ldatest_file 32 | 33 | args="--alpha ${alpha} \ 34 | --beta ${beta} \ 35 | --inference_data_file ${ldatest_file} \ 36 | --inference_result_file ${ldapred_file} \ 37 | --model_file ${model_file} \ 38 | --burn_in_iterations 50 \ 39 | --total_iterations 120 \ 40 | --file_type 0 41 | " 42 | 43 | time $ldapath/infer $args 44 | 45 | python construct_semilda.py pred_semilda.txt $test_file ../raw_data/test.txt ../dataset/label_map_lda ../submit/predict_semilda.txt 46 | -------------------------------------------------------------------------------- /V1/run_xgboost3_dog.sh: -------------------------------------------------------------------------------- 1 | 2 | BIN=../../tools/xgboost3/xgboost 3 | 4 | $BIN ../boost/xgboost3.conf num_round=120 num_class=18 bst:max_depth=7 data=../dataset/dog_train eval[test]=../dataset/dog_train 5 | 6 | $BIN ../boost/xgboost3.conf task=pred num_class=18 model_in=0120.model test:data=../dataset/dog_test 7 | #cp pred.txt pred5.txt 8 | python ../boost/construct_maxprob.py pred.txt ../trans_data/valid.txt ../dataset/label_map ../submit/dog.txt 9 | python ../evaluate/metric_F1.py ../trans_data/valid.label ../submit/dog.txt 10 | python ../evaluate/metric_confusion.py ../trans_data/valid.label ../submit/dog.txt 11 | #python ../evaluate/construct_maxprob_multi.py pred.txt ../trans_data/valid.txt ../dataset/label_map ../submit/dog.txt 12 | #python ../evaluate/metric_F1.py ../trans_data/valid.label ../submit/dog.txt 13 | 14 | $BIN ../boost/xgboost3.conf task=pred num_class=18 model_in=0120.model test:data=../dataset/pig_test 15 | #cp pred.txt pred5_2.txt 16 | python ../boost/construct_maxprob.py pred.txt ../raw_data/test.txt ../dataset/label_map ../submit/pig.txt 17 | python ../evaluate/construct_maxprob_multi.py pred.txt ../raw_data/test.txt ../dataset/label_map ../submit/pig2.txt 18 | -------------------------------------------------------------------------------- /V1/run_xgboost3_pig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BIN=../../tools/xgboost3/xgboost 3 | 4 | $BIN xgboost3.conf num_round=200 num_class=18 bst:max_depth=7 data=../dataset/pig_train eval[test]=../dataset/pig_train 5 | $BIN xgboost3.conf task=pred num_class=18 model_in=0200.model test:data=../dataset/pig_test 6 | mv pred.txt pred_xgboost.txt 7 | 8 | python construct_maxprob.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost.txt 9 | python construct_maxprob_multi.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost2.txt 10 | -------------------------------------------------------------------------------- /V1/run_xgboost3_pig2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BIN=../../tools/xgboost3/xgboost 3 | 4 | $BIN xgboost3.conf num_round=200 num_class=18 bst:max_depth=7 data=../dataset/pig_train2 eval[test]=../dataset/pig_train2 5 | $BIN xgboost3.conf task=pred num_class=18 model_in=0200.model test:data=../dataset/pig_test2 6 | mv pred.txt pred_xgboost.txt 7 | 8 | python construct_maxprob.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost.txt 9 | python construct_maxprob_multi.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost2.txt 10 | -------------------------------------------------------------------------------- /V1/run_xgboost3_session.sh: -------------------------------------------------------------------------------- 1 | 2 | BIN=../../tools/xgboost3/xgboost 3 | 4 | #$BIN xgboost3.conf num_round=90 num_class=18 bst:max_depth=7 data=../dataset/pig_train_session eval[test]=../dataset/pig_train_session 5 | #$BIN ../boost/xgboost3.conf task=pred num_class=18 model_in=0090.model test:data=../dataset/pig_test_session 6 | #mv pred.txt pred_session.txt 7 | #python construct_session_prob.py pred_session.txt ../trans_data/test.simple5 ../raw_data/test.txt ../dataset/label_map_session pred_session2.txt 8 | python construct_maxprob.py pred_session2.txt ../raw_data/test.txt ../dataset/label_map_session ../submit/predict_session.txt 9 | -------------------------------------------------------------------------------- /V1/split_train.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | p_train = '../raw_data/train.txt' 4 | p_test = '../raw_data/test.txt' 5 | 6 | p_dog = '../trans_data/dog.txt' 7 | p_valid = '../trans_data/valid.txt' 8 | p_label = '../trans_data/valid.label' 9 | 10 | n_fold = 3 11 | 12 | train_dict = {} 13 | test_dict = {} 14 | unknown_dict = {} 15 | 16 | for line in open(p_train): 17 | if not line.strip(): continue 18 | try: 19 | label, query, title = line.strip().split('\t') 20 | except: 21 | label, query = line.strip().split('\t') 22 | title = '-' 23 | 24 | if query not in train_dict: 25 | train_dict[query] = {} 26 | train_dict[query][label] = train_dict[query].get(label, 0) + 1 27 | if label.startswith('CLASS=TEST'): 28 | test_dict[query] = 1 29 | if label.startswith('CLASS=UNKNOWN'): 30 | unknown_dict[query] = 1 31 | 32 | valid_dict = {} 33 | fv2 = open('../trans_data/valid.txt', 'w') 34 | fv3 = open('../trans_data/valid.label', 'w') 35 | for query in train_dict: 36 | if query in test_dict: continue 37 | if query in unknown_dict: continue 38 | if random.randint(0, n_fold-1) == 1: 39 | valid_dict[query] = 1 40 | label = sorted(train_dict[query].items(), key=lambda d:-d[1])[0][0] 41 | fv2.write('%s\n' % query) 42 | fv3.write('%s\t%s\n' % (query, label)) 43 | fv2.close() 44 | fv3.close() 45 | 46 | fv1 = open('../trans_data/dog.txt', 'w') 47 | for line in open(p_train): 48 | if not line.strip(): 49 | fv1.write(line) 50 | continue 51 | 52 | try: 53 | label, query, title = line.strip().split('\t') 54 | except: 55 | label, query = line.strip().split('\t') 56 | title = '-' 57 | if query in test_dict: continue 58 | if query in train_dict: 59 | if query in valid_dict: 60 | label = 'CLASS=TEST' 61 | fv1.write('%s\t%s\t%s\n' % (label, query, title)) 62 | else: 63 | fv1.write(line) 64 | fv1.close() 65 | 66 | -------------------------------------------------------------------------------- /V1/trans_session.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | p_pig_train = '../raw_data/train.txt' 4 | p_dog_train = '../trans_data/dog.txt' 5 | p_pig_out = '../trans_data/pig.simple5' 6 | p_dog_out = '../trans_data/dog.simple5' 7 | p_pig_valid = '../trans_data/test.simple5' 8 | p_dog_valid = '../trans_data/valid.simple5' 9 | 10 | rates = {'CLASS=VIDEO' : 0.5} 11 | 12 | 13 | def stat(p_in, p_out): 14 | session = [set(), set()] 15 | labels = [] 16 | 17 | fo = open(p_out, 'w') 18 | tot_line = 0 19 | for line in open(p_in): 20 | if not line.strip(): 21 | session_label = '' 22 | session_flag = True 23 | positive_count = 0 24 | for key in labels: 25 | label, query = key.split('\t') 26 | if label.find('TEST')>=0 or label.find('KNOWN')>=0: 27 | if positive_count < 1: session_flag = False 28 | #elif positive_count == 1: positive_count = 0 29 | #else: positive_count = 0.5 30 | else: positive_count = 0 31 | elif session_flag: 32 | if not session_label: 33 | session_label = label 34 | positive_count += 1 35 | elif session_label != label: 36 | session_flag = False 37 | else: 38 | positive_count += 1 39 | if session[0] and session_label and session_flag and positive_count > 0: 40 | if session[1] and len(session[1])<=10 and len(session[0])<=5: 41 | rate = rates.get(session_label, 1.0) 42 | if random.random()<=rate: 43 | fo.write('%s\t%s\t%s\n' % (session_label, ';'.join(session[0]), ';'.join(session[1]))) 44 | session = [set(), set()] 45 | labels = [] 46 | continue 47 | 48 | try: 49 | label, query, title = line.strip().split('\t') 50 | except: 51 | label, query = line.strip().split('\t') 52 | title = '-' 53 | 54 | key = label + '\t' + query 55 | if not labels or labels[-1] != key: 56 | labels.append(key) 57 | session[0].add(query) 58 | if title and title!='-': 59 | session[1].add(title) 60 | 61 | tot_line += 1 62 | #if tot_line == 1000000: break 63 | fo.close() 64 | 65 | def valid(p_in, p_out): 66 | session = [set(), set()] 67 | has_test = False 68 | 69 | fo = open(p_out, 'w') 70 | tot_line = 0 71 | for line in open(p_in): 72 | if not line.strip(): 73 | if has_test and session[1] and len(session[1])<=10 and len(session[0])<=5: 74 | fo.write('%s\t%s\t%s\n' % (0, ';'.join(session[0]), ';'.join(session[1]))) 75 | session = [set(), set()] 76 | has_test = False 77 | continue 78 | try: 79 | label, query, title = line.strip().split('\t') 80 | except: 81 | label, query = line.strip().split('\t') 82 | title = '-' 83 | 84 | if label.find('TEST') >= 0: 85 | has_test = True 86 | session[0].add(query) 87 | if title and title!='-': 88 | session[1].add(title) 89 | 90 | tot_line += 1 91 | #if tot_line == 10000: break 92 | fo.close() 93 | 94 | stat(p_pig_train, p_pig_out) 95 | #stat(p_dog_train, p_dog_out) 96 | valid(p_pig_train, p_pig_valid) 97 | #valid(p_dog_train, p_dog_valid) 98 | -------------------------------------------------------------------------------- /V1/trans_train1.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../trans_data/train_refine.txt' 3 | p_dog = '../trans_data/dog_refine.txt' 4 | #p_train = '../raw_data/train.txt' 5 | #p_dog = '../trans_data/dog.txt' 6 | p_test = '../raw_data/test.txt' 7 | p_valid = '../trans_data/valid.txt' 8 | 9 | s_train = '../trans_data/train.simple' 10 | s_test = '../trans_data/test.simple' 11 | s_dog = '../trans_data/dog.simple' 12 | s_valid = '../trans_data/valid.simple' 13 | 14 | def trans(in1, in2, out1, out2): 15 | train_dict = {} 16 | test_dict = {} 17 | 18 | # for session 19 | session_train_query = {} 20 | session_test_query = {} 21 | session_labels = {} 22 | session_click = {} 23 | 24 | 25 | for line in open(in1): 26 | if not line.strip(): 27 | #session end 28 | for query in session_train_query: 29 | #if len(session_labels) == 1: 30 | for q2 in session_train_query: 31 | if query != q2: 32 | label = session_train_query[q2] 33 | train_dict[query][2][label] = train_dict[query][2].get(label, 0) + 1 34 | train_dict[query][3][q2] = train_dict[query][3].get(q2, 0) + 1 35 | for title in session_click: 36 | train_dict[query][4][title] = train_dict[query][4].get(title, 0) + 1 37 | for query in session_test_query: 38 | #if len(session_labels) == 1: 39 | for q2 in session_train_query: 40 | if query != q2: 41 | label = session_train_query[q2] 42 | test_dict[query][2][label] = test_dict[query][2].get(label, 0) + 1 43 | test_dict[query][3][q2] = test_dict[query][3].get(q2, 0) + 1 44 | for title in session_click: 45 | test_dict[query][4][title] = test_dict[query][4].get(title, 0) + 1 46 | session_train_query = {} 47 | session_test_query = {} 48 | session_labels = {} 49 | session_click = {} 50 | continue 51 | 52 | try: 53 | label, query, title = line.strip().split('\t') 54 | except: 55 | label, query = line.strip().split('\t') 56 | title = '-' 57 | #label = ' | '.join(sorted(label.split(' | '))) 58 | 59 | if title and title != '-': 60 | session_click[title] = session_click.get(title, 0) + 1 61 | 62 | if label.startswith('CLASS=TEST'): 63 | if query not in test_dict: 64 | test_dict[query] = [label, {}, {}, {}, {}] 65 | if title and title != '-': 66 | test_dict[query][1][title] = test_dict[query][1].get(title, 0) + 1 67 | session_test_query[query] = 1 68 | elif not label.startswith('CLASS=UNKNOWN'): 69 | if query not in train_dict: 70 | train_dict[query] = [{}, {}, {}, {}, {}] 71 | train_dict[query][0][label] = train_dict[query][0].get(label, 0) + 1 72 | if title and title != '-': 73 | train_dict[query][1][title] = train_dict[query][1].get(title, 0) + 1 74 | session_labels[label] = 1 75 | session_train_query[query] = label.replace(' ', '') 76 | 77 | n_top_title = 30 78 | n_top_label = 3 79 | n_top_query = 10 80 | n_top_session_title = 30 81 | 82 | with open(out1, 'w') as ft: 83 | for query in train_dict: 84 | label = sorted(train_dict[query][0].items(), key=lambda d:-d[1])[0][0] 85 | 86 | titles = sorted(train_dict[query][1].items(), key=lambda d:-d[1]) 87 | title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]] 88 | 89 | labels = sorted(train_dict[query][2].items(), key=lambda d:-d[1]) 90 | label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]] 91 | tot_label = float(sum(train_dict[query][2].values())) 92 | label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]] 93 | 94 | queries = sorted(train_dict[query][3].items(), key=lambda d:-d[1]) 95 | query_pairs = [] 96 | query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]] 97 | 98 | stitles = sorted(train_dict[query][4].items(), key=lambda d:-d[1]) 99 | stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]] 100 | 101 | stat_pairs = [] 102 | 103 | ft.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs))) 104 | 105 | with open(out2, 'w') as fo: 106 | for query in test_dict: 107 | label = test_dict[query][0] 108 | 109 | titles = sorted(test_dict[query][1].items(), key=lambda d:-d[1]) 110 | title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]] 111 | 112 | labels = sorted(test_dict[query][2].items(), key=lambda d:-d[1]) 113 | label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]] 114 | tot_label = float(sum(test_dict[query][2].values())) 115 | label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]] 116 | 117 | queries = sorted(test_dict[query][3].items(), key=lambda d:-d[1]) 118 | query_pairs = [] 119 | query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]] 120 | 121 | stitles = sorted(test_dict[query][4].items(), key=lambda d:-d[1]) 122 | stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]] 123 | 124 | stat_pairs = [] 125 | 126 | fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs))) 127 | 128 | trans(p_train, p_test, s_train, s_test) 129 | trans(p_dog, p_valid, s_dog, s_valid) 130 | 131 | -------------------------------------------------------------------------------- /V1/trans_train2.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../trans_data/train_refine.txt' 3 | p_dog = '../trans_data/dog_refine.txt' 4 | #p_train = '../raw_data/train.txt' 5 | #p_dog = '../trans_data/dog.txt' 6 | p_test = '../raw_data/test.txt' 7 | p_valid = '../trans_data/valid.txt' 8 | 9 | s_train = '../trans_data/train.simple2' 10 | s_test = '../trans_data/test.simple2' 11 | s_dog = '../trans_data/dog.simple2' 12 | s_valid = '../trans_data/valid.simple2' 13 | 14 | def trans(in1, in2, out1, out2): 15 | train_dict = {} 16 | test_dict = {} 17 | 18 | query_freq = {} 19 | query_titles = {} 20 | 21 | query_session = {} 22 | query_search = {} 23 | query_click = {} 24 | query_dupclick = {} 25 | 26 | query_session_search = {} 27 | query_session_click = {} 28 | query_session_dupclick = {} 29 | 30 | # for session 31 | session_train_query = {} 32 | session_test_query = {} 33 | session_labels = {} 34 | session_search = 0 35 | session_click = {} 36 | session_query_search = {} 37 | session_query_click = {} 38 | 39 | 40 | for line in open(in1): 41 | if not line.strip(): 42 | #session end 43 | session_dupclick = 0 44 | for title in session_click: 45 | if session_click[title] > 1: session_dupclick += 1 46 | for query in session_train_query: 47 | query_session[query] = query_session.get(query, 0) + 1. 48 | query_session_search[query] = query_session_search.get(query, 0) + session_search 49 | query_session_click[query] = query_session_click.get(query, 0) + len(session_click) 50 | query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick 51 | query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0) 52 | query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {})) 53 | session_query_dup = 0 54 | for title in session_query_click.get(query, {}): 55 | if session_query_click[query][title] > 1: session_query_dup += 1 56 | query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup 57 | for query in session_test_query: 58 | query_session[query] = query_session.get(query, 0) + 1. 59 | query_session_search[query] = query_session_search.get(query, 0) + session_search 60 | query_session_click[query] = query_session_click.get(query, 0) + len(session_click) 61 | query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick 62 | query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0) 63 | query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {})) 64 | session_query_dup = 0 65 | for title in session_query_click.get(query, {}): 66 | if session_query_click[query][title] > 1: session_query_dup += 1 67 | query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup 68 | session_train_query = {} 69 | session_test_query = {} 70 | session_labels = {} 71 | session_search = 0 72 | session_click = {} 73 | session_query_search = {} 74 | session_query_click = {} 75 | continue 76 | 77 | try: 78 | label, query, title = line.strip().split('\t') 79 | except: 80 | label, query = line.strip().split('\t') 81 | title = '-' 82 | #label = ' | '.join(sorted(label.split(' | '))) 83 | 84 | query_freq[query] = query_freq.get(query, 0) + 1 85 | if title and title != '-': 86 | if query not in query_titles: query_titles[query] = [0., 0.] 87 | query_titles[query][0] += 1 88 | query_titles[query][1] += len(title.split(' ')) 89 | session_click[title] = session_click.get(title, 0) + 1 90 | if query not in session_query_click: session_query_click[query] = {} 91 | session_query_click[query][title] = session_query_click[query].get(title, 0) + 1 92 | else: 93 | session_search += 1 94 | if query not in session_query_search: session_query_search[query] = 0 95 | session_query_search[query] += 1 96 | 97 | if label.startswith('CLASS=TEST'): 98 | if query not in test_dict: 99 | test_dict[query] = [label, {}, {}, {}] 100 | session_test_query[query] = 1 101 | elif not label.startswith('CLASS=UNKNOWN'): 102 | if query not in train_dict: 103 | train_dict[query] = [{}, {}, {}, {}] 104 | session_train_query[query] = label.replace(' ', '') 105 | 106 | 107 | with open(out1, 'w') as ft: 108 | for query in train_dict: 109 | stat_pairs = [] 110 | stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) ) 111 | stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) ) 112 | if query_titles.get(query, [0, 0])[0] >= 3: 113 | stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) ) 114 | 115 | stat_pairs2 = [] 116 | if query_session.get(query, 0) >= 5: 117 | stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) ) 118 | stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) ) 119 | stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) ) 120 | stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) ) 121 | stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) ) 122 | stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) ) 123 | 124 | ft.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2))) 125 | 126 | with open(out2, 'w') as fo: 127 | for query in test_dict: 128 | stat_pairs = [] 129 | stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) ) 130 | stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) ) 131 | if query_titles.get(query, [0, 0])[0] >= 3: 132 | stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) ) 133 | 134 | stat_pairs2 = [] 135 | if query_session.get(query, 0) >= 5: 136 | stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) ) 137 | stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) ) 138 | stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) ) 139 | stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) ) 140 | stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) ) 141 | stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) ) 142 | 143 | fo.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2))) 144 | 145 | trans(p_train, p_test, s_train, s_test) 146 | trans(p_dog, p_valid, s_dog, s_valid) 147 | 148 | -------------------------------------------------------------------------------- /V1/xgboost3.conf: -------------------------------------------------------------------------------- 1 | ### General Parameters, see comment for each definition 2 | # choose the tree booster, 0: tree, 1: linear 3 | booster_type = 0 4 | # this is the only difference with classification, use 0: linear regression 5 | # when labels are in [0,1] we can also use 1: logistic regression 6 | loss_type = 0 7 | # evaluation metrics for validation data 8 | eval_metric=merror 9 | #eval_metric=error 10 | #eval_metric=auc 11 | #eval_metric=map 12 | #eval_metric=rmse 13 | #eval_metric=ndcg 14 | # silent=1 means printing running messages 15 | #silent = 1 16 | # do not use binary buffer 17 | use_buffer = 0 18 | nthread = 8 19 | #num_class = 18 20 | 21 | ### Tree Booster Parameters 22 | # step size shrinkage used in update to prevent overfitting. 23 | bst:eta = 0.1 24 | #bst:eta = 0.05 25 | # minimum loss reduction required to make a further partition. larger -> more conservative 26 | bst:gamma = 1.0 27 | # minimum sum of instance weight(hessian) needed in a child. larger -> more conservative 28 | bst:min_child_weight = 1.0 29 | # maximum depth of a tree 30 | #bst:max_depth = 5 31 | # constructing method to build a tree, 0: svdfeature, 1: column major expansion, 2: row major expansion 32 | #bst:tree_maker = 1 33 | 34 | ### Linear Booster Parameters 35 | # L2 regularization term on weights 36 | bst:lambda = 0 37 | # L1 regularization term on weights 38 | bst:alpha = 0 39 | # L2 regularization term on bias 40 | bst:lambda_bias = 0 41 | 42 | ### Task parameters 43 | # specify the learning task and the corresponding learning objective 44 | #objective = multi:softmax 45 | objective = multi:softprob 46 | #objective = reg:linear 47 | #objective = reg:linear 48 | #objective = reg:logistic 49 | #objective = binary:logistic 50 | #objective = binary:logitraw 51 | # the number of round to do boosting 52 | num_round = 10 53 | # 0 means do not save any model except the final round model 54 | save_period = 0 55 | # the initial prediction score 56 | base_score = 0.0 57 | # fature map 58 | #fmap = "../dataset/feat_map.txt" 59 | # name for dump model 60 | name_dump = "dump.nice.txt" 61 | # The path of training data 62 | #data = "../dataset/train_dog.svm" 63 | # The path of validation data, used to monitor training process, here [test] sets name of the validation set 64 | #eval[test] = "../dataset/train_dog.svm" 65 | # The path of test data 66 | #test:data = "../dataset/test_dog.svm" 67 | 68 | 69 | -------------------------------------------------------------------------------- /V2/construct_liblinear_b1.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | def get_match(p_pred, p_test, p_out): 12 | fo = open(p_out, 'w') 13 | fl = open(p_test) 14 | fin = open(p_pred) 15 | fin.readline() 16 | for line in fin: 17 | c = int(line.split(' ')[0]) 18 | #c = max(0, min(max_label, int(float(line.strip())+0.5))) 19 | label = label_map[c] 20 | feats = fl.readline().strip() 21 | fo.write('%s\t%s\n' % (feats, label)) 22 | fin.close() 23 | fl.close() 24 | fo.close() 25 | 26 | if __name__ == '__main__': 27 | import sys 28 | if len(sys.argv) != 5: 29 | print ' pred test label out' 30 | exit(1) 31 | 32 | load_label_map(sys.argv[3]) 33 | print 'max_label:', max_label 34 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 35 | -------------------------------------------------------------------------------- /V2/construct_maxprob.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | adjust = {} 5 | def load_label_map(p_in): 6 | global max_label 7 | for line in open(p_in): 8 | label, c = line.strip().split('\t') 9 | label_map[int(c)] = label 10 | max_label = max(max_label, int(c)) 11 | 12 | for i in range(max_label+1): 13 | adjust[i] = 0 14 | 15 | def get_match(p_pred, p_test, p_out): 16 | npred = len(open(p_pred).readlines()) / len(open(p_test).readlines()) 17 | fo = open(p_out, 'w') 18 | fp = open(p_pred) 19 | for line in open(p_test): 20 | feats = line.strip() 21 | 22 | pred = [] 23 | for i in range(npred): 24 | if i <= max_label: 25 | pred.append(float(fp.readline().strip())) 26 | else: 27 | fp.readline() 28 | c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0] 29 | label = label_map[c] 30 | fo.write('%s\t%s\n' % (feats, label)) 31 | fp.close() 32 | fo.close() 33 | 34 | if __name__ == '__main__': 35 | import sys 36 | if len(sys.argv) < 5: 37 | print ' pred test label out adjust' 38 | exit(1) 39 | 40 | load_label_map(sys.argv[3]) 41 | print 'max_label:', max_label 42 | if len(sys.argv) >= 6: 43 | for line in open(sys.argv[5]): 44 | cid, v = line.strip().split('\t') 45 | adjust[int(cid)] = float(v) 46 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 47 | -------------------------------------------------------------------------------- /V2/construct_maxprob_multi.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | adjust = {} 5 | def load_label_map(p_in): 6 | global max_label 7 | for line in open(p_in): 8 | label, c = line.strip().split('\t') 9 | label_map[int(c)] = label 10 | max_label = max(max_label, int(c)) 11 | 12 | for i in range(max_label+1): 13 | adjust[i] = 0 14 | 15 | def get_match(p_pred, p_test, p_out): 16 | npred = len(open(p_pred).readlines()) / len(open(p_test).readlines()) 17 | fo = open(p_out, 'w') 18 | fp = open(p_pred) 19 | for line in open(p_test): 20 | feats = line.strip() 21 | 22 | pred = [] 23 | for i in range(npred): 24 | if i <= max_label: 25 | pred.append(float(fp.readline().strip())) 26 | else: 27 | fp.readline() 28 | sort_list = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1]) 29 | c = sort_list[0][0] 30 | label = label_map[c] 31 | if sort_list[1][1] > 0.45: 32 | label = label + ' | ' + label_map[sort_list[1][0]] 33 | label = ' | '.join(label.split(' | ')[:2]) 34 | fo.write('%s\t%s\n' % (feats, label)) 35 | fp.close() 36 | fo.close() 37 | 38 | if __name__ == '__main__': 39 | import sys 40 | if len(sys.argv) < 5: 41 | print ' pred test label out adjust' 42 | exit(1) 43 | 44 | load_label_map(sys.argv[3]) 45 | print 'max_label:', max_label 46 | if len(sys.argv) >= 6: 47 | for line in open(sys.argv[5]): 48 | cid, v = line.strip().split('\t') 49 | adjust[int(cid)] = float(v) 50 | get_match(sys.argv[1], sys.argv[2], sys.argv[4]) 51 | -------------------------------------------------------------------------------- /V2/construct_semilda.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | def get_match(p_pred, p_test_simple, p_test, p_out): 12 | feat_map = {} 13 | fl = open(p_test_simple) 14 | fin = open(p_pred) 15 | for line in fin: 16 | pred = [float(v) for v in line.strip().split(' ')] 17 | tot = sum(pred) + 0.001 18 | pred = [v/tot for v in pred] 19 | c = sorted([(k, v) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0] 20 | label = label_map[c] 21 | feats = fl.readline().strip().split('\t')[1] 22 | feat_map[feats] = label 23 | fin.close() 24 | fl.close() 25 | fo = open(p_out, 'w') 26 | for line in open(p_test): 27 | feats = line.strip() 28 | fo.write('%s\t%s\n' % (feats, feat_map[feats])) 29 | fo.close() 30 | 31 | if __name__ == '__main__': 32 | import sys 33 | if len(sys.argv) != 6: 34 | print ' pred test.simple test label out' 35 | exit(1) 36 | 37 | load_label_map(sys.argv[4]) 38 | print 'max_label:', max_label 39 | get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5]) 40 | -------------------------------------------------------------------------------- /V2/construct_session_prob.py: -------------------------------------------------------------------------------- 1 | 2 | label_map = {} 3 | max_label = 0 4 | def load_label_map(p_in): 5 | global max_label 6 | for line in open(p_in): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | max_label = max(max_label, int(c)) 10 | 11 | 12 | def get_match(p_pred, p_session, p_test, p_out): 13 | npred = len(open(p_pred).readlines()) / len(open(p_session).readlines()) 14 | query_dict = {} 15 | query_num = {} 16 | for line in open(p_test): 17 | query = line.strip() 18 | query_dict[query] = [0]*npred 19 | query_num[query] = 0 20 | 21 | fp = open(p_pred) 22 | for line in open(p_session): 23 | query_list = line.strip().split('\t')[1].split(';') 24 | 25 | pred = [] 26 | for i in range(npred): 27 | if i <= max_label: 28 | pred.append(float(fp.readline().strip())) 29 | else: 30 | fp.readline() 31 | for query in query_list: 32 | if query in query_dict: 33 | for i, v in enumerate(pred): 34 | query_dict[query][i] += v 35 | query_num[query] += 1 36 | fp.close() 37 | print 'query_dict', len(query_dict) 38 | 39 | not_in_test = 0 40 | fo = open(p_out, 'w') 41 | for line in open(p_test): 42 | query = line.strip() 43 | #c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0] 44 | #label = label_map[c] 45 | if query_num[query] > 0: 46 | for i in range(npred): 47 | fo.write('%s\n' % (query_dict[query][i]/query_num[query])) 48 | else: 49 | for i in range(npred): 50 | fo.write('0\n') 51 | not_in_test += 1 52 | fo.close() 53 | print 'not in session:', not_in_test 54 | 55 | if __name__ == '__main__': 56 | import sys 57 | if len(sys.argv) < 6: 58 | print ' pred session testid label out' 59 | exit(1) 60 | 61 | load_label_map(sys.argv[4]) 62 | print 'max_label:', max_label 63 | 64 | get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5]) 65 | -------------------------------------------------------------------------------- /V2/markov_sessoin_label.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../raw_data/train.txt' 3 | p_test = '../raw_data/test.txt' 4 | p_dog = '../trans_data/dog.txt' 5 | p_valid = '../trans_data/valid.txt' 6 | 7 | def markov(p_in, p_query, p_out): 8 | test_label = {} 9 | unknown_label = {} 10 | test_unknown = {} 11 | 12 | label_query = {} 13 | unknown_query = {} 14 | test_query = {} 15 | session = [] 16 | for line in open(p_in): 17 | if not line.strip(): 18 | n_query = len(label_query) + len(unknown_query) + len(test_query) 19 | label_dict = {} 20 | for query, label in label_query.items(): 21 | label_dict[label] = label_dict.get(label, 0) + 1 22 | if len(label_dict) <= 1 or (len(label_dict) == 2 and (label_dict.keys()[0].find(label_dict.keys()[1])==0 or label_dict.keys()[1].find(label_dict.keys()[0])==0)): 23 | for query in test_query: 24 | if query not in test_label: 25 | test_label[query] = {} 26 | for query2, label in label_query.items(): 27 | test_label[query][label] = test_label[query].get(label, 0) + 1 28 | if query not in test_unknown: 29 | test_unknown[query] = {} 30 | for query2 in unknown_query: 31 | test_unknown[query][query2] = test_unknown[query].get(query2, 0) + 1 32 | for query in unknown_query: 33 | if query not in unknown_label: 34 | unknown_label[query] = {} 35 | for query2, label in label_query.items(): 36 | unknown_label[query][label] = unknown_label[query].get(label, 0) + 1 37 | else: 38 | #print session 39 | pass 40 | label_query = {} 41 | unknown_query = {} 42 | test_query = {} 43 | session = [] 44 | continue 45 | label, query = line.strip().split('\t')[:2] 46 | label = ' | '.join(sorted(label.split(' | '))) 47 | if not session or query != session[-1][1]: 48 | session.append( (label, query) ) 49 | if label=='CLASS=TEST': 50 | test_query[query] = 1 51 | elif label=='CLASS=UNKNOWN': 52 | if query.count(' ') > 1: 53 | unknown_query[query] = 1 54 | else: 55 | label_query[query] = label 56 | 57 | with open(p_out, 'w') as fo: 58 | for line in open(p_query): 59 | query = line.strip() 60 | if query in test_label and test_label[query]: 61 | s = ['%s:%s' % (k, v) for k, v in test_label[query].items()] 62 | fo.write('%s\n' % (' || '.join(s))) 63 | elif query in test_unknown: 64 | label_dict = {} 65 | for query2, v1 in test_unknown[query].items(): 66 | if query2 in unknown_label: 67 | for label, v2 in unknown_label[query2].items(): 68 | label_dict[label] = label_dict.get(label, 0) + v1*v2 69 | if label_dict: 70 | s = ['%s:%s' % (k, v) for k, v in label_dict.items()] 71 | fo.write('%s\n' % (' || '.join(s))) 72 | else: 73 | fo.write('\n') 74 | else: 75 | fo.write('\n') 76 | 77 | markov(p_dog, p_valid, 'pred_session_dog.txt') 78 | markov(p_train, p_test, 'pred_session_pig.txt') 79 | 80 | -------------------------------------------------------------------------------- /V2/metric_F1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Evaluation metric for the CIKM CUP 2014 4 | F-score 5 | 6 | @author: Michael Liu 7 | Created: Thu July 22 2014 8 | """ 9 | 10 | import os 11 | import csv 12 | import math 13 | 14 | def create_solution_dictionary(solution): 15 | """ 16 | """ 17 | 18 | solnDict = {} 19 | with open(solution, 'rb') as f: 20 | for line in f: 21 | query, labels = line.strip().split('\t') 22 | label_list = labels.split(' | ') 23 | solnDict[query] = label_list 24 | return solnDict 25 | 26 | def check_submission(submission, solutionDict): 27 | """ 28 | """ 29 | 30 | submissionDict = {} 31 | with open(submission, 'rb') as f: 32 | for line in f: 33 | query, labels = line.strip('\n').split('\t') 34 | if query in submissionDict: 35 | print 'duplicate id in submission' 36 | return False 37 | if query not in solutionDict: 38 | print 'submission id must in solution' 39 | return False 40 | label_list = labels.split(' | ') 41 | submissionDict[query] = label_list 42 | 43 | if len(submissionDict) != len(solutionDict): 44 | print 'size of submission and solution must be the same' 45 | return False 46 | return submissionDict 47 | 48 | def F1_metric(solution, submission): 49 | """ 50 | """ 51 | 52 | solutionDict = create_solution_dictionary(solution) 53 | submissionDict = check_submission(submission, solutionDict) 54 | 55 | if submissionDict: 56 | true_positive = {} 57 | all_positive = {} 58 | groundtruth = {} 59 | 60 | for query in solutionDict: 61 | label_list = set(submissionDict[query]) 62 | truth_list = set(solutionDict[query]) 63 | for label in label_list: 64 | if label in truth_list: 65 | true_positive[label] = true_positive.get(label, 0) + 1. 66 | all_positive[label] = all_positive.get(label, 0) + 1 67 | for label in truth_list: 68 | groundtruth[label] = groundtruth.get(label, 0) + 1 69 | 70 | precision_list = [] 71 | recall_list = [] 72 | for label in groundtruth: 73 | precision = 0 74 | if label in all_positive: 75 | precision = true_positive.get(label, 0) / all_positive.get(label, 0) 76 | print label, 'precision', precision 77 | 78 | recall = true_positive[label] / groundtruth[label] 79 | print label, 'recall', recall 80 | 81 | precision_list.append(precision) 82 | recall_list.append(recall) 83 | 84 | ap = sum(precision_list) / len(recall_list) 85 | ar = sum(recall_list) / len(recall_list) 86 | F1 = 2*ap*ar / (ap + ar) 87 | print 'ap', ap 88 | print 'ar', ar 89 | print 'F1', F1 90 | 91 | if __name__ == "__main__": 92 | solutionFile = "" 93 | submissionFile = "" 94 | 95 | import sys 96 | if len(sys.argv) < 3: 97 | print ' solution submission' 98 | exit(-1) 99 | solutionFile = sys.argv[1] 100 | submissionFile = sys.argv[2] 101 | 102 | F1_metric(solutionFile, submissionFile) 103 | 104 | 105 | -------------------------------------------------------------------------------- /V2/prepare_ensemble_cat.py: -------------------------------------------------------------------------------- 1 | n_sample = 6000 2 | 3 | label_map = {} 4 | label_map2 = {} 5 | max_label = 0 6 | for line in open('../dataset/label_map_dog'): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | label_map2[label] = int(c) 10 | max_label = max(max_label, int(c)) 11 | print 'max_label:', max_label 12 | 13 | weights = [] 14 | for i in xrange(n_sample): 15 | weights.append([0]*((max_label+1)*8)) 16 | 17 | if __name__ == '__main__': 18 | import sys 19 | if len(sys.argv) < 4: 20 | print ' out label in1 m1 [in2 m2 ...]' 21 | exit(1) 22 | 23 | i = 3 24 | while i < len(sys.argv): 25 | in_i = sys.argv[i] 26 | m_i = sys.argv[i+1] 27 | print in_i, m_i 28 | if m_i == 'xgboost': 29 | nclass = int(sys.argv[i+2]) 30 | fin = open(in_i) 31 | for isample in xrange(n_sample): 32 | for ipred in xrange(nclass): 33 | pred = float(fin.readline().strip()) 34 | if ipred <= max_label: 35 | weights[isample][ipred] = pred 36 | fin.close() 37 | i += 3 38 | elif m_i == 'liblinear': 39 | fin = open(in_i) 40 | fin.readline() 41 | for isample in xrange(n_sample): 42 | preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 43 | for ipred, pred in enumerate(preds): 44 | weights[isample][max_label+1+ipred] = pred 45 | i += 2 46 | elif m_i == 'semilda': 47 | lda_map = {} 48 | for line in open('../dataset/label_map_lda'): 49 | label, c = line.strip().split('\t') 50 | lda_map[int(c)] = label 51 | feat_map = {} 52 | fin = open('../trans_data/valid.simple1') 53 | for line in open(in_i): 54 | preds = [float(v) for v in line.strip().split(' ')] 55 | tot = sum(preds) + 0.001 56 | preds = [v/tot for v in preds] 57 | feats = fin.readline().strip().split('\t')[1] 58 | feat_map[feats] = preds 59 | fin.close() 60 | fin = open('../trans_data/cat.txt') 61 | for isample in xrange(n_sample): 62 | feats = fin.readline().strip() 63 | preds = feat_map[feats] 64 | for ipred, pred in enumerate(preds): 65 | ipred2 = label_map2[lda_map[ipred]] 66 | weights[isample][(max_label+1)*2+ipred2] = pred / tot 67 | fin.close() 68 | i += 2 69 | elif m_i == 'sessionlabel': 70 | fin = open(in_i) 71 | for isample in xrange(n_sample): 72 | try: 73 | pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')] 74 | if pairs and pairs[0] and pairs[0][0]: 75 | tot = sum([float(v[1]) for v in pairs]) 76 | if tot > 0: 77 | for pair in pairs: 78 | label, v = pair[0], float(pair[1]) 79 | label = ' | '.join(sorted(label.split(' | '))) 80 | if label not in label_map2: label = label.split(' | ')[0] 81 | c = label_map2[label] 82 | weights[isample][(max_label+1)*3+c] = v / tot 83 | except: 84 | print pairs 85 | exit(1) 86 | i += 2 87 | 88 | 89 | with open(sys.argv[1], 'w') as fo: 90 | idx = 0 91 | for line in open(sys.argv[2]): 92 | arr = line.strip().split('\t') 93 | if len(arr) >= 2: 94 | label = arr[1] 95 | label = ' | '.join(sorted(label.split(' | '))) 96 | if label not in label_map2: label = label.split(' | ')[0] 97 | label = label_map2[label] 98 | else: 99 | label = 0 100 | fo.write('%s %s\n' % (label, ' '.join(['%s:%s' % (i+1, pred) for i, pred in enumerate(weights[idx])]))) 101 | idx += 1 102 | 103 | 104 | -------------------------------------------------------------------------------- /V2/prepare_ensemble_dog.py: -------------------------------------------------------------------------------- 1 | n_sample = 5961 2 | 3 | label_map = {} 4 | label_map2 = {} 5 | max_label = 0 6 | for line in open('../dataset/label_map_dog'): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | label_map2[label] = int(c) 10 | max_label = max(max_label, int(c)) 11 | print 'max_label:', max_label 12 | 13 | weights = [] 14 | for i in xrange(n_sample): 15 | weights.append([0]*((max_label+1)*8)) 16 | 17 | if __name__ == '__main__': 18 | import sys 19 | if len(sys.argv) < 4: 20 | print ' out label in1 m1 [in2 m2 ...]' 21 | exit(1) 22 | 23 | i = 3 24 | while i < len(sys.argv): 25 | in_i = sys.argv[i] 26 | m_i = sys.argv[i+1] 27 | print in_i, m_i 28 | if m_i == 'xgboost': 29 | nclass = int(sys.argv[i+2]) 30 | fin = open(in_i) 31 | for isample in xrange(n_sample): 32 | for ipred in xrange(nclass): 33 | pred = float(fin.readline().strip()) 34 | if ipred <= max_label: 35 | weights[isample][ipred] = pred 36 | fin.close() 37 | i += 3 38 | elif m_i == 'liblinear': 39 | fin = open(in_i) 40 | fin.readline() 41 | for isample in xrange(n_sample): 42 | preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 43 | for ipred, pred in enumerate(preds): 44 | weights[isample][max_label+1+ipred] = pred 45 | i += 2 46 | elif m_i == 'semilda': 47 | lda_map = {} 48 | for line in open('../dataset/label_map_lda'): 49 | label, c = line.strip().split('\t') 50 | lda_map[int(c)] = label 51 | feat_map = {} 52 | fin = open('../trans_data/valid.simple1') 53 | for line in open(in_i): 54 | preds = [float(v) for v in line.strip().split(' ')] 55 | tot = sum(preds) + 0.001 56 | preds = [v/tot for v in preds] 57 | feats = fin.readline().strip().split('\t')[1] 58 | feat_map[feats] = preds 59 | fin.close() 60 | fin = open('../trans_data/valid2.txt') 61 | for isample in xrange(n_sample): 62 | feats = fin.readline().strip() 63 | preds = feat_map[feats] 64 | for ipred, pred in enumerate(preds): 65 | ipred2 = label_map2[lda_map[ipred]] 66 | weights[isample][(max_label+1)*2+ipred2] = pred / tot 67 | fin.close() 68 | i += 2 69 | elif m_i == 'sessionlabel': 70 | fin = open(in_i) 71 | for isample in xrange(n_sample): 72 | try: 73 | pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')] 74 | if pairs and pairs[0] and pairs[0][0]: 75 | tot = sum([float(v[1]) for v in pairs]) 76 | if tot > 0: 77 | for pair in pairs: 78 | label, v = pair[0], float(pair[1]) 79 | label = ' | '.join(sorted(label.split(' | '))) 80 | if label not in label_map2: label = label.split(' | ')[0] 81 | c = label_map2[label] 82 | weights[isample][(max_label+1)*3+c] = v / tot 83 | except: 84 | print pairs 85 | exit(1) 86 | i += 2 87 | 88 | 89 | with open(sys.argv[1], 'w') as fo: 90 | idx = 0 91 | for line in open(sys.argv[2]): 92 | arr = line.strip().split('\t') 93 | if len(arr) >= 2: 94 | label = arr[1] 95 | label = ' | '.join(sorted(label.split(' | '))) 96 | if label not in label_map2: label = label.split(' | ')[0] 97 | label = label_map2[label] 98 | else: 99 | label = 0 100 | fo.write('%s %s\n' % (label, ' '.join(['%s:%s' % (i+1, pred) for i, pred in enumerate(weights[idx])]))) 101 | idx += 1 102 | 103 | 104 | -------------------------------------------------------------------------------- /V2/prepare_ensemble_pig.py: -------------------------------------------------------------------------------- 1 | n_sample = 39013 2 | 3 | label_map = {} 4 | label_map2 = {} 5 | max_label = 0 6 | for line in open('../dataset/label_map_dog'): 7 | label, c = line.strip().split('\t') 8 | label_map[int(c)] = label 9 | label_map2[label] = int(c) 10 | max_label = max(max_label, int(c)) 11 | print 'max_label:', max_label 12 | 13 | weights = [] 14 | for i in xrange(n_sample): 15 | weights.append([0]*((max_label+1)*8)) 16 | 17 | if __name__ == '__main__': 18 | import sys 19 | if len(sys.argv) < 4: 20 | print ' out label in1 m1 [in2 m2 ...]' 21 | exit(1) 22 | 23 | i = 3 24 | while i < len(sys.argv): 25 | in_i = sys.argv[i] 26 | m_i = sys.argv[i+1] 27 | print in_i, m_i 28 | if m_i == 'xgboost': 29 | nclass = int(sys.argv[i+2]) 30 | fin = open(in_i) 31 | for isample in xrange(n_sample): 32 | for ipred in xrange(nclass): 33 | pred = float(fin.readline().strip()) 34 | if ipred <= max_label: 35 | weights[isample][ipred] = pred 36 | fin.close() 37 | i += 3 38 | elif m_i == 'liblinear': 39 | fin = open(in_i) 40 | fin.readline() 41 | for isample in xrange(n_sample): 42 | preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 43 | for ipred, pred in enumerate(preds): 44 | weights[isample][max_label+1+ipred] = pred 45 | i += 2 46 | elif m_i == 'semilda': 47 | lda_map = {} 48 | for line in open('../dataset/label_map_lda'): 49 | label, c = line.strip().split('\t') 50 | lda_map[int(c)] = label 51 | feat_map = {} 52 | fin = open('../trans_data/test.simple1') 53 | for line in open(in_i): 54 | preds = [float(v) for v in line.strip().split(' ')] 55 | tot = sum(preds) + 0.001 56 | preds = [v/tot for v in preds] 57 | feats = fin.readline().strip().split('\t')[1] 58 | feat_map[feats] = preds 59 | fin.close() 60 | fin = open('../raw_data/test.txt') 61 | for isample in xrange(n_sample): 62 | feats = fin.readline().strip() 63 | preds = feat_map[feats] 64 | for ipred, pred in enumerate(preds): 65 | ipred2 = label_map2[lda_map[ipred]] 66 | weights[isample][(max_label+1)*2+ipred2] = pred / tot 67 | fin.close() 68 | i += 2 69 | elif m_i == 'sessionlabel': 70 | fin = open(in_i) 71 | for isample in xrange(n_sample): 72 | try: 73 | pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')] 74 | if pairs and pairs[0] and pairs[0][0]: 75 | tot = sum([float(v[1]) for v in pairs]) 76 | if tot > 0: 77 | for pair in pairs: 78 | label, v = pair[0], float(pair[1]) 79 | label = ' | '.join(sorted(label.split(' | '))) 80 | if label not in label_map2: label = label.split(' | ')[0] 81 | c = label_map2[label] 82 | weights[isample][(max_label+1)*3+c] = v / tot 83 | except: 84 | print pairs 85 | exit(1) 86 | i += 2 87 | 88 | 89 | with open(sys.argv[1], 'w') as fo: 90 | idx = 0 91 | for line in open(sys.argv[2]): 92 | arr = line.strip().split('\t') 93 | if len(arr) >= 2: 94 | label = arr[1] 95 | label = ' | '.join(sorted(label.split(' | '))) 96 | if label not in label_map2: label = label.split(' | ')[0] 97 | label = label_map2[label] 98 | else: 99 | label = 0 100 | fo.write('%s %s\n' % (label, ' '.join(['%s:%s' % (i+1, pred) for i, pred in enumerate(weights[idx])]))) 101 | idx += 1 102 | 103 | 104 | -------------------------------------------------------------------------------- /V2/prepare_feature_dog1.py: -------------------------------------------------------------------------------- 1 | 2 | p_dog_train_feat = '../trans_data/dog.simple1' 3 | p_dog_valid_feat = '../trans_data/valid.simple1' 4 | p_dog_valid_id = '../trans_data/valid.txt' 5 | 6 | p_pig_train_feat = '../trans_data/train.simple1' 7 | p_pig_valid_feat = '../trans_data/test.simple1' 8 | p_pig_valid_id = '../raw_data/test.txt' 9 | 10 | p_label_map = '../dataset/label_map_dog' 11 | p_dog_train = '../dataset/dog_train' 12 | p_dog_test = '../dataset/dog_test' 13 | p_pig_train = '../dataset/pig_train' 14 | p_pig_test = '../dataset/pig_test' 15 | 16 | min_word_df = 5 17 | min_title_df = 10 18 | 19 | label_map = {} 20 | cur_label = 0 21 | word_map = {} 22 | cur_word = 1 23 | 24 | pig_train = [] 25 | test_dict = {} 26 | pig_test = [] 27 | 28 | label_df = {} 29 | word_df = {} 30 | 31 | def get_df(p_in): 32 | for line in open(p_in): 33 | row = line.strip().split('\t') 34 | label = row[0] 35 | label = ' | '.join(sorted(label.split(' | '))) 36 | label_df[label] = label_df.get(label, 0) + 1 37 | query = row[1] 38 | titles = row[2] if len(row)>=3 else '' 39 | session_queries = row[5] if len(row)>=6 else '' 40 | session_titles = row[6] if len(row)>=7 else '' 41 | 42 | feat_list = query.split(' ') 43 | for i, word in enumerate(feat_list): 44 | if not word: continue 45 | word_df[word] = word_df.get(word, 0) + 1 46 | if i>=1: 47 | word = ' '.join(feat_list[i-1:i+1]) 48 | word_df[word] = word_df.get(word, 0) + 1 49 | word = '%s_%s' % (i, feat_list[i]) 50 | word_df[word] = word_df.get(word, 0) + 1 51 | if i >= len(feat_list)/2: 52 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 53 | word_df[word] = word_df.get(word, 0) + 1 54 | 55 | for pair in titles.split(';'): 56 | if not pair: continue 57 | title, freq = pair.split(':') 58 | feat_list = title.split(' ') 59 | for i, word in enumerate(feat_list): 60 | if not word: continue 61 | word = 't_' + word 62 | word_df[word] = word_df.get(word, 0) + 1 63 | if i>=1: 64 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 65 | word_df[word] = word_df.get(word, 0) + 1 66 | 67 | for pair in session_queries.split(';'): 68 | if not pair: continue 69 | title, freq = pair.split(':') 70 | feat_list = title.split(' ') 71 | for i, word in enumerate(feat_list): 72 | if not word: continue 73 | word = 'sq_' + word 74 | word_df[word] = word_df.get(word, 0) + 1 75 | if i>=1: 76 | word = 'sq_' + ' '.join(feat_list[i-1:i+1]) 77 | word_df[word] = word_df.get(word, 0) + 1 78 | 79 | for pair in session_titles.split(';'): 80 | if not pair: continue 81 | title, freq = pair.split(':') 82 | feat_list = title.split(' ') 83 | for i, word in enumerate(feat_list): 84 | if not word: continue 85 | word = 'st_' + word 86 | word_df[word] = word_df.get(word, 0) + 1 87 | if i>=1: 88 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 89 | word_df[word] = word_df.get(word, 0) + 1 90 | 91 | def prepare(p_in, p_out, isTrain, p_in2): 92 | global cur_label 93 | global cur_word 94 | 95 | if isTrain: fo = open(p_out, 'w') 96 | fin2 = open(p_in2) 97 | 98 | for line in open(p_in): 99 | row = line.strip().split('\t') 100 | label = row[0] 101 | query = row[1] 102 | titles = row[2] if len(row)>=3 else '' 103 | labels = row[3] if len(row)>=4 else '' 104 | session_queries = row[5] if len(row)>=6 else '' 105 | session_titles = row[6] if len(row)>=7 else '' 106 | 107 | row2 = fin2.readline().split('\t') 108 | if row2[0] == query: 109 | stats = row2[1] 110 | stats2 = row2[2].strip() 111 | else: 112 | print 'query mismatch' 113 | exit(1) 114 | 115 | if isTrain: 116 | label = ' | '.join(sorted(label.split(' | '))) 117 | if label_df[label] < 200: 118 | label = label.split(' ')[0] 119 | if label not in label_map: 120 | label_map[label] = cur_label 121 | cur_label += 1 122 | 123 | feat_list = query.split(' ') 124 | word_tf = {} 125 | for i, word in enumerate(feat_list): 126 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 127 | word_map[word] = cur_word 128 | cur_word += 1 129 | if word in word_map: 130 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 131 | if i>=1: 132 | word = ' '.join(feat_list[i-1:i+1]) 133 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 134 | word_map[word] = cur_word 135 | cur_word += 1 136 | if word in word_map: 137 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 138 | word = '%s_%s' % (i, feat_list[i]) 139 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 140 | word_map[word] = cur_word 141 | cur_word += 1 142 | if word in word_map: 143 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 144 | if i >= len(feat_list) / 2: 145 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 146 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 147 | word_map[word] = cur_word 148 | cur_word += 1 149 | if word in word_map: 150 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 151 | 152 | tot_freq = 0 153 | for pair in titles.split(';'): 154 | if not pair: continue 155 | title, freq = pair.split(':') 156 | tot_freq += float(freq) 157 | word_tf2 = {} 158 | for pair in titles.split(';'): 159 | if not pair: continue 160 | title, freq = pair.split(':') 161 | freq = float(freq) 162 | feat_list = title.split(' ') 163 | for i, word in enumerate(feat_list): 164 | word = 't_' + word 165 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 166 | word_map[word] = cur_word 167 | cur_word += 1 168 | if word in word_map: 169 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 170 | word = feat_list[i] 171 | ''' 172 | if word in word_map and word_map[word] in word_tf: 173 | word = 'qt_' + word 174 | if isTrain and word not in word_map: 175 | word_map[word] = cur_word 176 | cur_word += 1 177 | if word in word_map: 178 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 179 | ''' 180 | if i>=1: 181 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 182 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 183 | word_map[word] = cur_word 184 | cur_word += 1 185 | if word in word_map: 186 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 187 | 188 | tot_freq = 0 189 | for pair in session_queries.split(';'): 190 | if not pair: continue 191 | title, freq = pair.split(':') 192 | tot_freq += float(freq) 193 | for pair in session_queries.split(';'): 194 | if not pair: continue 195 | title, freq = pair.split(':') 196 | freq = float(freq) 197 | feat_list = title.split(' ') 198 | for i, word in enumerate(feat_list): 199 | word = 'sq_' + word 200 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 201 | word_map[word] = cur_word 202 | cur_word += 1 203 | if word in word_map: 204 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 205 | ''' 206 | if i>=1: 207 | word = 'sq_' + ' '.join(feat_list[i-1:i+1]) 208 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 209 | word_map[word] = cur_word 210 | cur_word += 1 211 | if word in word_map: 212 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 213 | ''' 214 | 215 | tot_freq = 0 216 | for pair in session_titles.split(';'): 217 | if not pair: continue 218 | title, freq = pair.split(':') 219 | tot_freq += float(freq) 220 | for pair in session_titles.split(';'): 221 | if not pair: continue 222 | title, freq = pair.split(':') 223 | freq = float(freq) 224 | feat_list = title.split(' ') 225 | for i, word in enumerate(feat_list): 226 | word = 'st_' + word 227 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 228 | word_map[word] = cur_word 229 | cur_word += 1 230 | if word in word_map: 231 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 232 | if i>=1: 233 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 234 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 235 | word_map[word] = cur_word 236 | cur_word += 1 237 | if word in word_map: 238 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 239 | 240 | for pair in labels.split(';'): 241 | if not pair: continue 242 | word, freq = pair.split(':') 243 | freq = float(freq) 244 | if isTrain and word not in word_map: 245 | word_map[word] = cur_word 246 | cur_word += 1 247 | if word in word_map: 248 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 249 | 250 | for pair in stats.split(';') + stats2.split(';'): 251 | if not pair: continue 252 | word, freq = pair.split(':') 253 | freq = float(freq) 254 | if isTrain and word not in word_map: 255 | word_map[word] = cur_word 256 | cur_word += 1 257 | if word in word_map: 258 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 259 | 260 | if isTrain: 261 | label, F = (label_map[label], word_tf.items() + word_tf2.items()) 262 | F = sorted(F, key=lambda d:d[0]) 263 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 264 | fo.write('%s %s\n' % (label, f_str)) 265 | else: 266 | test_dict[query] = word_tf.items() + word_tf2.items() 267 | 268 | if isTrain: fo.close() 269 | fin2.close() 270 | 271 | # save label map 272 | with open(p_label_map, 'w') as fo: 273 | for label in label_map: 274 | fo.write('%s\t%s\n' % (label, label_map[label])) 275 | 276 | 277 | def save_test(p_in, p_out): 278 | with open(p_out, 'w') as fo: 279 | for line in open(p_in): 280 | query = line.strip() 281 | F = test_dict[query] 282 | F = sorted(F, key=lambda d:d[0]) 283 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 284 | fo.write('%s %s\n' % (0, f_str)) 285 | 286 | get_df(p_dog_train_feat) 287 | prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2') 288 | prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2') 289 | save_test('../trans_data/cat.txt', '../dataset/cat_test') 290 | save_test('../trans_data/valid2.txt', '../dataset/dog_test') 291 | 292 | -------------------------------------------------------------------------------- /V2/prepare_feature_pig1.py: -------------------------------------------------------------------------------- 1 | 2 | p_dog_train_feat = '../trans_data/dog.simple1' 3 | p_dog_valid_feat = '../trans_data/valid.simple1' 4 | p_dog_valid_id = '../trans_data/valid.txt' 5 | 6 | p_pig_train_feat = '../trans_data/train.simple1' 7 | p_pig_valid_feat = '../trans_data/test.simple1' 8 | p_pig_valid_id = '../raw_data/test.txt' 9 | 10 | p_label_map = '../dataset/label_map_pig' 11 | p_dog_train = '../dataset/dog_train' 12 | p_dog_test = '../dataset/dog_test' 13 | p_pig_train = '../dataset/pig_train' 14 | p_pig_test = '../dataset/pig_test' 15 | 16 | min_word_df = 5 17 | min_title_df = 10 18 | 19 | label_map = {} 20 | cur_label = 0 21 | word_map = {} 22 | cur_word = 1 23 | 24 | pig_train = [] 25 | test_dict = {} 26 | pig_test = [] 27 | 28 | label_df = {} 29 | word_df = {} 30 | 31 | def get_df(p_in): 32 | for line in open(p_in): 33 | row = line.strip().split('\t') 34 | label = row[0] 35 | label = ' | '.join(sorted(label.split(' | '))) 36 | label_df[label] = label_df.get(label, 0) + 1 37 | query = row[1] 38 | titles = row[2] if len(row)>=3 else '' 39 | session_queries = row[5] if len(row)>=6 else '' 40 | session_titles = row[6] if len(row)>=7 else '' 41 | 42 | feat_list = query.split(' ') 43 | for i, word in enumerate(feat_list): 44 | if not word: continue 45 | word_df[word] = word_df.get(word, 0) + 1 46 | if i>=1: 47 | word = ' '.join(feat_list[i-1:i+1]) 48 | word_df[word] = word_df.get(word, 0) + 1 49 | word = '%s_%s' % (i, feat_list[i]) 50 | word_df[word] = word_df.get(word, 0) + 1 51 | if i >= len(feat_list)/2: 52 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 53 | word_df[word] = word_df.get(word, 0) + 1 54 | 55 | for pair in titles.split(';'): 56 | if not pair: continue 57 | title, freq = pair.split(':') 58 | feat_list = title.split(' ') 59 | for i, word in enumerate(feat_list): 60 | if not word: continue 61 | word = 't_' + word 62 | word_df[word] = word_df.get(word, 0) + 1 63 | if i>=1: 64 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 65 | word_df[word] = word_df.get(word, 0) + 1 66 | 67 | for pair in session_queries.split(';'): 68 | if not pair: continue 69 | title, freq = pair.split(':') 70 | feat_list = title.split(' ') 71 | for i, word in enumerate(feat_list): 72 | if not word: continue 73 | word = 'sq_' + word 74 | word_df[word] = word_df.get(word, 0) + 1 75 | if i>=1: 76 | word = 'sq_' + ' '.join(feat_list[i-1:i+1]) 77 | word_df[word] = word_df.get(word, 0) + 1 78 | 79 | for pair in session_titles.split(';'): 80 | if not pair: continue 81 | title, freq = pair.split(':') 82 | feat_list = title.split(' ') 83 | for i, word in enumerate(feat_list): 84 | if not word: continue 85 | word = 'st_' + word 86 | word_df[word] = word_df.get(word, 0) + 1 87 | if i>=1: 88 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 89 | word_df[word] = word_df.get(word, 0) + 1 90 | 91 | def prepare(p_in, p_out, isTrain, p_in2): 92 | global cur_label 93 | global cur_word 94 | 95 | if isTrain: fo = open(p_out, 'w') 96 | fin2 = open(p_in2) 97 | 98 | for line in open(p_in): 99 | row = line.strip().split('\t') 100 | label = row[0] 101 | query = row[1] 102 | titles = row[2] if len(row)>=3 else '' 103 | labels = row[3] if len(row)>=4 else '' 104 | session_queries = row[5] if len(row)>=6 else '' 105 | session_titles = row[6] if len(row)>=7 else '' 106 | 107 | row2 = fin2.readline().split('\t') 108 | if row2[0] == query: 109 | stats = row2[1] 110 | stats2 = row2[2].strip() 111 | else: 112 | print 'query mismatch' 113 | exit(1) 114 | 115 | if isTrain: 116 | label = ' | '.join(sorted(label.split(' | '))) 117 | if label_df[label] < 200: 118 | label = label.split(' ')[0] 119 | if label not in label_map: 120 | label_map[label] = cur_label 121 | cur_label += 1 122 | 123 | feat_list = query.split(' ') 124 | word_tf = {} 125 | for i, word in enumerate(feat_list): 126 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 127 | word_map[word] = cur_word 128 | cur_word += 1 129 | if word in word_map: 130 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 131 | if i>=1: 132 | word = ' '.join(feat_list[i-1:i+1]) 133 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 134 | word_map[word] = cur_word 135 | cur_word += 1 136 | if word in word_map: 137 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 138 | word = '%s_%s' % (i, feat_list[i]) 139 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 140 | word_map[word] = cur_word 141 | cur_word += 1 142 | if word in word_map: 143 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 144 | if i >= len(feat_list) / 2: 145 | word = '%s_%s' % (i-len(feat_list), feat_list[i]) 146 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 147 | word_map[word] = cur_word 148 | cur_word += 1 149 | if word in word_map: 150 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 151 | 152 | tot_freq = 0 153 | for pair in titles.split(';'): 154 | if not pair: continue 155 | title, freq = pair.split(':') 156 | tot_freq += float(freq) 157 | word_tf2 = {} 158 | for pair in titles.split(';'): 159 | if not pair: continue 160 | title, freq = pair.split(':') 161 | freq = float(freq) 162 | feat_list = title.split(' ') 163 | for i, word in enumerate(feat_list): 164 | word = 't_' + word 165 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 166 | word_map[word] = cur_word 167 | cur_word += 1 168 | if word in word_map: 169 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 170 | ''' 171 | word = feat_list[i] 172 | if word in word_map and word_map[word] in word_tf: 173 | word = 'qt_' + word 174 | if isTrain and word not in word_map: 175 | word_map[word] = cur_word 176 | cur_word += 1 177 | if word in word_map: 178 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 179 | ''' 180 | if i>=1: 181 | word = 't_' + ' '.join(feat_list[i-1:i+1]) 182 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 183 | word_map[word] = cur_word 184 | cur_word += 1 185 | if word in word_map: 186 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 187 | 188 | tot_freq = 0 189 | for pair in session_queries.split(';'): 190 | if not pair: continue 191 | title, freq = pair.split(':') 192 | tot_freq += float(freq) 193 | for pair in session_queries.split(';'): 194 | if not pair: continue 195 | title, freq = pair.split(':') 196 | freq = float(freq) 197 | feat_list = title.split(' ') 198 | for i, word in enumerate(feat_list): 199 | word = 'sq_' + word 200 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 201 | word_map[word] = cur_word 202 | cur_word += 1 203 | if word in word_map: 204 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 205 | ''' 206 | if i>=1: 207 | word = 'sq_' + ' '.join(feat_list[i-1:i+1]) 208 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 209 | word_map[word] = cur_word 210 | cur_word += 1 211 | if word in word_map: 212 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 213 | ''' 214 | 215 | tot_freq = 0 216 | for pair in session_titles.split(';'): 217 | if not pair: continue 218 | title, freq = pair.split(':') 219 | tot_freq += float(freq) 220 | for pair in session_titles.split(';'): 221 | if not pair: continue 222 | title, freq = pair.split(':') 223 | freq = float(freq) 224 | feat_list = title.split(' ') 225 | for i, word in enumerate(feat_list): 226 | word = 'st_' + word 227 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 228 | word_map[word] = cur_word 229 | cur_word += 1 230 | if word in word_map: 231 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 232 | if i>=1: 233 | word = 'st_' + ' '.join(feat_list[i-1:i+1]) 234 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 235 | word_map[word] = cur_word 236 | cur_word += 1 237 | if word in word_map: 238 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 239 | 240 | for pair in labels.split(';'): 241 | if not pair: continue 242 | word, freq = pair.split(':') 243 | freq = float(freq) 244 | if isTrain and word not in word_map: 245 | word_map[word] = cur_word 246 | cur_word += 1 247 | if word in word_map: 248 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 249 | 250 | for pair in stats.split(';') + stats2.split(';'): 251 | if not pair: continue 252 | word, freq = pair.split(':') 253 | freq = float(freq) 254 | if isTrain and word not in word_map: 255 | word_map[word] = cur_word 256 | cur_word += 1 257 | if word in word_map: 258 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq 259 | 260 | if isTrain: 261 | label, F = (label_map[label], word_tf.items() + word_tf2.items()) 262 | F = sorted(F, key=lambda d:d[0]) 263 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 264 | fo.write('%s %s\n' % (label, f_str)) 265 | else: 266 | test_dict[query] = word_tf.items() + word_tf2.items() 267 | 268 | if isTrain: fo.close() 269 | fin2.close() 270 | 271 | # save label map 272 | with open(p_label_map, 'w') as fo: 273 | for label in label_map: 274 | fo.write('%s\t%s\n' % (label, label_map[label])) 275 | 276 | 277 | def save_test(p_in, p_out): 278 | with open(p_out, 'w') as fo: 279 | for line in open(p_in): 280 | query = line.strip() 281 | F = test_dict[query] 282 | F = sorted(F, key=lambda d:d[0]) 283 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 284 | fo.write('%s %s\n' % (0, f_str)) 285 | 286 | get_df(p_pig_train_feat) 287 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2') 288 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2') 289 | save_test(p_pig_valid_id, p_pig_test) 290 | 291 | -------------------------------------------------------------------------------- /V2/prepare_lda_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding: utf-8 3 | # @author: zuotaoliu@126.com 4 | # @created: 2014-08-29 5 | import os 6 | import sys 7 | import re 8 | 9 | def do_word_index(p_in, p_out): 10 | label_map = {} 11 | cur_label = 0 12 | word_count = {} 13 | cur_idx = 0 14 | fo = open(p_out, 'w') 15 | for line in open(p_in): 16 | row = line.rstrip().split('\t') 17 | 18 | if len(row) >= 3: 19 | feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5]) 20 | else: 21 | feats = row[1] + ':1' 22 | 23 | labels = row[0].split(' | ') 24 | cids = [] 25 | for label in labels: 26 | if label not in label_map: 27 | label_map[label] = cur_label 28 | cur_label += 1 29 | cids.append(str(label_map[label])) 30 | wc = {} 31 | for pair in feats.split(';'): 32 | if not pair: continue 33 | words, freq = pair.split(':') 34 | freq = min(1, int(freq)) 35 | for word in words.split(' '): 36 | if not word: continue 37 | wc[word] = wc.get(word, 0) + freq 38 | word_count[word] = word_count.get(word, 0) + freq 39 | fo.write('%s\n' % (' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 40 | fo.close() 41 | 42 | return word_count 43 | 44 | if __name__ == "__main__": 45 | if len(sys.argv) < 3: 46 | print ' inputfile outputfile' 47 | exit(-1) 48 | word_count = do_word_index(sys.argv[1], sys.argv[2]) 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /V2/prepare_lda_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding: utf-8 3 | # @author: zuotaoliu@126.com 4 | # @created: 2014-08-29 5 | import os 6 | import sys 7 | import re 8 | 9 | def do_word_index(p_in, p_out): 10 | label_map = {} 11 | cur_label = 0 12 | word_count = {} 13 | cur_idx = 0 14 | fo = open(p_out, 'w') 15 | for line in open(p_in): 16 | row = line.rstrip().split('\t') 17 | 18 | if len(row) >= 3: 19 | feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5]) 20 | else: 21 | feats = row[1] + ':1' 22 | 23 | labels = row[0].split(' | ') 24 | cids = [] 25 | for label in labels: 26 | if label not in label_map: 27 | label_map[label] = cur_label 28 | cur_label += 1 29 | cids.append(str(label_map[label])) 30 | wc = {} 31 | for pair in feats.split(';'): 32 | if not pair: continue 33 | words, freq = pair.split(':') 34 | freq = min(1, int(freq)) 35 | for word in words.split(' '): 36 | if not word: continue 37 | wc[word] = wc.get(word, 0) + freq 38 | word_count[word] = word_count.get(word, 0) + freq 39 | fo.write('[%s] %s\n' % (' '.join(cids), ' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 40 | fo.close() 41 | with open('../dataset/label_map_lda', 'w') as fo: 42 | for label in label_map: 43 | fo.write('%s\t%s\n' % (label, label_map[label])) 44 | 45 | return word_count 46 | 47 | if __name__ == "__main__": 48 | if len(sys.argv) < 4: 49 | print ' inputfile outputfile wordindex' 50 | exit(-1) 51 | word_count = do_word_index(sys.argv[1], sys.argv[2]) 52 | 53 | sort_list = sorted(word_count.items(), key=lambda d:d[1], reverse=True) 54 | with open(sys.argv[3], 'w') as fo: 55 | for id, pair in enumerate(sort_list): 56 | word, num = pair 57 | if num >= 5: 58 | fo.write('%s %s\n' % (id, word)) 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /V2/prepare_session.py: -------------------------------------------------------------------------------- 1 | 2 | p_dog_train_feat = '../trans_data/dog.simple5' 3 | p_dog_valid_feat = '../trans_data/valid.simple5' 4 | p_dog_valid_id = '../trans_data/valid.txt' 5 | 6 | p_pig_train_feat = '../trans_data/pig.simple5' 7 | p_pig_valid_feat = '../trans_data/test.simple5' 8 | p_pig_valid_id = '../raw_data/test.txt' 9 | 10 | p_label_map = '../dataset/label_map_session' 11 | p_dog_train = '../dataset/dog_train_session' 12 | p_dog_test = '../dataset/dog_test_session' 13 | p_pig_train = '../dataset/pig_train_session' 14 | p_pig_test = '../dataset/pig_test_session' 15 | 16 | min_word_df = 5 17 | min_title_df = 10 18 | 19 | label_map = {} 20 | cur_label = 0 21 | word_map = {} 22 | cur_word = 1 23 | 24 | pig_train = [] 25 | test_dict = {} 26 | pig_test = [] 27 | 28 | label_df = {} 29 | word_df = {} 30 | 31 | def get_df(p_in): 32 | for line in open(p_in): 33 | row = line.strip().split('\t') 34 | label = row[0] 35 | label = ' | '.join(sorted(label.split(' | '))) 36 | label_df[label] = label_df.get(label, 0) + 1 37 | queries = row[1] 38 | titles = row[2] if len(row)>=3 else '' 39 | 40 | for query in queries.split(';'): 41 | if not query: continue 42 | feat_list = query.split(' ') 43 | for i, word in enumerate(feat_list): 44 | if not word: continue 45 | word_df[word] = word_df.get(word, 0) + 1 46 | #if i>=1: 47 | # word = ' '.join(feat_list[i-1:i+1]) 48 | # word_df[word] = word_df.get(word, 0) + 1 49 | #word = '%s_%s' % (i, feat_list[i]) 50 | #word_df[word] = word_df.get(word, 0) + 1 51 | #if i >= len(feat_list)/2: 52 | # word = '%s_%s' % (i-len(feat_list), feat_list[i]) 53 | # word_df[word] = word_df.get(word, 0) + 1 54 | 55 | for title in titles.split(';'): 56 | if not title: continue 57 | feat_list = title.split(' ') 58 | for i, word in enumerate(feat_list): 59 | if not word: continue 60 | word = 't_' + word 61 | word_df[word] = word_df.get(word, 0) + 1 62 | #if i>=1: 63 | # word = 't_' + ' '.join(feat_list[i-1:i+1]) 64 | # word_df[word] = word_df.get(word, 0) + 1 65 | 66 | 67 | def prepare(p_in, p_out, isTrain): 68 | global cur_label 69 | global cur_word 70 | 71 | fo = open(p_out, 'w') 72 | 73 | for line in open(p_in): 74 | row = line.strip().split('\t') 75 | label = row[0] 76 | queries = row[1] 77 | titles = row[2] if len(row)>=3 else '' 78 | 79 | if isTrain: 80 | label = ' | '.join(sorted(label.split(' | '))) 81 | if label_df[label] < 50: 82 | label = label.split(' ')[0] 83 | if label not in label_map: 84 | label_map[label] = cur_label 85 | cur_label += 1 86 | 87 | word_tf = {} 88 | query_list = queries.split(';') 89 | for query in query_list: 90 | if not query: continue 91 | feat_list = query.split(' ') 92 | for i, word in enumerate(feat_list): 93 | if not word: continue 94 | if isTrain and word_df[word] >= min_word_df and word not in word_map: 95 | word_map[word] = cur_word 96 | cur_word += 1 97 | if word in word_map: 98 | word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)*1./len(query_list) 99 | #if i>=1: 100 | # word = ' '.join(feat_list[i-1:i+1]) 101 | # if isTrain and word_df[word] >= min_word_df and word not in word_map: 102 | # word_map[word] = cur_word 103 | # cur_word += 1 104 | # if word in word_map: 105 | # word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 106 | #word = '%s_%s' % (i, feat_list[i]) 107 | #if isTrain and word_df[word] >= min_word_df and word not in word_map: 108 | # word_map[word] = cur_word 109 | # cur_word += 1 110 | #if word in word_map: 111 | # word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 112 | #if i >= len(feat_list) / 2: 113 | # word = '%s_%s' % (i-len(feat_list), feat_list[i]) 114 | # if isTrain and word_df[word] >= min_word_df and word not in word_map: 115 | # word_map[word] = cur_word 116 | # cur_word += 1 117 | # if word in word_map: 118 | # word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list) 119 | 120 | word_tf2 = {} 121 | title_list = titles.split(';') 122 | for title in title_list: 123 | if not title: continue 124 | feat_list = title.split(' ') 125 | for i, word in enumerate(feat_list): 126 | word = 't_' + word 127 | if isTrain and word_df[word] >= min_title_df and word not in word_map: 128 | word_map[word] = cur_word 129 | cur_word += 1 130 | if word in word_map: 131 | word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*1./len(title_list) 132 | #if i>=1: 133 | # word = 't_' + ' '.join(feat_list[i-1:i+1]) 134 | # if isTrain and word_df[word] >= min_title_df and word not in word_map: 135 | # word_map[word] = cur_word 136 | # cur_word += 1 137 | # if word in word_map: 138 | # word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq 139 | 140 | 141 | if isTrain: 142 | label, F = (label_map[label], word_tf.items() + word_tf2.items()) 143 | F = sorted(F, key=lambda d:d[0]) 144 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 145 | fo.write('%s %s\n' % (label, f_str)) 146 | else: 147 | label, F = 0, word_tf.items() + word_tf2.items() 148 | F = sorted(F, key=lambda d:d[0]) 149 | f_str = ' '.join(['%s:%s' % (k, v) for k, v in F]) 150 | fo.write('%s %s\n' % (label, f_str)) 151 | 152 | if isTrain: fo.close() 153 | 154 | # save label map 155 | with open(p_label_map, 'w') as fo: 156 | for label in label_map: 157 | fo.write('%s\t%s\n' % (label, label_map[label])) 158 | 159 | 160 | #for dog 161 | #get_df(p_dog_train_feat) 162 | #prepare(p_dog_train_feat, p_dog_train, True) 163 | #for pig 164 | get_df(p_pig_train_feat) 165 | prepare(p_pig_train_feat, p_pig_train, True) 166 | 167 | #prepare(p_dog_valid_feat, p_dog_test, False) 168 | prepare(p_pig_valid_feat, p_pig_test, False) 169 | 170 | -------------------------------------------------------------------------------- /V2/refine_train_by_sesson_query.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../raw_data/train.txt' 3 | p_test = '../raw_data/test.txt' 4 | p_dog = '../trans_data/dog.txt' 5 | p_valid = '../trans_data/valid.txt' 6 | 7 | def refine(p_in, p_out): 8 | with open(p_out, 'w') as fo: 9 | last_query = None 10 | has_known = False 11 | session_lines = [] 12 | for line in open(p_in): 13 | if not line.strip(): 14 | fo.write('\n') 15 | if has_known: 16 | for l in session_lines: 17 | fo.write(l) 18 | last_query = None 19 | has_known = False 20 | session_lines = [] 21 | else: 22 | label, query = line.strip().split('\t')[:2] 23 | query_set = set(query.split(' ')) 24 | if not last_query or (last_query & query_set): 25 | session_lines.append(line) 26 | if label != 'CLASS=UNKNOWN': 27 | has_known = True 28 | last_query = query_set 29 | else: 30 | fo.write('\n') 31 | if has_known: 32 | for l in session_lines: 33 | fo.write(l) 34 | last_query = None 35 | has_known = False 36 | session_lines = [] 37 | 38 | refine(p_dog, '../trans_data/dog_refine.txt') 39 | refine(p_train, '../trans_data/train_refine.txt') 40 | 41 | 42 | -------------------------------------------------------------------------------- /V2/run_all.sh: -------------------------------------------------------------------------------- 1 | 2 | sh -x run_prepare.sh 3 | 4 | sh -x run_liblinear_dog.sh 5 | sh -x run_liblinear_pig.sh 6 | sh -x run_xgboost3_dog.sh 7 | sh -x run_xgboost3_pig.sh 8 | 9 | sh -x run_ensemble.sh 10 | -------------------------------------------------------------------------------- /V2/run_ensemble.sh: -------------------------------------------------------------------------------- 1 | BIN=../../tools/xgboost3/xgboost 2 | 3 | python prepare_ensemble_cat.py ../dataset/cat_ensemble ../trans_data/cat.label pred_xgboost_cat.txt xgboost 10 pred_linear_cat.txt liblinear 4 | python prepare_ensemble_dog.py ../dataset/dog_ensemble ../trans_data/valid2.label pred_xgboost_dog.txt xgboost 10 pred_linear_dog.txt liblinear 5 | python prepare_ensemble_pig.py ../dataset/pig_ensemble ../raw_data/test.txt pred_xgboost_pig.txt xgboost 10 pred_linear_pig.txt liblinear 6 | 7 | 8 | $BIN xgboost3.conf num_round=200 num_class=10 bst:max_depth=7 data=../dataset/cat_ensemble eval[test]=../dataset/cat_ensemble 9 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/dog_ensemble 10 | mv pred.txt pred_ensemble_dog.txt 11 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/pig_ensemble 12 | mv pred.txt pred_ensemble_pig.txt 13 | 14 | python construct_maxprob.py pred_ensemble_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_ensemble.txt 15 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_ensemble.txt 16 | 17 | python construct_maxprob.py pred_ensemble_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_ensemble.txt 18 | python construct_maxprob_multi.py pred_ensemble_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_ensemble2.txt 19 | -------------------------------------------------------------------------------- /V2/run_liblinear_dog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TRAIN_BIN=../../tools/liblinear/train 3 | TEST_BIN=../../tools/liblinear/predict 4 | 5 | $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/dog_train dog.model 6 | $TEST_BIN -b 1 ../dataset/cat_test dog.model pred_linear_cat.txt 7 | $TEST_BIN -b 1 ../dataset/dog_test dog.model pred_linear_dog.txt 8 | 9 | python construct_liblinear_b1.py pred_linear_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_linear.txt 10 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_linear.txt 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /V2/run_liblinear_pig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TRAIN_BIN=../../tools/liblinear/train 3 | TEST_BIN=../../tools/liblinear/predict 4 | 5 | $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/pig_train pig.model 6 | $TEST_BIN -b 1 ../dataset/pig_test pig.model pred_linear_pig.txt 7 | 8 | python construct_liblinear_b1.py pred_linear_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_linear.txt 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /V2/run_prepare.sh: -------------------------------------------------------------------------------- 1 | # split train to dog/valid/pig 2 | # dog/valid are for offline tuning, pig/test are for leaderboard submission 3 | python split_train.py 4 | 5 | # aggregate titles and clicks for each query 6 | python trans_train1.py 7 | # get statistics data for each query 8 | python trans_train2.py 9 | 10 | # prepare features 11 | prepare_dog1.py 12 | prepare_pig1.py 13 | -------------------------------------------------------------------------------- /V2/run_semilda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -x 2 | ldapath=../../../cpp_workspace/cpp-semilda/src 3 | 4 | train_file=../trans_data/dog.simple1 5 | ldatrain_file=../dataset/train_semilda.train 6 | 7 | index_file=../dataset/word_index 8 | seed_file=lda_seed_words 9 | model_file=lda.model 10 | 11 | num_topic=7 12 | alpha=0.5 13 | beta=0.05 14 | 15 | python prepare_lda_train.py $train_file $ldatrain_file $index_file 16 | 17 | time /Users/zuotaoliu/install/mpich2/bin/mpiexec -n 4 $ldapath/mpi_slda \ 18 | --num_topics $num_topic \ 19 | --alpha $alpha --beta $beta \ 20 | --training_data_file $ldatrain_file \ 21 | --model_file $model_file \ 22 | --word_index_file $index_file \ 23 | --compute_likelihood true \ 24 | --burn_in_iterations 50 --total_iterations 120 25 | 26 | 27 | test_file=../trans_data/valid.simple1 28 | ldatest_file=../dataset/test_semilda.test 29 | ldapred_file=pred_semilda_dog.txt 30 | 31 | python prepare_lda_test.py $test_file $ldatest_file 32 | 33 | args="--alpha ${alpha} \ 34 | --beta ${beta} \ 35 | --inference_data_file ${ldatest_file} \ 36 | --inference_result_file ${ldapred_file} \ 37 | --model_file ${model_file} \ 38 | --burn_in_iterations 50 \ 39 | --total_iterations 120 \ 40 | --file_type 0 41 | " 42 | 43 | time $ldapath/infer $args 44 | python construct_semilda.py pred_semilda_dog.txt $test_file ../trans_data/valid.txt ../dataset/label_map_lda ../submit/dog_semilda.txt 45 | python metric_F1.py ../trans_data/valid.label ../submit/dog_semilda.txt 46 | 47 | test_file=../trans_data/test.simple1 48 | ldatest_file=../dataset/test_semilda.test 49 | ldapred_file=pred_semilda_pig.txt 50 | 51 | python prepare_lda_test.py $test_file $ldatest_file 52 | 53 | args="--alpha ${alpha} \ 54 | --beta ${beta} \ 55 | --inference_data_file ${ldatest_file} \ 56 | --inference_result_file ${ldapred_file} \ 57 | --model_file ${model_file} \ 58 | --burn_in_iterations 50 \ 59 | --total_iterations 120 \ 60 | --file_type 0 61 | " 62 | 63 | time $ldapath/infer $args 64 | python construct_semilda.py pred_semilda_pig.txt $test_file ../raw_data/test.txt ../dataset/label_map_lda ../submit/pig_semilda.txt 65 | 66 | -------------------------------------------------------------------------------- /V2/run_session_label.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -x 2 | python trans_session.py 3 | python prepare_session.py 4 | python markov_sessoin_label.py 5 | 6 | #python construct_semilda.py pred_semilda_dog.txt $test_file ../trans_data/valid.txt ../dataset/label_map_lda ../submit/dog_semilda.txt 7 | #python metric_F1.py ../trans_data/valid.label ../submit/dog_semilda.txt 8 | 9 | #python construct_semilda.py pred_semilda_pig.txt $test_file ../raw_data/test.txt ../dataset/label_map_lda ../submit/pig_semilda.txt 10 | 11 | -------------------------------------------------------------------------------- /V2/run_xgboost3_dog.sh: -------------------------------------------------------------------------------- 1 | BIN=../../tools/xgboost3/xgboost 2 | 3 | $BIN xgboost3.conf num_round=200 num_class=10 bst:max_depth=7 data=../dataset/dog_train eval[test]=../dataset/dog_train 4 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/cat_test 5 | mv pred.txt pred_xgboost_cat.txt 6 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/dog_test 7 | mv pred.txt pred_xgboost_dog.txt 8 | 9 | python construct_maxprob.py pred_xgboost_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_xgboost.txt 10 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_xgboost.txt 11 | python construct_maxprob_multi.py pred_xgboost_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_xgboost2.txt 12 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_xgboost2.txt 13 | 14 | -------------------------------------------------------------------------------- /V2/run_xgboost3_pig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BIN=../../tools/xgboost3/xgboost 3 | 4 | $BIN xgboost3.conf num_round=200 num_class=10 bst:max_depth=7 data=../dataset/pig_train eval[test]=../dataset/pig_train 5 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/pig_test 6 | mv pred.txt pred_xgboost_pig.txt 7 | 8 | python construct_maxprob.py pred_xgboost_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_xgboost.txt 9 | python construct_maxprob_multi.py pred_xgboost_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_xgboost2.txt 10 | -------------------------------------------------------------------------------- /V2/split_train.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | p_train = '../raw_data/train.txt' 4 | p_test = '../raw_data/test.txt' 5 | 6 | p_dog = '../trans_data/dog.txt' 7 | p_valid = '../trans_data/valid.txt' 8 | p_label = '../trans_data/valid.label' 9 | 10 | n_fold = 3 11 | 12 | train_dict = {} 13 | test_dict = {} 14 | unknown_dict = {} 15 | 16 | for line in open(p_train): 17 | if not line.strip(): continue 18 | try: 19 | label, query, title = line.strip().split('\t') 20 | except: 21 | label, query = line.strip().split('\t') 22 | title = '-' 23 | 24 | if query not in train_dict: 25 | train_dict[query] = {} 26 | train_dict[query][label] = train_dict[query].get(label, 0) + 1 27 | if label.startswith('CLASS=TEST'): 28 | test_dict[query] = 1 29 | if label.startswith('CLASS=UNKNOWN'): 30 | unknown_dict[query] = 1 31 | 32 | valid_dict = {} 33 | fv2 = open('../trans_data/valid.txt', 'w') 34 | fv3 = open('../trans_data/valid.label', 'w') 35 | for query in train_dict: 36 | if query in test_dict: continue 37 | if query in unknown_dict: continue 38 | if random.randint(0, n_fold-1) == 1: 39 | valid_dict[query] = 1 40 | label = sorted(train_dict[query].items(), key=lambda d:-d[1])[0][0] 41 | fv2.write('%s\n' % query) 42 | fv3.write('%s\t%s\n' % (query, label)) 43 | fv2.close() 44 | fv3.close() 45 | 46 | fv1 = open('../trans_data/dog.txt', 'w') 47 | for line in open(p_train): 48 | if not line.strip(): 49 | fv1.write(line) 50 | continue 51 | 52 | try: 53 | label, query, title = line.strip().split('\t') 54 | except: 55 | label, query = line.strip().split('\t') 56 | title = '-' 57 | if query in test_dict: continue 58 | if query in train_dict: 59 | if query in valid_dict: 60 | label = 'CLASS=TEST' 61 | fv1.write('%s\t%s\t%s\n' % (label, query, title)) 62 | else: 63 | fv1.write(line) 64 | fv1.close() 65 | 66 | -------------------------------------------------------------------------------- /V2/trans_session.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | p_pig_train = '../raw_data/train.txt' 4 | p_dog_train = '../trans_data/dog.txt' 5 | p_pig_out = '../trans_data/pig.simple5' 6 | p_dog_out = '../trans_data/dog.simple5' 7 | p_pig_valid = '../trans_data/test.simple5' 8 | p_dog_valid = '../trans_data/valid.simple5' 9 | 10 | rates = {'CLASS=VIDEO' : 0.5} 11 | 12 | 13 | def stat(p_in, p_out): 14 | session = [set(), set()] 15 | labels = [] 16 | 17 | fo = open(p_out, 'w') 18 | tot_line = 0 19 | for line in open(p_in): 20 | if not line.strip(): 21 | session_label = '' 22 | session_flag = True 23 | positive_count = 0 24 | for key in labels: 25 | label, query = key.split('\t') 26 | if label.find('TEST')>=0 or label.find('KNOWN')>=0: 27 | if positive_count < 1: session_flag = False 28 | #elif positive_count == 1: positive_count = 0 29 | #else: positive_count = 0.5 30 | else: positive_count = 0 31 | elif session_flag: 32 | if not session_label: 33 | session_label = label 34 | positive_count += 1 35 | elif session_label != label: 36 | session_flag = False 37 | else: 38 | positive_count += 1 39 | if session[0] and session_label and session_flag and positive_count > 0: 40 | if session[1] and len(session[1])<=10 and len(session[0])<=5: 41 | rate = rates.get(session_label, 1.0) 42 | if random.random()<=rate: 43 | fo.write('%s\t%s\t%s\n' % (session_label, ';'.join(session[0]), ';'.join(session[1]))) 44 | session = [set(), set()] 45 | labels = [] 46 | continue 47 | 48 | try: 49 | label, query, title = line.strip().split('\t') 50 | except: 51 | label, query = line.strip().split('\t') 52 | title = '-' 53 | 54 | key = label + '\t' + query 55 | if not labels or labels[-1] != key: 56 | labels.append(key) 57 | session[0].add(query) 58 | if title and title!='-': 59 | session[1].add(title) 60 | 61 | tot_line += 1 62 | #if tot_line == 1000000: break 63 | fo.close() 64 | 65 | def valid(p_in, p_out): 66 | session = [set(), set()] 67 | has_test = False 68 | 69 | fo = open(p_out, 'w') 70 | tot_line = 0 71 | for line in open(p_in): 72 | if not line.strip(): 73 | if has_test and session[1] and len(session[1])<=10 and len(session[0])<=5: 74 | fo.write('%s\t%s\t%s\n' % (0, ';'.join(session[0]), ';'.join(session[1]))) 75 | session = [set(), set()] 76 | has_test = False 77 | continue 78 | try: 79 | label, query, title = line.strip().split('\t') 80 | except: 81 | label, query = line.strip().split('\t') 82 | title = '-' 83 | 84 | if label.find('TEST') >= 0: 85 | has_test = True 86 | session[0].add(query) 87 | if title and title!='-': 88 | session[1].add(title) 89 | 90 | tot_line += 1 91 | #if tot_line == 10000: break 92 | fo.close() 93 | 94 | stat(p_pig_train, p_pig_out) 95 | stat(p_dog_train, p_dog_out) 96 | valid(p_pig_train, p_pig_valid) 97 | valid(p_dog_train, p_dog_valid) 98 | -------------------------------------------------------------------------------- /V2/trans_train0.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../raw_data/train.txt' 3 | p_dog = '../trans_data/dog.txt' 4 | p_test = '../raw_data/test.txt' 5 | p_valid = '../trans_data/valid.txt' 6 | 7 | s_train = '../trans_data/train.simple0' 8 | s_test = '../trans_data/test.simple0' 9 | s_dog = '../trans_data/dog.simple0' 10 | s_valid = '../trans_data/valid.simple0' 11 | 12 | def trans(in1, in2, out1, out2): 13 | train_dict = {} 14 | test_dict = {} 15 | 16 | # for session 17 | session_train_query = {} 18 | session_test_query = {} 19 | session_labels = {} 20 | session_query = {} 21 | session_click = {} 22 | 23 | for line in open(in1): 24 | if not line.strip(): 25 | #session end 26 | for query in session_train_query: 27 | #if len(session_labels) == 1: 28 | for q2 in session_train_query: 29 | if query != q2: 30 | label = session_train_query[q2] 31 | train_dict[query][2][label] = train_dict[query][2].get(label, 0) + 1 32 | for q2 in session_query: 33 | if query != q2: 34 | train_dict[query][3][q2] = train_dict[query][3].get(q2, 0) + 1 35 | for title in session_click: 36 | train_dict[query][4][title] = train_dict[query][4].get(title, 0) + 1 37 | for query in session_test_query: 38 | #if len(session_labels) == 1: 39 | for q2 in session_train_query: 40 | if query != q2: 41 | label = session_train_query[q2] 42 | test_dict[query][2][label] = test_dict[query][2].get(label, 0) + 1 43 | for q2 in session_query: 44 | if query != q2: 45 | test_dict[query][3][q2] = test_dict[query][3].get(q2, 0) + 1 46 | for title in session_click: 47 | test_dict[query][4][title] = test_dict[query][4].get(title, 0) + 1 48 | session_train_query = {} 49 | session_test_query = {} 50 | session_labels = {} 51 | session_query = {} 52 | session_click = {} 53 | continue 54 | 55 | try: 56 | label, query, title = line.strip().split('\t') 57 | except: 58 | label, query = line.strip().split('\t') 59 | title = '-' 60 | #label = ' | '.join(sorted(label.split(' | '))) 61 | 62 | if title and title != '-': 63 | session_click[title] = 1 64 | session_query[query] = 1 65 | 66 | if label.startswith('CLASS=TEST'): 67 | if query not in test_dict: 68 | test_dict[query] = [label, {}, {}, {}, {}] 69 | if title and title != '-': 70 | test_dict[query][1][title] = test_dict[query][1].get(title, 0) + 1 71 | session_test_query[query] = 1 72 | elif not label.startswith('CLASS=UNKNOWN'): 73 | if query not in train_dict: 74 | train_dict[query] = [{}, {}, {}, {}, {}] 75 | train_dict[query][0][label] = train_dict[query][0].get(label, 0) + 1 76 | if title and title != '-': 77 | train_dict[query][1][title] = train_dict[query][1].get(title, 0) + 1 78 | session_labels[label] = 1 79 | session_train_query[query] = ''.join(sorted(label.split(' | '))) 80 | 81 | n_top_title = -1 82 | n_top_label = 3 83 | n_top_query = 10 84 | n_top_session_title = 30 85 | 86 | with open(out1, 'w') as ft: 87 | for query in train_dict: 88 | label = sorted(train_dict[query][0].items(), key=lambda d:-d[1])[0][0] 89 | 90 | titles = sorted(train_dict[query][1].items(), key=lambda d:-d[1]) 91 | title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:]] 92 | 93 | labels = sorted(train_dict[query][2].items(), key=lambda d:-d[1]) 94 | label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]] 95 | tot_label = float(sum(train_dict[query][2].values())) 96 | label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]] 97 | 98 | queries = sorted(train_dict[query][3].items(), key=lambda d:-d[1]) 99 | query_pairs = [] 100 | query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]] 101 | 102 | stitles = sorted(train_dict[query][4].items(), key=lambda d:-d[1]) 103 | stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]] 104 | 105 | stat_pairs = [] 106 | 107 | ft.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs))) 108 | 109 | with open(out2, 'w') as fo: 110 | for query in test_dict: 111 | label = test_dict[query][0] 112 | 113 | titles = sorted(test_dict[query][1].items(), key=lambda d:-d[1]) 114 | title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:]] 115 | 116 | labels = sorted(test_dict[query][2].items(), key=lambda d:-d[1]) 117 | label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]] 118 | tot_label = float(sum(test_dict[query][2].values())) 119 | label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]] 120 | 121 | queries = sorted(test_dict[query][3].items(), key=lambda d:-d[1]) 122 | query_pairs = [] 123 | query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]] 124 | 125 | stitles = sorted(test_dict[query][4].items(), key=lambda d:-d[1]) 126 | stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]] 127 | 128 | stat_pairs = [] 129 | 130 | fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs))) 131 | 132 | trans(p_train, p_test, s_train, s_test) 133 | trans(p_dog, p_valid, s_dog, s_valid) 134 | 135 | -------------------------------------------------------------------------------- /V2/trans_train1.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../raw_data/train.txt' 3 | p_dog = '../trans_data/dog.txt' 4 | p_test = '../raw_data/test.txt' 5 | p_valid = '../trans_data/valid.txt' 6 | 7 | s_train = '../trans_data/train.simple1' 8 | s_test = '../trans_data/test.simple1' 9 | s_dog = '../trans_data/dog.simple1' 10 | s_valid = '../trans_data/valid.simple1' 11 | 12 | def trans(in1, in2, out1, out2): 13 | train_dict = {} 14 | test_dict = {} 15 | 16 | # for session 17 | session_train_query = {} 18 | session_test_query = {} 19 | session_labels = {} 20 | session_query = {} 21 | session_click = {} 22 | 23 | for line in open(in1): 24 | if not line.strip(): 25 | #session end 26 | for query in session_train_query: 27 | #if len(session_labels) == 1: 28 | for q2 in session_train_query: 29 | if query != q2: 30 | label = session_train_query[q2] 31 | train_dict[query][2][label] = train_dict[query][2].get(label, 0) + 1 32 | for q2 in session_query: 33 | if query != q2: 34 | train_dict[query][3][q2] = train_dict[query][3].get(q2, 0) + 1 35 | for title in session_click: 36 | train_dict[query][4][title] = train_dict[query][4].get(title, 0) + 1 37 | for query in session_test_query: 38 | #if len(session_labels) == 1: 39 | for q2 in session_train_query: 40 | if query != q2: 41 | label = session_train_query[q2] 42 | test_dict[query][2][label] = test_dict[query][2].get(label, 0) + 1 43 | for q2 in session_query: 44 | if query != q2: 45 | test_dict[query][3][q2] = test_dict[query][3].get(q2, 0) + 1 46 | for title in session_click: 47 | test_dict[query][4][title] = test_dict[query][4].get(title, 0) + 1 48 | session_train_query = {} 49 | session_test_query = {} 50 | session_labels = {} 51 | session_query = {} 52 | session_click = {} 53 | continue 54 | 55 | try: 56 | label, query, title = line.strip().split('\t') 57 | except: 58 | label, query = line.strip().split('\t') 59 | title = '-' 60 | #label = ' | '.join(sorted(label.split(' | '))) 61 | 62 | if title and title != '-': 63 | session_click[title] = 1 64 | session_query[query] = 1 65 | 66 | if label.startswith('CLASS=TEST'): 67 | if query not in test_dict: 68 | test_dict[query] = [label, {}, {}, {}, {}] 69 | if title and title != '-': 70 | test_dict[query][1][title] = test_dict[query][1].get(title, 0) + 1 71 | session_test_query[query] = 1 72 | elif not label.startswith('CLASS=UNKNOWN'): 73 | if query not in train_dict: 74 | train_dict[query] = [{}, {}, {}, {}, {}] 75 | train_dict[query][0][label] = train_dict[query][0].get(label, 0) + 1 76 | if title and title != '-': 77 | train_dict[query][1][title] = train_dict[query][1].get(title, 0) + 1 78 | session_labels[label] = 1 79 | session_train_query[query] = ''.join(sorted(label.split(' | '))) 80 | 81 | n_top_title = 30 82 | n_top_label = 3 83 | n_top_query = 10 84 | n_top_session_title = 30 85 | 86 | with open(out1, 'w') as ft: 87 | for query in train_dict: 88 | label = sorted(train_dict[query][0].items(), key=lambda d:-d[1])[0][0] 89 | 90 | titles = sorted(train_dict[query][1].items(), key=lambda d:-d[1]) 91 | title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]] 92 | 93 | labels = sorted(train_dict[query][2].items(), key=lambda d:-d[1]) 94 | label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]] 95 | tot_label = float(sum(train_dict[query][2].values())) 96 | label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]] 97 | 98 | queries = sorted(train_dict[query][3].items(), key=lambda d:-d[1]) 99 | query_pairs = [] 100 | query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]] 101 | 102 | stitles = sorted(train_dict[query][4].items(), key=lambda d:-d[1]) 103 | stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]] 104 | 105 | stat_pairs = [] 106 | 107 | ft.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs))) 108 | 109 | with open(out2, 'w') as fo: 110 | for query in test_dict: 111 | label = test_dict[query][0] 112 | 113 | titles = sorted(test_dict[query][1].items(), key=lambda d:-d[1]) 114 | title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]] 115 | 116 | labels = sorted(test_dict[query][2].items(), key=lambda d:-d[1]) 117 | label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]] 118 | tot_label = float(sum(test_dict[query][2].values())) 119 | label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]] 120 | 121 | queries = sorted(test_dict[query][3].items(), key=lambda d:-d[1]) 122 | query_pairs = [] 123 | query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]] 124 | 125 | stitles = sorted(test_dict[query][4].items(), key=lambda d:-d[1]) 126 | stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]] 127 | 128 | stat_pairs = [] 129 | 130 | fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs))) 131 | 132 | trans(p_train, p_test, s_train, s_test) 133 | trans(p_dog, p_valid, s_dog, s_valid) 134 | 135 | -------------------------------------------------------------------------------- /V2/trans_train2.py: -------------------------------------------------------------------------------- 1 | 2 | p_train = '../raw_data/train.txt' 3 | p_dog = '../trans_data/dog.txt' 4 | p_test = '../raw_data/test.txt' 5 | p_valid = '../trans_data/valid.txt' 6 | 7 | s_train = '../trans_data/train.simple2' 8 | s_test = '../trans_data/test.simple2' 9 | s_dog = '../trans_data/dog.simple2' 10 | s_valid = '../trans_data/valid.simple2' 11 | 12 | def trans(in1, in2, out1, out2): 13 | train_dict = {} 14 | test_dict = {} 15 | 16 | query_freq = {} 17 | query_titles = {} 18 | 19 | query_session = {} 20 | query_search = {} 21 | query_click = {} 22 | query_dupclick = {} 23 | 24 | query_session_search = {} 25 | query_session_click = {} 26 | query_session_dupclick = {} 27 | 28 | # for session 29 | session_train_query = {} 30 | session_test_query = {} 31 | session_labels = {} 32 | session_search = 0 33 | session_click = {} 34 | session_query_search = {} 35 | session_query_click = {} 36 | 37 | for line in open(in1): 38 | if not line.strip(): 39 | #session end 40 | session_dupclick = 0 41 | for title in session_click: 42 | if session_click[title] > 1: session_dupclick += 1 43 | for query in session_train_query: 44 | query_session[query] = query_session.get(query, 0) + 1. 45 | query_session_search[query] = query_session_search.get(query, 0) + session_search 46 | query_session_click[query] = query_session_click.get(query, 0) + len(session_click) 47 | query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick 48 | query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0) 49 | query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {})) 50 | session_query_dup = 0 51 | for title in session_query_click.get(query, {}): 52 | if session_query_click[query][title] > 1: session_query_dup += 1 53 | query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup 54 | for query in session_test_query: 55 | query_session[query] = query_session.get(query, 0) + 1. 56 | query_session_search[query] = query_session_search.get(query, 0) + session_search 57 | query_session_click[query] = query_session_click.get(query, 0) + len(session_click) 58 | query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick 59 | query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0) 60 | query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {})) 61 | session_query_dup = 0 62 | for title in session_query_click.get(query, {}): 63 | if session_query_click[query][title] > 1: session_query_dup += 1 64 | query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup 65 | 66 | session_train_query = {} 67 | session_test_query = {} 68 | session_labels = {} 69 | session_search = 0 70 | session_click = {} 71 | session_query_search = {} 72 | session_query_click = {} 73 | continue 74 | 75 | try: 76 | label, query, title = line.strip().split('\t') 77 | except: 78 | label, query = line.strip().split('\t') 79 | title = '-' 80 | #label = ' | '.join(sorted(label.split(' | '))) 81 | 82 | query_freq[query] = query_freq.get(query, 0) + 1 83 | if title and title != '-': 84 | if query not in query_titles: query_titles[query] = [0., 0.] 85 | query_titles[query][0] += 1 86 | query_titles[query][1] += len(title.split(' ')) 87 | session_click[title] = session_click.get(title, 0) + 1 88 | if query not in session_query_click: session_query_click[query] = {} 89 | session_query_click[query][title] = session_query_click[query].get(title, 0) + 1 90 | else: 91 | session_search += 1 92 | if query not in session_query_search: session_query_search[query] = 0 93 | session_query_search[query] += 1 94 | 95 | if label.startswith('CLASS=TEST'): 96 | if query not in test_dict: 97 | test_dict[query] = [label, {}, {}, {}] 98 | session_test_query[query] = 1 99 | elif not label.startswith('CLASS=UNKNOWN'): 100 | if query not in train_dict: 101 | train_dict[query] = [{}, {}, {}, {}] 102 | session_train_query[query] = label.replace(' ', '') 103 | 104 | 105 | with open(out1, 'w') as ft: 106 | for query in train_dict: 107 | stat_pairs = [] 108 | stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) ) 109 | stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) ) 110 | if query_titles.get(query, [0, 0])[0] >= 3: 111 | stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) ) 112 | 113 | stat_pairs2 = [] 114 | if query_session.get(query, 0) >= 5: 115 | stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) ) 116 | stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) ) 117 | stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) ) 118 | stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) ) 119 | stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) ) 120 | stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) ) 121 | 122 | ft.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2))) 123 | 124 | with open(out2, 'w') as fo: 125 | for query in test_dict: 126 | stat_pairs = [] 127 | stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) ) 128 | stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) ) 129 | if query_titles.get(query, [0, 0])[0] >= 3: 130 | stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) ) 131 | 132 | stat_pairs2 = [] 133 | if query_session.get(query, 0) >= 5: 134 | stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) ) 135 | stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) ) 136 | stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) ) 137 | stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) ) 138 | stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) ) 139 | stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) ) 140 | 141 | fo.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2))) 142 | 143 | trans(p_train, p_test, s_train, s_test) 144 | trans(p_dog, p_valid, s_dog, s_valid) 145 | 146 | -------------------------------------------------------------------------------- /V2/xgboost3.conf: -------------------------------------------------------------------------------- 1 | ### General Parameters, see comment for each definition 2 | # choose the tree booster, 0: tree, 1: linear 3 | booster_type = 0 4 | # this is the only difference with classification, use 0: linear regression 5 | # when labels are in [0,1] we can also use 1: logistic regression 6 | loss_type = 0 7 | # evaluation metrics for validation data 8 | eval_metric=merror 9 | #eval_metric=error 10 | #eval_metric=auc 11 | #eval_metric=map 12 | #eval_metric=rmse 13 | #eval_metric=ndcg 14 | # silent=1 means printing running messages 15 | #silent = 1 16 | # do not use binary buffer 17 | use_buffer = 0 18 | nthread = 8 19 | #num_class = 18 20 | 21 | ### Tree Booster Parameters 22 | # step size shrinkage used in update to prevent overfitting. 23 | bst:eta = 0.1 24 | #bst:eta = 0.05 25 | # minimum loss reduction required to make a further partition. larger -> more conservative 26 | bst:gamma = 1.0 27 | # minimum sum of instance weight(hessian) needed in a child. larger -> more conservative 28 | bst:min_child_weight = 1.0 29 | # maximum depth of a tree 30 | #bst:max_depth = 5 31 | # constructing method to build a tree, 0: svdfeature, 1: column major expansion, 2: row major expansion 32 | #bst:tree_maker = 1 33 | 34 | ### Linear Booster Parameters 35 | # L2 regularization term on weights 36 | bst:lambda = 0 37 | # L1 regularization term on weights 38 | bst:alpha = 0 39 | # L2 regularization term on bias 40 | bst:lambda_bias = 0 41 | 42 | ### Task parameters 43 | # specify the learning task and the corresponding learning objective 44 | #objective = multi:softmax 45 | objective = multi:softprob 46 | #objective = reg:linear 47 | #objective = reg:linear 48 | #objective = reg:logistic 49 | #objective = binary:logistic 50 | #objective = binary:logitraw 51 | # the number of round to do boosting 52 | num_round = 10 53 | # 0 means do not save any model except the final round model 54 | save_period = 0 55 | # the initial prediction score 56 | base_score = 0.0 57 | # fature map 58 | #fmap = "../dataset/feat_map.txt" 59 | # name for dump model 60 | name_dump = "dump.nice.txt" 61 | # The path of training data 62 | #data = "../dataset/train_dog.svm" 63 | # The path of validation data, used to monitor training process, here [test] sets name of the validation set 64 | #eval[test] = "../dataset/train_dog.svm" 65 | # The path of test data 66 | #test:data = "../dataset/test_dog.svm" 67 | 68 | 69 | --------------------------------------------------------------------------------