├── README.md
├── V1
    ├── averaging_methods.py
    ├── construct_liblinear_b1.py
    ├── construct_liblinear_multi.py
    ├── construct_maxlabel.py
    ├── construct_maxprob.py
    ├── construct_maxprob_balance.py
    ├── construct_maxprob_multi.py
    ├── construct_semilda.py
    ├── construct_session_prob.py
    ├── forest.py
    ├── markov_all.py
    ├── markov_sessoin_label.py
    ├── metric_F1.py
    ├── metric_confusion.py
    ├── prepare1.py
    ├── prepare2.py
    ├── prepare3.py
    ├── prepare_lda_test.py
    ├── prepare_lda_train.py
    ├── prepare_liblinear_1vsA.py
    ├── prepare_session.py
    ├── refine_train_by_sesson_query.py
    ├── run_average.sh
    ├── run_forest_dog.sh
    ├── run_liblinear_multi.sh
    ├── run_liblinear_pig.sh
    ├── run_semilda_pig.sh
    ├── run_xgboost3_dog.sh
    ├── run_xgboost3_pig.sh
    ├── run_xgboost3_pig2.sh
    ├── run_xgboost3_session.sh
    ├── split_train.py
    ├── trans_session.py
    ├── trans_train1.py
    ├── trans_train2.py
    └── xgboost3.conf
└── V2
    ├── construct_liblinear_b1.py
    ├── construct_maxprob.py
    ├── construct_maxprob_multi.py
    ├── construct_semilda.py
    ├── construct_session_prob.py
    ├── markov_sessoin_label.py
    ├── metric_F1.py
    ├── prepare_ensemble_cat.py
    ├── prepare_ensemble_dog.py
    ├── prepare_ensemble_pig.py
    ├── prepare_feature_dog1.py
    ├── prepare_feature_pig1.py
    ├── prepare_lda_test.py
    ├── prepare_lda_train.py
    ├── prepare_session.py
    ├── refine_train_by_sesson_query.py
    ├── run_all.sh
    ├── run_ensemble.sh
    ├── run_liblinear_dog.sh
    ├── run_liblinear_pig.sh
    ├── run_prepare.sh
    ├── run_semilda.sh
    ├── run_session_label.sh
    ├── run_xgboost3_dog.sh
    ├── run_xgboost3_pig.sh
    ├── split_train.py
    ├── trans_session.py
    ├── trans_train0.py
    ├── trans_train1.py
    ├── trans_train2.py
    └── xgboost3.conf


/README.md:
--------------------------------------------------------------------------------
 1 | Fancyspeed's solution for CIKM2014 Cup (the 5th place).
 2 | ===================================================
 3 | 
 4 | ## Background
 5 | 
 6 | The task is query classification, or query intent detection. 
 7 | 
 8 | About the competition, please visit http://cikm2014.fudan.edu.cn/index.php/Index/index and http://openresearch.baidu.com/topic/71.jspx
 9 | 
10 | 
11 | ## Challenges
12 | 
13 | * Multi-class multi-label
14 | * Short text
15 | * Click and session
16 | * Unlabelled data
17 | * Unbalanced data
18 | 
19 | ## Ideas for each challenge
20 | 
21 | * Structured labels
22 | * N-gram, word position, aggregated query as a sample
23 | * In-session queries and labels, keyword and entity detection
24 | * Semi-supervised learning
25 | * Sampling, post-processin
26 | 
27 | ## Features
28 | 
29 | * query words (1-gram, 2-gram, word position)
30 | * clicked title words (1-gram, 2-gram)
31 | * words of top 30 titles in query's same sessions
32 | * words of top 3 labels in query's same sessions
33 | * labels in query's same sessions
34 | * query length
35 | * query frequence
36 | * average length of clicked titles
37 | * average search times in query's same sessions
38 | * average click times in query's same sessions
39 | * averge duplicated clicks in query's same sessions
40 | 
41 | ## Methods and tools
42 | 
43 | * GBM: Xgboost with softmax-objective
44 | * SVC: Liblinear
45 | * Multi-class LR: Sklearn.MultiTaskLasso
46 | * Random Forest: Sklearn.RandomForestClassifier
47 | * Labelled LDA: modified PLDA
48 | * Markov Chain: query-query similarity by text and session co-occurrence
49 | 
50 | ## Ensembles
51 | 
52 | * weighted averaging
53 | * linear model
54 | * cascading: feed xgboost
55 | 
56 | ## Post-processing
57 | 
58 | * Calibration: same label distribution as training set
59 | * Threshold: same average labels as training set
60 | 
61 | ## How to run
62 | 
63 |   * Dependencies:
64 |     1. XGBoost for GBM: https://github.com/tqchen/xgboost
65 |     2. Liblinear for LR and SVC: http://www.csie.ntu.edu.tw/~cjlin/liblinear/ 
66 | 
67 |   * Assumpation:
68 |     1. XGboost's path is ../../tools/xgboost3/ 
69 |     2. Liblinear's path is ../../tools/liblinear/ 
70 |     3. raw training data is in ../raw_data
71 |     4. need 3 folds ../trans_data, ../dataset, ../submit for temporary data
72 | 
73 |   * Run:
74 |     1. `cd V2`
75 |     2. `sh -x run_all.sh`
76 | 
77 |   * Steps:
78 |     1. split train.txt to dog/valid (for offline tuning): split_train.py 
79 |     2. merge information for each query: trans_train.py
80 |     3. generate features: prepare_feature.py
81 |     4. train and predict by xgboost: run_xgboost3_dog.sh
82 |     5. train and predict by liblinear: run_liblinear_dog.sh
83 |     6. ensemble: run_ensemble.sh
84 | 
85 | 


--------------------------------------------------------------------------------
/V1/averaging_methods.py:
--------------------------------------------------------------------------------
 1 | n_sample = 39013
 2 | 
 3 | label_map = {}
 4 | label_map2 = {}
 5 | max_label = 0
 6 | for line in open('../dataset/label_map'):
 7 |     label, c = line.strip().split('\t')
 8 |     label_map[int(c)] = label
 9 |     label_map2[label] = int(c)
10 |     max_label = max(max_label, int(c))
11 | print 'max_label:', max_label 
12 | 
13 | weights = []
14 | for i in xrange(n_sample):
15 |     weights.append([0]*(max_label+1))
16 | 
17 | if __name__ == '__main__':
18 |     import sys
19 |     if len(sys.argv) < 4:
20 |         print '<usage> out in1 m1 w1 [in2 m2 w2 ...]'
21 |         exit(1)
22 | 
23 |     i = 2
24 |     while i < len(sys.argv):
25 |         in_i = sys.argv[i]
26 |         m_i = sys.argv[i+1]
27 |         w_i = float(sys.argv[i+2])
28 |         print in_i, m_i, w_i
29 |         if m_i == 'xgboost':
30 |             nclass = int(sys.argv[i+3])
31 |             fin = open(in_i)
32 |             for isample in xrange(n_sample):
33 |                 for ipred in xrange(nclass):
34 |                     pred = float(fin.readline().strip())
35 |                     if ipred <= max_label:
36 |                         weights[isample][ipred] += pred * w_i
37 |             fin.close()
38 |             i += 4
39 |         elif m_i == 'liblinear':
40 |             fin = open(in_i)
41 |             fin.readline()
42 |             for isample in xrange(n_sample):
43 |                 preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 
44 |                 for ipred, pred in enumerate(preds):
45 |                     weights[isample][ipred] += pred * w_i
46 |             i += 3
47 |         elif m_i == 'semilda':
48 |             lda_map = {}
49 |             for line in open('../dataset/label_map_lda'):
50 |                 label, c = line.strip().split('\t')
51 |                 lda_map[int(c)] = label
52 |             feat_map = {}
53 |             fin = open('../trans_data/test.simple')
54 |             for line in open(in_i):
55 |                 preds = [float(v) for v in line.strip().split(' ')] 
56 |                 tot = sum(preds) + 0.001
57 |                 preds = [v/tot for v in preds]
58 |                 feats = fin.readline().strip().split('\t')[1]
59 |                 feat_map[feats] = preds
60 |             fin.close()
61 |             fin = open('../raw_data/test.txt')
62 |             for isample in xrange(n_sample):
63 |                 feats = fin.readline().strip()
64 |                 preds = feat_map[feats]
65 |                 for ipred, pred in enumerate(preds):
66 |                     ipred2 = label_map2[lda_map[ipred]]
67 |                     weights[isample][ipred2] += pred / tot * w_i
68 |             fin.close()
69 |             i += 3
70 |         elif m_i == 'sessionlabel':
71 |             fin = open(in_i)
72 |             for isample in xrange(n_sample):
73 |               try:
74 |                 pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')]
75 |                 if pairs and pairs[0] and pairs[0][0]:
76 |                   tot = sum([float(v[1]) for v in pairs])
77 |                   if tot > 0:
78 |                     for pair in pairs:
79 |                         label, v = pair[0], float(pair[1])
80 |                         label = ' | '.join(sorted(label.split(' | ')))
81 |                         if label not in label_map2: label = label.split(' | ')[0]
82 |                         c = label_map2[label]
83 |                         weights[isample][c] += v / tot * w_i
84 |               except:
85 |                 print pairs
86 |                 exit(1)
87 |             i += 3
88 | 
89 | 
90 |     with open(sys.argv[1], 'w') as fo:
91 |         for preds in weights:
92 |             for pred in preds:
93 |                 fo.write('%s\n' % pred)
94 |             #label = sorted([(i, v) for i, v in enumerate(preds)], key=lambda d:-d[1])[0][0]
95 |             #fo.write('%s\n' % label)
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/V1/construct_liblinear_b1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | def get_match(p_pred, p_test, p_out):
12 |     fo = open(p_out, 'w')
13 |     fl = open(p_test)
14 |     fin = open(p_pred)
15 |     fin.readline()
16 |     for line in fin: 
17 |         c = int(line.split(' ')[0])
18 |         #c = max(0, min(max_label, int(float(line.strip())+0.5)))
19 |         label = label_map[c]
20 |         feats = fl.readline().strip()
21 |         fo.write('%s\t%s\n' % (feats, label))
22 |     fin.close()
23 |     fl.close()
24 |     fo.close()
25 | 
26 | if __name__ == '__main__':
27 |     import sys
28 |     if len(sys.argv) != 5:
29 |         print '<usage> pred test label out'
30 |         exit(1)
31 |     
32 |     load_label_map(sys.argv[3])
33 |     print 'max_label:', max_label
34 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
35 | 


--------------------------------------------------------------------------------
/V1/construct_liblinear_multi.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | def get_match(p_pred, p_test, p_out):
12 |     fin = []
13 |     for i in range(7):
14 |         fin.append( open('%s_%s.txt' % (p_pred, i)) )
15 |         fin[i].readline()
16 |     fo = open(p_out, 'w')
17 | 
18 |     for line in open(p_test):
19 |         feats = line.strip()
20 | 
21 |         preds = []
22 |         for i in range(7):
23 |             #preds.append( (label_map[i], int(fin[i].readline().split(' ')[0]))) 
24 |             preds.append( (label_map[i], float(fin[i].readline().strip().split(' ')[2]))) 
25 |         labels = sorted(preds, key=lambda d:-d[1])
26 |         if labels[1][1] > 0.5:
27 |             label = labels[0][0] + ' | ' + labels[1][0]
28 |         else:
29 |             label = labels[0][0]
30 |         fo.write('%s\t%s\n' % (feats, label))
31 |     fo.close()
32 |     for i in range(7):
33 |         fin[i].close()
34 | 
35 | if __name__ == '__main__':
36 |     import sys
37 |     if len(sys.argv) != 5:
38 |         print '<usage> pred test label out'
39 |         exit(1)
40 |     
41 |     load_label_map(sys.argv[3])
42 |     print 'max_label:', max_label
43 | 
44 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
45 | 


--------------------------------------------------------------------------------
/V1/construct_maxlabel.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | def get_match(p_pred, p_test, p_out):
12 |     fo = open(p_out, 'w')
13 |     fl = open(p_test)
14 |     for line in open(p_pred):
15 |         c = max(0, min(max_label, int(float(line.strip())+0.5)))
16 |         label = label_map[c]
17 |         feats = fl.readline().strip()
18 |         fo.write('%s\t%s\n' % (feats, label))
19 |     fl.close()
20 |     fo.close()
21 | 
22 | if __name__ == '__main__':
23 |     import sys
24 |     if len(sys.argv) != 5:
25 |         print '<usage> pred test label out'
26 |         exit(1)
27 |     
28 |     load_label_map(sys.argv[3])
29 |     print 'max_label:', max_label
30 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
31 | 


--------------------------------------------------------------------------------
/V1/construct_maxprob.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | adjust = {}
 5 | def load_label_map(p_in):
 6 |     global max_label
 7 |     for line in open(p_in):
 8 |         label, c = line.strip().split('\t')
 9 |         label_map[int(c)] = label
10 |         max_label = max(max_label, int(c))
11 | 
12 |     for i in range(max_label+1):
13 |         adjust[i] = 0
14 | 
15 | def get_match(p_pred, p_test, p_out):
16 |     npred = len(open(p_pred).readlines()) / len(open(p_test).readlines())
17 |     fo = open(p_out, 'w')
18 |     fp = open(p_pred)
19 |     for line in open(p_test):
20 |         feats = line.strip()
21 | 
22 |         pred = []
23 |         for i in range(npred):
24 |             if i <= max_label:
25 |                 pred.append(float(fp.readline().strip())) 
26 |             else:
27 |                 fp.readline()
28 |         c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0]
29 |         label = label_map[c]
30 |         fo.write('%s\t%s\n' % (feats, label))
31 |     fp.close()
32 |     fo.close()
33 | 
34 | if __name__ == '__main__':
35 |     import sys
36 |     if len(sys.argv) < 5:
37 |         print '<usage> pred test label out adjust'
38 |         exit(1)
39 |     
40 |     load_label_map(sys.argv[3])
41 |     print 'max_label:', max_label
42 |     if len(sys.argv) >= 6:
43 |         for line in open(sys.argv[5]):
44 |             cid, v = line.strip().split('\t')
45 |             adjust[int(cid)] = float(v)
46 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
47 | 


--------------------------------------------------------------------------------
/V1/construct_maxprob_balance.py:
--------------------------------------------------------------------------------
  1 | 
  2 | label_map = {}
  3 | label_map2 = {}
  4 | max_label = 0
  5 | adjust = {}
  6 | def load_label_map(p_in):
  7 |     global max_label
  8 |     for line in open(p_in):
  9 |         label, c = line.strip().split('\t')
 10 |         label_map[int(c)] = label
 11 |         label_map2[label] = int(c)
 12 |         max_label = max(max_label, int(c))
 13 |     print label_map
 14 |     print label_map2
 15 | 
 16 |     for c in range(max_label+1):
 17 |         adjust[c] = 0.
 18 | 
 19 | truth_dict = {}
 20 | def load_truth(p_truth):
 21 |     tot = 0.
 22 |     for line in open(p_truth):
 23 |         label = line.strip().split('\t')[1]
 24 |         label = ' | '.join(sorted(label.split(' | ')))
 25 |         if label not in label_map: label = label.split(' | ')[0]
 26 |         c = label_map2[label]
 27 |         truth_dict[c] = truth_dict.get(c, 0) + 1
 28 |         tot += 1.
 29 |     for c in range(max_label+1):
 30 |         truth_dict[c] = truth_dict.get(c, 0) / tot
 31 |     print truth_dict
 32 | 
 33 | def learn(p_pred, npred):
 34 |     i = 0
 35 |     preds = []
 36 |     pred = []
 37 |     for line in open(p_pred):
 38 |         j = i % npred
 39 |         if j <= max_label:
 40 |             pred.append(float(line.strip()))
 41 |         if j == npred-1:
 42 |             preds.append(pred)
 43 |             pred = []
 44 |         i += 1
 45 |     ite = 0
 46 |     while ite < 20:
 47 |         cids = {}
 48 |         tot = len(preds)
 49 |         for pred in preds:
 50 |             c = sorted([(k, v+adjust[k]) for (k, v) in enumerate(pred)], key=lambda d:-d[1])[0][0]
 51 |             cids[c] = cids.get(c, 0) + 1./tot
 52 |         for c in cids:
 53 |             if cids[c] < truth_dict[c] * 0.8: adjust[c] += 0.005
 54 |             elif cids[c] < truth_dict[c] * 0.2: adjust[c] += 0.015
 55 |             elif cids[c] > truth_dict[c] * 1.2: adjust[c] -= 0.005
 56 |             elif cids[c] > truth_dict[c] * 5: adjust[c] -= 0.015
 57 |         ite += 1
 58 | 
 59 | 
 60 | def get_match(p_pred, p_test, p_out):
 61 |     npred = len(open(p_pred).readlines()) / len(open(p_test).readlines())
 62 | 
 63 |     learn(p_pred, npred)
 64 | 
 65 |     fo = open(p_out, 'w')
 66 |     fp = open(p_pred)
 67 |     for line in open(p_test):
 68 |         feats = line.strip()
 69 | 
 70 |         pred = []
 71 |         for c in range(npred):
 72 |             if c <= max_label:
 73 |                 pred.append(float(fp.readline().strip())) 
 74 |             else:
 75 |                 fp.readline()
 76 |         sort_list = sorted([(c, v+adjust[c]) for c, v in enumerate(pred)], key=lambda d:-d[1])
 77 |         c = sort_list[0][0]
 78 |         label = label_map[c]
 79 |         if sort_list[1][1] > 0.45: 
 80 |             label = label + ' | ' + label_map[sort_list[1][0]]  
 81 |         label = ' | '.join(label.split(' | ')[:2])
 82 |         fo.write('%s\t%s\n' % (feats, label))
 83 |     fp.close()
 84 |     fo.close()
 85 | 
 86 | if __name__ == '__main__':
 87 |     import sys
 88 |     if len(sys.argv) < 5:
 89 |         print '<usage> pred test label out adjust'
 90 |         exit(1)
 91 |     
 92 |     load_label_map(sys.argv[3])
 93 |     load_truth('../trans_data/valid.label')
 94 |     print 'max_label:', max_label
 95 |     if len(sys.argv) >= 6:
 96 |         for line in open(sys.argv[5]):
 97 |             cid, v = line.strip().split('\t')
 98 |             adjust[int(cid)] = float(v)
 99 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
100 | 
101 | 


--------------------------------------------------------------------------------
/V1/construct_maxprob_multi.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | adjust = {}
 5 | def load_label_map(p_in):
 6 |     global max_label
 7 |     for line in open(p_in):
 8 |         label, c = line.strip().split('\t')
 9 |         label_map[int(c)] = label
10 |         max_label = max(max_label, int(c))
11 | 
12 |     for i in range(max_label+1):
13 |         adjust[i] = 0
14 | 
15 | def get_match(p_pred, p_test, p_out):
16 |     npred = len(open(p_pred).readlines()) / len(open(p_test).readlines())
17 |     fo = open(p_out, 'w')
18 |     fp = open(p_pred)
19 |     for line in open(p_test):
20 |         feats = line.strip()
21 | 
22 |         pred = []
23 |         for i in range(npred):
24 |             if i <= max_label:
25 |                 pred.append(float(fp.readline().strip())) 
26 |             else:
27 |                 fp.readline()
28 |         sort_list = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])
29 |         c = sort_list[0][0]
30 |         label = label_map[c]
31 |         if sort_list[1][1] > 0.45: 
32 |             label = label + ' | ' + label_map[sort_list[1][0]]  
33 |         label = ' | '.join(label.split(' | ')[:2])
34 |         fo.write('%s\t%s\n' % (feats, label))
35 |     fp.close()
36 |     fo.close()
37 | 
38 | if __name__ == '__main__':
39 |     import sys
40 |     if len(sys.argv) < 5:
41 |         print '<usage> pred test label out adjust'
42 |         exit(1)
43 |     
44 |     load_label_map(sys.argv[3])
45 |     print 'max_label:', max_label
46 |     if len(sys.argv) >= 6:
47 |         for line in open(sys.argv[5]):
48 |             cid, v = line.strip().split('\t')
49 |             adjust[int(cid)] = float(v)
50 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
51 | 


--------------------------------------------------------------------------------
/V1/construct_semilda.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | def get_match(p_pred, p_test_simple, p_test, p_out):
12 |     feat_map = {}
13 |     fl = open(p_test_simple)
14 |     fin = open(p_pred)
15 |     for line in fin: 
16 |         pred = [float(v) for v in line.strip().split(' ')]
17 |         tot = sum(pred) + 0.001
18 |         pred = [v/tot for v in pred]
19 |         c = sorted([(k, v) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0]
20 |         label = label_map[c]
21 |         feats = fl.readline().strip().split('\t')[1]
22 |         feat_map[feats] = label
23 |     fin.close()
24 |     fl.close()
25 |     fo = open(p_out, 'w')
26 |     for line in open(p_test):
27 |         feats = line.strip()
28 |         fo.write('%s\t%s\n' % (feats, feat_map[feats]))
29 |     fo.close()
30 | 
31 | if __name__ == '__main__':
32 |     import sys
33 |     if len(sys.argv) != 6:
34 |         print '<usage> pred test.simple test label out'
35 |         exit(1)
36 |     
37 |     load_label_map(sys.argv[4])
38 |     print 'max_label:', max_label
39 |     get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5])
40 | 


--------------------------------------------------------------------------------
/V1/construct_session_prob.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | 
12 | def get_match(p_pred, p_session, p_test, p_out):
13 |     npred = len(open(p_pred).readlines()) / len(open(p_session).readlines())
14 |     query_dict = {} 
15 |     query_num = {}
16 |     for line in open(p_test):
17 |         query = line.strip()
18 |         query_dict[query] = [0]*npred     
19 |         query_num[query] = 0
20 | 
21 |     fp = open(p_pred)
22 |     for line in open(p_session):
23 |         query_list = line.strip().split('\t')[1].split(';')
24 | 
25 |         pred = []
26 |         for i in range(npred):
27 |             if i <= max_label:
28 |                 pred.append(float(fp.readline().strip())) 
29 |             else:
30 |                 fp.readline()
31 |         for query in query_list:
32 |             if query in query_dict: 
33 |                 for i, v in enumerate(pred):
34 |                     query_dict[query][i] += v
35 |                 query_num[query] += 1
36 |     fp.close()
37 |     print 'query_dict', len(query_dict)
38 | 
39 |     not_in_test = 0
40 |     fo = open(p_out, 'w')
41 |     for line in open(p_test):
42 |         query = line.strip()
43 |         #c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0]
44 |         #label = label_map[c]
45 |         if query_num[query] > 0:
46 |             for i in range(npred):
47 |                 fo.write('%s\n' % (query_dict[query][i]/query_num[query]))
48 |         else:
49 |             for i in range(npred):
50 |                 fo.write('0\n')
51 |             not_in_test += 1
52 |     fo.close()
53 |     print 'not in session:', not_in_test
54 | 
55 | if __name__ == '__main__':
56 |     import sys
57 |     if len(sys.argv) < 6:
58 |         print '<usage> pred session testid label out'
59 |         exit(1)
60 |     
61 |     load_label_map(sys.argv[4])
62 |     print 'max_label:', max_label
63 | 
64 |     get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5])
65 | 


--------------------------------------------------------------------------------
/V1/forest.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | # Import the random forest package
 4 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
 5 | from sklearn.datasets import load_svmlight_file, load_svmlight_files
 6 | 
 7 | if len(sys.argv) < 5:
 8 |     print '<usage> p_trian p_test n_tree depth'
 9 |     exit(1)
10 | p_train = sys.argv[1]
11 | p_test = sys.argv[2]
12 | n_tree = int(sys.argv[3])
13 | depth = int(sys.argv[4])
14 | print p_train, p_test, n_tree, depth
15 | 
16 | # How to load data?
17 | #dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
18 | #X_train, y_train, X_test, y_test = load_svmlight_files(p_train, p_test)
19 | X_train, y_train = load_svmlight_file(p_train)
20 | X_train = X_train.toarray()
21 | X_test, y_test = load_svmlight_file(p_test, n_features=X_train.shape[1])
22 | X_test = X_test.toarray()
23 | 
24 | # Create the random forest object which will include all the parameters
25 | # for the fit
26 | forest = RandomForestClassifier(n_estimators=n_tree, criterion='gini', max_depth=depth, n_jobs=4)
27 | 
28 | # Fit the training data to the Survived labels and create the decision trees
29 | forest = forest.fit(X_train, y_train)
30 | 
31 | # Take the same decision trees and run it on the test data
32 | output = forest.predict(X_test)
33 | 
34 | with open('pred_forest.txt', 'w') as fo:
35 |     for o in output:
36 |         fo.write('%s\n' % o)
37 | 


--------------------------------------------------------------------------------
/V1/markov_all.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import gc
  3 | 
  4 | p_train = '../raw_data/train.txt'
  5 | p_test = '../raw_data/test.txt'
  6 | p_dog = '../trans_data/dog.txt'
  7 | p_valid = '../trans_data/valid.txt'
  8 | 
  9 | def norm(a2b):
 10 |     gc.disable()
 11 |     for a in a2b:
 12 |         tot = sum([a2b[a][b] for b in a2b[a]]) 
 13 |         for b in a2b[a]: a2b[a][b] /= tot
 14 |     gc.enable()
 15 | 
 16 | def multiple(a2b, b2c):
 17 |     gc.disable()
 18 |     a2c = {}
 19 |     for a in a2b:
 20 |         if a not in a2c: a2c[a] = {}
 21 |         for b in a2b[a]:
 22 |             v1 = a2b[a][b]
 23 |             if b in b2c:
 24 |                 for c in b2c[b]:
 25 |                     if c not in a2c[a]: a2c[a][c] = 0
 26 |                     v2 = b2c[b][c]
 27 |                     a2c[a][c] += a2b[a][b] * b2c[b][c]
 28 |     gc.enable()
 29 |     return a2c
 30 | 
 31 | class Converter(object):
 32 |     def __init__(self):
 33 |         self.max_idx = 0
 34 |         self.s_dict = {}
 35 |     def str2id(self, s):
 36 |         if s not in self.s_dict: 
 37 |             self.s_dict[s] = self.max_idx
 38 |             self.s_dict[self.max_idx] = s
 39 |             self.max_idx += 1
 40 |         return self.s_dict[s]
 41 |     def id2str(self, i):
 42 |         return self.s_dict.get(i, '')
 43 | 
 44 | def build(p_in):
 45 |     #gc.disable()
 46 |     convert = Converter()
 47 |     
 48 |     all_to_class = {}
 49 |     query_to_all = {}
 50 | 
 51 |     print 'loading from', p_in 
 52 | 
 53 |     session_query = set()
 54 |     session_all = set()
 55 |     for line in open(p_in):
 56 |         if not line.strip():
 57 |             #session end
 58 |             for q in session_query:
 59 |                 for q2 in session_all:
 60 |                     if q!=q2:
 61 |                         if q not in query_to_all: query_to_all[q] = {}
 62 |                         query_to_all[q][q2] = query_to_all[q].get(q2, 0) + 1.
 63 | 
 64 |             session_query = set()
 65 |             session_all = set()
 66 |             continue
 67 | 
 68 |         try:
 69 |             labels, query, title = line.strip().split('\t')
 70 |         except:
 71 |             labels, query = line.strip().split('\t')
 72 |             title = '-'
 73 | 
 74 |         query = convert.str2id(query)
 75 |         session_query.add(query)
 76 |         session_all.add(query)
 77 |         if title and title != '-':
 78 |             title = convert.str2id('t_' + title)
 79 |             session_all.add(title)
 80 | 
 81 |         if labels!='CLASS=TEST' and labels!='CLASS=UNKNOWN':
 82 |             label_list = labels.split(' | ')
 83 |             for label in label_list:
 84 |                 label = convert.str2id(label)
 85 |                 if query not in all_to_class: all_to_class[query] = {}
 86 |                 all_to_class[query][label] = all_to_class[query].get(label, 0) + 1.
 87 |                 if title and title != '-':
 88 |                     if title not in all_to_class: all_to_class[title] = {}
 89 |                     all_to_class[title][label] = all_to_class[title].get(label, 0) + 1.
 90 |     print 'load finished'
 91 | 
 92 |     norm(all_to_class)
 93 |     norm(query_to_all)
 94 |     print 'normalize finished'
 95 | 
 96 |     return all_to_class, query_to_all, convert
 97 | 
 98 | def markov(p_in, p_query, p_out1, p_out2):
 99 |     all_to_class, query_to_all, convert = build(p_in)
100 | 
101 |     query_to_class1 = multiple(query_to_all, all_to_class)
102 |     print 'round 1 finished'
103 |     query_to_class2 = multiple(query_to_all, query_to_class1)
104 |     print 'round 2 finished'
105 | 
106 |     fo1 = open(p_out1, 'w') 
107 |     fo2 = open(p_out2, 'w')    
108 |     for line in open(p_query):
109 |         query = line.strip()
110 |         query = convert.str2id(query)
111 |         if query not in query_to_class1:
112 |             fo1.write('\n')
113 |             fo2.write('\n')
114 | 
115 |         query = convert.id2str(query)
116 |         rs = ['%s:%s' % (convert.id2str(k), v) for k, v in query_to_class1.get(query, {}).items()]
117 |         rs2 = ['%s:%s' % (convert.id2str(k), v) for k, v in query_to_class2.get(query, {}).items()]
118 |         fo1.write('%s\n' % (' || '.join(rs))) 
119 |         fo2.write('%s\n' % (' || '.join(rs2))) 
120 |     fo1.close()
121 |     fo2.close()
122 |     print 'write to file finished'
123 | 
124 | markov(p_train, p_test, 'pred_markov1', 'pred_markov2')
125 | #markov(p_dog, p_valid, 'dog_pred1', 'dog_pred2')
126 | 
127 | 


--------------------------------------------------------------------------------
/V1/markov_sessoin_label.py:
--------------------------------------------------------------------------------
 1 | 
 2 | p_train = '../raw_data/train.txt'
 3 | p_test = '../raw_data/test.txt'
 4 | p_dog = '../trans_data/dog_refine.txt'
 5 | p_dog = '../trans_data/dog.txt'
 6 | p_valid = '../trans_data/valid.txt'
 7 | 
 8 | def markov(p_in, p_query, p_out):
 9 |     test_label = {}
10 |     unknown_label = {}
11 |     test_unknown = {}
12 | 
13 |     label_query = {}
14 |     unknown_query = {}
15 |     test_query = {}
16 |     session = [] 
17 |     for line in open(p_in):
18 |         if not line.strip():
19 |             n_query = len(label_query) + len(unknown_query) + len(test_query) 
20 |             label_dict = {}
21 |             for query, label in label_query.items():
22 |                 label_dict[label] = label_dict.get(label, 0) + 1
23 |             if len(label_dict) <= 1 or (len(label_dict) == 2 and (label_dict.keys()[0].find(label_dict.keys()[1])==0 or label_dict.keys()[1].find(label_dict.keys()[0])==0)):
24 |               for query in test_query:
25 |                 if query not in test_label: 
26 |                     test_label[query] = {}
27 |                 for query2, label in label_query.items():
28 |                     test_label[query][label] = test_label[query].get(label, 0) + 1
29 |                 if query not in test_unknown:
30 |                     test_unknown[query] = {}
31 |                 for query2 in unknown_query:
32 |                     test_unknown[query][query2] = test_unknown[query].get(query2, 0) + 1
33 |               for query in unknown_query:
34 |                 if query not in unknown_label: 
35 |                     unknown_label[query] = {}
36 |                 for query2, label in label_query.items():
37 |                     unknown_label[query][label] = unknown_label[query].get(label, 0) + 1
38 |             else:
39 |                 #print session
40 |                 pass
41 |             label_query = {}
42 |             unknown_query = {}
43 |             test_query = {}
44 |             session = []
45 |             continue
46 |         label, query = line.strip().split('\t')[:2]
47 |         label = ' | '.join(sorted(label.split(' | ')))
48 |         if not session or query != session[-1][1]:
49 |             session.append( (label, query) )
50 |         if label=='CLASS=TEST':
51 |             test_query[query] = 1
52 |         elif label=='CLASS=UNKNOWN':
53 |             if query.count(' ') > 1:
54 |                 unknown_query[query] = 1
55 |         else:
56 |             label_query[query] = label
57 | 
58 |     with open(p_out, 'w') as fo:
59 |         for line in open(p_query):
60 |             query = line.strip()
61 |             if query in test_label and test_label[query]:
62 |                 s = ['%s:%s' % (k, v) for k, v in test_label[query].items()]
63 |                 fo.write('%s\n' % (' || '.join(s)))
64 |             elif query in test_unknown:
65 |                 label_dict = {}
66 |                 for query2, v1 in test_unknown[query].items():
67 |                     if query2 in unknown_label:
68 |                         for label, v2 in unknown_label[query2].items():
69 |                             label_dict[label] = label_dict.get(label, 0) + v1*v2
70 |                 if label_dict:
71 |                     s = ['%s:%s' % (k, v) for k, v in label_dict.items()]
72 |                     fo.write('%s\n' % (' || '.join(s)))
73 |                 else:
74 |                     fo.write('\n')
75 |             else:
76 |                 fo.write('\n')
77 | 
78 | #markov(p_dog, p_valid, 'pred.txt')
79 | markov(p_train, p_test, 'pred_session_label.txt')
80 | 


--------------------------------------------------------------------------------
/V1/metric_F1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Evaluation metric for the CIKM CUP 2014 
  4 | F-score
  5 | 
  6 | @author: Michael Liu 
  7 | Created: Thu July 22 2014
  8 | """
  9 | 
 10 | import os
 11 | import csv
 12 | import math
 13 | 
 14 | def create_solution_dictionary(solution):
 15 |     """
 16 |     """
 17 |     
 18 |     solnDict = {}
 19 |     with open(solution, 'rb') as f:
 20 |         for line in f:
 21 |             query, labels = line.strip().split('\t')
 22 |             label_list = labels.split(' | ')
 23 |             solnDict[query] = label_list
 24 |     return solnDict
 25 | 
 26 | def check_submission(submission, solutionDict):
 27 |     """
 28 |     """
 29 | 
 30 |     submissionDict = {}
 31 |     with open(submission, 'rb') as f:
 32 |         for line in f:
 33 |             query, labels = line.strip('\n').split('\t')
 34 |             if query in submissionDict:
 35 |                 print 'duplicate id in submission'
 36 |                 return False
 37 |             if query not in solutionDict:
 38 |                 print 'submission id must in solution'
 39 |                 return False
 40 |             label_list = labels.split(' | ')
 41 |             submissionDict[query] = label_list
 42 | 
 43 |     if len(submissionDict) != len(solutionDict):
 44 |         print 'size of submission and solution must be the same'
 45 |         return False
 46 |     return submissionDict
 47 | 
 48 | def F1_metric(solution, submission):
 49 |     """
 50 |     """
 51 | 
 52 |     solutionDict = create_solution_dictionary(solution)
 53 |     submissionDict = check_submission(submission, solutionDict)
 54 | 
 55 |     if submissionDict:
 56 |         true_positive = {}
 57 |         all_positive = {}
 58 |         groundtruth = {}
 59 | 
 60 |         for query in solutionDict:
 61 |             label_list = set(submissionDict[query])
 62 |             truth_list = set(solutionDict[query])
 63 |             for label in label_list: 
 64 |                 if label in truth_list:
 65 |                     true_positive[label] = true_positive.get(label, 0) + 1.
 66 |                 all_positive[label] = all_positive.get(label, 0) + 1
 67 |             for label in truth_list:
 68 |                 groundtruth[label] = groundtruth.get(label, 0) + 1
 69 | 
 70 |         precision_list = []
 71 |         recall_list = [] 
 72 |         for label in groundtruth:
 73 |             precision = 0
 74 |             if label in all_positive:
 75 |                 precision = true_positive.get(label, 0) / all_positive.get(label, 0)
 76 |             print label, 'precision', precision 
 77 | 
 78 |             recall = true_positive[label] / groundtruth[label]
 79 |             print label, 'recall', recall 
 80 | 
 81 |             precision_list.append(precision)
 82 |             recall_list.append(recall)
 83 | 
 84 |         ap = sum(precision_list) / len(recall_list)
 85 |         ar = sum(recall_list) / len(recall_list)
 86 |         F1 = 2*ap*ar / (ap + ar)
 87 |         print 'ap', ap
 88 |         print 'ar', ar
 89 |         print 'F1', F1
 90 | 
 91 | if __name__ == "__main__":
 92 |     solutionFile = ""
 93 |     submissionFile = ""
 94 | 
 95 |     import sys
 96 |     if len(sys.argv) < 3:
 97 |         print '<usage> solution submission'
 98 |         exit(-1)
 99 |     solutionFile = sys.argv[1]
100 |     submissionFile = sys.argv[2]
101 |     
102 |     F1_metric(solutionFile, submissionFile)
103 |     
104 |     
105 | 


--------------------------------------------------------------------------------
/V1/metric_confusion.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Evaluation metric for the CIKM CUP 2014 
 4 | F-score
 5 | 
 6 | @author: Michael Liu 
 7 | Created: Thu July 22 2014
 8 | """
 9 | 
10 | import os
11 | import csv
12 | import math
13 | 
14 | def create_solution_dictionary(solution):
15 |     """
16 |     """
17 |     
18 |     solnDict = {}
19 |     with open(solution, 'rb') as f:
20 |         for line in f:
21 |             query, labels = line.strip().split('\t')
22 |             solnDict[query] = labels
23 |     return solnDict
24 | 
25 | def check_submission(submission, solutionDict):
26 |     """
27 |     """
28 | 
29 |     submissionDict = {}
30 |     with open(submission, 'rb') as f:
31 |         for line in f:
32 |             query, labels = line.strip('\n').split('\t')
33 |             if query in submissionDict:
34 |                 print 'duplicate id in submission'
35 |                 return False
36 |             if query not in solutionDict:
37 |                 print 'submission id must in solution'
38 |                 return False
39 |             submissionDict[query] = labels
40 | 
41 |     if len(submissionDict) != len(solutionDict):
42 |         print 'size of submission and solution must be the same'
43 |         return False
44 |     return submissionDict
45 | 
46 | def confusion(solution, submission):
47 |     """
48 |     """
49 | 
50 |     solutionDict = create_solution_dictionary(solution)
51 |     submissionDict = check_submission(submission, solutionDict)
52 | 
53 |     if submissionDict:
54 |         matrix = {}
55 |         for query in solutionDict:
56 |             label = submissionDict[query]
57 |             truth = solutionDict[query]
58 |             matrix[truth] = matrix.get(truth, {})
59 |             matrix[truth][label] = matrix[truth].get(label, 0) + 1
60 | 
61 |         confusion = []
62 |         for truth in matrix:
63 |             label_list = sorted(matrix[truth].items(), key=lambda d:-d[1])
64 |             confusion.append( (truth, label_list) )
65 |         for truth, label_list in sorted(confusion, key=lambda d:-d[1][1][1] if len(d[1])>=2 else 0):
66 |             label_list_str = ['%s:%s' % (k, v) for k, v in label_list]
67 |             print truth, ' ==>> ', '   '.join(label_list_str)
68 | 
69 | if __name__ == "__main__":
70 |     solutionFile = ""
71 |     submissionFile = ""
72 | 
73 |     import sys
74 |     if len(sys.argv) < 3:
75 |         print '<usage> solution submission'
76 |         exit(-1)
77 |     solutionFile = sys.argv[1]
78 |     submissionFile = sys.argv[2]
79 |     
80 |     confusion(solutionFile, submissionFile)
81 |     
82 |     
83 | 


--------------------------------------------------------------------------------
/V1/prepare1.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_dog_train_feat = '../trans_data/dog.simple'
  3 | p_dog_valid_feat = '../trans_data/valid.simple'
  4 | p_dog_valid_id = '../trans_data/valid.txt'
  5 | 
  6 | p_pig_train_feat = '../trans_data/train.simple'
  7 | p_pig_valid_feat = '../trans_data/test.simple'
  8 | p_pig_valid_id = '../raw_data/test.txt'
  9 | 
 10 | p_label_map = '../dataset/label_map'
 11 | p_dog_train = '../dataset/dog_train'
 12 | p_dog_test = '../dataset/dog_test'
 13 | p_pig_train = '../dataset/pig_train'
 14 | p_pig_test = '../dataset/pig_test'
 15 | 
 16 | min_word_df = 5
 17 | min_title_df = 10
 18 | 
 19 | label_map = {}
 20 | cur_label = 0
 21 | word_map = {}
 22 | cur_word = 1 
 23 | 
 24 | pig_train = []
 25 | test_dict = {}
 26 | pig_test = [] 
 27 | 
 28 | label_df = {}
 29 | word_df = {}
 30 | 
 31 | def get_df(p_in):
 32 |   for line in open(p_in):
 33 |     row = line.strip().split('\t')
 34 |     label = row[0]
 35 |     label = ' | '.join(sorted(label.split(' | ')))
 36 |     label_df[label] = label_df.get(label, 0) + 1
 37 |     query = row[1]
 38 |     titles = row[2] if len(row)>=3 else ''
 39 |     session_queries = row[5] if len(row)>=6 else ''
 40 |     session_titles = row[6] if len(row)>=7 else ''
 41 | 
 42 |     feat_list = query.split(' ')
 43 |     for i, word in enumerate(feat_list):
 44 |         #if not word: continue
 45 |         word_df[word] = word_df.get(word, 0) + 1
 46 |         if i>=1:
 47 |             word = ' '.join(feat_list[i-1:i+1])
 48 |             word_df[word] = word_df.get(word, 0) + 1
 49 |         word = '%s_%s' % (i, feat_list[i])
 50 |         word_df[word] = word_df.get(word, 0) + 1
 51 |         if i >= len(feat_list)/2:
 52 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
 53 |             word_df[word] = word_df.get(word, 0) + 1
 54 | 
 55 |     for pair in titles.split(';'):
 56 |         if not pair: continue
 57 |         title, freq = pair.split(':')
 58 |         feat_list = title.split(' ')
 59 |         for i, word in enumerate(feat_list):
 60 |             #if not word: continue
 61 |             word = 't_' + word
 62 |             word_df[word] = word_df.get(word, 0) + 1
 63 |             if i>=1:
 64 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
 65 |                 word_df[word] = word_df.get(word, 0) + 1
 66 | 
 67 |     for pair in session_queries.split(';'):
 68 |         if not pair: continue
 69 |         title, freq = pair.split(':')
 70 |         feat_list = title.split(' ')
 71 |         for i, word in enumerate(feat_list):
 72 |             #if not word: continue
 73 |             word = 'sq_' + word
 74 |             word_df[word] = word_df.get(word, 0) + 1
 75 |     for pair in session_titles.split(';'):
 76 |         if not pair: continue
 77 |         title, freq = pair.split(':')
 78 |         feat_list = title.split(' ')
 79 |         for i, word in enumerate(feat_list):
 80 |             #if not word: continue
 81 |             word = 'st_' + word
 82 |             word_df[word] = word_df.get(word, 0) + 1
 83 |             if i>=1:
 84 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
 85 |                 word_df[word] = word_df.get(word, 0) + 1
 86 | 
 87 | def prepare(p_in, p_out, isTrain, p_in2):
 88 |   global cur_label
 89 |   global cur_word
 90 | 
 91 |   if isTrain: fo = open(p_out, 'w')
 92 |   fin2 = open(p_in2)
 93 | 
 94 |   for line in open(p_in):
 95 |     row = line.strip().split('\t')
 96 |     label = row[0]
 97 |     query = row[1]
 98 |     titles = row[2] if len(row)>=3 else ''
 99 |     labels = row[3] if len(row)>=4 else ''
100 |     session_queries = row[5] if len(row)>=6 else ''
101 |     session_titles = row[6] if len(row)>=7 else ''
102 | 
103 |     row2 = fin2.readline().split('\t')
104 |     if row2[0] == query:
105 |         stats = row2[1]
106 |         stats2 = row2[2].strip()
107 |     else:
108 |         print 'query mismatch'
109 |         exit(1)
110 | 
111 |     if isTrain:
112 |         label = ' | '.join(sorted(label.split(' | ')))
113 |         if label_df[label] < 200:
114 |             label = label.split(' ')[0]
115 |         if label not in label_map:
116 |             label_map[label] = cur_label
117 |             cur_label += 1
118 | 
119 |     feat_list = query.split(' ')
120 |     word_tf = {}
121 |     for i, word in enumerate(feat_list):
122 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
123 |             word_map[word] = cur_word
124 |             cur_word += 1
125 |         if word in word_map:
126 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
127 |         if i>=1:
128 |             word = ' '.join(feat_list[i-1:i+1])
129 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
130 |                 word_map[word] = cur_word
131 |                 cur_word += 1
132 |             if word in word_map:
133 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
134 |         word = '%s_%s' % (i, feat_list[i])
135 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
136 |             word_map[word] = cur_word
137 |             cur_word += 1
138 |         if word in word_map:
139 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
140 |         if i >= len(feat_list) / 2:
141 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
142 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
143 |                 word_map[word] = cur_word
144 |                 cur_word += 1
145 |             if word in word_map:
146 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
147 | 
148 |     tot_freq = 0
149 |     for pair in titles.split(';'):
150 |         if not pair: continue
151 |         title, freq = pair.split(':')
152 |         tot_freq += float(freq)
153 |     word_tf2 = {}
154 |     for pair in titles.split(';'):
155 |         if not pair: continue
156 |         title, freq = pair.split(':')
157 |         freq = float(freq)
158 |         feat_list = title.split(' ')
159 |         for i, word in enumerate(feat_list):
160 |             word = 't_' + word
161 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
162 |                 word_map[word] = cur_word
163 |                 cur_word += 1
164 |             if word in word_map:
165 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
166 |             if word in word_map and word_map[word] in word_tf:
167 |                 word = 'qt_' + word
168 |                 if isTrain and word not in word_map:
169 |                     word_map[word] = cur_word
170 |                     cur_word += 1
171 |                 if word in word_map:
172 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1.*freq*1./tot_freq
173 |             if i>=1:
174 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
175 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
176 |                     word_map[word] = cur_word
177 |                     cur_word += 1
178 |                 if word in word_map:
179 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
180 | 
181 |     tot_freq = 0
182 |     for pair in session_queries.split(';'):
183 |         if not pair: continue
184 |         title, freq = pair.split(':')
185 |         tot_freq += float(freq)
186 |     for pair in session_queries.split(';'):
187 |         if not pair: continue
188 |         title, freq = pair.split(':')
189 |         freq = float(freq)
190 |         feat_list = title.split(' ')
191 |         for i, word in enumerate(feat_list):
192 |             word = 'sq_' + word
193 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
194 |                 word_map[word] = cur_word
195 |                 cur_word += 1
196 |             if word in word_map:
197 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
198 | 
199 |     tot_freq = 0
200 |     for pair in session_titles.split(';'):
201 |         if not pair: continue
202 |         title, freq = pair.split(':')
203 |         tot_freq += float(freq)
204 |     for pair in session_titles.split(';'):
205 |         if not pair: continue
206 |         title, freq = pair.split(':')
207 |         freq = float(freq)
208 |         feat_list = title.split(' ')
209 |         for i, word in enumerate(feat_list):
210 |             word = 'st_' + word
211 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
212 |                 word_map[word] = cur_word
213 |                 cur_word += 1
214 |             if word in word_map:
215 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
216 |             if i>=1:
217 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
218 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
219 |                     word_map[word] = cur_word
220 |                     cur_word += 1
221 |                 if word in word_map:
222 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
223 | 
224 |     for pair in labels.split(';'):
225 |         if not pair: continue
226 |         word, freq = pair.split(':')
227 |         freq = float(freq)
228 |         if isTrain and word not in word_map:
229 |             word_map[word] = cur_word
230 |             cur_word += 1
231 |         if word in word_map:
232 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
233 | 
234 |     for pair in stats.split(';') + stats2.split(';'):
235 |         if not pair: continue
236 |         word, freq = pair.split(':')
237 |         freq = float(freq)
238 |         if isTrain and word not in word_map:
239 |             word_map[word] = cur_word
240 |             cur_word += 1
241 |         if word in word_map:
242 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
243 | 
244 |     if isTrain:
245 |         label, F = (label_map[label], word_tf.items() + word_tf2.items())
246 |         F = sorted(F, key=lambda d:d[0])
247 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
248 |         fo.write('%s %s\n' % (label, f_str))
249 |     else:
250 |         test_dict[query] = word_tf.items() + word_tf2.items()
251 | 
252 |   if isTrain: fo.close()
253 |   fin2.close()
254 | 
255 |   # save label map
256 |   with open(p_label_map, 'w') as fo:
257 |     for label in label_map:
258 |         fo.write('%s\t%s\n' % (label, label_map[label]))
259 | 
260 | 
261 | def save_test(p_in, p_out):
262 |   with open(p_out, 'w') as fo:
263 |     for line in open(p_in):
264 |         query = line.strip()
265 |         F = test_dict[query]
266 |         F = sorted(F, key=lambda d:d[0])
267 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
268 |         fo.write('%s %s\n' % (0, f_str))
269 |         
270 | #for dog
271 | #get_df(p_dog_train_feat)
272 | #prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2')
273 | #for pig
274 | get_df(p_pig_train_feat)
275 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2')
276 | 
277 | #prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2')
278 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2')
279 | #save_test(p_dog_valid_id, p_dog_test)
280 | save_test(p_pig_valid_id, p_pig_test)
281 | 
282 | 


--------------------------------------------------------------------------------
/V1/prepare2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_dog_train_feat = '../trans_data/dog.simple'
  3 | p_dog_valid_feat = '../trans_data/valid.simple'
  4 | p_dog_valid_id = '../trans_data/valid.txt'
  5 | 
  6 | p_pig_train_feat = '../trans_data/train.simple'
  7 | p_pig_valid_feat = '../trans_data/test.simple'
  8 | p_pig_valid_id = '../raw_data/test.txt'
  9 | 
 10 | p_label_map = '../dataset/label_map'
 11 | p_dog_train = '../dataset/dog_train'
 12 | p_dog_test = '../dataset/dog_test'
 13 | p_pig_train = '../dataset/pig_train2'
 14 | p_pig_test = '../dataset/pig_test2'
 15 | 
 16 | min_word_df = 5
 17 | min_title_df = 10
 18 | 
 19 | label_map = {}
 20 | cur_label = 0
 21 | word_map = {}
 22 | cur_word = 1 
 23 | 
 24 | pig_train = []
 25 | test_dict = {}
 26 | pig_test = [] 
 27 | 
 28 | label_df = {}
 29 | word_df = {}
 30 | 
 31 | def get_df(p_in):
 32 |   for line in open(p_in):
 33 |     row = line.strip().split('\t')
 34 |     label = row[0]
 35 |     label = ' | '.join(sorted(label.split(' | ')))
 36 |     label_df[label] = label_df.get(label, 0) + 1
 37 |     query = row[1]
 38 |     titles = row[2] if len(row)>=3 else ''
 39 |     session_queries = row[5] if len(row)>=6 else ''
 40 |     session_titles = row[6] if len(row)>=7 else ''
 41 | 
 42 |     feat_list = query.split(' ')
 43 |     for i, word in enumerate(feat_list):
 44 |         #if not word: continue
 45 |         word_df[word] = word_df.get(word, 0) + 1
 46 |         if i>=1:
 47 |             word = ' '.join(feat_list[i-1:i+1])
 48 |             word_df[word] = word_df.get(word, 0) + 1
 49 |         word = '%s_%s' % (i, feat_list[i])
 50 |         word_df[word] = word_df.get(word, 0) + 1
 51 |         if i >= len(feat_list)/2:
 52 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
 53 |             word_df[word] = word_df.get(word, 0) + 1
 54 | 
 55 |     for pair in titles.split(';'):
 56 |         if not pair: continue
 57 |         title, freq = pair.split(':')
 58 |         feat_list = title.split(' ')
 59 |         for i, word in enumerate(feat_list):
 60 |             #if not word: continue
 61 |             word = 't_' + word
 62 |             word_df[word] = word_df.get(word, 0) + 1
 63 |             if i>=1:
 64 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
 65 |                 word_df[word] = word_df.get(word, 0) + 1
 66 | 
 67 |     for pair in session_queries.split(';'):
 68 |         if not pair: continue
 69 |         title, freq = pair.split(':')
 70 |         feat_list = title.split(' ')
 71 |         for i, word in enumerate(feat_list):
 72 |             #if not word: continue
 73 |             word = 'sq_' + word
 74 |             word_df[word] = word_df.get(word, 0) + 1
 75 |     for pair in session_titles.split(';'):
 76 |         if not pair: continue
 77 |         title, freq = pair.split(':')
 78 |         feat_list = title.split(' ')
 79 |         for i, word in enumerate(feat_list):
 80 |             #if not word: continue
 81 |             word = 'st_' + word
 82 |             word_df[word] = word_df.get(word, 0) + 1
 83 |             if i>=1:
 84 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
 85 |                 word_df[word] = word_df.get(word, 0) + 1
 86 | 
 87 | def prepare(p_in, p_out, isTrain, p_in2):
 88 |   global cur_label
 89 |   global cur_word
 90 | 
 91 |   if isTrain: fo = open(p_out, 'w')
 92 |   fin2 = open(p_in2)
 93 | 
 94 |   for line in open(p_in):
 95 |     row = line.strip().split('\t')
 96 |     label = row[0]
 97 |     query = row[1]
 98 |     titles = row[2] if len(row)>=3 else ''
 99 |     labels = row[3] if len(row)>=4 else ''
100 |     session_queries = row[5] if len(row)>=6 else ''
101 |     session_titles = row[6] if len(row)>=7 else ''
102 | 
103 |     row2 = fin2.readline().split('\t')
104 |     if row2[0] == query:
105 |         stats = row2[1]
106 |         stats2 = row2[2].strip()
107 |     else:
108 |         print 'query mismatch'
109 |         exit(1)
110 | 
111 |     if isTrain:
112 |         label = ' | '.join(sorted(label.split(' | ')))
113 |         if label_df[label] < 200:
114 |             label = label.split(' ')[0]
115 |         if label not in label_map:
116 |             label_map[label] = cur_label
117 |             cur_label += 1
118 | 
119 |     feat_list = query.split(' ')
120 |     word_tf = {}
121 |     for i, word in enumerate(feat_list):
122 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
123 |             word_map[word] = cur_word
124 |             cur_word += 1
125 |         if word in word_map:
126 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
127 |         if i>=1:
128 |             word = ' '.join(feat_list[i-1:i+1])
129 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
130 |                 word_map[word] = cur_word
131 |                 cur_word += 1
132 |             if word in word_map:
133 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
134 |         word = '%s_%s' % (i, feat_list[i])
135 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
136 |             word_map[word] = cur_word
137 |             cur_word += 1
138 |         if word in word_map:
139 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
140 |         if i >= len(feat_list) / 2:
141 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
142 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
143 |                 word_map[word] = cur_word
144 |                 cur_word += 1
145 |             if word in word_map:
146 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
147 | 
148 |     tot_freq = 0
149 |     for pair in titles.split(';'):
150 |         if not pair: continue
151 |         title, freq = pair.split(':')
152 |         tot_freq += float(freq)
153 |     word_tf2 = {}
154 |     for pair in titles.split(';'):
155 |         if not pair: continue
156 |         title, freq = pair.split(':')
157 |         freq = float(freq)
158 |         feat_list = title.split(' ')
159 |         for i, word in enumerate(feat_list):
160 |             #word = 't_' + word
161 |             #if isTrain and word_df[word] >= min_title_df and word not in word_map:
162 |             #    word_map[word] = cur_word
163 |             #    cur_word += 1
164 |             #if word in word_map:
165 |             #    word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
166 |             if i>=1:
167 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
168 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
169 |                     word_map[word] = cur_word
170 |                     cur_word += 1
171 |                 if word in word_map:
172 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
173 | 
174 |     '''
175 |     tot_freq = 0
176 |     for pair in session_queries.split(';'):
177 |         if not pair: continue
178 |         title, freq = pair.split(':')
179 |         tot_freq += float(freq)
180 |     for pair in session_queries.split(';'):
181 |         if not pair: continue
182 |         title, freq = pair.split(':')
183 |         freq = float(freq)
184 |         feat_list = title.split(' ')
185 |         for i, word in enumerate(feat_list):
186 |             word = 'sq_' + word
187 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
188 |                 word_map[word] = cur_word
189 |                 cur_word += 1
190 |             if word in word_map:
191 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
192 |     '''
193 | 
194 |     tot_freq = 0
195 |     for pair in session_titles.split(';'):
196 |         if not pair: continue
197 |         title, freq = pair.split(':')
198 |         tot_freq += float(freq)
199 |     for pair in session_titles.split(';'):
200 |         if not pair: continue
201 |         title, freq = pair.split(':')
202 |         freq = float(freq)
203 |         feat_list = title.split(' ')
204 |         for i, word in enumerate(feat_list):
205 |             #word = 'st_' + word
206 |             #if isTrain and word_df[word] >= min_title_df and word not in word_map:
207 |             #    word_map[word] = cur_word
208 |             #    cur_word += 1
209 |             #if word in word_map:
210 |             #    word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
211 |             if i>=1:
212 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
213 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
214 |                     word_map[word] = cur_word
215 |                     cur_word += 1
216 |                 if word in word_map:
217 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
218 | 
219 |     for pair in labels.split(';'):
220 |         if not pair: continue
221 |         word, freq = pair.split(':')
222 |         freq = float(freq)
223 |         if isTrain and word not in word_map:
224 |             word_map[word] = cur_word
225 |             cur_word += 1
226 |         if word in word_map:
227 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
228 | 
229 |     for pair in stats.split(';') + stats2.split(';'):
230 |         if not pair: continue
231 |         word, freq = pair.split(':')
232 |         freq = float(freq)
233 |         if isTrain and word not in word_map:
234 |             word_map[word] = cur_word
235 |             cur_word += 1
236 |         if word in word_map:
237 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
238 | 
239 |     if isTrain:
240 |         label, F = (label_map[label], word_tf.items() + word_tf2.items())
241 |         F = sorted(F, key=lambda d:d[0])
242 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
243 |         fo.write('%s %s\n' % (label, f_str))
244 |     else:
245 |         test_dict[query] = word_tf.items() + word_tf2.items()
246 | 
247 |   if isTrain: fo.close()
248 |   fin2.close()
249 | 
250 |   # save label map
251 |   with open(p_label_map, 'w') as fo:
252 |     for label in label_map:
253 |         fo.write('%s\t%s\n' % (label, label_map[label]))
254 | 
255 | 
256 | def save_test(p_in, p_out):
257 |   with open(p_out, 'w') as fo:
258 |     for line in open(p_in):
259 |         query = line.strip()
260 |         F = test_dict[query]
261 |         F = sorted(F, key=lambda d:d[0])
262 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
263 |         fo.write('%s %s\n' % (0, f_str))
264 |         
265 | #for dog
266 | #get_df(p_dog_train_feat)
267 | #prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2')
268 | #for pig
269 | get_df(p_pig_train_feat)
270 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2')
271 | 
272 | #prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2')
273 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2')
274 | #save_test(p_dog_valid_id, p_dog_test)
275 | save_test(p_pig_valid_id, p_pig_test)
276 | 
277 | 


--------------------------------------------------------------------------------
/V1/prepare3.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_dog_train_feat = '../trans_data/dog.simple'
  3 | p_dog_valid_feat = '../trans_data/valid.simple'
  4 | p_dog_valid_id = '../trans_data/valid.txt'
  5 | 
  6 | p_pig_train_feat = '../trans_data/train.simple'
  7 | p_pig_valid_feat = '../trans_data/test.simple'
  8 | p_pig_valid_id = '../raw_data/test.txt'
  9 | 
 10 | p_label_map = '../dataset/label_map'
 11 | p_dog_train = '../dataset/dog_train'
 12 | p_dog_test = '../dataset/dog_test'
 13 | p_pig_train = '../dataset/pig_train'
 14 | p_pig_test = '../dataset/pig_test'
 15 | 
 16 | min_word_df = 5
 17 | min_title_df = 10
 18 | 
 19 | label_map = {}
 20 | cur_label = 0
 21 | word_map = {}
 22 | cur_word = 1 
 23 | 
 24 | pig_train = []
 25 | test_dict = {}
 26 | pig_test = [] 
 27 | 
 28 | label_df = {}
 29 | word_df = {}
 30 | 
 31 | def get_df(p_in):
 32 |   for line in open(p_in):
 33 |     row = line.strip().split('\t')
 34 |     label = row[0]
 35 |     label = ' | '.join(sorted(label.split(' | ')))
 36 |     label_df[label] = label_df.get(label, 0) + 1
 37 |     query = row[1]
 38 |     titles = row[2] if len(row)>=3 else ''
 39 |     session_queries = row[5] if len(row)>=6 else ''
 40 |     session_titles = row[6] if len(row)>=7 else ''
 41 | 
 42 |     feat_list = query.split(' ')
 43 |     for i, word in enumerate(feat_list):
 44 |         #if not word: continue
 45 |         word_df[word] = word_df.get(word, 0) + 1
 46 |         if i>=1:
 47 |             word = ' '.join(feat_list[i-1:i+1])
 48 |             word_df[word] = word_df.get(word, 0) + 1
 49 |             word = '%s_%s' % (i-1, ' '.join(feat_list[i-1:i+1]))
 50 |             word_df[word] = word_df.get(word, 0) + 1
 51 |         word = '%s_%s' % (i, feat_list[i])
 52 |         word_df[word] = word_df.get(word, 0) + 1
 53 |         if i >= len(feat_list)/2:
 54 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
 55 |             word_df[word] = word_df.get(word, 0) + 1
 56 | 
 57 |     for pair in titles.split(';'):
 58 |         if not pair: continue
 59 |         title, freq = pair.split(':')
 60 |         feat_list = title.split(' ')
 61 |         for i, word in enumerate(feat_list):
 62 |             #if not word: continue
 63 |             word = 't_' + word
 64 |             word_df[word] = word_df.get(word, 0) + 1
 65 |             if i>=1:
 66 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
 67 |                 word_df[word] = word_df.get(word, 0) + 1
 68 | 
 69 |     for pair in session_queries.split(';'):
 70 |         if not pair: continue
 71 |         title, freq = pair.split(':')
 72 |         feat_list = title.split(' ')
 73 |         for i, word in enumerate(feat_list):
 74 |             #if not word: continue
 75 |             word = 'sq_' + word
 76 |             word_df[word] = word_df.get(word, 0) + 1
 77 |     for pair in session_titles.split(';'):
 78 |         if not pair: continue
 79 |         title, freq = pair.split(':')
 80 |         feat_list = title.split(' ')
 81 |         for i, word in enumerate(feat_list):
 82 |             #if not word: continue
 83 |             word = 'st_' + word
 84 |             word_df[word] = word_df.get(word, 0) + 1
 85 | 
 86 | def prepare(p_in, p_out, isTrain, p_in2):
 87 |   global cur_label
 88 |   global cur_word
 89 | 
 90 |   if isTrain: fo = open(p_out, 'w')
 91 |   fin2 = open(p_in2)
 92 | 
 93 |   for line in open(p_in):
 94 |     row = line.strip().split('\t')
 95 |     label = row[0]
 96 |     query = row[1]
 97 |     titles = row[2] if len(row)>=3 else ''
 98 |     labels = row[3] if len(row)>=4 else ''
 99 |     session_queries = row[5] if len(row)>=6 else ''
100 |     session_titles = row[6] if len(row)>=7 else ''
101 | 
102 |     row2 = fin2.readline().split('\t')
103 |     if row2[0] == query:
104 |         stats = row2[1]
105 |         stats2 = row2[2].strip()
106 |     else:
107 |         print 'query mismatch'
108 |         exit(1)
109 | 
110 |     if isTrain:
111 |         label = ' | '.join(sorted(label.split(' | ')))
112 |         if label_df[label] < 200:
113 |             label = label.split(' ')[0]
114 |         if label not in label_map:
115 |             label_map[label] = cur_label
116 |             cur_label += 1
117 | 
118 |     feat_list = query.split(' ')
119 |     word_tf = {}
120 |     for i, word in enumerate(feat_list):
121 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
122 |             word_map[word] = cur_word
123 |             cur_word += 1
124 |         if word in word_map:
125 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
126 |         if i>=1:
127 |             word = ' '.join(feat_list[i-1:i+1])
128 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
129 |                 word_map[word] = cur_word
130 |                 cur_word += 1
131 |             if word in word_map:
132 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
133 |             word = '%s_%s' % (i-1, ' '.join(feat_list[i-1:i+1]))
134 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
135 |                 word_map[word] = cur_word
136 |                 cur_word += 1
137 |             if word in word_map:
138 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
139 |         word = '%s_%s' % (i, feat_list[i])
140 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
141 |             word_map[word] = cur_word
142 |             cur_word += 1
143 |         if word in word_map:
144 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
145 |         if i >= len(feat_list) / 2:
146 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
147 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
148 |                 word_map[word] = cur_word
149 |                 cur_word += 1
150 |             if word in word_map:
151 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
152 | 
153 |     tot_freq = 0
154 |     for pair in titles.split(';'):
155 |         if not pair: continue
156 |         title, freq = pair.split(':')
157 |         tot_freq += float(freq)
158 |     word_tf2 = {}
159 |     for pair in titles.split(';'):
160 |         if not pair: continue
161 |         title, freq = pair.split(':')
162 |         freq = float(freq)
163 |         feat_list = title.split(' ')
164 |         for i, word in enumerate(feat_list):
165 |             word = 't_' + word
166 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
167 |                 word_map[word] = cur_word
168 |                 cur_word += 1
169 |             if word in word_map:
170 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
171 |             if i>=1:
172 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
173 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
174 |                     word_map[word] = cur_word
175 |                     cur_word += 1
176 |                 if word in word_map:
177 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
178 | 
179 |     tot_freq = 0
180 |     for pair in session_queries.split(';'):
181 |         if not pair: continue
182 |         title, freq = pair.split(':')
183 |         tot_freq += float(freq)
184 |     for pair in session_queries.split(';'):
185 |         if not pair: continue
186 |         title, freq = pair.split(':')
187 |         freq = float(freq)
188 |         feat_list = title.split(' ')
189 |         for i, word in enumerate(feat_list):
190 |             word = 'sq_' + word
191 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
192 |                 word_map[word] = cur_word
193 |                 cur_word += 1
194 |             if word in word_map:
195 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
196 | 
197 |     tot_freq = 0
198 |     for pair in session_titles.split(';'):
199 |         if not pair: continue
200 |         title, freq = pair.split(':')
201 |         tot_freq += float(freq)
202 |     for pair in session_titles.split(';'):
203 |         if not pair: continue
204 |         title, freq = pair.split(':')
205 |         freq = float(freq)
206 |         feat_list = title.split(' ')
207 |         for i, word in enumerate(feat_list):
208 |             word = 'st_' + word
209 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
210 |                 word_map[word] = cur_word
211 |                 cur_word += 1
212 |             if word in word_map:
213 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
214 | 
215 |     for pair in labels.split(';'):
216 |         if not pair: continue
217 |         word, freq = pair.split(':')
218 |         freq = float(freq)
219 |         if isTrain and word not in word_map:
220 |             word_map[word] = cur_word
221 |             cur_word += 1
222 |         if word in word_map:
223 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
224 | 
225 |     for pair in stats.split(';') + stats2.split(';'):
226 |         if not pair: continue
227 |         word, freq = pair.split(':')
228 |         freq = float(freq)
229 |         if isTrain and word not in word_map:
230 |             word_map[word] = cur_word
231 |             cur_word += 1
232 |         if word in word_map:
233 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
234 | 
235 |     if isTrain:
236 |         label, F = (label_map[label], word_tf.items() + word_tf2.items())
237 |         F = sorted(F, key=lambda d:d[0])
238 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
239 |         fo.write('%s %s\n' % (label, f_str))
240 |     else:
241 |         test_dict[query] = word_tf.items() + word_tf2.items()
242 | 
243 |   if isTrain: fo.close()
244 |   fin2.close()
245 | 
246 |   # save label map
247 |   with open(p_label_map, 'w') as fo:
248 |     for label in label_map:
249 |         fo.write('%s\t%s\n' % (label, label_map[label]))
250 | 
251 | 
252 | def save_test(p_in, p_out):
253 |   with open(p_out, 'w') as fo:
254 |     for line in open(p_in):
255 |         query = line.strip()
256 |         F = test_dict[query]
257 |         F = sorted(F, key=lambda d:d[0])
258 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
259 |         fo.write('%s %s\n' % (0, f_str))
260 |         
261 | #for dog
262 | #get_df(p_dog_train_feat)
263 | #prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2')
264 | #for pig
265 | get_df(p_pig_train_feat)
266 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2')
267 | 
268 | #prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2')
269 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2')
270 | #save_test(p_dog_valid_id, p_dog_test)
271 | save_test(p_pig_valid_id, p_pig_test)
272 | 
273 | 


--------------------------------------------------------------------------------
/V1/prepare_lda_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # coding: utf-8
 3 | # @author: zuotaoliu@126.com
 4 | # @created: 2014-08-29
 5 | import os
 6 | import sys
 7 | import re
 8 | 
 9 | def do_word_index(p_in, p_out):
10 |     label_map = {}
11 |     cur_label = 0
12 |     word_count = {}
13 |     cur_idx = 0
14 |     fo = open(p_out, 'w')
15 |     for line in open(p_in):
16 |         row = line.rstrip().split('\t')
17 | 
18 |         if len(row) >= 3:
19 |             feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5])
20 |         else:
21 |             feats = row[1] + ':1'
22 | 
23 |         labels = row[0].split(' | ')
24 |         cids = []
25 |         for label in labels:
26 |             if label not in label_map: 
27 |                 label_map[label] = cur_label
28 |                 cur_label += 1
29 |             cids.append(str(label_map[label]))
30 |         wc = {}
31 |         for pair in feats.split(';'):
32 |             if not pair: continue
33 |             words, freq = pair.split(':')
34 |             freq = min(1, int(freq))
35 |             for word in words.split(' '):
36 |                 if not word: continue
37 |                 wc[word] = wc.get(word, 0) + freq
38 |                 word_count[word] = word_count.get(word, 0) + freq
39 |         fo.write('%s\n' % (' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 
40 |     fo.close()
41 |         
42 |     return word_count
43 |         
44 | if __name__ == "__main__":
45 |     if len(sys.argv) < 3:
46 |         print '<usage> inputfile outputfile'
47 |         exit(-1)
48 |     word_count = do_word_index(sys.argv[1], sys.argv[2])
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/V1/prepare_lda_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # coding: utf-8
 3 | # @author: zuotaoliu@126.com
 4 | # @created: 2014-08-29
 5 | import os
 6 | import sys
 7 | import re
 8 | 
 9 | def do_word_index(p_in, p_out):
10 |     label_map = {}
11 |     cur_label = 0
12 |     word_count = {}
13 |     cur_idx = 0
14 |     fo = open(p_out, 'w')
15 |     for line in open(p_in):
16 |         row = line.rstrip().split('\t')
17 | 
18 |         if len(row) >= 3:
19 |             feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5])
20 |         else:
21 |             feats = row[1] + ':1'
22 | 
23 |         labels = row[0].split(' | ')
24 |         cids = []
25 |         for label in labels:
26 |             if label not in label_map: 
27 |                 label_map[label] = cur_label
28 |                 cur_label += 1
29 |             cids.append(str(label_map[label]))
30 |         wc = {}
31 |         for pair in feats.split(';'):
32 |             if not pair: continue
33 |             words, freq = pair.split(':')
34 |             freq = min(1, int(freq))
35 |             for word in words.split(' '):
36 |                 if not word: continue
37 |                 wc[word] = wc.get(word, 0) + freq
38 |                 word_count[word] = word_count.get(word, 0) + freq
39 |         fo.write('[%s] %s\n' % (' '.join(cids), ' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 
40 |     fo.close()
41 |     with open('../dataset/label_map_lda', 'w') as fo:
42 |         for label in label_map:
43 |             fo.write('%s\t%s\n' % (label, label_map[label]))
44 |         
45 |     return word_count
46 |         
47 | if __name__ == "__main__":
48 |     if len(sys.argv) < 4:
49 |         print '<usage> inputfile outputfile wordindex'
50 |         exit(-1)
51 |     word_count = do_word_index(sys.argv[1], sys.argv[2])
52 | 
53 |     sort_list = sorted(word_count.items(), key=lambda d:d[1], reverse=True)
54 |     with open(sys.argv[3], 'w') as fo:
55 |         for id, pair in enumerate(sort_list):
56 |             word, num = pair
57 |             if num >= 5:
58 |                 fo.write('%s %s\n' % (id, word))
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/V1/prepare_liblinear_1vsA.py:
--------------------------------------------------------------------------------
 1 | p_simple = '../trans_data/train.simple'
 2 | p_raw_train = '../dataset/pig_train'
 3 | p_svm_train = '../dataset/svm_train'
 4 | 
 5 | label_map = {}
 6 | cur_label = 0
 7 | for line in open('../dataset/label_map'):
 8 |     label = line.split('\t')[0]
 9 |     if label.count(' | ') == 0:
10 |         if label not in label_map:
11 |                 label_map[label] = cur_label
12 |                 cur_label += 1
13 |                 print label
14 | with open('../dataset/label_map_svm', 'w') as fo:
15 |     for label in label_map:
16 |         fo.write('%s\t%s\n' % (label, label_map[label]))
17 | 
18 | fout = []
19 | for c in range(7):
20 |     fout.append( open('%s%s' % (p_svm_train, c), 'w') )
21 | 
22 | with open(p_simple) as fin:
23 |     for line in open(p_raw_train):
24 |         arr = line.split(' ')
25 |         labels = fin.readline().split('\t')[0].split(' | ')
26 |         c_dict = {label_map[label]:1 for label in labels}
27 |         for c in range(7):
28 |             if c in c_dict:
29 |                 fout[c].write('%s %s' % (1, ' '.join(arr[1:])))
30 |             else:
31 |                 fout[c].write('%s %s' % (0, ' '.join(arr[1:])))
32 |                 
33 |             
34 | for c in range(7):
35 |     fout[c].close()
36 | 


--------------------------------------------------------------------------------
/V1/prepare_session.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_dog_train_feat = '../trans_data/dog.simple5'
  3 | p_dog_valid_feat = '../trans_data/valid.simple5'
  4 | p_dog_valid_id = '../trans_data/valid.txt'
  5 | 
  6 | p_pig_train_feat = '../trans_data/pig.simple5'
  7 | p_pig_valid_feat = '../trans_data/test.simple5'
  8 | p_pig_valid_id = '../raw_data/test.txt'
  9 | 
 10 | p_label_map = '../dataset/label_map_session'
 11 | p_dog_train = '../dataset/dog_train_session'
 12 | p_dog_test = '../dataset/dog_test_session'
 13 | p_pig_train = '../dataset/pig_train_session'
 14 | p_pig_test = '../dataset/pig_test_session'
 15 | 
 16 | min_word_df = 5
 17 | min_title_df = 10
 18 | 
 19 | label_map = {}
 20 | cur_label = 0
 21 | word_map = {}
 22 | cur_word = 1 
 23 | 
 24 | pig_train = []
 25 | test_dict = {}
 26 | pig_test = [] 
 27 | 
 28 | label_df = {}
 29 | word_df = {}
 30 | 
 31 | def get_df(p_in):
 32 |   for line in open(p_in):
 33 |     row = line.strip().split('\t')
 34 |     label = row[0]
 35 |     label = ' | '.join(sorted(label.split(' | ')))
 36 |     label_df[label] = label_df.get(label, 0) + 1
 37 |     queries = row[1]
 38 |     titles = row[2] if len(row)>=3 else ''
 39 | 
 40 |     for query in queries.split(';'):
 41 |         if not query: continue
 42 |         feat_list = query.split(' ')
 43 |         for i, word in enumerate(feat_list):
 44 |             if not word: continue
 45 |             word_df[word] = word_df.get(word, 0) + 1
 46 |             #if i>=1:
 47 |             #    word = ' '.join(feat_list[i-1:i+1])
 48 |             #    word_df[word] = word_df.get(word, 0) + 1
 49 |             #word = '%s_%s' % (i, feat_list[i])
 50 |             #word_df[word] = word_df.get(word, 0) + 1
 51 |             #if i >= len(feat_list)/2:
 52 |             #    word = '%s_%s' % (i-len(feat_list), feat_list[i])
 53 |             #    word_df[word] = word_df.get(word, 0) + 1
 54 | 
 55 |     for title in titles.split(';'):
 56 |         if not title: continue
 57 |         feat_list = title.split(' ')
 58 |         for i, word in enumerate(feat_list):
 59 |             if not word: continue
 60 |             word = 't_' + word
 61 |             word_df[word] = word_df.get(word, 0) + 1
 62 |             #if i>=1:
 63 |             #    word = 't_' + ' '.join(feat_list[i-1:i+1])
 64 |             #    word_df[word] = word_df.get(word, 0) + 1
 65 | 
 66 | 
 67 | def prepare(p_in, p_out, isTrain):
 68 |   global cur_label
 69 |   global cur_word
 70 | 
 71 |   fo = open(p_out, 'w')
 72 | 
 73 |   for line in open(p_in):
 74 |     row = line.strip().split('\t')
 75 |     label = row[0]
 76 |     queries = row[1]
 77 |     titles = row[2] if len(row)>=3 else ''
 78 | 
 79 |     if isTrain:
 80 |         label = ' | '.join(sorted(label.split(' | ')))
 81 |         if label_df[label] < 50:
 82 |             label = label.split(' ')[0]
 83 |         if label not in label_map:
 84 |             label_map[label] = cur_label
 85 |             cur_label += 1
 86 | 
 87 |     word_tf = {}
 88 |     query_list = queries.split(';')
 89 |     for query in query_list:
 90 |         if not query: continue
 91 |         feat_list = query.split(' ')
 92 |         for i, word in enumerate(feat_list):
 93 |             if not word: continue
 94 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
 95 |                 word_map[word] = cur_word
 96 |                 cur_word += 1
 97 |             if word in word_map:
 98 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)*1./len(query_list)
 99 |             #if i>=1:
100 |             #    word = ' '.join(feat_list[i-1:i+1])
101 |             #    if isTrain and word_df[word] >= min_word_df and word not in word_map:
102 |             #        word_map[word] = cur_word
103 |             #        cur_word += 1
104 |             #    if word in word_map:
105 |             #        word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
106 |             #word = '%s_%s' % (i, feat_list[i])
107 |             #if isTrain and word_df[word] >= min_word_df and word not in word_map:
108 |             #    word_map[word] = cur_word
109 |             #    cur_word += 1
110 |             #if word in word_map:
111 |             #    word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
112 |             #if i >= len(feat_list) / 2:
113 |             #    word = '%s_%s' % (i-len(feat_list), feat_list[i])
114 |             #    if isTrain and word_df[word] >= min_word_df and word not in word_map:
115 |             #        word_map[word] = cur_word
116 |             #        cur_word += 1
117 |             #    if word in word_map:
118 |             #        word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
119 | 
120 |     word_tf2 = {}
121 |     title_list = titles.split(';')
122 |     for title in title_list:
123 |         if not title: continue
124 |         feat_list = title.split(' ')
125 |         for i, word in enumerate(feat_list):
126 |             word = 't_' + word
127 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
128 |                 word_map[word] = cur_word
129 |                 cur_word += 1
130 |             if word in word_map:
131 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*1./len(title_list)
132 |             #if i>=1:
133 |             #    word = 't_' + ' '.join(feat_list[i-1:i+1])
134 |             #    if isTrain and word_df[word] >= min_title_df and word not in word_map:
135 |             #        word_map[word] = cur_word
136 |             #        cur_word += 1
137 |             #    if word in word_map:
138 |             #        word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
139 | 
140 | 
141 |     if isTrain:
142 |         label, F = (label_map[label], word_tf.items() + word_tf2.items())
143 |         F = sorted(F, key=lambda d:d[0])
144 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
145 |         fo.write('%s %s\n' % (label, f_str))
146 |     else:
147 |         label, F = 0, word_tf.items() + word_tf2.items()
148 |         F = sorted(F, key=lambda d:d[0])
149 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
150 |         fo.write('%s %s\n' % (label, f_str))
151 | 
152 |   if isTrain: fo.close()
153 | 
154 |   # save label map
155 |   with open(p_label_map, 'w') as fo:
156 |     for label in label_map:
157 |         fo.write('%s\t%s\n' % (label, label_map[label]))
158 | 
159 | 
160 | #for dog
161 | #get_df(p_dog_train_feat)
162 | #prepare(p_dog_train_feat, p_dog_train, True)
163 | #for pig
164 | get_df(p_pig_train_feat)
165 | prepare(p_pig_train_feat, p_pig_train, True)
166 | 
167 | #prepare(p_dog_valid_feat, p_dog_test, False)
168 | prepare(p_pig_valid_feat, p_pig_test, False)
169 | 
170 | 


--------------------------------------------------------------------------------
/V1/refine_train_by_sesson_query.py:
--------------------------------------------------------------------------------
 1 | 
 2 | p_train = '../raw_data/train.txt'
 3 | p_test = '../raw_data/test.txt'
 4 | p_dog = '../trans_data/dog.txt'
 5 | p_valid = '../trans_data/valid.txt'
 6 | 
 7 | def refine(p_in, p_out):
 8 |     with open(p_out, 'w') as fo:
 9 |         last_query = None
10 |         has_known = False
11 |         session_lines = []
12 |         for line in open(p_in):
13 |             if not line.strip():
14 |                 fo.write('\n')
15 |                 if has_known:
16 |                     for l in session_lines:
17 |                         fo.write(l)
18 |                 last_query = None
19 |                 has_known = False
20 |                 session_lines = []
21 |             else:
22 |                 label, query = line.strip().split('\t')[:2]
23 |                 query_set = set(query.split(' '))
24 |                 if not last_query or (last_query & query_set):
25 |                     session_lines.append(line)
26 |                     if label != 'CLASS=UNKNOWN':
27 |                         has_known = True
28 |                     last_query = query_set
29 |                 else:
30 |                     fo.write('\n')
31 |                     if has_known:
32 |                         for l in session_lines:
33 |                             fo.write(l)
34 |                     last_query = None
35 |                     has_known = False
36 |                     session_lines = []
37 | 
38 | refine(p_dog, '../trans_data/dog_refine.txt')
39 | refine(p_train, '../trans_data/train_refine.txt')
40 |                 
41 |             
42 | 


--------------------------------------------------------------------------------
/V1/run_average.sh:
--------------------------------------------------------------------------------
1 | #python averaging_methods.py pred_average.txt pred_linear.txt liblinear 0.2 pred_xgboost.txt xgboost 0.8 18
2 | python averaging_methods.py pred_average.txt pred_xgboost.txt xgboost 0.65 18 pred_linear.txt liblinear 0.15 pred_semilda.txt semilda 0.0 pred_session2.txt xgboost 0.1 18 pred_session_label.txt sessionlabel 0.1
3 | 
4 | python construct_maxprob.py pred_average.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_average.txt
5 | python construct_maxprob_multi.py pred_average.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_average2.txt
6 | #python construct_maxprob_balance.py pred_average.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_average3.txt
7 | 


--------------------------------------------------------------------------------
/V1/run_forest_dog.sh:
--------------------------------------------------------------------------------
1 | python forest.py ../dataset/pig_train ../dataset/pig_test 20 5
2 | 
3 | python construct_maxlabel.py pred_forest.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_forest.txt 
4 | 


--------------------------------------------------------------------------------
/V1/run_liblinear_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | TRAIN_BIN=../../tools/liblinear/train
 3 | TEST_BIN=../../tools/liblinear/predict
 4 | 
 5 | for ((i=0; i<=6; i++)); do
 6 |   $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/svm_train${i} svm.model
 7 |   $TEST_BIN -b 1 ../dataset/pig_test svm.model pred_linear_${i}.txt 
 8 | done
 9 | 
10 | #python construct_liblinear_b1.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt
11 | 
12 | #$TEST_BIN ../dataset/pig_test pig.model pred_linear.txt 
13 | #python construct_maxlabel.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/V1/run_liblinear_pig.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | TRAIN_BIN=../../tools/liblinear/train
 3 | TEST_BIN=../../tools/liblinear/predict
 4 | 
 5 | #$TRAIN_BIN -s 6 -c 10 -e 0.001 -w0 0.5 -w5 0.6 ../dataset/pig_train pig.model
 6 | $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/pig_train pig.model
 7 | 
 8 | $TEST_BIN -b 1 ../dataset/pig_test pig.model pred_linear.txt 
 9 | python construct_liblinear_b1.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt
10 | 
11 | #$TEST_BIN ../dataset/pig_test pig.model pred_linear.txt 
12 | #python construct_maxlabel.py pred_linear.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_linear.txt
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/V1/run_semilda_pig.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -x
 2 | ldapath=../cpp_lda/src
 3 | 
 4 | train_file=../trans_data/dog.simple
 5 | test_file=../trans_data/test.simple
 6 | 
 7 | ldatrain_file=../dataset/train_semilda.train
 8 | ldatest_file=../dataset/test_semilda.test
 9 | ldapred_file=pred_semilda.txt
10 | 
11 | index_file=../dataset/word_index
12 | seed_file=lda_seed_words
13 | model_file=lda.model
14 | 
15 | num_topic=7
16 | alpha=0.5
17 | beta=0.05
18 | 
19 | python prepare_lda_train.py $train_file $ldatrain_file $index_file
20 | 
21 | time /Users/zuotaoliu/install/mpich2/bin/mpiexec -n 4 $ldapath/mpi_slda \
22 | --num_topics $num_topic \
23 | --alpha $alpha --beta $beta \
24 | --training_data_file $ldatrain_file \
25 | --model_file $model_file \
26 | --word_index_file $index_file \
27 | --compute_likelihood true \
28 | --burn_in_iterations 50 --total_iterations 120
29 | 
30 | 
31 | python prepare_lda_test.py $test_file $ldatest_file
32 | 
33 | args="--alpha ${alpha} \
34 |       --beta ${beta} \
35 |       --inference_data_file ${ldatest_file} \
36 |       --inference_result_file ${ldapred_file} \
37 |       --model_file ${model_file} \
38 |       --burn_in_iterations 50 \
39 |       --total_iterations 120 \
40 |       --file_type 0
41 |       "
42 | 
43 | time $ldapath/infer $args
44 | 
45 | python construct_semilda.py pred_semilda.txt $test_file ../raw_data/test.txt ../dataset/label_map_lda ../submit/predict_semilda.txt
46 | 


--------------------------------------------------------------------------------
/V1/run_xgboost3_dog.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | BIN=../../tools/xgboost3/xgboost
 3 | 
 4 | $BIN ../boost/xgboost3.conf num_round=120 num_class=18 bst:max_depth=7 data=../dataset/dog_train eval[test]=../dataset/dog_train
 5 | 
 6 | $BIN ../boost/xgboost3.conf task=pred num_class=18 model_in=0120.model test:data=../dataset/dog_test
 7 | #cp pred.txt pred5.txt
 8 | python ../boost/construct_maxprob.py pred.txt ../trans_data/valid.txt ../dataset/label_map ../submit/dog.txt
 9 | python ../evaluate/metric_F1.py ../trans_data/valid.label ../submit/dog.txt 
10 | python ../evaluate/metric_confusion.py ../trans_data/valid.label ../submit/dog.txt
11 | #python ../evaluate/construct_maxprob_multi.py pred.txt ../trans_data/valid.txt ../dataset/label_map ../submit/dog.txt
12 | #python ../evaluate/metric_F1.py ../trans_data/valid.label ../submit/dog.txt 
13 | 
14 | $BIN ../boost/xgboost3.conf task=pred num_class=18 model_in=0120.model test:data=../dataset/pig_test
15 | #cp pred.txt pred5_2.txt
16 | python ../boost/construct_maxprob.py pred.txt ../raw_data/test.txt ../dataset/label_map ../submit/pig.txt
17 | python ../evaluate/construct_maxprob_multi.py pred.txt ../raw_data/test.txt ../dataset/label_map ../submit/pig2.txt
18 | 


--------------------------------------------------------------------------------
/V1/run_xgboost3_pig.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BIN=../../tools/xgboost3/xgboost
 3 | 
 4 | $BIN xgboost3.conf num_round=200 num_class=18 bst:max_depth=7 data=../dataset/pig_train eval[test]=../dataset/pig_train
 5 | $BIN xgboost3.conf task=pred num_class=18 model_in=0200.model test:data=../dataset/pig_test
 6 | mv pred.txt pred_xgboost.txt
 7 | 
 8 | python construct_maxprob.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost.txt
 9 | python construct_maxprob_multi.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost2.txt
10 | 


--------------------------------------------------------------------------------
/V1/run_xgboost3_pig2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BIN=../../tools/xgboost3/xgboost
 3 | 
 4 | $BIN xgboost3.conf num_round=200 num_class=18 bst:max_depth=7 data=../dataset/pig_train2 eval[test]=../dataset/pig_train2
 5 | $BIN xgboost3.conf task=pred num_class=18 model_in=0200.model test:data=../dataset/pig_test2
 6 | mv pred.txt pred_xgboost.txt
 7 | 
 8 | python construct_maxprob.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost.txt
 9 | python construct_maxprob_multi.py pred_xgboost.txt ../raw_data/test.txt ../dataset/label_map ../submit/predict_xgboost2.txt
10 | 


--------------------------------------------------------------------------------
/V1/run_xgboost3_session.sh:
--------------------------------------------------------------------------------
1 | 
2 | BIN=../../tools/xgboost3/xgboost
3 | 
4 | #$BIN xgboost3.conf num_round=90 num_class=18 bst:max_depth=7 data=../dataset/pig_train_session eval[test]=../dataset/pig_train_session
5 | #$BIN ../boost/xgboost3.conf task=pred num_class=18 model_in=0090.model test:data=../dataset/pig_test_session
6 | #mv pred.txt pred_session.txt
7 | #python construct_session_prob.py pred_session.txt ../trans_data/test.simple5 ../raw_data/test.txt ../dataset/label_map_session pred_session2.txt
8 | python construct_maxprob.py pred_session2.txt ../raw_data/test.txt ../dataset/label_map_session ../submit/predict_session.txt
9 | 


--------------------------------------------------------------------------------
/V1/split_train.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | p_train = '../raw_data/train.txt'
 4 | p_test = '../raw_data/test.txt'
 5 | 
 6 | p_dog = '../trans_data/dog.txt'
 7 | p_valid = '../trans_data/valid.txt'
 8 | p_label = '../trans_data/valid.label'
 9 | 
10 | n_fold = 3
11 | 
12 | train_dict = {}
13 | test_dict = {}
14 | unknown_dict = {}
15 | 
16 | for line in open(p_train):
17 |     if not line.strip(): continue
18 |     try:
19 |         label, query, title = line.strip().split('\t')
20 |     except:
21 |         label, query = line.strip().split('\t')
22 |         title = '-'
23 | 
24 |     if query not in train_dict:
25 |         train_dict[query] = {} 
26 |     train_dict[query][label] = train_dict[query].get(label, 0) + 1
27 |     if label.startswith('CLASS=TEST'):
28 |         test_dict[query] = 1
29 |     if label.startswith('CLASS=UNKNOWN'):
30 |         unknown_dict[query] = 1
31 | 
32 | valid_dict = {}
33 | fv2 = open('../trans_data/valid.txt', 'w')
34 | fv3 = open('../trans_data/valid.label', 'w')
35 | for query in train_dict:
36 |     if query in test_dict: continue
37 |     if query in unknown_dict: continue
38 |     if random.randint(0, n_fold-1) == 1:
39 |         valid_dict[query] = 1
40 |         label = sorted(train_dict[query].items(), key=lambda d:-d[1])[0][0]
41 |         fv2.write('%s\n' % query)
42 |         fv3.write('%s\t%s\n' % (query, label))
43 | fv2.close()
44 | fv3.close()
45 | 
46 | fv1 = open('../trans_data/dog.txt', 'w')
47 | for line in open(p_train):
48 |     if not line.strip():
49 |         fv1.write(line)
50 |         continue
51 | 
52 |     try:
53 |         label, query, title = line.strip().split('\t')
54 |     except:
55 |         label, query = line.strip().split('\t')
56 |         title = '-'
57 |     if query in test_dict: continue
58 |     if query in train_dict:
59 |         if query in valid_dict:
60 |             label = 'CLASS=TEST'
61 |             fv1.write('%s\t%s\t%s\n' % (label, query, title))
62 |         else:
63 |             fv1.write(line)
64 | fv1.close()
65 | 
66 | 


--------------------------------------------------------------------------------
/V1/trans_session.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | p_pig_train = '../raw_data/train.txt'
 4 | p_dog_train = '../trans_data/dog.txt'
 5 | p_pig_out = '../trans_data/pig.simple5'
 6 | p_dog_out = '../trans_data/dog.simple5'
 7 | p_pig_valid = '../trans_data/test.simple5'
 8 | p_dog_valid = '../trans_data/valid.simple5'
 9 | 
10 | rates = {'CLASS=VIDEO' : 0.5}
11 | 
12 | 
13 | def stat(p_in, p_out):
14 |     session = [set(), set()]
15 |     labels = []
16 |     
17 |     fo = open(p_out, 'w')
18 |     tot_line = 0
19 |     for line in open(p_in):
20 |         if not line.strip():
21 |             session_label = ''
22 |             session_flag = True
23 |             positive_count = 0
24 |             for key in labels:
25 |                 label, query = key.split('\t')
26 |                 if label.find('TEST')>=0 or label.find('KNOWN')>=0:
27 |                     if positive_count < 1: session_flag = False
28 |                     #elif positive_count == 1: positive_count = 0 
29 |                     #else: positive_count = 0.5
30 |                     else: positive_count = 0
31 |                 elif session_flag:
32 |                     if not session_label: 
33 |                         session_label = label
34 |                         positive_count += 1
35 |                     elif session_label != label: 
36 |                         session_flag = False
37 |                     else:
38 |                         positive_count += 1
39 |             if session[0] and session_label and session_flag and positive_count > 0:
40 |                 if session[1] and len(session[1])<=10 and len(session[0])<=5:
41 |                     rate = rates.get(session_label, 1.0)
42 |                     if random.random()<=rate:
43 |                         fo.write('%s\t%s\t%s\n' % (session_label, ';'.join(session[0]), ';'.join(session[1])))
44 |             session = [set(), set()]
45 |             labels = []
46 |             continue
47 | 
48 |         try:
49 |             label, query, title = line.strip().split('\t')
50 |         except:
51 |             label, query = line.strip().split('\t')
52 |             title = '-'
53 | 
54 |         key = label + '\t' + query
55 |         if not labels or labels[-1] != key:
56 |             labels.append(key)
57 |         session[0].add(query)
58 |         if title and title!='-':
59 |             session[1].add(title)
60 | 
61 |         tot_line += 1
62 |         #if tot_line == 1000000: break
63 |     fo.close()
64 |             
65 | def valid(p_in, p_out):
66 |     session = [set(), set()]
67 |     has_test = False
68 | 
69 |     fo = open(p_out, 'w')
70 |     tot_line = 0
71 |     for line in open(p_in):
72 |         if not line.strip():
73 |             if has_test and session[1] and len(session[1])<=10 and len(session[0])<=5:
74 |                 fo.write('%s\t%s\t%s\n' % (0, ';'.join(session[0]), ';'.join(session[1])))
75 |             session = [set(), set()]
76 |             has_test = False
77 |             continue
78 |         try:
79 |             label, query, title = line.strip().split('\t')
80 |         except:
81 |             label, query = line.strip().split('\t')
82 |             title = '-'
83 | 
84 |         if label.find('TEST') >= 0:
85 |             has_test = True
86 |         session[0].add(query)
87 |         if title and title!='-':
88 |             session[1].add(title)
89 | 
90 |         tot_line += 1
91 |         #if tot_line == 10000: break
92 |     fo.close()
93 | 
94 | stat(p_pig_train, p_pig_out)
95 | #stat(p_dog_train, p_dog_out)
96 | valid(p_pig_train, p_pig_valid)
97 | #valid(p_dog_train, p_dog_valid)
98 | 


--------------------------------------------------------------------------------
/V1/trans_train1.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_train = '../trans_data/train_refine.txt'
  3 | p_dog = '../trans_data/dog_refine.txt'
  4 | #p_train = '../raw_data/train.txt'
  5 | #p_dog = '../trans_data/dog.txt'
  6 | p_test = '../raw_data/test.txt'
  7 | p_valid = '../trans_data/valid.txt'
  8 | 
  9 | s_train = '../trans_data/train.simple'
 10 | s_test = '../trans_data/test.simple'
 11 | s_dog = '../trans_data/dog.simple'
 12 | s_valid = '../trans_data/valid.simple'
 13 | 
 14 | def trans(in1, in2, out1, out2):
 15 |     train_dict = {}
 16 |     test_dict = {}
 17 | 
 18 |     # for session
 19 |     session_train_query = {}
 20 |     session_test_query = {}
 21 |     session_labels = {}
 22 |     session_click = {}
 23 | 
 24 | 
 25 |     for line in open(in1):
 26 |         if not line.strip():
 27 |             #session end
 28 |             for query in session_train_query:
 29 |                 #if len(session_labels) == 1:
 30 |                 for q2 in session_train_query:
 31 |                     if query != q2:
 32 |                         label = session_train_query[q2]
 33 |                         train_dict[query][2][label] = train_dict[query][2].get(label, 0) + 1
 34 |                         train_dict[query][3][q2] = train_dict[query][3].get(q2, 0) + 1
 35 |                 for title in session_click:
 36 |                     train_dict[query][4][title] = train_dict[query][4].get(title, 0) + 1
 37 |             for query in session_test_query:
 38 |                 #if len(session_labels) == 1:
 39 |                 for q2 in session_train_query:
 40 |                     if query != q2:
 41 |                         label = session_train_query[q2]
 42 |                         test_dict[query][2][label] = test_dict[query][2].get(label, 0) + 1
 43 |                         test_dict[query][3][q2] = test_dict[query][3].get(q2, 0) + 1
 44 |                 for title in session_click:
 45 |                     test_dict[query][4][title] = test_dict[query][4].get(title, 0) + 1
 46 |             session_train_query = {}
 47 |             session_test_query = {}
 48 |             session_labels = {}
 49 |             session_click = {}
 50 |             continue
 51 | 
 52 |         try:
 53 |             label, query, title = line.strip().split('\t')
 54 |         except:
 55 |             label, query = line.strip().split('\t')
 56 |             title = '-'
 57 |         #label = ' | '.join(sorted(label.split(' | ')))
 58 | 
 59 |         if title and title != '-':
 60 |             session_click[title] = session_click.get(title, 0) + 1
 61 | 
 62 |         if label.startswith('CLASS=TEST'):
 63 |             if query not in test_dict: 
 64 |                 test_dict[query] = [label, {}, {}, {}, {}]
 65 |             if title and title != '-':
 66 |                 test_dict[query][1][title] = test_dict[query][1].get(title, 0) + 1
 67 |             session_test_query[query] = 1
 68 |         elif not label.startswith('CLASS=UNKNOWN'):
 69 |             if query not in train_dict: 
 70 |                 train_dict[query] = [{}, {}, {}, {}, {}]
 71 |             train_dict[query][0][label] = train_dict[query][0].get(label, 0) + 1
 72 |             if title and title != '-':
 73 |                 train_dict[query][1][title] = train_dict[query][1].get(title, 0) + 1
 74 |             session_labels[label] = 1
 75 |             session_train_query[query] = label.replace(' ', '')
 76 | 
 77 |     n_top_title = 30
 78 |     n_top_label = 3
 79 |     n_top_query = 10 
 80 |     n_top_session_title = 30 
 81 | 
 82 |     with open(out1, 'w') as ft:
 83 |         for query in train_dict:
 84 |             label = sorted(train_dict[query][0].items(), key=lambda d:-d[1])[0][0]
 85 | 
 86 |             titles = sorted(train_dict[query][1].items(), key=lambda d:-d[1])
 87 |             title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]]
 88 | 
 89 |             labels = sorted(train_dict[query][2].items(), key=lambda d:-d[1])
 90 |             label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]]
 91 |             tot_label = float(sum(train_dict[query][2].values()))
 92 |             label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]]
 93 | 
 94 |             queries = sorted(train_dict[query][3].items(), key=lambda d:-d[1])
 95 |             query_pairs = []
 96 |             query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]]
 97 | 
 98 |             stitles = sorted(train_dict[query][4].items(), key=lambda d:-d[1])
 99 |             stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]]
100 | 
101 |             stat_pairs = []
102 | 
103 |             ft.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs)))
104 | 
105 |     with open(out2, 'w') as fo:
106 |         for query in test_dict:
107 |             label = test_dict[query][0]
108 | 
109 |             titles = sorted(test_dict[query][1].items(), key=lambda d:-d[1])
110 |             title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]]
111 | 
112 |             labels = sorted(test_dict[query][2].items(), key=lambda d:-d[1])
113 |             label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]]
114 |             tot_label = float(sum(test_dict[query][2].values()))
115 |             label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]]
116 | 
117 |             queries = sorted(test_dict[query][3].items(), key=lambda d:-d[1])
118 |             query_pairs = []
119 |             query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]]
120 | 
121 |             stitles = sorted(test_dict[query][4].items(), key=lambda d:-d[1])
122 |             stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]]
123 | 
124 |             stat_pairs = []
125 | 
126 |             fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs)))
127 | 
128 | trans(p_train, p_test, s_train, s_test)
129 | trans(p_dog, p_valid, s_dog, s_valid)
130 | 
131 | 


--------------------------------------------------------------------------------
/V1/trans_train2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_train = '../trans_data/train_refine.txt'
  3 | p_dog = '../trans_data/dog_refine.txt'
  4 | #p_train = '../raw_data/train.txt'
  5 | #p_dog = '../trans_data/dog.txt'
  6 | p_test = '../raw_data/test.txt'
  7 | p_valid = '../trans_data/valid.txt'
  8 | 
  9 | s_train = '../trans_data/train.simple2'
 10 | s_test = '../trans_data/test.simple2'
 11 | s_dog = '../trans_data/dog.simple2'
 12 | s_valid = '../trans_data/valid.simple2'
 13 | 
 14 | def trans(in1, in2, out1, out2):
 15 |     train_dict = {}
 16 |     test_dict = {}
 17 | 
 18 |     query_freq = {}
 19 |     query_titles = {}
 20 | 
 21 |     query_session = {}
 22 |     query_search = {}
 23 |     query_click = {}
 24 |     query_dupclick = {}
 25 | 
 26 |     query_session_search = {}
 27 |     query_session_click = {}
 28 |     query_session_dupclick = {}
 29 | 
 30 |     # for session
 31 |     session_train_query = {}
 32 |     session_test_query = {}
 33 |     session_labels = {}
 34 |     session_search = 0
 35 |     session_click = {}
 36 |     session_query_search = {}
 37 |     session_query_click = {}
 38 | 
 39 | 
 40 |     for line in open(in1):
 41 |         if not line.strip():
 42 |             #session end
 43 |             session_dupclick = 0
 44 |             for title in session_click:
 45 |                 if session_click[title] > 1: session_dupclick += 1
 46 |             for query in session_train_query:
 47 |                 query_session[query] = query_session.get(query, 0) + 1.
 48 |                 query_session_search[query] = query_session_search.get(query, 0) + session_search
 49 |                 query_session_click[query] = query_session_click.get(query, 0) + len(session_click)
 50 |                 query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick
 51 |                 query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0)
 52 |                 query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {}))
 53 |                 session_query_dup = 0
 54 |                 for title in session_query_click.get(query, {}):
 55 |                     if session_query_click[query][title] > 1: session_query_dup += 1
 56 |                 query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup
 57 |             for query in session_test_query:
 58 |                 query_session[query] = query_session.get(query, 0) + 1.
 59 |                 query_session_search[query] = query_session_search.get(query, 0) + session_search
 60 |                 query_session_click[query] = query_session_click.get(query, 0) + len(session_click)
 61 |                 query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick
 62 |                 query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0)
 63 |                 query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {}))
 64 |                 session_query_dup = 0
 65 |                 for title in session_query_click.get(query, {}):
 66 |                     if session_query_click[query][title] > 1: session_query_dup += 1
 67 |                 query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup
 68 |             session_train_query = {}
 69 |             session_test_query = {}
 70 |             session_labels = {}
 71 |             session_search = 0
 72 |             session_click = {}
 73 |             session_query_search = {}
 74 |             session_query_click = {}
 75 |             continue
 76 | 
 77 |         try:
 78 |             label, query, title = line.strip().split('\t')
 79 |         except:
 80 |             label, query = line.strip().split('\t')
 81 |             title = '-'
 82 |         #label = ' | '.join(sorted(label.split(' | ')))
 83 | 
 84 |         query_freq[query] = query_freq.get(query, 0) + 1
 85 |         if title and title != '-':
 86 |             if query not in query_titles: query_titles[query] = [0., 0.]
 87 |             query_titles[query][0] += 1
 88 |             query_titles[query][1] += len(title.split(' '))
 89 |             session_click[title] = session_click.get(title, 0) + 1
 90 |             if query not in session_query_click: session_query_click[query] = {}
 91 |             session_query_click[query][title] = session_query_click[query].get(title, 0) + 1
 92 |         else:
 93 |             session_search += 1
 94 |             if query not in session_query_search: session_query_search[query] = 0
 95 |             session_query_search[query] += 1
 96 | 
 97 |         if label.startswith('CLASS=TEST'):
 98 |             if query not in test_dict: 
 99 |                 test_dict[query] = [label, {}, {}, {}]
100 |             session_test_query[query] = 1
101 |         elif not label.startswith('CLASS=UNKNOWN'):
102 |             if query not in train_dict: 
103 |                 train_dict[query] = [{}, {}, {}, {}]
104 |             session_train_query[query] = label.replace(' ', '')
105 | 
106 | 
107 |     with open(out1, 'w') as ft:
108 |         for query in train_dict:
109 |             stat_pairs = []
110 |             stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) )
111 |             stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) )
112 |             if query_titles.get(query, [0, 0])[0] >= 3:
113 |                 stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) )
114 | 
115 |             stat_pairs2 = []
116 |             if query_session.get(query, 0) >= 5:
117 |                 stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) )
118 |                 stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) )
119 |                 stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) )
120 |                 stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) )
121 |                 stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) )
122 |                 stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) )
123 | 
124 |             ft.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2)))
125 | 
126 |     with open(out2, 'w') as fo:
127 |         for query in test_dict:
128 |             stat_pairs = []
129 |             stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) )
130 |             stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) )
131 |             if query_titles.get(query, [0, 0])[0] >= 3:
132 |                 stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) )
133 | 
134 |             stat_pairs2 = []
135 |             if query_session.get(query, 0) >= 5:
136 |                 stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) )
137 |                 stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) )
138 |                 stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) )
139 |                 stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) )
140 |                 stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) )
141 |                 stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) )
142 | 
143 |             fo.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2)))
144 | 
145 | trans(p_train, p_test, s_train, s_test)
146 | trans(p_dog, p_valid, s_dog, s_valid)
147 | 
148 | 


--------------------------------------------------------------------------------
/V1/xgboost3.conf:
--------------------------------------------------------------------------------
 1 | ### General Parameters, see comment for each definition
 2 | # choose the tree booster, 0: tree, 1: linear
 3 | booster_type = 0 
 4 | # this is the only difference with classification, use 0: linear regression
 5 | # when labels are in [0,1] we can also use 1: logistic regression
 6 | loss_type = 0
 7 | # evaluation metrics for validation data
 8 | eval_metric=merror
 9 | #eval_metric=error
10 | #eval_metric=auc
11 | #eval_metric=map
12 | #eval_metric=rmse
13 | #eval_metric=ndcg
14 | # silent=1 means printing running messages
15 | #silent = 1
16 | # do not use binary buffer
17 | use_buffer = 0
18 | nthread = 8
19 | #num_class = 18
20 | 
21 | ### Tree Booster Parameters
22 | # step size shrinkage used in update to prevent overfitting. 
23 | bst:eta = 0.1
24 | #bst:eta = 0.05
25 | # minimum loss reduction required to make a further partition. larger -> more conservative
26 | bst:gamma = 1.0 
27 | # minimum sum of instance weight(hessian) needed in a child. larger -> more conservative
28 | bst:min_child_weight = 1.0
29 | # maximum depth of a tree
30 | #bst:max_depth = 5 
31 | # constructing method to build a tree, 0: svdfeature, 1: column major expansion, 2: row major expansion
32 | #bst:tree_maker = 1
33 | 
34 | ### Linear Booster Parameters
35 | # L2 regularization term on weights
36 | bst:lambda = 0
37 | # L1 regularization term on weights
38 | bst:alpha = 0
39 | # L2 regularization term on bias
40 | bst:lambda_bias = 0
41 | 
42 | ### Task parameters
43 | # specify the learning task and the corresponding learning objective
44 | #objective = multi:softmax
45 | objective = multi:softprob
46 | #objective = reg:linear
47 | #objective = reg:linear
48 | #objective = reg:logistic
49 | #objective = binary:logistic
50 | #objective = binary:logitraw
51 | # the number of round to do boosting
52 | num_round = 10
53 | # 0 means do not save any model except the final round model
54 | save_period = 0 
55 | # the initial prediction score
56 | base_score = 0.0
57 | # fature map
58 | #fmap = "../dataset/feat_map.txt"
59 | # name for dump model
60 | name_dump = "dump.nice.txt"
61 | # The path of training data
62 | #data = "../dataset/train_dog.svm" 
63 | # The path of validation data, used to monitor training process, here [test] sets name of the validation set
64 | #eval[test] = "../dataset/train_dog.svm" 
65 | # The path of test data 
66 | #test:data = "../dataset/test_dog.svm"      
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/V2/construct_liblinear_b1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | def get_match(p_pred, p_test, p_out):
12 |     fo = open(p_out, 'w')
13 |     fl = open(p_test)
14 |     fin = open(p_pred)
15 |     fin.readline()
16 |     for line in fin: 
17 |         c = int(line.split(' ')[0])
18 |         #c = max(0, min(max_label, int(float(line.strip())+0.5)))
19 |         label = label_map[c]
20 |         feats = fl.readline().strip()
21 |         fo.write('%s\t%s\n' % (feats, label))
22 |     fin.close()
23 |     fl.close()
24 |     fo.close()
25 | 
26 | if __name__ == '__main__':
27 |     import sys
28 |     if len(sys.argv) != 5:
29 |         print '<usage> pred test label out'
30 |         exit(1)
31 |     
32 |     load_label_map(sys.argv[3])
33 |     print 'max_label:', max_label
34 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
35 | 


--------------------------------------------------------------------------------
/V2/construct_maxprob.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | adjust = {}
 5 | def load_label_map(p_in):
 6 |     global max_label
 7 |     for line in open(p_in):
 8 |         label, c = line.strip().split('\t')
 9 |         label_map[int(c)] = label
10 |         max_label = max(max_label, int(c))
11 | 
12 |     for i in range(max_label+1):
13 |         adjust[i] = 0
14 | 
15 | def get_match(p_pred, p_test, p_out):
16 |     npred = len(open(p_pred).readlines()) / len(open(p_test).readlines())
17 |     fo = open(p_out, 'w')
18 |     fp = open(p_pred)
19 |     for line in open(p_test):
20 |         feats = line.strip()
21 | 
22 |         pred = []
23 |         for i in range(npred):
24 |             if i <= max_label:
25 |                 pred.append(float(fp.readline().strip())) 
26 |             else:
27 |                 fp.readline()
28 |         c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0]
29 |         label = label_map[c]
30 |         fo.write('%s\t%s\n' % (feats, label))
31 |     fp.close()
32 |     fo.close()
33 | 
34 | if __name__ == '__main__':
35 |     import sys
36 |     if len(sys.argv) < 5:
37 |         print '<usage> pred test label out adjust'
38 |         exit(1)
39 |     
40 |     load_label_map(sys.argv[3])
41 |     print 'max_label:', max_label
42 |     if len(sys.argv) >= 6:
43 |         for line in open(sys.argv[5]):
44 |             cid, v = line.strip().split('\t')
45 |             adjust[int(cid)] = float(v)
46 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
47 | 


--------------------------------------------------------------------------------
/V2/construct_maxprob_multi.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | adjust = {}
 5 | def load_label_map(p_in):
 6 |     global max_label
 7 |     for line in open(p_in):
 8 |         label, c = line.strip().split('\t')
 9 |         label_map[int(c)] = label
10 |         max_label = max(max_label, int(c))
11 | 
12 |     for i in range(max_label+1):
13 |         adjust[i] = 0
14 | 
15 | def get_match(p_pred, p_test, p_out):
16 |     npred = len(open(p_pred).readlines()) / len(open(p_test).readlines())
17 |     fo = open(p_out, 'w')
18 |     fp = open(p_pred)
19 |     for line in open(p_test):
20 |         feats = line.strip()
21 | 
22 |         pred = []
23 |         for i in range(npred):
24 |             if i <= max_label:
25 |                 pred.append(float(fp.readline().strip())) 
26 |             else:
27 |                 fp.readline()
28 |         sort_list = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])
29 |         c = sort_list[0][0]
30 |         label = label_map[c]
31 |         if sort_list[1][1] > 0.45: 
32 |             label = label + ' | ' + label_map[sort_list[1][0]]  
33 |         label = ' | '.join(label.split(' | ')[:2])
34 |         fo.write('%s\t%s\n' % (feats, label))
35 |     fp.close()
36 |     fo.close()
37 | 
38 | if __name__ == '__main__':
39 |     import sys
40 |     if len(sys.argv) < 5:
41 |         print '<usage> pred test label out adjust'
42 |         exit(1)
43 |     
44 |     load_label_map(sys.argv[3])
45 |     print 'max_label:', max_label
46 |     if len(sys.argv) >= 6:
47 |         for line in open(sys.argv[5]):
48 |             cid, v = line.strip().split('\t')
49 |             adjust[int(cid)] = float(v)
50 |     get_match(sys.argv[1], sys.argv[2], sys.argv[4])
51 | 


--------------------------------------------------------------------------------
/V2/construct_semilda.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | def get_match(p_pred, p_test_simple, p_test, p_out):
12 |     feat_map = {}
13 |     fl = open(p_test_simple)
14 |     fin = open(p_pred)
15 |     for line in fin: 
16 |         pred = [float(v) for v in line.strip().split(' ')]
17 |         tot = sum(pred) + 0.001
18 |         pred = [v/tot for v in pred]
19 |         c = sorted([(k, v) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0]
20 |         label = label_map[c]
21 |         feats = fl.readline().strip().split('\t')[1]
22 |         feat_map[feats] = label
23 |     fin.close()
24 |     fl.close()
25 |     fo = open(p_out, 'w')
26 |     for line in open(p_test):
27 |         feats = line.strip()
28 |         fo.write('%s\t%s\n' % (feats, feat_map[feats]))
29 |     fo.close()
30 | 
31 | if __name__ == '__main__':
32 |     import sys
33 |     if len(sys.argv) != 6:
34 |         print '<usage> pred test.simple test label out'
35 |         exit(1)
36 |     
37 |     load_label_map(sys.argv[4])
38 |     print 'max_label:', max_label
39 |     get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5])
40 | 


--------------------------------------------------------------------------------
/V2/construct_session_prob.py:
--------------------------------------------------------------------------------
 1 | 
 2 | label_map = {}
 3 | max_label = 0
 4 | def load_label_map(p_in):
 5 |     global max_label
 6 |     for line in open(p_in):
 7 |         label, c = line.strip().split('\t')
 8 |         label_map[int(c)] = label
 9 |         max_label = max(max_label, int(c))
10 | 
11 | 
12 | def get_match(p_pred, p_session, p_test, p_out):
13 |     npred = len(open(p_pred).readlines()) / len(open(p_session).readlines())
14 |     query_dict = {} 
15 |     query_num = {}
16 |     for line in open(p_test):
17 |         query = line.strip()
18 |         query_dict[query] = [0]*npred     
19 |         query_num[query] = 0
20 | 
21 |     fp = open(p_pred)
22 |     for line in open(p_session):
23 |         query_list = line.strip().split('\t')[1].split(';')
24 | 
25 |         pred = []
26 |         for i in range(npred):
27 |             if i <= max_label:
28 |                 pred.append(float(fp.readline().strip())) 
29 |             else:
30 |                 fp.readline()
31 |         for query in query_list:
32 |             if query in query_dict: 
33 |                 for i, v in enumerate(pred):
34 |                     query_dict[query][i] += v
35 |                 query_num[query] += 1
36 |     fp.close()
37 |     print 'query_dict', len(query_dict)
38 | 
39 |     not_in_test = 0
40 |     fo = open(p_out, 'w')
41 |     for line in open(p_test):
42 |         query = line.strip()
43 |         #c = sorted([(k, v+adjust[k]) for k, v in enumerate(pred)], key=lambda d:-d[1])[0][0]
44 |         #label = label_map[c]
45 |         if query_num[query] > 0:
46 |             for i in range(npred):
47 |                 fo.write('%s\n' % (query_dict[query][i]/query_num[query]))
48 |         else:
49 |             for i in range(npred):
50 |                 fo.write('0\n')
51 |             not_in_test += 1
52 |     fo.close()
53 |     print 'not in session:', not_in_test
54 | 
55 | if __name__ == '__main__':
56 |     import sys
57 |     if len(sys.argv) < 6:
58 |         print '<usage> pred session testid label out'
59 |         exit(1)
60 |     
61 |     load_label_map(sys.argv[4])
62 |     print 'max_label:', max_label
63 | 
64 |     get_match(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[5])
65 | 


--------------------------------------------------------------------------------
/V2/markov_sessoin_label.py:
--------------------------------------------------------------------------------
 1 | 
 2 | p_train = '../raw_data/train.txt'
 3 | p_test = '../raw_data/test.txt'
 4 | p_dog = '../trans_data/dog.txt'
 5 | p_valid = '../trans_data/valid.txt'
 6 | 
 7 | def markov(p_in, p_query, p_out):
 8 |     test_label = {}
 9 |     unknown_label = {}
10 |     test_unknown = {}
11 | 
12 |     label_query = {}
13 |     unknown_query = {}
14 |     test_query = {}
15 |     session = [] 
16 |     for line in open(p_in):
17 |         if not line.strip():
18 |             n_query = len(label_query) + len(unknown_query) + len(test_query) 
19 |             label_dict = {}
20 |             for query, label in label_query.items():
21 |                 label_dict[label] = label_dict.get(label, 0) + 1
22 |             if len(label_dict) <= 1 or (len(label_dict) == 2 and (label_dict.keys()[0].find(label_dict.keys()[1])==0 or label_dict.keys()[1].find(label_dict.keys()[0])==0)):
23 |               for query in test_query:
24 |                 if query not in test_label: 
25 |                     test_label[query] = {}
26 |                 for query2, label in label_query.items():
27 |                     test_label[query][label] = test_label[query].get(label, 0) + 1
28 |                 if query not in test_unknown:
29 |                     test_unknown[query] = {}
30 |                 for query2 in unknown_query:
31 |                     test_unknown[query][query2] = test_unknown[query].get(query2, 0) + 1
32 |               for query in unknown_query:
33 |                 if query not in unknown_label: 
34 |                     unknown_label[query] = {}
35 |                 for query2, label in label_query.items():
36 |                     unknown_label[query][label] = unknown_label[query].get(label, 0) + 1
37 |             else:
38 |                 #print session
39 |                 pass
40 |             label_query = {}
41 |             unknown_query = {}
42 |             test_query = {}
43 |             session = []
44 |             continue
45 |         label, query = line.strip().split('\t')[:2]
46 |         label = ' | '.join(sorted(label.split(' | ')))
47 |         if not session or query != session[-1][1]:
48 |             session.append( (label, query) )
49 |         if label=='CLASS=TEST':
50 |             test_query[query] = 1
51 |         elif label=='CLASS=UNKNOWN':
52 |             if query.count(' ') > 1:
53 |                 unknown_query[query] = 1
54 |         else:
55 |             label_query[query] = label
56 | 
57 |     with open(p_out, 'w') as fo:
58 |         for line in open(p_query):
59 |             query = line.strip()
60 |             if query in test_label and test_label[query]:
61 |                 s = ['%s:%s' % (k, v) for k, v in test_label[query].items()]
62 |                 fo.write('%s\n' % (' || '.join(s)))
63 |             elif query in test_unknown:
64 |                 label_dict = {}
65 |                 for query2, v1 in test_unknown[query].items():
66 |                     if query2 in unknown_label:
67 |                         for label, v2 in unknown_label[query2].items():
68 |                             label_dict[label] = label_dict.get(label, 0) + v1*v2
69 |                 if label_dict:
70 |                     s = ['%s:%s' % (k, v) for k, v in label_dict.items()]
71 |                     fo.write('%s\n' % (' || '.join(s)))
72 |                 else:
73 |                     fo.write('\n')
74 |             else:
75 |                 fo.write('\n')
76 | 
77 | markov(p_dog, p_valid, 'pred_session_dog.txt')
78 | markov(p_train, p_test, 'pred_session_pig.txt')
79 | 
80 | 


--------------------------------------------------------------------------------
/V2/metric_F1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Evaluation metric for the CIKM CUP 2014 
  4 | F-score
  5 | 
  6 | @author: Michael Liu 
  7 | Created: Thu July 22 2014
  8 | """
  9 | 
 10 | import os
 11 | import csv
 12 | import math
 13 | 
 14 | def create_solution_dictionary(solution):
 15 |     """
 16 |     """
 17 |     
 18 |     solnDict = {}
 19 |     with open(solution, 'rb') as f:
 20 |         for line in f:
 21 |             query, labels = line.strip().split('\t')
 22 |             label_list = labels.split(' | ')
 23 |             solnDict[query] = label_list
 24 |     return solnDict
 25 | 
 26 | def check_submission(submission, solutionDict):
 27 |     """
 28 |     """
 29 | 
 30 |     submissionDict = {}
 31 |     with open(submission, 'rb') as f:
 32 |         for line in f:
 33 |             query, labels = line.strip('\n').split('\t')
 34 |             if query in submissionDict:
 35 |                 print 'duplicate id in submission'
 36 |                 return False
 37 |             if query not in solutionDict:
 38 |                 print 'submission id must in solution'
 39 |                 return False
 40 |             label_list = labels.split(' | ')
 41 |             submissionDict[query] = label_list
 42 | 
 43 |     if len(submissionDict) != len(solutionDict):
 44 |         print 'size of submission and solution must be the same'
 45 |         return False
 46 |     return submissionDict
 47 | 
 48 | def F1_metric(solution, submission):
 49 |     """
 50 |     """
 51 | 
 52 |     solutionDict = create_solution_dictionary(solution)
 53 |     submissionDict = check_submission(submission, solutionDict)
 54 | 
 55 |     if submissionDict:
 56 |         true_positive = {}
 57 |         all_positive = {}
 58 |         groundtruth = {}
 59 | 
 60 |         for query in solutionDict:
 61 |             label_list = set(submissionDict[query])
 62 |             truth_list = set(solutionDict[query])
 63 |             for label in label_list: 
 64 |                 if label in truth_list:
 65 |                     true_positive[label] = true_positive.get(label, 0) + 1.
 66 |                 all_positive[label] = all_positive.get(label, 0) + 1
 67 |             for label in truth_list:
 68 |                 groundtruth[label] = groundtruth.get(label, 0) + 1
 69 | 
 70 |         precision_list = []
 71 |         recall_list = [] 
 72 |         for label in groundtruth:
 73 |             precision = 0
 74 |             if label in all_positive:
 75 |                 precision = true_positive.get(label, 0) / all_positive.get(label, 0)
 76 |             print label, 'precision', precision 
 77 | 
 78 |             recall = true_positive[label] / groundtruth[label]
 79 |             print label, 'recall', recall 
 80 | 
 81 |             precision_list.append(precision)
 82 |             recall_list.append(recall)
 83 | 
 84 |         ap = sum(precision_list) / len(recall_list)
 85 |         ar = sum(recall_list) / len(recall_list)
 86 |         F1 = 2*ap*ar / (ap + ar)
 87 |         print 'ap', ap
 88 |         print 'ar', ar
 89 |         print 'F1', F1
 90 | 
 91 | if __name__ == "__main__":
 92 |     solutionFile = ""
 93 |     submissionFile = ""
 94 | 
 95 |     import sys
 96 |     if len(sys.argv) < 3:
 97 |         print '<usage> solution submission'
 98 |         exit(-1)
 99 |     solutionFile = sys.argv[1]
100 |     submissionFile = sys.argv[2]
101 |     
102 |     F1_metric(solutionFile, submissionFile)
103 |     
104 |     
105 | 


--------------------------------------------------------------------------------
/V2/prepare_ensemble_cat.py:
--------------------------------------------------------------------------------
  1 | n_sample = 6000
  2 | 
  3 | label_map = {}
  4 | label_map2 = {}
  5 | max_label = 0
  6 | for line in open('../dataset/label_map_dog'):
  7 |     label, c = line.strip().split('\t')
  8 |     label_map[int(c)] = label
  9 |     label_map2[label] = int(c)
 10 |     max_label = max(max_label, int(c))
 11 | print 'max_label:', max_label 
 12 | 
 13 | weights = []
 14 | for i in xrange(n_sample):
 15 |     weights.append([0]*((max_label+1)*8))
 16 | 
 17 | if __name__ == '__main__':
 18 |     import sys
 19 |     if len(sys.argv) < 4:
 20 |         print '<usage> out label in1 m1 [in2 m2 ...]'
 21 |         exit(1)
 22 | 
 23 |     i = 3
 24 |     while i < len(sys.argv):
 25 |         in_i = sys.argv[i]
 26 |         m_i = sys.argv[i+1]
 27 |         print in_i, m_i
 28 |         if m_i == 'xgboost':
 29 |             nclass = int(sys.argv[i+2])
 30 |             fin = open(in_i)
 31 |             for isample in xrange(n_sample):
 32 |                 for ipred in xrange(nclass):
 33 |                     pred = float(fin.readline().strip())
 34 |                     if ipred <= max_label:
 35 |                         weights[isample][ipred] = pred 
 36 |             fin.close()
 37 |             i += 3
 38 |         elif m_i == 'liblinear':
 39 |             fin = open(in_i)
 40 |             fin.readline()
 41 |             for isample in xrange(n_sample):
 42 |                 preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 
 43 |                 for ipred, pred in enumerate(preds):
 44 |                     weights[isample][max_label+1+ipred] = pred
 45 |             i += 2
 46 |         elif m_i == 'semilda':
 47 |             lda_map = {}
 48 |             for line in open('../dataset/label_map_lda'):
 49 |                 label, c = line.strip().split('\t')
 50 |                 lda_map[int(c)] = label
 51 |             feat_map = {}
 52 |             fin = open('../trans_data/valid.simple1')
 53 |             for line in open(in_i):
 54 |                 preds = [float(v) for v in line.strip().split(' ')] 
 55 |                 tot = sum(preds) + 0.001
 56 |                 preds = [v/tot for v in preds]
 57 |                 feats = fin.readline().strip().split('\t')[1]
 58 |                 feat_map[feats] = preds
 59 |             fin.close()
 60 |             fin = open('../trans_data/cat.txt')
 61 |             for isample in xrange(n_sample):
 62 |                 feats = fin.readline().strip()
 63 |                 preds = feat_map[feats]
 64 |                 for ipred, pred in enumerate(preds):
 65 |                     ipred2 = label_map2[lda_map[ipred]]
 66 |                     weights[isample][(max_label+1)*2+ipred2] = pred / tot
 67 |             fin.close()
 68 |             i += 2
 69 |         elif m_i == 'sessionlabel':
 70 |             fin = open(in_i)
 71 |             for isample in xrange(n_sample):
 72 |               try:
 73 |                 pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')]
 74 |                 if pairs and pairs[0] and pairs[0][0]:
 75 |                   tot = sum([float(v[1]) for v in pairs])
 76 |                   if tot > 0:
 77 |                     for pair in pairs:
 78 |                         label, v = pair[0], float(pair[1])
 79 |                         label = ' | '.join(sorted(label.split(' | ')))
 80 |                         if label not in label_map2: label = label.split(' | ')[0]
 81 |                         c = label_map2[label]
 82 |                         weights[isample][(max_label+1)*3+c] = v / tot
 83 |               except:
 84 |                 print pairs
 85 |                 exit(1)
 86 |             i += 2
 87 | 
 88 | 
 89 |     with open(sys.argv[1], 'w') as fo:
 90 |         idx = 0
 91 |         for line in open(sys.argv[2]):
 92 |             arr = line.strip().split('\t')
 93 |             if len(arr) >= 2:
 94 |                 label = arr[1] 
 95 |                 label = ' | '.join(sorted(label.split(' | ')))
 96 |                 if label not in label_map2: label = label.split(' | ')[0]
 97 |                 label = label_map2[label]
 98 |             else:
 99 |                 label = 0
100 |             fo.write('%s %s\n' % (label, ' '.join(['%s:%s' % (i+1, pred) for i, pred in enumerate(weights[idx])])))
101 |             idx += 1
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/V2/prepare_ensemble_dog.py:
--------------------------------------------------------------------------------
  1 | n_sample = 5961
  2 | 
  3 | label_map = {}
  4 | label_map2 = {}
  5 | max_label = 0
  6 | for line in open('../dataset/label_map_dog'):
  7 |     label, c = line.strip().split('\t')
  8 |     label_map[int(c)] = label
  9 |     label_map2[label] = int(c)
 10 |     max_label = max(max_label, int(c))
 11 | print 'max_label:', max_label 
 12 | 
 13 | weights = []
 14 | for i in xrange(n_sample):
 15 |     weights.append([0]*((max_label+1)*8))
 16 | 
 17 | if __name__ == '__main__':
 18 |     import sys
 19 |     if len(sys.argv) < 4:
 20 |         print '<usage> out label in1 m1 [in2 m2 ...]'
 21 |         exit(1)
 22 | 
 23 |     i = 3
 24 |     while i < len(sys.argv):
 25 |         in_i = sys.argv[i]
 26 |         m_i = sys.argv[i+1]
 27 |         print in_i, m_i
 28 |         if m_i == 'xgboost':
 29 |             nclass = int(sys.argv[i+2])
 30 |             fin = open(in_i)
 31 |             for isample in xrange(n_sample):
 32 |                 for ipred in xrange(nclass):
 33 |                     pred = float(fin.readline().strip())
 34 |                     if ipred <= max_label:
 35 |                         weights[isample][ipred] = pred 
 36 |             fin.close()
 37 |             i += 3
 38 |         elif m_i == 'liblinear':
 39 |             fin = open(in_i)
 40 |             fin.readline()
 41 |             for isample in xrange(n_sample):
 42 |                 preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 
 43 |                 for ipred, pred in enumerate(preds):
 44 |                     weights[isample][max_label+1+ipred] = pred
 45 |             i += 2
 46 |         elif m_i == 'semilda':
 47 |             lda_map = {}
 48 |             for line in open('../dataset/label_map_lda'):
 49 |                 label, c = line.strip().split('\t')
 50 |                 lda_map[int(c)] = label
 51 |             feat_map = {}
 52 |             fin = open('../trans_data/valid.simple1')
 53 |             for line in open(in_i):
 54 |                 preds = [float(v) for v in line.strip().split(' ')] 
 55 |                 tot = sum(preds) + 0.001
 56 |                 preds = [v/tot for v in preds]
 57 |                 feats = fin.readline().strip().split('\t')[1]
 58 |                 feat_map[feats] = preds
 59 |             fin.close()
 60 |             fin = open('../trans_data/valid2.txt')
 61 |             for isample in xrange(n_sample):
 62 |                 feats = fin.readline().strip()
 63 |                 preds = feat_map[feats]
 64 |                 for ipred, pred in enumerate(preds):
 65 |                     ipred2 = label_map2[lda_map[ipred]]
 66 |                     weights[isample][(max_label+1)*2+ipred2] = pred / tot
 67 |             fin.close()
 68 |             i += 2
 69 |         elif m_i == 'sessionlabel':
 70 |             fin = open(in_i)
 71 |             for isample in xrange(n_sample):
 72 |               try:
 73 |                 pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')]
 74 |                 if pairs and pairs[0] and pairs[0][0]:
 75 |                   tot = sum([float(v[1]) for v in pairs])
 76 |                   if tot > 0:
 77 |                     for pair in pairs:
 78 |                         label, v = pair[0], float(pair[1])
 79 |                         label = ' | '.join(sorted(label.split(' | ')))
 80 |                         if label not in label_map2: label = label.split(' | ')[0]
 81 |                         c = label_map2[label]
 82 |                         weights[isample][(max_label+1)*3+c] = v / tot
 83 |               except:
 84 |                 print pairs
 85 |                 exit(1)
 86 |             i += 2
 87 | 
 88 | 
 89 |     with open(sys.argv[1], 'w') as fo:
 90 |         idx = 0
 91 |         for line in open(sys.argv[2]):
 92 |             arr = line.strip().split('\t')
 93 |             if len(arr) >= 2:
 94 |                 label = arr[1] 
 95 |                 label = ' | '.join(sorted(label.split(' | ')))
 96 |                 if label not in label_map2: label = label.split(' | ')[0]
 97 |                 label = label_map2[label]
 98 |             else:
 99 |                 label = 0
100 |             fo.write('%s %s\n' % (label, ' '.join(['%s:%s' % (i+1, pred) for i, pred in enumerate(weights[idx])])))
101 |             idx += 1
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/V2/prepare_ensemble_pig.py:
--------------------------------------------------------------------------------
  1 | n_sample = 39013
  2 | 
  3 | label_map = {}
  4 | label_map2 = {}
  5 | max_label = 0
  6 | for line in open('../dataset/label_map_dog'):
  7 |     label, c = line.strip().split('\t')
  8 |     label_map[int(c)] = label
  9 |     label_map2[label] = int(c)
 10 |     max_label = max(max_label, int(c))
 11 | print 'max_label:', max_label 
 12 | 
 13 | weights = []
 14 | for i in xrange(n_sample):
 15 |     weights.append([0]*((max_label+1)*8))
 16 | 
 17 | if __name__ == '__main__':
 18 |     import sys
 19 |     if len(sys.argv) < 4:
 20 |         print '<usage> out label in1 m1 [in2 m2 ...]'
 21 |         exit(1)
 22 | 
 23 |     i = 3
 24 |     while i < len(sys.argv):
 25 |         in_i = sys.argv[i]
 26 |         m_i = sys.argv[i+1]
 27 |         print in_i, m_i
 28 |         if m_i == 'xgboost':
 29 |             nclass = int(sys.argv[i+2])
 30 |             fin = open(in_i)
 31 |             for isample in xrange(n_sample):
 32 |                 for ipred in xrange(nclass):
 33 |                     pred = float(fin.readline().strip())
 34 |                     if ipred <= max_label:
 35 |                         weights[isample][ipred] = pred 
 36 |             fin.close()
 37 |             i += 3
 38 |         elif m_i == 'liblinear':
 39 |             fin = open(in_i)
 40 |             fin.readline()
 41 |             for isample in xrange(n_sample):
 42 |                 preds = [float(v) for v in fin.readline().strip().split(' ')[1:]] 
 43 |                 for ipred, pred in enumerate(preds):
 44 |                     weights[isample][max_label+1+ipred] = pred
 45 |             i += 2
 46 |         elif m_i == 'semilda':
 47 |             lda_map = {}
 48 |             for line in open('../dataset/label_map_lda'):
 49 |                 label, c = line.strip().split('\t')
 50 |                 lda_map[int(c)] = label
 51 |             feat_map = {}
 52 |             fin = open('../trans_data/test.simple1')
 53 |             for line in open(in_i):
 54 |                 preds = [float(v) for v in line.strip().split(' ')] 
 55 |                 tot = sum(preds) + 0.001
 56 |                 preds = [v/tot for v in preds]
 57 |                 feats = fin.readline().strip().split('\t')[1]
 58 |                 feat_map[feats] = preds
 59 |             fin.close()
 60 |             fin = open('../raw_data/test.txt')
 61 |             for isample in xrange(n_sample):
 62 |                 feats = fin.readline().strip()
 63 |                 preds = feat_map[feats]
 64 |                 for ipred, pred in enumerate(preds):
 65 |                     ipred2 = label_map2[lda_map[ipred]]
 66 |                     weights[isample][(max_label+1)*2+ipred2] = pred / tot
 67 |             fin.close()
 68 |             i += 2
 69 |         elif m_i == 'sessionlabel':
 70 |             fin = open(in_i)
 71 |             for isample in xrange(n_sample):
 72 |               try:
 73 |                 pairs = [pair.split(':') for pair in fin.readline().strip().split(' || ')]
 74 |                 if pairs and pairs[0] and pairs[0][0]:
 75 |                   tot = sum([float(v[1]) for v in pairs])
 76 |                   if tot > 0:
 77 |                     for pair in pairs:
 78 |                         label, v = pair[0], float(pair[1])
 79 |                         label = ' | '.join(sorted(label.split(' | ')))
 80 |                         if label not in label_map2: label = label.split(' | ')[0]
 81 |                         c = label_map2[label]
 82 |                         weights[isample][(max_label+1)*3+c] = v / tot
 83 |               except:
 84 |                 print pairs
 85 |                 exit(1)
 86 |             i += 2
 87 | 
 88 | 
 89 |     with open(sys.argv[1], 'w') as fo:
 90 |         idx = 0
 91 |         for line in open(sys.argv[2]):
 92 |             arr = line.strip().split('\t')
 93 |             if len(arr) >= 2:
 94 |                 label = arr[1] 
 95 |                 label = ' | '.join(sorted(label.split(' | ')))
 96 |                 if label not in label_map2: label = label.split(' | ')[0]
 97 |                 label = label_map2[label]
 98 |             else:
 99 |                 label = 0
100 |             fo.write('%s %s\n' % (label, ' '.join(['%s:%s' % (i+1, pred) for i, pred in enumerate(weights[idx])])))
101 |             idx += 1
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/V2/prepare_feature_dog1.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_dog_train_feat = '../trans_data/dog.simple1'
  3 | p_dog_valid_feat = '../trans_data/valid.simple1'
  4 | p_dog_valid_id = '../trans_data/valid.txt'
  5 | 
  6 | p_pig_train_feat = '../trans_data/train.simple1'
  7 | p_pig_valid_feat = '../trans_data/test.simple1'
  8 | p_pig_valid_id = '../raw_data/test.txt'
  9 | 
 10 | p_label_map = '../dataset/label_map_dog'
 11 | p_dog_train = '../dataset/dog_train'
 12 | p_dog_test = '../dataset/dog_test'
 13 | p_pig_train = '../dataset/pig_train'
 14 | p_pig_test = '../dataset/pig_test'
 15 | 
 16 | min_word_df = 5
 17 | min_title_df = 10
 18 | 
 19 | label_map = {}
 20 | cur_label = 0
 21 | word_map = {}
 22 | cur_word = 1 
 23 | 
 24 | pig_train = []
 25 | test_dict = {}
 26 | pig_test = [] 
 27 | 
 28 | label_df = {}
 29 | word_df = {}
 30 | 
 31 | def get_df(p_in):
 32 |   for line in open(p_in):
 33 |     row = line.strip().split('\t')
 34 |     label = row[0]
 35 |     label = ' | '.join(sorted(label.split(' | ')))
 36 |     label_df[label] = label_df.get(label, 0) + 1
 37 |     query = row[1]
 38 |     titles = row[2] if len(row)>=3 else ''
 39 |     session_queries = row[5] if len(row)>=6 else ''
 40 |     session_titles = row[6] if len(row)>=7 else ''
 41 | 
 42 |     feat_list = query.split(' ')
 43 |     for i, word in enumerate(feat_list):
 44 |         if not word: continue
 45 |         word_df[word] = word_df.get(word, 0) + 1
 46 |         if i>=1:
 47 |             word = ' '.join(feat_list[i-1:i+1])
 48 |             word_df[word] = word_df.get(word, 0) + 1
 49 |         word = '%s_%s' % (i, feat_list[i])
 50 |         word_df[word] = word_df.get(word, 0) + 1
 51 |         if i >= len(feat_list)/2:
 52 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
 53 |             word_df[word] = word_df.get(word, 0) + 1
 54 | 
 55 |     for pair in titles.split(';'):
 56 |         if not pair: continue
 57 |         title, freq = pair.split(':')
 58 |         feat_list = title.split(' ')
 59 |         for i, word in enumerate(feat_list):
 60 |             if not word: continue
 61 |             word = 't_' + word
 62 |             word_df[word] = word_df.get(word, 0) + 1
 63 |             if i>=1:
 64 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
 65 |                 word_df[word] = word_df.get(word, 0) + 1
 66 | 
 67 |     for pair in session_queries.split(';'):
 68 |         if not pair: continue
 69 |         title, freq = pair.split(':')
 70 |         feat_list = title.split(' ')
 71 |         for i, word in enumerate(feat_list):
 72 |             if not word: continue
 73 |             word = 'sq_' + word
 74 |             word_df[word] = word_df.get(word, 0) + 1
 75 |             if i>=1:
 76 |                 word = 'sq_' + ' '.join(feat_list[i-1:i+1])
 77 |                 word_df[word] = word_df.get(word, 0) + 1
 78 | 
 79 |     for pair in session_titles.split(';'):
 80 |         if not pair: continue
 81 |         title, freq = pair.split(':')
 82 |         feat_list = title.split(' ')
 83 |         for i, word in enumerate(feat_list):
 84 |             if not word: continue
 85 |             word = 'st_' + word
 86 |             word_df[word] = word_df.get(word, 0) + 1
 87 |             if i>=1:
 88 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
 89 |                 word_df[word] = word_df.get(word, 0) + 1
 90 | 
 91 | def prepare(p_in, p_out, isTrain, p_in2):
 92 |   global cur_label
 93 |   global cur_word
 94 | 
 95 |   if isTrain: fo = open(p_out, 'w')
 96 |   fin2 = open(p_in2)
 97 | 
 98 |   for line in open(p_in):
 99 |     row = line.strip().split('\t')
100 |     label = row[0]
101 |     query = row[1]
102 |     titles = row[2] if len(row)>=3 else ''
103 |     labels = row[3] if len(row)>=4 else ''
104 |     session_queries = row[5] if len(row)>=6 else ''
105 |     session_titles = row[6] if len(row)>=7 else ''
106 | 
107 |     row2 = fin2.readline().split('\t')
108 |     if row2[0] == query:
109 |         stats = row2[1]
110 |         stats2 = row2[2].strip()
111 |     else:
112 |         print 'query mismatch'
113 |         exit(1)
114 | 
115 |     if isTrain:
116 |         label = ' | '.join(sorted(label.split(' | ')))
117 |         if label_df[label] < 200:
118 |             label = label.split(' ')[0]
119 |         if label not in label_map:
120 |             label_map[label] = cur_label
121 |             cur_label += 1
122 | 
123 |     feat_list = query.split(' ')
124 |     word_tf = {}
125 |     for i, word in enumerate(feat_list):
126 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
127 |             word_map[word] = cur_word
128 |             cur_word += 1
129 |         if word in word_map:
130 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
131 |         if i>=1:
132 |             word = ' '.join(feat_list[i-1:i+1])
133 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
134 |                 word_map[word] = cur_word
135 |                 cur_word += 1
136 |             if word in word_map:
137 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
138 |         word = '%s_%s' % (i, feat_list[i])
139 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
140 |             word_map[word] = cur_word
141 |             cur_word += 1
142 |         if word in word_map:
143 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
144 |         if i >= len(feat_list) / 2:
145 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
146 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
147 |                 word_map[word] = cur_word
148 |                 cur_word += 1
149 |             if word in word_map:
150 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
151 | 
152 |     tot_freq = 0
153 |     for pair in titles.split(';'):
154 |         if not pair: continue
155 |         title, freq = pair.split(':')
156 |         tot_freq += float(freq)
157 |     word_tf2 = {}
158 |     for pair in titles.split(';'):
159 |         if not pair: continue
160 |         title, freq = pair.split(':')
161 |         freq = float(freq)
162 |         feat_list = title.split(' ')
163 |         for i, word in enumerate(feat_list):
164 |             word = 't_' + word
165 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
166 |                 word_map[word] = cur_word
167 |                 cur_word += 1
168 |             if word in word_map:
169 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
170 |             word = feat_list[i]
171 |             '''
172 |             if word in word_map and word_map[word] in word_tf:
173 |                 word = 'qt_' + word
174 |                 if isTrain and word not in word_map:
175 |                     word_map[word] = cur_word
176 |                     cur_word += 1
177 |                 if word in word_map:
178 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
179 |             '''
180 |             if i>=1:
181 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
182 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
183 |                     word_map[word] = cur_word
184 |                     cur_word += 1
185 |                 if word in word_map:
186 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
187 | 
188 |     tot_freq = 0
189 |     for pair in session_queries.split(';'):
190 |         if not pair: continue
191 |         title, freq = pair.split(':')
192 |         tot_freq += float(freq)
193 |     for pair in session_queries.split(';'):
194 |         if not pair: continue
195 |         title, freq = pair.split(':')
196 |         freq = float(freq)
197 |         feat_list = title.split(' ')
198 |         for i, word in enumerate(feat_list):
199 |             word = 'sq_' + word
200 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
201 |                 word_map[word] = cur_word
202 |                 cur_word += 1
203 |             if word in word_map:
204 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
205 |             '''
206 |             if i>=1:
207 |                 word = 'sq_' + ' '.join(feat_list[i-1:i+1])
208 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
209 |                     word_map[word] = cur_word
210 |                     cur_word += 1
211 |                 if word in word_map:
212 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
213 |             '''
214 | 
215 |     tot_freq = 0
216 |     for pair in session_titles.split(';'):
217 |         if not pair: continue
218 |         title, freq = pair.split(':')
219 |         tot_freq += float(freq)
220 |     for pair in session_titles.split(';'):
221 |         if not pair: continue
222 |         title, freq = pair.split(':')
223 |         freq = float(freq)
224 |         feat_list = title.split(' ')
225 |         for i, word in enumerate(feat_list):
226 |             word = 'st_' + word
227 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
228 |                 word_map[word] = cur_word
229 |                 cur_word += 1
230 |             if word in word_map:
231 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
232 |             if i>=1:
233 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
234 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
235 |                     word_map[word] = cur_word
236 |                     cur_word += 1
237 |                 if word in word_map:
238 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
239 | 
240 |     for pair in labels.split(';'):
241 |         if not pair: continue
242 |         word, freq = pair.split(':')
243 |         freq = float(freq)
244 |         if isTrain and word not in word_map:
245 |             word_map[word] = cur_word
246 |             cur_word += 1
247 |         if word in word_map:
248 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
249 | 
250 |     for pair in stats.split(';') + stats2.split(';'):
251 |         if not pair: continue
252 |         word, freq = pair.split(':')
253 |         freq = float(freq)
254 |         if isTrain and word not in word_map:
255 |             word_map[word] = cur_word
256 |             cur_word += 1
257 |         if word in word_map:
258 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
259 | 
260 |     if isTrain:
261 |         label, F = (label_map[label], word_tf.items() + word_tf2.items())
262 |         F = sorted(F, key=lambda d:d[0])
263 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
264 |         fo.write('%s %s\n' % (label, f_str))
265 |     else:
266 |         test_dict[query] = word_tf.items() + word_tf2.items()
267 | 
268 |   if isTrain: fo.close()
269 |   fin2.close()
270 | 
271 |   # save label map
272 |   with open(p_label_map, 'w') as fo:
273 |     for label in label_map:
274 |         fo.write('%s\t%s\n' % (label, label_map[label]))
275 | 
276 | 
277 | def save_test(p_in, p_out):
278 |   with open(p_out, 'w') as fo:
279 |     for line in open(p_in):
280 |         query = line.strip()
281 |         F = test_dict[query]
282 |         F = sorted(F, key=lambda d:d[0])
283 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
284 |         fo.write('%s %s\n' % (0, f_str))
285 |         
286 | get_df(p_dog_train_feat)
287 | prepare(p_dog_train_feat, p_dog_train, True, '../trans_data/dog.simple2')
288 | prepare(p_dog_valid_feat, '', False, '../trans_data/valid.simple2')
289 | save_test('../trans_data/cat.txt', '../dataset/cat_test')
290 | save_test('../trans_data/valid2.txt', '../dataset/dog_test')
291 | 
292 | 


--------------------------------------------------------------------------------
/V2/prepare_feature_pig1.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_dog_train_feat = '../trans_data/dog.simple1'
  3 | p_dog_valid_feat = '../trans_data/valid.simple1'
  4 | p_dog_valid_id = '../trans_data/valid.txt'
  5 | 
  6 | p_pig_train_feat = '../trans_data/train.simple1'
  7 | p_pig_valid_feat = '../trans_data/test.simple1'
  8 | p_pig_valid_id = '../raw_data/test.txt'
  9 | 
 10 | p_label_map = '../dataset/label_map_pig'
 11 | p_dog_train = '../dataset/dog_train'
 12 | p_dog_test = '../dataset/dog_test'
 13 | p_pig_train = '../dataset/pig_train'
 14 | p_pig_test = '../dataset/pig_test'
 15 | 
 16 | min_word_df = 5
 17 | min_title_df = 10
 18 | 
 19 | label_map = {}
 20 | cur_label = 0
 21 | word_map = {}
 22 | cur_word = 1 
 23 | 
 24 | pig_train = []
 25 | test_dict = {}
 26 | pig_test = [] 
 27 | 
 28 | label_df = {}
 29 | word_df = {}
 30 | 
 31 | def get_df(p_in):
 32 |   for line in open(p_in):
 33 |     row = line.strip().split('\t')
 34 |     label = row[0]
 35 |     label = ' | '.join(sorted(label.split(' | ')))
 36 |     label_df[label] = label_df.get(label, 0) + 1
 37 |     query = row[1]
 38 |     titles = row[2] if len(row)>=3 else ''
 39 |     session_queries = row[5] if len(row)>=6 else ''
 40 |     session_titles = row[6] if len(row)>=7 else ''
 41 | 
 42 |     feat_list = query.split(' ')
 43 |     for i, word in enumerate(feat_list):
 44 |         if not word: continue
 45 |         word_df[word] = word_df.get(word, 0) + 1
 46 |         if i>=1:
 47 |             word = ' '.join(feat_list[i-1:i+1])
 48 |             word_df[word] = word_df.get(word, 0) + 1
 49 |         word = '%s_%s' % (i, feat_list[i])
 50 |         word_df[word] = word_df.get(word, 0) + 1
 51 |         if i >= len(feat_list)/2:
 52 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
 53 |             word_df[word] = word_df.get(word, 0) + 1
 54 | 
 55 |     for pair in titles.split(';'):
 56 |         if not pair: continue
 57 |         title, freq = pair.split(':')
 58 |         feat_list = title.split(' ')
 59 |         for i, word in enumerate(feat_list):
 60 |             if not word: continue
 61 |             word = 't_' + word
 62 |             word_df[word] = word_df.get(word, 0) + 1
 63 |             if i>=1:
 64 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
 65 |                 word_df[word] = word_df.get(word, 0) + 1
 66 | 
 67 |     for pair in session_queries.split(';'):
 68 |         if not pair: continue
 69 |         title, freq = pair.split(':')
 70 |         feat_list = title.split(' ')
 71 |         for i, word in enumerate(feat_list):
 72 |             if not word: continue
 73 |             word = 'sq_' + word
 74 |             word_df[word] = word_df.get(word, 0) + 1
 75 |             if i>=1:
 76 |                 word = 'sq_' + ' '.join(feat_list[i-1:i+1])
 77 |                 word_df[word] = word_df.get(word, 0) + 1
 78 | 
 79 |     for pair in session_titles.split(';'):
 80 |         if not pair: continue
 81 |         title, freq = pair.split(':')
 82 |         feat_list = title.split(' ')
 83 |         for i, word in enumerate(feat_list):
 84 |             if not word: continue
 85 |             word = 'st_' + word
 86 |             word_df[word] = word_df.get(word, 0) + 1
 87 |             if i>=1:
 88 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
 89 |                 word_df[word] = word_df.get(word, 0) + 1
 90 | 
 91 | def prepare(p_in, p_out, isTrain, p_in2):
 92 |   global cur_label
 93 |   global cur_word
 94 | 
 95 |   if isTrain: fo = open(p_out, 'w')
 96 |   fin2 = open(p_in2)
 97 | 
 98 |   for line in open(p_in):
 99 |     row = line.strip().split('\t')
100 |     label = row[0]
101 |     query = row[1]
102 |     titles = row[2] if len(row)>=3 else ''
103 |     labels = row[3] if len(row)>=4 else ''
104 |     session_queries = row[5] if len(row)>=6 else ''
105 |     session_titles = row[6] if len(row)>=7 else ''
106 | 
107 |     row2 = fin2.readline().split('\t')
108 |     if row2[0] == query:
109 |         stats = row2[1]
110 |         stats2 = row2[2].strip()
111 |     else:
112 |         print 'query mismatch'
113 |         exit(1)
114 | 
115 |     if isTrain:
116 |         label = ' | '.join(sorted(label.split(' | ')))
117 |         if label_df[label] < 200:
118 |             label = label.split(' ')[0]
119 |         if label not in label_map:
120 |             label_map[label] = cur_label
121 |             cur_label += 1
122 | 
123 |     feat_list = query.split(' ')
124 |     word_tf = {}
125 |     for i, word in enumerate(feat_list):
126 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
127 |             word_map[word] = cur_word
128 |             cur_word += 1
129 |         if word in word_map:
130 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
131 |         if i>=1:
132 |             word = ' '.join(feat_list[i-1:i+1])
133 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
134 |                 word_map[word] = cur_word
135 |                 cur_word += 1
136 |             if word in word_map:
137 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
138 |         word = '%s_%s' % (i, feat_list[i])
139 |         if isTrain and word_df[word] >= min_word_df and word not in word_map:
140 |             word_map[word] = cur_word
141 |             cur_word += 1
142 |         if word in word_map:
143 |             word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
144 |         if i >= len(feat_list) / 2:
145 |             word = '%s_%s' % (i-len(feat_list), feat_list[i])
146 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
147 |                 word_map[word] = cur_word
148 |                 cur_word += 1
149 |             if word in word_map:
150 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
151 | 
152 |     tot_freq = 0
153 |     for pair in titles.split(';'):
154 |         if not pair: continue
155 |         title, freq = pair.split(':')
156 |         tot_freq += float(freq)
157 |     word_tf2 = {}
158 |     for pair in titles.split(';'):
159 |         if not pair: continue
160 |         title, freq = pair.split(':')
161 |         freq = float(freq)
162 |         feat_list = title.split(' ')
163 |         for i, word in enumerate(feat_list):
164 |             word = 't_' + word
165 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
166 |                 word_map[word] = cur_word
167 |                 cur_word += 1
168 |             if word in word_map:
169 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
170 |             '''
171 |             word = feat_list[i]
172 |             if word in word_map and word_map[word] in word_tf:
173 |                 word = 'qt_' + word
174 |                 if isTrain and word not in word_map:
175 |                     word_map[word] = cur_word
176 |                     cur_word += 1
177 |                 if word in word_map:
178 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
179 |             '''
180 |             if i>=1:
181 |                 word = 't_' + ' '.join(feat_list[i-1:i+1])
182 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
183 |                     word_map[word] = cur_word
184 |                     cur_word += 1
185 |                 if word in word_map:
186 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
187 | 
188 |     tot_freq = 0
189 |     for pair in session_queries.split(';'):
190 |         if not pair: continue
191 |         title, freq = pair.split(':')
192 |         tot_freq += float(freq)
193 |     for pair in session_queries.split(';'):
194 |         if not pair: continue
195 |         title, freq = pair.split(':')
196 |         freq = float(freq)
197 |         feat_list = title.split(' ')
198 |         for i, word in enumerate(feat_list):
199 |             word = 'sq_' + word
200 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
201 |                 word_map[word] = cur_word
202 |                 cur_word += 1
203 |             if word in word_map:
204 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
205 |             '''
206 |             if i>=1:
207 |                 word = 'sq_' + ' '.join(feat_list[i-1:i+1])
208 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
209 |                     word_map[word] = cur_word
210 |                     cur_word += 1
211 |                 if word in word_map:
212 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
213 |             '''
214 | 
215 |     tot_freq = 0
216 |     for pair in session_titles.split(';'):
217 |         if not pair: continue
218 |         title, freq = pair.split(':')
219 |         tot_freq += float(freq)
220 |     for pair in session_titles.split(';'):
221 |         if not pair: continue
222 |         title, freq = pair.split(':')
223 |         freq = float(freq)
224 |         feat_list = title.split(' ')
225 |         for i, word in enumerate(feat_list):
226 |             word = 'st_' + word
227 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
228 |                 word_map[word] = cur_word
229 |                 cur_word += 1
230 |             if word in word_map:
231 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
232 |             if i>=1:
233 |                 word = 'st_' + ' '.join(feat_list[i-1:i+1])
234 |                 if isTrain and word_df[word] >= min_title_df and word not in word_map:
235 |                     word_map[word] = cur_word
236 |                     cur_word += 1
237 |                 if word in word_map:
238 |                     word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
239 | 
240 |     for pair in labels.split(';'):
241 |         if not pair: continue
242 |         word, freq = pair.split(':')
243 |         freq = float(freq)
244 |         if isTrain and word not in word_map:
245 |             word_map[word] = cur_word
246 |             cur_word += 1
247 |         if word in word_map:
248 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
249 | 
250 |     for pair in stats.split(';') + stats2.split(';'):
251 |         if not pair: continue
252 |         word, freq = pair.split(':')
253 |         freq = float(freq)
254 |         if isTrain and word not in word_map:
255 |             word_map[word] = cur_word
256 |             cur_word += 1
257 |         if word in word_map:
258 |             word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + freq
259 | 
260 |     if isTrain:
261 |         label, F = (label_map[label], word_tf.items() + word_tf2.items())
262 |         F = sorted(F, key=lambda d:d[0])
263 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
264 |         fo.write('%s %s\n' % (label, f_str))
265 |     else:
266 |         test_dict[query] = word_tf.items() + word_tf2.items()
267 | 
268 |   if isTrain: fo.close()
269 |   fin2.close()
270 | 
271 |   # save label map
272 |   with open(p_label_map, 'w') as fo:
273 |     for label in label_map:
274 |         fo.write('%s\t%s\n' % (label, label_map[label]))
275 | 
276 | 
277 | def save_test(p_in, p_out):
278 |   with open(p_out, 'w') as fo:
279 |     for line in open(p_in):
280 |         query = line.strip()
281 |         F = test_dict[query]
282 |         F = sorted(F, key=lambda d:d[0])
283 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
284 |         fo.write('%s %s\n' % (0, f_str))
285 |         
286 | get_df(p_pig_train_feat)
287 | prepare(p_pig_train_feat, p_pig_train, True, '../trans_data/train.simple2')
288 | prepare(p_pig_valid_feat, '', False, '../trans_data/test.simple2')
289 | save_test(p_pig_valid_id, p_pig_test)
290 | 
291 | 


--------------------------------------------------------------------------------
/V2/prepare_lda_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # coding: utf-8
 3 | # @author: zuotaoliu@126.com
 4 | # @created: 2014-08-29
 5 | import os
 6 | import sys
 7 | import re
 8 | 
 9 | def do_word_index(p_in, p_out):
10 |     label_map = {}
11 |     cur_label = 0
12 |     word_count = {}
13 |     cur_idx = 0
14 |     fo = open(p_out, 'w')
15 |     for line in open(p_in):
16 |         row = line.rstrip().split('\t')
17 | 
18 |         if len(row) >= 3:
19 |             feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5])
20 |         else:
21 |             feats = row[1] + ':1'
22 | 
23 |         labels = row[0].split(' | ')
24 |         cids = []
25 |         for label in labels:
26 |             if label not in label_map: 
27 |                 label_map[label] = cur_label
28 |                 cur_label += 1
29 |             cids.append(str(label_map[label]))
30 |         wc = {}
31 |         for pair in feats.split(';'):
32 |             if not pair: continue
33 |             words, freq = pair.split(':')
34 |             freq = min(1, int(freq))
35 |             for word in words.split(' '):
36 |                 if not word: continue
37 |                 wc[word] = wc.get(word, 0) + freq
38 |                 word_count[word] = word_count.get(word, 0) + freq
39 |         fo.write('%s\n' % (' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 
40 |     fo.close()
41 |         
42 |     return word_count
43 |         
44 | if __name__ == "__main__":
45 |     if len(sys.argv) < 3:
46 |         print '<usage> inputfile outputfile'
47 |         exit(-1)
48 |     word_count = do_word_index(sys.argv[1], sys.argv[2])
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/V2/prepare_lda_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # coding: utf-8
 3 | # @author: zuotaoliu@126.com
 4 | # @created: 2014-08-29
 5 | import os
 6 | import sys
 7 | import re
 8 | 
 9 | def do_word_index(p_in, p_out):
10 |     label_map = {}
11 |     cur_label = 0
12 |     word_count = {}
13 |     cur_idx = 0
14 |     fo = open(p_out, 'w')
15 |     for line in open(p_in):
16 |         row = line.rstrip().split('\t')
17 | 
18 |         if len(row) >= 3:
19 |             feats = row[1] + ':1;' + ';'.join(row[2].split(';')[:5])
20 |         else:
21 |             feats = row[1] + ':1'
22 | 
23 |         labels = row[0].split(' | ')
24 |         cids = []
25 |         for label in labels:
26 |             if label not in label_map: 
27 |                 label_map[label] = cur_label
28 |                 cur_label += 1
29 |             cids.append(str(label_map[label]))
30 |         wc = {}
31 |         for pair in feats.split(';'):
32 |             if not pair: continue
33 |             words, freq = pair.split(':')
34 |             freq = min(1, int(freq))
35 |             for word in words.split(' '):
36 |                 if not word: continue
37 |                 wc[word] = wc.get(word, 0) + freq
38 |                 word_count[word] = word_count.get(word, 0) + freq
39 |         fo.write('[%s] %s\n' % (' '.join(cids), ' '.join(['%s %s' % (k, v) for k, v in wc.items()]))) 
40 |     fo.close()
41 |     with open('../dataset/label_map_lda', 'w') as fo:
42 |         for label in label_map:
43 |             fo.write('%s\t%s\n' % (label, label_map[label]))
44 |         
45 |     return word_count
46 |         
47 | if __name__ == "__main__":
48 |     if len(sys.argv) < 4:
49 |         print '<usage> inputfile outputfile wordindex'
50 |         exit(-1)
51 |     word_count = do_word_index(sys.argv[1], sys.argv[2])
52 | 
53 |     sort_list = sorted(word_count.items(), key=lambda d:d[1], reverse=True)
54 |     with open(sys.argv[3], 'w') as fo:
55 |         for id, pair in enumerate(sort_list):
56 |             word, num = pair
57 |             if num >= 5:
58 |                 fo.write('%s %s\n' % (id, word))
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/V2/prepare_session.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_dog_train_feat = '../trans_data/dog.simple5'
  3 | p_dog_valid_feat = '../trans_data/valid.simple5'
  4 | p_dog_valid_id = '../trans_data/valid.txt'
  5 | 
  6 | p_pig_train_feat = '../trans_data/pig.simple5'
  7 | p_pig_valid_feat = '../trans_data/test.simple5'
  8 | p_pig_valid_id = '../raw_data/test.txt'
  9 | 
 10 | p_label_map = '../dataset/label_map_session'
 11 | p_dog_train = '../dataset/dog_train_session'
 12 | p_dog_test = '../dataset/dog_test_session'
 13 | p_pig_train = '../dataset/pig_train_session'
 14 | p_pig_test = '../dataset/pig_test_session'
 15 | 
 16 | min_word_df = 5
 17 | min_title_df = 10
 18 | 
 19 | label_map = {}
 20 | cur_label = 0
 21 | word_map = {}
 22 | cur_word = 1 
 23 | 
 24 | pig_train = []
 25 | test_dict = {}
 26 | pig_test = [] 
 27 | 
 28 | label_df = {}
 29 | word_df = {}
 30 | 
 31 | def get_df(p_in):
 32 |   for line in open(p_in):
 33 |     row = line.strip().split('\t')
 34 |     label = row[0]
 35 |     label = ' | '.join(sorted(label.split(' | ')))
 36 |     label_df[label] = label_df.get(label, 0) + 1
 37 |     queries = row[1]
 38 |     titles = row[2] if len(row)>=3 else ''
 39 | 
 40 |     for query in queries.split(';'):
 41 |         if not query: continue
 42 |         feat_list = query.split(' ')
 43 |         for i, word in enumerate(feat_list):
 44 |             if not word: continue
 45 |             word_df[word] = word_df.get(word, 0) + 1
 46 |             #if i>=1:
 47 |             #    word = ' '.join(feat_list[i-1:i+1])
 48 |             #    word_df[word] = word_df.get(word, 0) + 1
 49 |             #word = '%s_%s' % (i, feat_list[i])
 50 |             #word_df[word] = word_df.get(word, 0) + 1
 51 |             #if i >= len(feat_list)/2:
 52 |             #    word = '%s_%s' % (i-len(feat_list), feat_list[i])
 53 |             #    word_df[word] = word_df.get(word, 0) + 1
 54 | 
 55 |     for title in titles.split(';'):
 56 |         if not title: continue
 57 |         feat_list = title.split(' ')
 58 |         for i, word in enumerate(feat_list):
 59 |             if not word: continue
 60 |             word = 't_' + word
 61 |             word_df[word] = word_df.get(word, 0) + 1
 62 |             #if i>=1:
 63 |             #    word = 't_' + ' '.join(feat_list[i-1:i+1])
 64 |             #    word_df[word] = word_df.get(word, 0) + 1
 65 | 
 66 | 
 67 | def prepare(p_in, p_out, isTrain):
 68 |   global cur_label
 69 |   global cur_word
 70 | 
 71 |   fo = open(p_out, 'w')
 72 | 
 73 |   for line in open(p_in):
 74 |     row = line.strip().split('\t')
 75 |     label = row[0]
 76 |     queries = row[1]
 77 |     titles = row[2] if len(row)>=3 else ''
 78 | 
 79 |     if isTrain:
 80 |         label = ' | '.join(sorted(label.split(' | ')))
 81 |         if label_df[label] < 50:
 82 |             label = label.split(' ')[0]
 83 |         if label not in label_map:
 84 |             label_map[label] = cur_label
 85 |             cur_label += 1
 86 | 
 87 |     word_tf = {}
 88 |     query_list = queries.split(';')
 89 |     for query in query_list:
 90 |         if not query: continue
 91 |         feat_list = query.split(' ')
 92 |         for i, word in enumerate(feat_list):
 93 |             if not word: continue
 94 |             if isTrain and word_df[word] >= min_word_df and word not in word_map:
 95 |                 word_map[word] = cur_word
 96 |                 cur_word += 1
 97 |             if word in word_map:
 98 |                 word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)*1./len(query_list)
 99 |             #if i>=1:
100 |             #    word = ' '.join(feat_list[i-1:i+1])
101 |             #    if isTrain and word_df[word] >= min_word_df and word not in word_map:
102 |             #        word_map[word] = cur_word
103 |             #        cur_word += 1
104 |             #    if word in word_map:
105 |             #        word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
106 |             #word = '%s_%s' % (i, feat_list[i])
107 |             #if isTrain and word_df[word] >= min_word_df and word not in word_map:
108 |             #    word_map[word] = cur_word
109 |             #    cur_word += 1
110 |             #if word in word_map:
111 |             #    word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
112 |             #if i >= len(feat_list) / 2:
113 |             #    word = '%s_%s' % (i-len(feat_list), feat_list[i])
114 |             #    if isTrain and word_df[word] >= min_word_df and word not in word_map:
115 |             #        word_map[word] = cur_word
116 |             #        cur_word += 1
117 |             #    if word in word_map:
118 |             #        word_tf[word_map[word]] = word_tf.get(word_map[word], 0) + 1./len(feat_list)
119 | 
120 |     word_tf2 = {}
121 |     title_list = titles.split(';')
122 |     for title in title_list:
123 |         if not title: continue
124 |         feat_list = title.split(' ')
125 |         for i, word in enumerate(feat_list):
126 |             word = 't_' + word
127 |             if isTrain and word_df[word] >= min_title_df and word not in word_map:
128 |                 word_map[word] = cur_word
129 |                 cur_word += 1
130 |             if word in word_map:
131 |                 word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*1./len(title_list)
132 |             #if i>=1:
133 |             #    word = 't_' + ' '.join(feat_list[i-1:i+1])
134 |             #    if isTrain and word_df[word] >= min_title_df and word not in word_map:
135 |             #        word_map[word] = cur_word
136 |             #        cur_word += 1
137 |             #    if word in word_map:
138 |             #        word_tf2[word_map[word]] = word_tf2.get(word_map[word], 0) + 1./len(feat_list)*freq/tot_freq
139 | 
140 | 
141 |     if isTrain:
142 |         label, F = (label_map[label], word_tf.items() + word_tf2.items())
143 |         F = sorted(F, key=lambda d:d[0])
144 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
145 |         fo.write('%s %s\n' % (label, f_str))
146 |     else:
147 |         label, F = 0, word_tf.items() + word_tf2.items()
148 |         F = sorted(F, key=lambda d:d[0])
149 |         f_str = ' '.join(['%s:%s' % (k, v) for k, v in F])
150 |         fo.write('%s %s\n' % (label, f_str))
151 | 
152 |   if isTrain: fo.close()
153 | 
154 |   # save label map
155 |   with open(p_label_map, 'w') as fo:
156 |     for label in label_map:
157 |         fo.write('%s\t%s\n' % (label, label_map[label]))
158 | 
159 | 
160 | #for dog
161 | #get_df(p_dog_train_feat)
162 | #prepare(p_dog_train_feat, p_dog_train, True)
163 | #for pig
164 | get_df(p_pig_train_feat)
165 | prepare(p_pig_train_feat, p_pig_train, True)
166 | 
167 | #prepare(p_dog_valid_feat, p_dog_test, False)
168 | prepare(p_pig_valid_feat, p_pig_test, False)
169 | 
170 | 


--------------------------------------------------------------------------------
/V2/refine_train_by_sesson_query.py:
--------------------------------------------------------------------------------
 1 | 
 2 | p_train = '../raw_data/train.txt'
 3 | p_test = '../raw_data/test.txt'
 4 | p_dog = '../trans_data/dog.txt'
 5 | p_valid = '../trans_data/valid.txt'
 6 | 
 7 | def refine(p_in, p_out):
 8 |     with open(p_out, 'w') as fo:
 9 |         last_query = None
10 |         has_known = False
11 |         session_lines = []
12 |         for line in open(p_in):
13 |             if not line.strip():
14 |                 fo.write('\n')
15 |                 if has_known:
16 |                     for l in session_lines:
17 |                         fo.write(l)
18 |                 last_query = None
19 |                 has_known = False
20 |                 session_lines = []
21 |             else:
22 |                 label, query = line.strip().split('\t')[:2]
23 |                 query_set = set(query.split(' '))
24 |                 if not last_query or (last_query & query_set):
25 |                     session_lines.append(line)
26 |                     if label != 'CLASS=UNKNOWN':
27 |                         has_known = True
28 |                     last_query = query_set
29 |                 else:
30 |                     fo.write('\n')
31 |                     if has_known:
32 |                         for l in session_lines:
33 |                             fo.write(l)
34 |                     last_query = None
35 |                     has_known = False
36 |                     session_lines = []
37 | 
38 | refine(p_dog, '../trans_data/dog_refine.txt')
39 | refine(p_train, '../trans_data/train_refine.txt')
40 |                 
41 |             
42 | 


--------------------------------------------------------------------------------
/V2/run_all.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | sh -x run_prepare.sh
 3 | 
 4 | sh -x run_liblinear_dog.sh
 5 | sh -x run_liblinear_pig.sh
 6 | sh -x run_xgboost3_dog.sh
 7 | sh -x run_xgboost3_pig.sh
 8 | 
 9 | sh -x run_ensemble.sh
10 | 


--------------------------------------------------------------------------------
/V2/run_ensemble.sh:
--------------------------------------------------------------------------------
 1 | BIN=../../tools/xgboost3/xgboost
 2 | 
 3 | python prepare_ensemble_cat.py ../dataset/cat_ensemble ../trans_data/cat.label pred_xgboost_cat.txt xgboost 10 pred_linear_cat.txt liblinear
 4 | python prepare_ensemble_dog.py ../dataset/dog_ensemble ../trans_data/valid2.label pred_xgboost_dog.txt xgboost 10 pred_linear_dog.txt liblinear
 5 | python prepare_ensemble_pig.py ../dataset/pig_ensemble ../raw_data/test.txt pred_xgboost_pig.txt xgboost 10 pred_linear_pig.txt liblinear
 6 | 
 7 | 
 8 | $BIN xgboost3.conf num_round=200 num_class=10 bst:max_depth=7 data=../dataset/cat_ensemble eval[test]=../dataset/cat_ensemble
 9 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/dog_ensemble
10 | mv pred.txt pred_ensemble_dog.txt
11 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/pig_ensemble
12 | mv pred.txt pred_ensemble_pig.txt
13 | 
14 | python construct_maxprob.py pred_ensemble_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_ensemble.txt
15 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_ensemble.txt 
16 | 
17 | python construct_maxprob.py pred_ensemble_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_ensemble.txt
18 | python construct_maxprob_multi.py pred_ensemble_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_ensemble2.txt
19 | 


--------------------------------------------------------------------------------
/V2/run_liblinear_dog.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | TRAIN_BIN=../../tools/liblinear/train
 3 | TEST_BIN=../../tools/liblinear/predict
 4 | 
 5 | $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/dog_train dog.model
 6 | $TEST_BIN -b 1 ../dataset/cat_test dog.model pred_linear_cat.txt 
 7 | $TEST_BIN -b 1 ../dataset/dog_test dog.model pred_linear_dog.txt 
 8 | 
 9 | python construct_liblinear_b1.py pred_linear_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_linear.txt
10 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_linear.txt 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/V2/run_liblinear_pig.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | TRAIN_BIN=../../tools/liblinear/train
 3 | TEST_BIN=../../tools/liblinear/predict
 4 | 
 5 | $TRAIN_BIN -s 6 -c 10 -e 0.001 ../dataset/pig_train pig.model
 6 | $TEST_BIN -b 1 ../dataset/pig_test pig.model pred_linear_pig.txt 
 7 | 
 8 | python construct_liblinear_b1.py pred_linear_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_linear.txt
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/V2/run_prepare.sh:
--------------------------------------------------------------------------------
 1 | # split train to dog/valid/pig
 2 | # dog/valid are for offline tuning, pig/test are for leaderboard submission
 3 | python split_train.py
 4 | 
 5 | # aggregate titles and clicks for each query
 6 | python trans_train1.py
 7 | # get statistics data for each query
 8 | python trans_train2.py
 9 | 
10 | # prepare features
11 | prepare_dog1.py
12 | prepare_pig1.py
13 | 


--------------------------------------------------------------------------------
/V2/run_semilda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -x
 2 | ldapath=../../../cpp_workspace/cpp-semilda/src
 3 | 
 4 | train_file=../trans_data/dog.simple1
 5 | ldatrain_file=../dataset/train_semilda.train
 6 | 
 7 | index_file=../dataset/word_index
 8 | seed_file=lda_seed_words
 9 | model_file=lda.model
10 | 
11 | num_topic=7
12 | alpha=0.5
13 | beta=0.05
14 | 
15 | python prepare_lda_train.py $train_file $ldatrain_file $index_file
16 | 
17 | time /Users/zuotaoliu/install/mpich2/bin/mpiexec -n 4 $ldapath/mpi_slda \
18 | --num_topics $num_topic \
19 | --alpha $alpha --beta $beta \
20 | --training_data_file $ldatrain_file \
21 | --model_file $model_file \
22 | --word_index_file $index_file \
23 | --compute_likelihood true \
24 | --burn_in_iterations 50 --total_iterations 120
25 | 
26 | 
27 | test_file=../trans_data/valid.simple1
28 | ldatest_file=../dataset/test_semilda.test
29 | ldapred_file=pred_semilda_dog.txt
30 | 
31 | python prepare_lda_test.py $test_file $ldatest_file
32 | 
33 | args="--alpha ${alpha} \
34 |       --beta ${beta} \
35 |       --inference_data_file ${ldatest_file} \
36 |       --inference_result_file ${ldapred_file} \
37 |       --model_file ${model_file} \
38 |       --burn_in_iterations 50 \
39 |       --total_iterations 120 \
40 |       --file_type 0
41 |       "
42 | 
43 | time $ldapath/infer $args
44 | python construct_semilda.py pred_semilda_dog.txt $test_file ../trans_data/valid.txt ../dataset/label_map_lda ../submit/dog_semilda.txt
45 | python metric_F1.py ../trans_data/valid.label ../submit/dog_semilda.txt 
46 | 
47 | test_file=../trans_data/test.simple1
48 | ldatest_file=../dataset/test_semilda.test
49 | ldapred_file=pred_semilda_pig.txt
50 | 
51 | python prepare_lda_test.py $test_file $ldatest_file
52 | 
53 | args="--alpha ${alpha} \
54 |       --beta ${beta} \
55 |       --inference_data_file ${ldatest_file} \
56 |       --inference_result_file ${ldapred_file} \
57 |       --model_file ${model_file} \
58 |       --burn_in_iterations 50 \
59 |       --total_iterations 120 \
60 |       --file_type 0
61 |       "
62 | 
63 | time $ldapath/infer $args
64 | python construct_semilda.py pred_semilda_pig.txt $test_file ../raw_data/test.txt ../dataset/label_map_lda ../submit/pig_semilda.txt
65 | 
66 | 


--------------------------------------------------------------------------------
/V2/run_session_label.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -x
 2 | python trans_session.py
 3 | python prepare_session.py
 4 | python markov_sessoin_label.py
 5 | 
 6 | #python construct_semilda.py pred_semilda_dog.txt $test_file ../trans_data/valid.txt ../dataset/label_map_lda ../submit/dog_semilda.txt
 7 | #python metric_F1.py ../trans_data/valid.label ../submit/dog_semilda.txt 
 8 | 
 9 | #python construct_semilda.py pred_semilda_pig.txt $test_file ../raw_data/test.txt ../dataset/label_map_lda ../submit/pig_semilda.txt
10 | 
11 | 


--------------------------------------------------------------------------------
/V2/run_xgboost3_dog.sh:
--------------------------------------------------------------------------------
 1 | BIN=../../tools/xgboost3/xgboost
 2 | 
 3 | $BIN xgboost3.conf num_round=200 num_class=10 bst:max_depth=7 data=../dataset/dog_train eval[test]=../dataset/dog_train
 4 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/cat_test
 5 | mv pred.txt pred_xgboost_cat.txt
 6 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/dog_test
 7 | mv pred.txt pred_xgboost_dog.txt
 8 | 
 9 | python construct_maxprob.py pred_xgboost_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_xgboost.txt
10 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_xgboost.txt 
11 | python construct_maxprob_multi.py pred_xgboost_dog.txt ../trans_data/valid2.txt ../dataset/label_map_dog ../submit/dog_xgboost2.txt
12 | python metric_F1.py ../trans_data/valid2.label ../submit/dog_xgboost2.txt 
13 | 
14 | 


--------------------------------------------------------------------------------
/V2/run_xgboost3_pig.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BIN=../../tools/xgboost3/xgboost
 3 | 
 4 | $BIN xgboost3.conf num_round=200 num_class=10 bst:max_depth=7 data=../dataset/pig_train eval[test]=../dataset/pig_train
 5 | $BIN xgboost3.conf task=pred num_class=10 model_in=0200.model test:data=../dataset/pig_test
 6 | mv pred.txt pred_xgboost_pig.txt
 7 | 
 8 | python construct_maxprob.py pred_xgboost_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_xgboost.txt
 9 | python construct_maxprob_multi.py pred_xgboost_pig.txt ../raw_data/test.txt ../dataset/label_map_pig ../submit/pig_xgboost2.txt
10 | 


--------------------------------------------------------------------------------
/V2/split_train.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | p_train = '../raw_data/train.txt'
 4 | p_test = '../raw_data/test.txt'
 5 | 
 6 | p_dog = '../trans_data/dog.txt'
 7 | p_valid = '../trans_data/valid.txt'
 8 | p_label = '../trans_data/valid.label'
 9 | 
10 | n_fold = 3
11 | 
12 | train_dict = {}
13 | test_dict = {}
14 | unknown_dict = {}
15 | 
16 | for line in open(p_train):
17 |     if not line.strip(): continue
18 |     try:
19 |         label, query, title = line.strip().split('\t')
20 |     except:
21 |         label, query = line.strip().split('\t')
22 |         title = '-'
23 | 
24 |     if query not in train_dict:
25 |         train_dict[query] = {} 
26 |     train_dict[query][label] = train_dict[query].get(label, 0) + 1
27 |     if label.startswith('CLASS=TEST'):
28 |         test_dict[query] = 1
29 |     if label.startswith('CLASS=UNKNOWN'):
30 |         unknown_dict[query] = 1
31 | 
32 | valid_dict = {}
33 | fv2 = open('../trans_data/valid.txt', 'w')
34 | fv3 = open('../trans_data/valid.label', 'w')
35 | for query in train_dict:
36 |     if query in test_dict: continue
37 |     if query in unknown_dict: continue
38 |     if random.randint(0, n_fold-1) == 1:
39 |         valid_dict[query] = 1
40 |         label = sorted(train_dict[query].items(), key=lambda d:-d[1])[0][0]
41 |         fv2.write('%s\n' % query)
42 |         fv3.write('%s\t%s\n' % (query, label))
43 | fv2.close()
44 | fv3.close()
45 | 
46 | fv1 = open('../trans_data/dog.txt', 'w')
47 | for line in open(p_train):
48 |     if not line.strip():
49 |         fv1.write(line)
50 |         continue
51 | 
52 |     try:
53 |         label, query, title = line.strip().split('\t')
54 |     except:
55 |         label, query = line.strip().split('\t')
56 |         title = '-'
57 |     if query in test_dict: continue
58 |     if query in train_dict:
59 |         if query in valid_dict:
60 |             label = 'CLASS=TEST'
61 |             fv1.write('%s\t%s\t%s\n' % (label, query, title))
62 |         else:
63 |             fv1.write(line)
64 | fv1.close()
65 | 
66 | 


--------------------------------------------------------------------------------
/V2/trans_session.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | p_pig_train = '../raw_data/train.txt'
 4 | p_dog_train = '../trans_data/dog.txt'
 5 | p_pig_out = '../trans_data/pig.simple5'
 6 | p_dog_out = '../trans_data/dog.simple5'
 7 | p_pig_valid = '../trans_data/test.simple5'
 8 | p_dog_valid = '../trans_data/valid.simple5'
 9 | 
10 | rates = {'CLASS=VIDEO' : 0.5}
11 | 
12 | 
13 | def stat(p_in, p_out):
14 |     session = [set(), set()]
15 |     labels = []
16 |     
17 |     fo = open(p_out, 'w')
18 |     tot_line = 0
19 |     for line in open(p_in):
20 |         if not line.strip():
21 |             session_label = ''
22 |             session_flag = True
23 |             positive_count = 0
24 |             for key in labels:
25 |                 label, query = key.split('\t')
26 |                 if label.find('TEST')>=0 or label.find('KNOWN')>=0:
27 |                     if positive_count < 1: session_flag = False
28 |                     #elif positive_count == 1: positive_count = 0 
29 |                     #else: positive_count = 0.5
30 |                     else: positive_count = 0
31 |                 elif session_flag:
32 |                     if not session_label: 
33 |                         session_label = label
34 |                         positive_count += 1
35 |                     elif session_label != label: 
36 |                         session_flag = False
37 |                     else:
38 |                         positive_count += 1
39 |             if session[0] and session_label and session_flag and positive_count > 0:
40 |                 if session[1] and len(session[1])<=10 and len(session[0])<=5:
41 |                     rate = rates.get(session_label, 1.0)
42 |                     if random.random()<=rate:
43 |                         fo.write('%s\t%s\t%s\n' % (session_label, ';'.join(session[0]), ';'.join(session[1])))
44 |             session = [set(), set()]
45 |             labels = []
46 |             continue
47 | 
48 |         try:
49 |             label, query, title = line.strip().split('\t')
50 |         except:
51 |             label, query = line.strip().split('\t')
52 |             title = '-'
53 | 
54 |         key = label + '\t' + query
55 |         if not labels or labels[-1] != key:
56 |             labels.append(key)
57 |         session[0].add(query)
58 |         if title and title!='-':
59 |             session[1].add(title)
60 | 
61 |         tot_line += 1
62 |         #if tot_line == 1000000: break
63 |     fo.close()
64 |             
65 | def valid(p_in, p_out):
66 |     session = [set(), set()]
67 |     has_test = False
68 | 
69 |     fo = open(p_out, 'w')
70 |     tot_line = 0
71 |     for line in open(p_in):
72 |         if not line.strip():
73 |             if has_test and session[1] and len(session[1])<=10 and len(session[0])<=5:
74 |                 fo.write('%s\t%s\t%s\n' % (0, ';'.join(session[0]), ';'.join(session[1])))
75 |             session = [set(), set()]
76 |             has_test = False
77 |             continue
78 |         try:
79 |             label, query, title = line.strip().split('\t')
80 |         except:
81 |             label, query = line.strip().split('\t')
82 |             title = '-'
83 | 
84 |         if label.find('TEST') >= 0:
85 |             has_test = True
86 |         session[0].add(query)
87 |         if title and title!='-':
88 |             session[1].add(title)
89 | 
90 |         tot_line += 1
91 |         #if tot_line == 10000: break
92 |     fo.close()
93 | 
94 | stat(p_pig_train, p_pig_out)
95 | stat(p_dog_train, p_dog_out)
96 | valid(p_pig_train, p_pig_valid)
97 | valid(p_dog_train, p_dog_valid)
98 | 


--------------------------------------------------------------------------------
/V2/trans_train0.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_train = '../raw_data/train.txt'
  3 | p_dog = '../trans_data/dog.txt'
  4 | p_test = '../raw_data/test.txt'
  5 | p_valid = '../trans_data/valid.txt'
  6 | 
  7 | s_train = '../trans_data/train.simple0'
  8 | s_test = '../trans_data/test.simple0'
  9 | s_dog = '../trans_data/dog.simple0'
 10 | s_valid = '../trans_data/valid.simple0'
 11 | 
 12 | def trans(in1, in2, out1, out2):
 13 |     train_dict = {}
 14 |     test_dict = {}
 15 | 
 16 |     # for session
 17 |     session_train_query = {}
 18 |     session_test_query = {}
 19 |     session_labels = {}
 20 |     session_query = {}
 21 |     session_click = {}
 22 | 
 23 |     for line in open(in1):
 24 |         if not line.strip():
 25 |             #session end
 26 |             for query in session_train_query:
 27 |                 #if len(session_labels) == 1:
 28 |                 for q2 in session_train_query:
 29 |                     if query != q2:
 30 |                         label = session_train_query[q2]
 31 |                         train_dict[query][2][label] = train_dict[query][2].get(label, 0) + 1
 32 |                 for q2 in session_query:
 33 |                     if query != q2:
 34 |                         train_dict[query][3][q2] = train_dict[query][3].get(q2, 0) + 1
 35 |                 for title in session_click:
 36 |                     train_dict[query][4][title] = train_dict[query][4].get(title, 0) + 1
 37 |             for query in session_test_query:
 38 |                 #if len(session_labels) == 1:
 39 |                 for q2 in session_train_query:
 40 |                     if query != q2:
 41 |                         label = session_train_query[q2]
 42 |                         test_dict[query][2][label] = test_dict[query][2].get(label, 0) + 1
 43 |                 for q2 in session_query:
 44 |                     if query != q2:
 45 |                         test_dict[query][3][q2] = test_dict[query][3].get(q2, 0) + 1
 46 |                 for title in session_click:
 47 |                     test_dict[query][4][title] = test_dict[query][4].get(title, 0) + 1
 48 |             session_train_query = {}
 49 |             session_test_query = {}
 50 |             session_labels = {}
 51 |             session_query = {}
 52 |             session_click = {}
 53 |             continue
 54 | 
 55 |         try:
 56 |             label, query, title = line.strip().split('\t')
 57 |         except:
 58 |             label, query = line.strip().split('\t')
 59 |             title = '-'
 60 |         #label = ' | '.join(sorted(label.split(' | ')))
 61 | 
 62 |         if title and title != '-':
 63 |             session_click[title] = 1
 64 |         session_query[query] = 1
 65 | 
 66 |         if label.startswith('CLASS=TEST'):
 67 |             if query not in test_dict: 
 68 |                 test_dict[query] = [label, {}, {}, {}, {}]
 69 |             if title and title != '-':
 70 |                 test_dict[query][1][title] = test_dict[query][1].get(title, 0) + 1
 71 |             session_test_query[query] = 1
 72 |         elif not label.startswith('CLASS=UNKNOWN'):
 73 |             if query not in train_dict: 
 74 |                 train_dict[query] = [{}, {}, {}, {}, {}]
 75 |             train_dict[query][0][label] = train_dict[query][0].get(label, 0) + 1
 76 |             if title and title != '-':
 77 |                 train_dict[query][1][title] = train_dict[query][1].get(title, 0) + 1
 78 |             session_labels[label] = 1
 79 |             session_train_query[query] = ''.join(sorted(label.split(' | '))) 
 80 | 
 81 |     n_top_title = -1
 82 |     n_top_label = 3
 83 |     n_top_query = 10 
 84 |     n_top_session_title = 30 
 85 | 
 86 |     with open(out1, 'w') as ft:
 87 |         for query in train_dict:
 88 |             label = sorted(train_dict[query][0].items(), key=lambda d:-d[1])[0][0]
 89 | 
 90 |             titles = sorted(train_dict[query][1].items(), key=lambda d:-d[1])
 91 |             title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:]]
 92 | 
 93 |             labels = sorted(train_dict[query][2].items(), key=lambda d:-d[1])
 94 |             label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]]
 95 |             tot_label = float(sum(train_dict[query][2].values()))
 96 |             label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]]
 97 | 
 98 |             queries = sorted(train_dict[query][3].items(), key=lambda d:-d[1])
 99 |             query_pairs = []
100 |             query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]]
101 | 
102 |             stitles = sorted(train_dict[query][4].items(), key=lambda d:-d[1])
103 |             stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]]
104 | 
105 |             stat_pairs = []
106 | 
107 |             ft.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs)))
108 | 
109 |     with open(out2, 'w') as fo:
110 |         for query in test_dict:
111 |             label = test_dict[query][0]
112 | 
113 |             titles = sorted(test_dict[query][1].items(), key=lambda d:-d[1])
114 |             title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:]]
115 | 
116 |             labels = sorted(test_dict[query][2].items(), key=lambda d:-d[1])
117 |             label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]]
118 |             tot_label = float(sum(test_dict[query][2].values()))
119 |             label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]]
120 | 
121 |             queries = sorted(test_dict[query][3].items(), key=lambda d:-d[1])
122 |             query_pairs = []
123 |             query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]]
124 | 
125 |             stitles = sorted(test_dict[query][4].items(), key=lambda d:-d[1])
126 |             stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]]
127 | 
128 |             stat_pairs = []
129 | 
130 |             fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs)))
131 | 
132 | trans(p_train, p_test, s_train, s_test)
133 | trans(p_dog, p_valid, s_dog, s_valid)
134 | 
135 | 


--------------------------------------------------------------------------------
/V2/trans_train1.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_train = '../raw_data/train.txt'
  3 | p_dog = '../trans_data/dog.txt'
  4 | p_test = '../raw_data/test.txt'
  5 | p_valid = '../trans_data/valid.txt'
  6 | 
  7 | s_train = '../trans_data/train.simple1'
  8 | s_test = '../trans_data/test.simple1'
  9 | s_dog = '../trans_data/dog.simple1'
 10 | s_valid = '../trans_data/valid.simple1'
 11 | 
 12 | def trans(in1, in2, out1, out2):
 13 |     train_dict = {}
 14 |     test_dict = {}
 15 | 
 16 |     # for session
 17 |     session_train_query = {}
 18 |     session_test_query = {}
 19 |     session_labels = {}
 20 |     session_query = {}
 21 |     session_click = {}
 22 | 
 23 |     for line in open(in1):
 24 |         if not line.strip():
 25 |             #session end
 26 |             for query in session_train_query:
 27 |                 #if len(session_labels) == 1:
 28 |                 for q2 in session_train_query:
 29 |                     if query != q2:
 30 |                         label = session_train_query[q2]
 31 |                         train_dict[query][2][label] = train_dict[query][2].get(label, 0) + 1
 32 |                 for q2 in session_query:
 33 |                     if query != q2:
 34 |                         train_dict[query][3][q2] = train_dict[query][3].get(q2, 0) + 1
 35 |                 for title in session_click:
 36 |                     train_dict[query][4][title] = train_dict[query][4].get(title, 0) + 1
 37 |             for query in session_test_query:
 38 |                 #if len(session_labels) == 1:
 39 |                 for q2 in session_train_query:
 40 |                     if query != q2:
 41 |                         label = session_train_query[q2]
 42 |                         test_dict[query][2][label] = test_dict[query][2].get(label, 0) + 1
 43 |                 for q2 in session_query:
 44 |                     if query != q2:
 45 |                         test_dict[query][3][q2] = test_dict[query][3].get(q2, 0) + 1
 46 |                 for title in session_click:
 47 |                     test_dict[query][4][title] = test_dict[query][4].get(title, 0) + 1
 48 |             session_train_query = {}
 49 |             session_test_query = {}
 50 |             session_labels = {}
 51 |             session_query = {}
 52 |             session_click = {}
 53 |             continue
 54 | 
 55 |         try:
 56 |             label, query, title = line.strip().split('\t')
 57 |         except:
 58 |             label, query = line.strip().split('\t')
 59 |             title = '-'
 60 |         #label = ' | '.join(sorted(label.split(' | ')))
 61 | 
 62 |         if title and title != '-':
 63 |             session_click[title] = 1
 64 |         session_query[query] = 1
 65 | 
 66 |         if label.startswith('CLASS=TEST'):
 67 |             if query not in test_dict: 
 68 |                 test_dict[query] = [label, {}, {}, {}, {}]
 69 |             if title and title != '-':
 70 |                 test_dict[query][1][title] = test_dict[query][1].get(title, 0) + 1
 71 |             session_test_query[query] = 1
 72 |         elif not label.startswith('CLASS=UNKNOWN'):
 73 |             if query not in train_dict: 
 74 |                 train_dict[query] = [{}, {}, {}, {}, {}]
 75 |             train_dict[query][0][label] = train_dict[query][0].get(label, 0) + 1
 76 |             if title and title != '-':
 77 |                 train_dict[query][1][title] = train_dict[query][1].get(title, 0) + 1
 78 |             session_labels[label] = 1
 79 |             session_train_query[query] = ''.join(sorted(label.split(' | '))) 
 80 | 
 81 |     n_top_title = 30
 82 |     n_top_label = 3
 83 |     n_top_query = 10 
 84 |     n_top_session_title = 30 
 85 | 
 86 |     with open(out1, 'w') as ft:
 87 |         for query in train_dict:
 88 |             label = sorted(train_dict[query][0].items(), key=lambda d:-d[1])[0][0]
 89 | 
 90 |             titles = sorted(train_dict[query][1].items(), key=lambda d:-d[1])
 91 |             title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]]
 92 | 
 93 |             labels = sorted(train_dict[query][2].items(), key=lambda d:-d[1])
 94 |             label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]]
 95 |             tot_label = float(sum(train_dict[query][2].values()))
 96 |             label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]]
 97 | 
 98 |             queries = sorted(train_dict[query][3].items(), key=lambda d:-d[1])
 99 |             query_pairs = []
100 |             query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]]
101 | 
102 |             stitles = sorted(train_dict[query][4].items(), key=lambda d:-d[1])
103 |             stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]]
104 | 
105 |             stat_pairs = []
106 | 
107 |             ft.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs)))
108 | 
109 |     with open(out2, 'w') as fo:
110 |         for query in test_dict:
111 |             label = test_dict[query][0]
112 | 
113 |             titles = sorted(test_dict[query][1].items(), key=lambda d:-d[1])
114 |             title_pairs = ['%s:%s' % (v[0], v[1]) for v in titles[:n_top_title+1]]
115 | 
116 |             labels = sorted(test_dict[query][2].items(), key=lambda d:-d[1])
117 |             label_pairs = ['%s:%s' % (v[0], v[1]) for v in labels[:n_top_label+1]]
118 |             tot_label = float(sum(test_dict[query][2].values()))
119 |             label_pairs += ['f%s:%s' % (v[0], v[1]/tot_label) for v in labels[:n_top_label+1]]
120 | 
121 |             queries = sorted(test_dict[query][3].items(), key=lambda d:-d[1])
122 |             query_pairs = []
123 |             query_pairs = ['%s:%s' % (v[0], v[1]) for v in queries[:n_top_query+1]]
124 | 
125 |             stitles = sorted(test_dict[query][4].items(), key=lambda d:-d[1])
126 |             stitle_pairs = ['%s:%s' % (v[0], v[1]) for v in stitles[:n_top_session_title+1]]
127 | 
128 |             stat_pairs = []
129 | 
130 |             fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (label, query, ';'.join(title_pairs), ';'.join(label_pairs), ';'.join(stat_pairs), ';'.join(query_pairs), ';'.join(stitle_pairs)))
131 | 
132 | trans(p_train, p_test, s_train, s_test)
133 | trans(p_dog, p_valid, s_dog, s_valid)
134 | 
135 | 


--------------------------------------------------------------------------------
/V2/trans_train2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | p_train = '../raw_data/train.txt'
  3 | p_dog = '../trans_data/dog.txt'
  4 | p_test = '../raw_data/test.txt'
  5 | p_valid = '../trans_data/valid.txt'
  6 | 
  7 | s_train = '../trans_data/train.simple2'
  8 | s_test = '../trans_data/test.simple2'
  9 | s_dog = '../trans_data/dog.simple2'
 10 | s_valid = '../trans_data/valid.simple2'
 11 | 
 12 | def trans(in1, in2, out1, out2):
 13 |     train_dict = {}
 14 |     test_dict = {}
 15 | 
 16 |     query_freq = {}
 17 |     query_titles = {}
 18 | 
 19 |     query_session = {}
 20 |     query_search = {}
 21 |     query_click = {}
 22 |     query_dupclick = {}
 23 | 
 24 |     query_session_search = {}
 25 |     query_session_click = {}
 26 |     query_session_dupclick = {}
 27 | 
 28 |     # for session
 29 |     session_train_query = {}
 30 |     session_test_query = {}
 31 |     session_labels = {}
 32 |     session_search = 0
 33 |     session_click = {}
 34 |     session_query_search = {}
 35 |     session_query_click = {}
 36 | 
 37 |     for line in open(in1):
 38 |         if not line.strip():
 39 |             #session end
 40 |             session_dupclick = 0
 41 |             for title in session_click:
 42 |                 if session_click[title] > 1: session_dupclick += 1
 43 |             for query in session_train_query:
 44 |                 query_session[query] = query_session.get(query, 0) + 1.
 45 |                 query_session_search[query] = query_session_search.get(query, 0) + session_search
 46 |                 query_session_click[query] = query_session_click.get(query, 0) + len(session_click)
 47 |                 query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick
 48 |                 query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0)
 49 |                 query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {}))
 50 |                 session_query_dup = 0
 51 |                 for title in session_query_click.get(query, {}):
 52 |                     if session_query_click[query][title] > 1: session_query_dup += 1
 53 |                 query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup
 54 |             for query in session_test_query:
 55 |                 query_session[query] = query_session.get(query, 0) + 1.
 56 |                 query_session_search[query] = query_session_search.get(query, 0) + session_search
 57 |                 query_session_click[query] = query_session_click.get(query, 0) + len(session_click)
 58 |                 query_session_dupclick[query] = query_session_dupclick.get(query, 0) + session_dupclick
 59 |                 query_search[query] = query_search.get(query, 0) + session_query_search.get(query, 0)
 60 |                 query_click[query] = query_click.get(query, 0) + len(session_query_click.get(query, {}))
 61 |                 session_query_dup = 0
 62 |                 for title in session_query_click.get(query, {}):
 63 |                     if session_query_click[query][title] > 1: session_query_dup += 1
 64 |                 query_dupclick[query] = query_dupclick.get(query, 0) + session_query_dup
 65 | 
 66 |             session_train_query = {}
 67 |             session_test_query = {}
 68 |             session_labels = {}
 69 |             session_search = 0
 70 |             session_click = {}
 71 |             session_query_search = {}
 72 |             session_query_click = {}
 73 |             continue
 74 | 
 75 |         try:
 76 |             label, query, title = line.strip().split('\t')
 77 |         except:
 78 |             label, query = line.strip().split('\t')
 79 |             title = '-'
 80 |         #label = ' | '.join(sorted(label.split(' | ')))
 81 | 
 82 |         query_freq[query] = query_freq.get(query, 0) + 1
 83 |         if title and title != '-':
 84 |             if query not in query_titles: query_titles[query] = [0., 0.]
 85 |             query_titles[query][0] += 1
 86 |             query_titles[query][1] += len(title.split(' '))
 87 |             session_click[title] = session_click.get(title, 0) + 1
 88 |             if query not in session_query_click: session_query_click[query] = {}
 89 |             session_query_click[query][title] = session_query_click[query].get(title, 0) + 1
 90 |         else:
 91 |             session_search += 1
 92 |             if query not in session_query_search: session_query_search[query] = 0
 93 |             session_query_search[query] += 1
 94 | 
 95 |         if label.startswith('CLASS=TEST'):
 96 |             if query not in test_dict: 
 97 |                 test_dict[query] = [label, {}, {}, {}]
 98 |             session_test_query[query] = 1
 99 |         elif not label.startswith('CLASS=UNKNOWN'):
100 |             if query not in train_dict: 
101 |                 train_dict[query] = [{}, {}, {}, {}]
102 |             session_train_query[query] = label.replace(' ', '')
103 | 
104 | 
105 |     with open(out1, 'w') as ft:
106 |         for query in train_dict:
107 |             stat_pairs = []
108 |             stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) )
109 |             stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) )
110 |             if query_titles.get(query, [0, 0])[0] >= 3:
111 |                 stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) )
112 | 
113 |             stat_pairs2 = []
114 |             if query_session.get(query, 0) >= 5:
115 |                 stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) )
116 |                 stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) )
117 |                 stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) )
118 |                 stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) )
119 |                 stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) )
120 |                 stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) )
121 | 
122 |             ft.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2)))
123 | 
124 |     with open(out2, 'w') as fo:
125 |         for query in test_dict:
126 |             stat_pairs = []
127 |             stat_pairs.append( '%s:%s' % ('query_len', len(query.split(' '))) )
128 |             stat_pairs.append( '%s:%s' % ('query_freq', query_freq[query]) )
129 |             if query_titles.get(query, [0, 0])[0] >= 3:
130 |                 stat_pairs.append( '%s:%s' % ('title_len', query_titles[query][1]/query_titles[query][0]) )
131 | 
132 |             stat_pairs2 = []
133 |             if query_session.get(query, 0) >= 5:
134 |                 stat_pairs2.append( '%s:%s' % ('query_search', query_search[query]/query_session[query]) )
135 |                 stat_pairs2.append( '%s:%s' % ('query_click', query_click[query]/query_session[query]) )
136 |                 stat_pairs2.append( '%s:%s' % ('query_dupclick', query_dupclick[query]/query_session[query]) )
137 |                 stat_pairs2.append( '%s:%s' % ('query_session_search', query_session_search[query]/query_session[query]) )
138 |                 stat_pairs2.append( '%s:%s' % ('query_session_click', query_session_click[query]/query_session[query]) )
139 |                 stat_pairs2.append( '%s:%s' % ('query_session_dupclick', query_session_dupclick[query]/query_session[query]) )
140 | 
141 |             fo.write('%s\t%s\t%s\n' % (query, ';'.join(stat_pairs), ';'.join(stat_pairs2)))
142 | 
143 | trans(p_train, p_test, s_train, s_test)
144 | trans(p_dog, p_valid, s_dog, s_valid)
145 | 
146 | 


--------------------------------------------------------------------------------
/V2/xgboost3.conf:
--------------------------------------------------------------------------------
 1 | ### General Parameters, see comment for each definition
 2 | # choose the tree booster, 0: tree, 1: linear
 3 | booster_type = 0 
 4 | # this is the only difference with classification, use 0: linear regression
 5 | # when labels are in [0,1] we can also use 1: logistic regression
 6 | loss_type = 0
 7 | # evaluation metrics for validation data
 8 | eval_metric=merror
 9 | #eval_metric=error
10 | #eval_metric=auc
11 | #eval_metric=map
12 | #eval_metric=rmse
13 | #eval_metric=ndcg
14 | # silent=1 means printing running messages
15 | #silent = 1
16 | # do not use binary buffer
17 | use_buffer = 0
18 | nthread = 8
19 | #num_class = 18
20 | 
21 | ### Tree Booster Parameters
22 | # step size shrinkage used in update to prevent overfitting. 
23 | bst:eta = 0.1
24 | #bst:eta = 0.05
25 | # minimum loss reduction required to make a further partition. larger -> more conservative
26 | bst:gamma = 1.0 
27 | # minimum sum of instance weight(hessian) needed in a child. larger -> more conservative
28 | bst:min_child_weight = 1.0
29 | # maximum depth of a tree
30 | #bst:max_depth = 5 
31 | # constructing method to build a tree, 0: svdfeature, 1: column major expansion, 2: row major expansion
32 | #bst:tree_maker = 1
33 | 
34 | ### Linear Booster Parameters
35 | # L2 regularization term on weights
36 | bst:lambda = 0
37 | # L1 regularization term on weights
38 | bst:alpha = 0
39 | # L2 regularization term on bias
40 | bst:lambda_bias = 0
41 | 
42 | ### Task parameters
43 | # specify the learning task and the corresponding learning objective
44 | #objective = multi:softmax
45 | objective = multi:softprob
46 | #objective = reg:linear
47 | #objective = reg:linear
48 | #objective = reg:logistic
49 | #objective = binary:logistic
50 | #objective = binary:logitraw
51 | # the number of round to do boosting
52 | num_round = 10
53 | # 0 means do not save any model except the final round model
54 | save_period = 0 
55 | # the initial prediction score
56 | base_score = 0.0
57 | # fature map
58 | #fmap = "../dataset/feat_map.txt"
59 | # name for dump model
60 | name_dump = "dump.nice.txt"
61 | # The path of training data
62 | #data = "../dataset/train_dog.svm" 
63 | # The path of validation data, used to monitor training process, here [test] sets name of the validation set
64 | #eval[test] = "../dataset/train_dog.svm" 
65 | # The path of test data 
66 | #test:data = "../dataset/test_dog.svm"      
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------