├── README.md
├── feat
└── README.md
├── input
└── README.md
├── output
├── README.md
├── m1
│ ├── catboost03
│ │ └── .gitkeep
│ ├── inferSent1
│ │ └── .gitkeep
│ └── nn02
│ │ └── .gitkeep
└── m3
│ ├── lgb_m3_32-50-0
│ └── .gitkeep
│ ├── lgb_m3_37-0
│ └── .gitkeep
│ └── lgb_m3_38-0
│ └── .gitkeep
├── src
├── ensemble
│ └── .gitkeep
├── feature
│ ├── .gitkeep
│ ├── .ipynb_checkpoints
│ │ └── gen_dict-checkpoint.ipynb
│ ├── data_preprocess.py
│ ├── feat30-50.py
│ ├── feat31-50.py
│ ├── feat32-50.py
│ ├── feat37-pairwise.py
│ ├── feat38-stk.py
│ ├── feat40.py
│ ├── gen_dict.ipynb
│ ├── gen_samples.py
│ └── tfidf_recall_30.py
├── rank
│ ├── m1
│ │ ├── catboost03.py
│ │ ├── glove
│ │ │ ├── .gitignore
│ │ │ ├── .travis.yml
│ │ │ ├── LICENSE
│ │ │ ├── Makefile
│ │ │ ├── README.md
│ │ │ ├── demo.sh
│ │ │ ├── eval
│ │ │ │ ├── matlab
│ │ │ │ │ ├── WordLookup.m
│ │ │ │ │ ├── evaluate_vectors.m
│ │ │ │ │ └── read_and_evaluate.m
│ │ │ │ ├── octave
│ │ │ │ │ ├── WordLookup_octave.m
│ │ │ │ │ ├── evaluate_vectors_octave.m
│ │ │ │ │ └── read_and_evaluate_octave.m
│ │ │ │ ├── python
│ │ │ │ │ ├── distance.py
│ │ │ │ │ ├── evaluate.py
│ │ │ │ │ └── word_analogy.py
│ │ │ │ └── question-data
│ │ │ │ │ ├── capital-common-countries.txt
│ │ │ │ │ ├── capital-world.txt
│ │ │ │ │ ├── city-in-state.txt
│ │ │ │ │ ├── currency.txt
│ │ │ │ │ ├── family.txt
│ │ │ │ │ ├── gram1-adjective-to-adverb.txt
│ │ │ │ │ ├── gram2-opposite.txt
│ │ │ │ │ ├── gram3-comparative.txt
│ │ │ │ │ ├── gram4-superlative.txt
│ │ │ │ │ ├── gram5-present-participle.txt
│ │ │ │ │ ├── gram6-nationality-adjective.txt
│ │ │ │ │ ├── gram7-past-tense.txt
│ │ │ │ │ ├── gram8-plural.txt
│ │ │ │ │ └── gram9-plural-verbs.txt
│ │ │ └── src
│ │ │ │ ├── README.md
│ │ │ │ ├── cooccur.c
│ │ │ │ ├── glove.c
│ │ │ │ ├── shuffle.c
│ │ │ │ └── vocab_count.c
│ │ ├── inferSent1-5-fold_predict.py
│ │ ├── inferSent1-5-fold_train.py
│ │ ├── nn02_predict.py
│ │ ├── nn02_train.py
│ │ ├── prepare_rank_train.py
│ │ ├── run.sh
│ │ └── w2v_training.py
│ ├── m2
│ │ ├── bert_5_fold_predict.py
│ │ ├── bert_5_fold_train.py
│ │ ├── bert_preprocessing.py
│ │ ├── change_formatting4stk.py
│ │ ├── final_blend.py
│ │ ├── fold_result_integration.py
│ │ ├── gen_w2v.sh
│ │ ├── mk_submission.py
│ │ ├── model.py
│ │ ├── nn_5_fold_predict.py
│ │ ├── nn_5_fold_train.py
│ │ ├── nn_preprocessing.py
│ │ ├── preprocessing.py
│ │ ├── run.sh
│ │ └── utils.py
│ └── m3
│ │ ├── convert.py
│ │ ├── eval.py
│ │ ├── flow.py
│ │ ├── kfold_merge.py
│ │ ├── lgb_train_32-50-0.py
│ │ ├── lgb_train_37-0.py
│ │ ├── lgb_train_38-0.py
│ │ ├── lgb_train_38-1.py
│ │ └── lgb_train_40-0.py
├── recall
│ └── tfidf_recall_30.py
└── utils
│ └── .gitkeep
├── stk_feat
└── README.md
└── tools
├── __pycache__
├── basic_learner.cpython-37.pyc
├── custom_bm25.cpython-37.pyc
├── custom_metrics.cpython-37.pyc
├── feat_utils.cpython-37.pyc
├── lgb_learner.cpython-37.pyc
├── loader.cpython-37.pyc
├── nlp_preprocess.cpython-37.pyc
└── pandas_util.cpython-37.pyc
├── basic_learner.py
├── basic_learner.pyc
├── custom_bm25.py
├── custom_bm25.pyc
├── custom_metrics.py
├── custom_metrics.pyc
├── feat_utils.py
├── lgb_learner.py
├── lgb_learner.pyc
├── loader.py
├── loader.pyc
├── nlp_preprocess.py
├── pandas_util.py
└── pandas_util.pyc
/README.md:
--------------------------------------------------------------------------------
1 | # WSDM2020-solution
2 | ## Team Name: funny
3 | Team Member: just4fun, greedisgood, slowdown, funny
4 | ## No Data Leak
5 | We achieve map@3 score 0.37458 at part 1 and 0.38020 at part 2 without using any data leak in the competition. During the recall process we search the related papers from the whole dataset without tricky data screening.
6 |
7 | ## Our Basic Solution
8 | data preprocess -> recall by text similarity-> single model (LGB + NN) -> model stacking -> linear ensemble -> final result
9 |
10 |
--------------------------------------------------------------------------------
/feat/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of generated features
2 |
--------------------------------------------------------------------------------
/input/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of input
2 |
--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of cv results and results.
2 |
--------------------------------------------------------------------------------
/output/m1/catboost03/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m1/catboost03/.gitkeep
--------------------------------------------------------------------------------
/output/m1/inferSent1/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m1/inferSent1/.gitkeep
--------------------------------------------------------------------------------
/output/m1/nn02/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m1/nn02/.gitkeep
--------------------------------------------------------------------------------
/output/m3/lgb_m3_32-50-0/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m3/lgb_m3_32-50-0/.gitkeep
--------------------------------------------------------------------------------
/output/m3/lgb_m3_37-0/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m3/lgb_m3_37-0/.gitkeep
--------------------------------------------------------------------------------
/output/m3/lgb_m3_38-0/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m3/lgb_m3_38-0/.gitkeep
--------------------------------------------------------------------------------
/src/ensemble/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/src/ensemble/.gitkeep
--------------------------------------------------------------------------------
/src/feature/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/src/feature/.gitkeep
--------------------------------------------------------------------------------
/src/feature/data_preprocess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import time
8 | from tqdm import tqdm
9 | from datetime import datetime
10 |
11 | # 数据处理
12 | import re
13 | import pickle
14 | import numpy as np
15 | import pandas as pd
16 | from multiprocessing import Pool
17 |
18 | # 自定义工具包
19 | sys.path.append('../../tools/')
20 | import loader
21 | import pandas_util
22 | from nlp_preprocess import preprocess
23 |
24 | # 设置随机种子
25 | SEED = 2020
26 | PROCESS_NUM, PARTITION_NUM = 32, 32
27 |
28 | input_root_path = '../../input/'
29 | output_root_path = '../../input/'
30 |
31 | postfix = 'final_all'
32 | file_type = 'ftr'
33 |
34 | tr_out_path = output_root_path + 'tr_input_{}.{}'.format(postfix, file_type)
35 | te_out_path = output_root_path + 'te_input_{}.{}'.format(postfix, file_type)
36 | paper_out_path = output_root_path + 'paper_input_{}.{}'.format(postfix, file_type)
37 |
38 | # 获取关键句函数
39 | def digest(text):
40 | backup = text[:]
41 | text = text.replace('al.', '').split('. ')
42 | t=''
43 | pre_text=[]
44 | len_text=len(text)-1
45 | add=True
46 | pre=''
47 | while len_text>=0:
48 | index=text[len_text]
49 | index+=pre
50 | if len(index.split(' '))<=3 :
51 | add=False
52 | pre=index+pre
53 | else:
54 | add=True
55 | pre=''
56 | if add:
57 | pre_text.append(index)
58 | len_text-=1
59 | if len(pre_text)==0:
60 | pre_text=text
61 | pre_text.reverse()
62 | for index in pre_text:
63 | if index.find('[**##**]') != -1:
64 | index = re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+','',index)
65 | index+='. '
66 | t+=index
67 | return t
68 |
69 | def partition(df, num):
70 | df_partitions, step = [], int(np.ceil(df.shape[0]/num))
71 | for i in range(0, df.shape[0], step):
72 | df_partitions.append(df.iloc[i:i+step])
73 | return df_partitions
74 |
75 | def tr_single_process(params=None):
76 | (tr, i) = params
77 | print (i, 'start', datetime.now())
78 | tr['quer_key'] = tr['description_text'].fillna('').progress_apply(lambda s: preprocess(digest(s)))
79 | tr['quer_all'] = tr['description_text'].fillna('').progress_apply(lambda s: preprocess(s))
80 | print (i, 'completed', datetime.now())
81 | return tr
82 |
83 | def paper_single_process(params=None):
84 | (df, i) = params
85 | print (i, 'start', datetime.now())
86 | df['titl'] = df['title'].fillna('').progress_apply(lambda s: preprocess(s))
87 | df['abst'] = df['abstract'].fillna('').progress_apply(lambda s: preprocess(s))
88 | print (i, 'completed', datetime.now())
89 | return df
90 |
91 | def multi_text_process(df, task, process_num=30):
92 | pool = Pool(process_num)
93 | df_parts = partition(df, process_num)
94 | print ('{} processes init and partition to {} parts' \
95 | .format(process_num, process_num))
96 | param_list = [(df_parts[i], i) for i in range(process_num)]
97 | if task in ['tr', 'te']:
98 | dfs = pool.map(tr_single_process, param_list)
99 | elif task in ['paper']:
100 | dfs = pool.map(paper_single_process, param_list)
101 | df = pd.concat(dfs, axis=0)
102 | print (task, 'multi process completed')
103 | print (df.columns)
104 | return df
105 |
106 | if __name__ == "__main__":
107 |
108 | ts = time.time()
109 | tqdm.pandas()
110 | print('start time: %s' % datetime.now())
111 | # load data
112 | df = loader.load_df(input_root_path + 'candidate_paper_for_wsdm2020.ftr')
113 | tr = loader.load_df(input_root_path + 'train_release.csv')
114 | te = loader.load_df(input_root_path + 'test.csv')
115 | cv = loader.load_df(input_root_path + 'cv_ids_0109.csv')
116 |
117 | # 过滤重复数据 & 异常数据
118 | tr = tr[tr['description_id'].isin(cv['description_id'].tolist())]
119 | tr = tr[tr.description_id != '6.45E+04']
120 |
121 | df = df[~pd.isnull(df['paper_id'])]
122 | tr = tr[~pd.isnull(tr['description_id'])]
123 | print ('pre', te.shape)
124 | te = te[~pd.isnull(te['description_id'])]
125 | print ('post', te.shape)
126 |
127 | #df = df.head(1000)
128 | #tr = tr.head(1000)
129 | #te = te.head(1000)
130 |
131 | tr = multi_text_process(tr, task='tr')
132 | te = multi_text_process(te, task='te')
133 | df = multi_text_process(df, task='paper')
134 |
135 | tr.drop(['description_text'], axis=1, inplace=True)
136 | te.drop(['description_text'], axis=1, inplace=True)
137 | df.drop(['abstract', 'title'], axis=1, inplace=True)
138 | print ('text preprocess completed')
139 |
140 | loader.save_df(tr, tr_out_path)
141 | print (tr.columns)
142 | print (tr.head())
143 |
144 | loader.save_df(te, te_out_path)
145 | print (te.columns)
146 | print (te.head())
147 |
148 | loader.save_df(df, paper_out_path)
149 | print (df.columns)
150 | print (df.head())
151 |
152 | print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
153 |
154 |
155 |
156 |
--------------------------------------------------------------------------------
/src/feature/feat31-50.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | # 生成词向量距离特征
5 |
6 | # 基础模块
7 | import os
8 | import gc
9 | import sys
10 | import time
11 | import pickle
12 | from datetime import datetime
13 | from tqdm import tqdm
14 |
15 | # 数据处理
16 | import numpy as np
17 | import pandas as pd
18 | from tqdm import tqdm
19 | from multiprocessing import Pool
20 |
21 | # 自定义工具包
22 | sys.path.append('../../tools/')
23 | import loader
24 | import pandas_util
25 | import custom_bm25 as bm25
26 | from feat_utils import try_divide, dump_feat_name
27 |
28 | # 开源工具包
29 | import nltk
30 | import gensim
31 | from gensim.models import Word2Vec
32 | from gensim.models.word2vec import LineSentence
33 | from gensim import corpora, models, similarities
34 | from gensim.similarities import SparseMatrixSimilarity
35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
36 |
37 | # 设置随机种子
38 | SEED = 2020
39 |
40 | input_root_path = '../../input/'
41 | output_root_path = '../../feat/'
42 |
43 | postfix = '31-50'
44 | file_type = 'ftr'
45 |
46 | # 当前特征
47 | tr_fea_out_path = output_root_path + 'tr_fea_{}.{}'.format(postfix, file_type)
48 | te_fea_out_path = output_root_path + 'te_fea_{}.{}'.format(postfix, file_type)
49 |
50 | # 当前特征 + 之前特征 merge 之后的完整训练数据
51 | tr_out_path = output_root_path + 'tr_s0_{}.{}'.format(postfix, file_type)
52 | te_out_path = output_root_path + 'te_s0_{}.{}'.format(postfix, file_type)
53 |
54 | ID_NAMES = ['description_id', 'paper_id']
55 | PROCESS_NUM = 15
56 |
57 | # load data
58 | ts = time.time()
59 | dictionary = corpora.Dictionary.load('../../feat/corpus.dict')
60 | tfidf = models.TfidfModel.load('../../feat/tfidf.model')
61 |
62 | print ('load data completed, cost {}s'.format(np.round(time.time() - ts, 2)))
63 |
64 | def sum_score(x, y):
65 | return max(x, 0) + max(y, 0)
66 |
67 | def cos_dis(vec_x, vec_y, norm=False):
68 | if vec_x == None or vec_y == None:
69 | return -1
70 | dic_x = {v[0]: v[1] for v in vec_x}
71 | dic_y = {v[0]: v[1] for v in vec_y}
72 |
73 | dot_prod = 0
74 | for k, x in dic_x.items():
75 | y = dic_y.get(k, 0)
76 | dot_prod += x * y
77 | norm_x = np.linalg.norm([v[1] for v in vec_x])
78 | norm_y = np.linalg.norm([v[1] for v in vec_y])
79 |
80 | cos = dot_prod / (norm_x * norm_y)
81 | return 0.5 * cos + 0.5 if norm else cos # 归一化到[0, 1]区间内
82 |
83 | def eucl_dis(vec_x, vec_y):
84 | if vec_x == None or vec_y == None:
85 | return -1
86 | dic_x = {v[0]: v[1] for v in vec_x}
87 | dic_y = {v[0]: v[1] for v in vec_y}
88 | lis_i = list(set(list(dic_x.keys()) + list(dic_y.keys())))
89 | squa_sum = 0
90 | for i in lis_i:
91 | x, y = dic_x.get(i, 0), dic_y.get(i, 0)
92 | squa_sum += np.square(x - y)
93 | return np.sqrt(squa_sum)
94 |
95 | def manh_dis(vec_x, vec_y):
96 | if vec_x == None or vec_y == None:
97 | return -1
98 | dic_x = {v[0]: v[1] for v in vec_x}
99 | dic_y = {v[0]: v[1] for v in vec_y}
100 | lis_i = list(set(list(dic_x.keys()) + list(dic_y.keys())))
101 | abs_sum = 0
102 | for i in lis_i:
103 | x, y = dic_x.get(i, 0), dic_y.get(i, 0)
104 | abs_sum += np.abs(x - y)
105 | return abs_sum
106 |
107 | def get_bm25_corp(quer, paper_id):
108 | quer_vec = dictionary.doc2bow(quer.split(' '))
109 | corp_score = bm25_corp.get_score(quer_vec, paper_ids.index(paper_id))
110 | return corp_score
111 |
112 | def get_bm25_abst(quer, paper_id):
113 | quer_vec = dictionary.doc2bow(quer.split(' '))
114 | abst_score = bm25_abst.get_score(quer_vec, paper_ids.index(paper_id))
115 | return abst_score
116 |
117 | def get_bm25_titl(quer, paper_id):
118 | quer_vec = dictionary.doc2bow(quer.split(' '))
119 | titl_score = bm25_titl.get_score(quer_vec, paper_ids.index(paper_id))
120 | return titl_score
121 |
122 | def single_process_feat(params=None):
123 | ts = time.time()
124 | (df, i) = params
125 |
126 | ts = time.time()
127 | print (i, 'start', datetime.now())
128 | # tfidf vec dis
129 | df['quer_key_vec'] = df['quer_key'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
130 | df['quer_all_vec'] = df['quer_all'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
131 | df['titl_vec'] = df['titl'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
132 | df['abst_vec'] = df['abst'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
133 | df['corp_vec'] = df['corp'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
134 | print (i, 'load vec completed, cost {}s'.format(np.round(time.time() - ts), 2))
135 |
136 | ts = time.time()
137 | vec_type = 'tfidf'
138 | for vec_x in ['quer_key', 'quer_all']:
139 | for vec_y in ['abst', 'titl', 'corp']:
140 | df['{}_{}_{}_cos_dis'.format(vec_x, vec_type, vec_y)] = df.progress_apply(lambda row: \
141 | cos_dis(row['{}_vec'.format(vec_x)], row['{}_vec'.format(vec_y)]), axis=1)
142 | df['{}_{}_{}_eucl_dis'.format(vec_x, vec_type, vec_y)] = df.progress_apply(lambda row: \
143 | eucl_dis(row['{}_vec'.format(vec_x)], row['{}_vec'.format(vec_y)]), axis=1)
144 | df['{}_{}_{}_manh_dis'.format(vec_x, vec_type, vec_y)] = df.progress_apply(lambda row: \
145 | manh_dis(row['{}_vec'.format(vec_x)], row['{}_vec'.format(vec_y)]), axis=1)
146 |
147 | print (i, vec_x, 'tfidf completed, cost {}s'.format(np.round(time.time() - ts), 2))
148 |
149 | del_cols = [col for col in df.columns if df[col].dtype == 'O' and col not in ID_NAMES]
150 | print ('del cols', del_cols)
151 | df.drop(del_cols, axis=1, inplace=True)
152 | return df
153 |
154 | def partition(df, num):
155 | df_partitions, step = [], int(np.ceil(df.shape[0]/num))
156 | for i in range(0, df.shape[0], step):
157 | df_partitions.append(df.iloc[i:i+step])
158 | return df_partitions
159 |
160 | def multi_process_feat(df):
161 | pool = Pool(PROCESS_NUM)
162 | df = df[ID_NAMES + ['quer_key', 'quer_all', 'abst', 'titl', 'corp']]
163 | df_parts = partition(df, PROCESS_NUM)
164 | print ('{} processes init and partition to {} parts' \
165 | .format(PROCESS_NUM, PROCESS_NUM))
166 | ts = time.time()
167 |
168 | param_list = [(df_parts[i], i) \
169 | for i in range(PROCESS_NUM)]
170 | dfs = pool.map(single_process_feat, param_list)
171 | df_out = pd.concat(dfs, axis=0)
172 | return df_out
173 |
174 | def gen_samples(paper, tr_desc_path, tr_recall_path, fea_out_path):
175 | tr_desc = loader.load_df(tr_desc_path)
176 | tr = loader.load_df(tr_recall_path)
177 | # tr = tr.head(1000)
178 |
179 | tr = tr.merge(paper, on=['paper_id'], how='left')
180 | tr = tr.merge(tr_desc[['description_id', 'quer_key', 'quer_all']], on=['description_id'], how='left')
181 |
182 | print (tr.columns)
183 | print (tr.head())
184 |
185 | tr_feat = multi_process_feat(tr)
186 | loader.save_df(tr_feat, fea_out_path)
187 |
188 | tr = tr.merge(tr_feat, on=ID_NAMES, how='left')
189 | del_cols = [col for col in tr.columns if tr[col].dtype == 'O' and col not in ID_NAMES]
190 | print ('tr del cols', del_cols)
191 | return tr.drop(del_cols, axis=1)
192 |
193 |
194 | # 增加 vec sim 特征
195 |
196 | if __name__ == "__main__":
197 |
198 | ts = time.time()
199 | tqdm.pandas()
200 | print('start time: %s' % datetime.now())
201 | paper = loader.load_df('../../input/paper_input_final.ftr')
202 | paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
203 | paper['corp'] = paper['abst'] + ' ' + paper['titl'] + ' ' + paper['keywords'].fillna('').replace(';', ' ')
204 |
205 | tr_desc_path = '../../input/tr_input_final.ftr'
206 | te_desc_path = '../../input/te_input_final.ftr'
207 |
208 | tr_recall_path = '../../feat/tr_s0_30-50.ftr'
209 | te_recall_path = '../../feat/te_s0_30-50.ftr'
210 |
211 | tr = gen_samples(paper, tr_desc_path, tr_recall_path, tr_fea_out_path)
212 | print (tr.columns)
213 | print ([col for col in tr.columns if tr[col].dtype == 'O'])
214 | loader.save_df(tr, tr_out_path)
215 |
216 | te = gen_samples(paper, te_desc_path, te_recall_path, te_fea_out_path)
217 | print (te.columns)
218 | loader.save_df(te, te_out_path)
219 | print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
220 |
221 |
222 |
223 |
224 |
--------------------------------------------------------------------------------
/src/feature/feat37-pairwise.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | # 生成词向量距离特征
5 |
6 | # 基础模块
7 | import os
8 | import gc
9 | import sys
10 | import time
11 | import pickle
12 | from datetime import datetime
13 | from tqdm import tqdm
14 |
15 | # 数据处理
16 | import numpy as np
17 | import pandas as pd
18 | from tqdm import tqdm
19 | from multiprocessing import Pool
20 |
21 | # 自定义工具包
22 | sys.path.append('../../tools/')
23 | import loader
24 | import pandas_util
25 | import custom_bm25 as bm25
26 | from feat_utils import try_divide, dump_feat_name
27 |
28 | # 开源工具包
29 | import nltk
30 | import gensim
31 | from gensim.models import Word2Vec
32 | from gensim.models.word2vec import LineSentence
33 | from gensim import corpora, models, similarities
34 | from gensim.similarities import SparseMatrixSimilarity
35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
36 |
37 | # 设置随机种子
38 | SEED = 2020
39 |
40 | input_root_path = '../../input/'
41 | output_root_path = '../../feat/'
42 |
43 | FEA_NUM = '37'
44 | postfix = 's0_{}'.format(FEA_NUM)
45 | file_type = 'ftr'
46 |
47 | # 当前特征
48 | tr_fea_out_path = output_root_path + 'tr_fea_{}.{}'.format(postfix, file_type)
49 | te_fea_out_path = output_root_path + 'te_fea_{}.{}'.format(postfix, file_type)
50 |
51 | # 当前特征 + 之前特征 merge 之后的完整训练数据
52 | tr_out_path = output_root_path + 'tr_{}.{}'.format(postfix, file_type)
53 | te_out_path = output_root_path + 'te_{}.{}'.format(postfix, file_type)
54 |
55 | ID_NAMES = ['description_id', 'paper_id']
56 | PROCESS_NUM = 20
57 |
58 | # load data
59 | ts = time.time()
60 |
61 | def feat_extract(df, is_te=False):
62 | if is_te:
63 | df_pred = loader.load_df('../../output/m3/lgb_m3_32-50-0/lgb_m3_32-50-0.ftr')
64 | else:
65 | df_pred = loader.load_df('../../output/m3/lgb_m3_32-50-0/lgb_m3_32-50-0_cv.ftr')
66 | df_pred = df_pred[ID_NAMES + ['target']]
67 |
68 | df_pred = df_pred.sort_values(by=['target'], ascending=False)
69 | df_pred['pred_rank'] = df_pred.groupby(['description_id']).cumcount().values
70 | df_pred = df_pred.sort_values(by=['description_id', 'target'])
71 | print (df_pred.shape)
72 | print (df_pred.head(10))
73 |
74 | pred_top1 = df_pred[df_pred['pred_rank'] == 0] \
75 | .drop_duplicates(subset='description_id', keep='first')
76 | pred_top1 = pred_top1[['description_id', 'target']]
77 | pred_top1.columns = ['description_id', 'top1_pred']
78 |
79 | pred_top2 = df_pred[df_pred['pred_rank'] < 2]
80 | pred_top2['top2_pred_avg'] = pred_top2.groupby('description_id')['target'].transform('mean')
81 | pred_top2['top2_pred_std'] = pred_top2.groupby('description_id')['target'].transform('std')
82 | pred_top2 = pred_top2[['description_id', 'top2_pred_avg', \
83 | 'top2_pred_std']].drop_duplicates(subset=['description_id'])
84 |
85 | pred_top3 = df_pred[df_pred['pred_rank'] < 3]
86 | pred_top3['top3_pred_avg'] = pred_top3.groupby('description_id')['target'].transform('mean')
87 | pred_top3['top3_pred_std'] = pred_top3.groupby('description_id')['target'].transform('std')
88 | pred_top3 = pred_top3[['description_id', 'top3_pred_avg', \
89 | 'top3_pred_std']].drop_duplicates(subset=['description_id'])
90 |
91 | pred_top5 = df_pred[df_pred['pred_rank'] < 5]
92 | pred_top5['top5_pred_avg'] = pred_top5.groupby('description_id')['target'].transform('mean')
93 | pred_top5['top5_pred_std'] = pred_top5.groupby('description_id')['target'].transform('std')
94 | pred_top5 = pred_top5[['description_id', 'top5_pred_avg', \
95 | 'top5_pred_std']].drop_duplicates(subset=['description_id'])
96 |
97 | df_pred.rename(columns={'target': 'pred'}, inplace=True)
98 | df = df.merge(df_pred, on=ID_NAMES, how='left')
99 | df = df.merge(pred_top1, on=['description_id'], how='left')
100 | df = df.merge(pred_top2, on=['description_id'], how='left')
101 | df = df.merge(pred_top3, on=['description_id'], how='left')
102 | df = df.merge(pred_top5, on=['description_id'], how='left')
103 |
104 | df['pred_sub_top1'] = df['pred'] - df['top1_pred']
105 | df['pred_sub_top2_avg'] = df['pred'] - df['top2_pred_avg']
106 | df['pred_sub_top3_avg'] = df['pred'] - df['top3_pred_avg']
107 | df['pred_sub_top5_avg'] = df['pred'] - df['top5_pred_avg']
108 |
109 | del_cols = ['paper_id', 'pred', 'pred_rank']
110 | df.drop(del_cols, axis=1, inplace=True)
111 | df_feat = df.drop_duplicates(subset=['description_id'])
112 |
113 | print ('df_feat info')
114 | print (df_feat.shape)
115 | print (df_feat.head())
116 | print (df_feat.columns.tolist())
117 |
118 | return df_feat
119 |
120 | def output_fea(tr, te):
121 | print (tr.head())
122 | print (te.head())
123 |
124 | loader.save_df(tr, tr_fea_out_path)
125 | loader.save_df(te, te_fea_out_path)
126 |
127 | def gen_fea():
128 | tr = loader.load_df('../../feat/tr_s0_32-50.ftr')
129 | te = loader.load_df('../../feat/te_s0_32-50.ftr')
130 |
131 | tr_feat = feat_extract(tr[ID_NAMES])
132 | te_feat = feat_extract(te[ID_NAMES], is_te=True)
133 |
134 | tr = tr[ID_NAMES].merge(tr_feat, on=['description_id'], how='left')
135 | te = te[ID_NAMES].merge(te_feat, on=['description_id'], how='left')
136 |
137 | print (tr.shape, te.shape)
138 | print (tr.head())
139 | print (te.head())
140 | print (tr.columns)
141 |
142 | output_fea(tr, te)
143 |
144 | # merge 已有特征
145 | def merge_fea(tr_list, te_list):
146 | tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES)
147 | te = loader.merge_fea(te_list, primary_keys=ID_NAMES)
148 |
149 | print (tr.head())
150 | print (te.head())
151 | print (tr.columns.tolist())
152 |
153 | loader.save_df(tr, tr_out_path)
154 | loader.save_df(te, te_out_path)
155 |
156 | if __name__ == "__main__":
157 |
158 | print('start time: %s' % datetime.now())
159 | root_path = '../../feat/'
160 | base_tr_path = root_path + 'tr_s0_32-50.ftr'
161 | base_te_path = root_path + 'te_s0_32-50.ftr'
162 |
163 | gen_fea()
164 |
165 | # merge fea
166 | prefix = 's0'
167 | fea_list = [FEA_NUM]
168 |
169 | tr_list = [base_tr_path] + \
170 | [root_path + 'tr_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
171 | te_list = [base_te_path] + \
172 | [root_path + 'te_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
173 |
174 | merge_fea(tr_list, te_list)
175 |
176 | print('all completed: %s' % datetime.now())
177 |
178 |
179 |
--------------------------------------------------------------------------------
/src/feature/feat38-stk.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | # 生成词向量距离特征
5 |
6 | # 基础模块
7 | import os
8 | import gc
9 | import sys
10 | import time
11 | import pickle
12 | from datetime import datetime
13 | from tqdm import tqdm
14 |
15 | # 数据处理
16 | import numpy as np
17 | import pandas as pd
18 | from tqdm import tqdm
19 | from multiprocessing import Pool
20 |
21 | # 自定义工具包
22 | sys.path.append('../../tools/')
23 | import loader
24 | import pandas_util
25 | import custom_bm25 as bm25
26 | from preprocess import preprocess
27 | from feat_utils import try_divide, dump_feat_name
28 |
29 | # 开源工具包
30 | import nltk
31 | import gensim
32 | from gensim.models import Word2Vec
33 | from gensim.models.word2vec import LineSentence
34 | from gensim import corpora, models, similarities
35 | from gensim.similarities import SparseMatrixSimilarity
36 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
37 |
38 | # 设置随机种子
39 | SEED = 2020
40 |
41 | input_root_path = '../../input/'
42 | output_root_path = '../../feat/'
43 |
44 | FEA_NUM = 38
45 |
46 | postfix = 's0_{}'.format(FEA_NUM)
47 | file_type = 'ftr'
48 |
49 | # 当前特征
50 | tr_fea_out_path = output_root_path + 'tr_fea_{}.{}'.format(postfix, file_type)
51 | te_fea_out_path = output_root_path + 'te_fea_{}.{}'.format(postfix, file_type)
52 |
53 | # 当前特征 + 之前特征 merge 之后的完整训练数据
54 | tr_out_path = output_root_path + 'tr_{}.{}'.format(postfix, file_type)
55 | te_out_path = output_root_path + 'te_{}.{}'.format(postfix, file_type)
56 |
57 | ID_NAMES = ['description_id', 'paper_id']
58 | PROCESS_NUM = 20
59 |
60 | # load data
61 | ts = time.time()
62 |
63 | def feat_extract(tr_path, te_path, prefix):
64 | tr_sample = loader.load_df('../../feat/tr_s0_37.ftr')
65 | te_sample = loader.load_df('../../feat/te_s0_37.ftr')
66 |
67 | tr = loader.load_df(tr_path)
68 | te = loader.load_df(te_path)
69 |
70 | del_cols = ['label']
71 | del_cols = [col for col in tr.columns if col in del_cols]
72 | tr.drop(del_cols, axis=1, inplace=True)
73 |
74 | tr = tr_sample[ID_NAMES].merge(tr, on=ID_NAMES, how='left')
75 | te = te_sample[ID_NAMES].merge(te, on=ID_NAMES, how='left')
76 |
77 | tr.columns = ID_NAMES + [prefix]
78 | te.columns = ID_NAMES + [prefix]
79 |
80 | print (prefix)
81 | print (tr.shape, te.shape)
82 | print (tr.head())
83 |
84 | tr = tr[prefix]
85 | te = te[prefix]
86 |
87 | return tr, te
88 |
89 | def output_fea(tr, te):
90 | print (tr.head())
91 | print (te.head())
92 |
93 | loader.save_df(tr, tr_fea_out_path)
94 | loader.save_df(te, te_fea_out_path)
95 |
96 | # 生成特征
97 | def gen_fea(base_tr_path=None, base_te_path=None):
98 |
99 | tr_sample = loader.load_df('../../feat/tr_s0_37.ftr')
100 | te_sample = loader.load_df('../../feat/te_s0_37.ftr')
101 |
102 | prefixs = ['m1_cat_03', 'm1_infesent_simple', 'm1_nn_02', \
103 | 'm2_ESIM_001', 'm2_ESIMplus_001', 'lgb_m3_37-0']
104 |
105 | tr_paths = ['{}_tr.ftr'.format(prefix) for prefix in prefixs]
106 | te_paths = ['final_{}_te.ftr'.format(prefix) for prefix in prefixs]
107 |
108 | tr_paths = ['../../stk_feat/{}'.format(p) for p in tr_paths]
109 | te_paths = ['../../stk_feat/{}'.format(p) for p in te_paths]
110 |
111 |
112 | trs, tes = [], []
113 | for i, prefix in enumerate(prefixs):
114 | tr, te = feat_extract(tr_paths[i], te_paths[i], prefix + '_prob')
115 | trs.append(tr)
116 | tes.append(te)
117 | tr = pd.concat([tr_sample[ID_NAMES]] + trs, axis=1)
118 | te = pd.concat([te_sample[ID_NAMES]] + tes, axis=1)
119 |
120 | float_cols = [c for c in tr.columns if tr[c].dtype == 'float']
121 | tr[float_cols] = tr[float_cols].astype('float32')
122 | te[float_cols] = te[float_cols].astype('float32')
123 |
124 | print (tr.shape, te.shape)
125 | print (tr.head())
126 | print (te.head())
127 | print (tr.columns)
128 |
129 | output_fea(tr, te)
130 |
131 | # merge 已有特征
132 | def merge_fea(tr_list, te_list):
133 | tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES)
134 | te = loader.merge_fea(te_list, primary_keys=ID_NAMES)
135 |
136 | print (tr.head())
137 | print (te.head())
138 | print (tr.columns.tolist())
139 |
140 | loader.save_df(tr, tr_out_path)
141 | loader.save_df(te, te_out_path)
142 |
143 | if __name__ == "__main__":
144 |
145 | print('start time: %s' % datetime.now())
146 | root_path = '../../feat/'
147 | base_tr_path = root_path + 'tr_s0_37.ftr'
148 | base_te_path = root_path + 'te_s0_37.ftr'
149 |
150 | gen_fea()
151 |
152 | # merge fea
153 | prefix = 's0'
154 | fea_list = [FEA_NUM]
155 |
156 | tr_list = [base_tr_path] + \
157 | [root_path + 'tr_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
158 | te_list = [base_te_path] + \
159 | [root_path + 'te_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
160 |
161 | merge_fea(tr_list, te_list)
162 |
163 | print('all completed: %s' % datetime.now())
164 |
165 |
166 |
167 |
--------------------------------------------------------------------------------
/src/feature/gen_samples.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | import warnings
5 | warnings.filterwarnings('always')
6 | warnings.filterwarnings('ignore')
7 |
8 | # 基础模块
9 | import os
10 | import sys
11 | import time
12 | from datetime import datetime
13 | from tqdm import tqdm
14 |
15 | # 数据处理
16 | import numpy as np
17 | import pandas as pd
18 |
19 | # 自定义工具包
20 | sys.path.append('../../tools/')
21 | import loader
22 | import pandas_util
23 |
24 | # 开源工具包
25 | from gensim.models import Word2Vec
26 | from gensim.models.word2vec import LineSentence
27 | from gensim import corpora, models, similarities
28 | from gensim.similarities import SparseMatrixSimilarity
29 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
30 |
31 | # 设置随机种子
32 | SEED = 2020
33 |
34 | def topk_lines(df, k):
35 | df.loc[:, 'rank'] = df.groupby(['description_id']).cumcount().values
36 | df = df[df['rank'] < k]
37 | df.drop(['rank'], axis=1, inplace=True)
38 | return df
39 |
40 | def process(in_path, k):
41 | ID_NAMES = ['description_id', 'paper_id']
42 |
43 | df = loader.load_df(in_path)
44 | df = topk_lines(df, k)
45 | df['sim_score'] = df['sim_score'].astype('float')
46 | df.rename(columns={'sim_score': 'corp_sim_score'}, inplace=True)
47 | return df
48 |
49 |
50 | if __name__ == "__main__":
51 |
52 | ts = time.time()
53 | tr_path = '../../feat/tr_tfidf_30.ftr'
54 | te_path = '../../feat/te_tfidf_30.ftr'
55 |
56 | cv = loader.load_df('../../input/cv_ids_0109.csv')[['description_id', 'cv']]
57 |
58 | tr = process(tr_path, k=50)
59 | tr = tr.merge(cv, on=['description_id'], how='left')
60 |
61 | te = process(te_path, k=50)
62 | te['cv'] = 0
63 |
64 | loader.save_df(tr, '../../feat/tr_samples_30-50.ftr')
65 | loader.save_df(te, '../../feat/te_samples_30-50.ftr')
66 | print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/src/feature/tfidf_recall_30.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | # bm25 recall
5 |
6 | # 基础模块
7 | import os
8 | import gc
9 | import sys
10 | import time
11 | import functools
12 | from tqdm import tqdm
13 | from six import iteritems
14 | from datetime import datetime
15 |
16 | # 数据处理
17 | import re
18 | import math
19 | import pickle
20 | import numpy as np
21 | import pandas as pd
22 | from multiprocessing import Pool
23 |
24 | # 自定义工具包
25 | sys.path.append('../../tools/')
26 | import loader
27 | import pandas_util
28 | import custom_bm25 as bm25
29 |
30 | # 开源工具包
31 | from gensim.models import Word2Vec
32 | from gensim.models.word2vec import LineSentence
33 | from gensim import corpora, models, similarities
34 | from gensim.similarities import SparseMatrixSimilarity
35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
36 |
37 | # 设置随机种子
38 | SEED = 2020
39 | PROCESS_NUM, PARTITION_NUM = 18, 18
40 |
41 | input_root_path = '../../input/'
42 | output_root_path = '../../feat/'
43 |
44 | postfix = '30'
45 | file_type = 'ftr'
46 |
47 | train_out_path = output_root_path + 'tr_tfidf_{}.{}'.format(postfix, file_type)
48 | test_out_path = output_root_path + 'te_tfidf_{}.{}'.format(postfix, file_type)
49 |
50 | def topk_sim_samples(desc, desc_ids, paper_ids, bm25_model, k=10):
51 | desc_id2papers = {}
52 | for desc_i in tqdm(range(len(desc))):
53 | query_vec, query_desc_id = desc[desc_i], desc_ids[desc_i]
54 | sims = bm25_model.get_scores(query_vec)
55 | sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
56 | sim_papers = [paper_ids[val[0]] for val in sort_sims[:k]]
57 | sim_scores = [str(val[1]) for val in sort_sims[:k]]
58 | desc_id2papers[query_desc_id] = ['|'.join(sim_papers), '|'.join(sim_scores)]
59 | sim_df = pd.DataFrame.from_dict(desc_id2papers, orient='index', columns=['paper_id', 'sim_score'])
60 | sim_df = sim_df.reset_index().rename(columns={'index':'description_id'})
61 | return sim_df
62 |
63 | def partition(queries, num):
64 | queries_partitions, step = [], int(np.ceil(len(queries)/num))
65 | for i in range(0, len(queries), step):
66 | queries_partitions.append(queries[i:i+step])
67 | return queries_partitions
68 |
69 | def single_process_search(params=None):
70 | (query_vecs, desc_ids, paper_ids, bm25_model, k, i) = params
71 | print (i, 'start', datetime.now())
72 | gc.collect()
73 | sim_df = topk_sim_samples(query_vecs, desc_ids, paper_ids, bm25_model, k)
74 | print (i, 'completed', datetime.now())
75 | return sim_df
76 |
77 | def multi_process_search(query_vecs, desc_ids, paper_ids, bm25_model, k):
78 | pool = Pool(PROCESS_NUM)
79 | queries_parts = partition(query_vecs, PARTITION_NUM)
80 | desc_ids_parts = partition(desc_ids, PARTITION_NUM)
81 | print ('{} processes init and partition to {} parts' \
82 | .format(PROCESS_NUM, PARTITION_NUM))
83 |
84 | param_list = [(queries_parts[i], desc_ids_parts[i], \
85 | paper_ids, bm25_model, k, i) for i in range(PARTITION_NUM)]
86 | sim_dfs = pool.map(single_process_search, param_list)
87 | sim_df = pd.concat(sim_dfs, axis=0)
88 | return sim_df
89 |
90 | def gen_samples(df, desc, desc_ids, corpus_list, paper_ids_list, k):
91 | df_samples_list = []
92 | for i, corpus in enumerate(corpus_list):
93 | bm25_model = bm25.BM25(corpus[0])
94 | cur_df_sample = multi_process_search(desc, desc_ids, \
95 | paper_ids_list[i], bm25_model, k)
96 | cur_df_sample_out = pandas_util.explode(cur_df_sample, ['paper_id', 'sim_score'])
97 | cur_df_sample_out['type'] = corpus[1] # recall_name
98 | df_samples_list.append(cur_df_sample_out)
99 | df_samples = pd.concat(df_samples_list, axis=0)
100 | df_samples.drop_duplicates(subset=['description_id', 'paper_id'], inplace=True)
101 | df_samples['target'] = 0
102 | return df_samples
103 |
104 | if __name__ == "__main__":
105 |
106 | ts = time.time()
107 | tqdm.pandas()
108 | print('start time: %s' % datetime.now())
109 | # load data
110 | df = loader.load_df(input_root_path + 'paper_input_final.ftr')
111 | df = df[~pd.isnull(df['paper_id'])]
112 |
113 | # gen tfidf vecs
114 | dictionary = pickle.load(open('../../feat/corpus.dict', 'rb'))
115 | print ('dic len', len(dictionary))
116 |
117 | df['corp'] = df['abst'] + ' ' + df['titl'] + ' ' + df['keywords'].fillna('').replace(';', ' ')
118 | df_corp, corp_paper_ids = [dictionary.doc2bow(line.split(' ')) for line in df['corp'].tolist()], \
119 | df['paper_id'].tolist()
120 |
121 | # gen topk sim samples
122 | paper_ids_list = [corp_paper_ids]
123 | corpus_list = [(df_corp, 'corp_bm25')]
124 | out_cols = ['description_id', 'paper_id', 'sim_score', 'target', 'type']
125 |
126 | if sys.argv[1] in ['tr']:
127 | # for tr ins
128 | tr = loader.load_df(input_root_path + 'tr_input_final.ftr')
129 | tr = tr[~pd.isnull(tr['description_id'])]
130 |
131 | # tr = tr.head(1000)
132 | tr_desc, tr_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in tr['quer_all'].tolist()], \
133 | tr['description_id'].tolist()
134 | print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))
135 |
136 | tr_samples = gen_samples(tr, tr_desc, tr_desc_ids, \
137 | corpus_list, paper_ids_list, k=50)
138 | tr_samples = tr.rename(columns={'paper_id': 'target_paper_id'}) \
139 | .merge(tr_samples, on='description_id', how='left')
140 | tr_samples.loc[tr_samples['target_paper_id'] == tr_samples['paper_id'], 'target'] = 1
141 | loader.save_df(tr_samples[out_cols], train_out_path)
142 | print ('recall succ {} from {}'.format(tr_samples['target'].sum(), tr.shape[0]))
143 | print (tr.shape, tr_samples.shape)
144 |
145 | if sys.argv[1] in ['te']:
146 | # for te ins
147 | te = loader.load_df(input_root_path + 'te_input_final.ftr')
148 | te = te[~pd.isnull(te['description_id'])]
149 |
150 | # te = te.head(1000)
151 | te_desc, te_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in te['quer_all'].tolist()], \
152 | te['description_id'].tolist()
153 | print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))
154 |
155 | te_samples = gen_samples(te, te_desc, te_desc_ids, \
156 | corpus_list, paper_ids_list, k=50)
157 | te_samples = te.merge(te_samples, on='description_id', how='left')
158 | loader.save_df(te_samples[out_cols], test_out_path)
159 | print (te.shape, te_samples.shape)
160 |
161 | print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/src/rank/m1/catboost03.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import datetime
10 | from catboost import CatBoostClassifier
11 | from time import time
12 | from tqdm import tqdm_notebook as tqdm
13 |
14 |
15 | # In[2]:
16 |
17 |
18 | feat_dir = "../../../feat/"
19 | input_dir = "../../../input/"
20 | cv_id = pd.read_csv("../../../input/cv_ids_0109.csv")
21 |
22 |
23 | # In[3]:
24 |
25 |
26 | train = pd.read_feather(f'{feat_dir}/tr_s0_32-50.ftr')
27 | train.drop(columns=['cv'],axis=1,inplace=True)
28 | train = train.merge(cv_id,on=['description_id'],how='left')
29 | train = train.dropna(subset=['cv']).reset_index(drop=True)
30 | # test = pd.read_feather(f'{feat_dir}/te_s0_20-50.ftr')
31 | test = pd.read_feather(f'{feat_dir}/te_s0_32-50.ftr')
32 |
33 |
34 | # In[4]:
35 |
36 |
37 | ID_NAMES = ['description_id', 'paper_id']
38 | TARGET_NAME = 'target'
39 |
40 |
41 | # In[5]:
42 |
43 |
44 | def get_feas(data):
45 | cols = data.columns.tolist()
46 | del_cols = ID_NAMES + ['target', 'cv']
47 | #sub_cols = ['year', 'corp_cos', 'corp_eucl', 'corp_manh', 'quer_all']
48 | sub_cols = ['year', 'corp_sim_score']
49 | sub_cols = ['year', 'pos_of_corp', 'pos_of_abst', 'pos_of_titl']
50 | for col in data.columns:
51 | for sub_col in sub_cols:
52 | if sub_col in col:
53 | del_cols.append(col)
54 |
55 | cols = [val for val in cols if val not in del_cols]
56 | print ('del_cols', del_cols)
57 | return cols
58 |
59 |
60 | # In[6]:
61 |
62 |
63 | feas = get_feas(train)
64 |
65 |
66 | # In[7]:
67 |
68 |
69 | def make_classifier():
70 | clf = CatBoostClassifier(
71 | loss_function='Logloss',
72 | eval_metric="AUC",
73 | # task_type="CPU",
74 | learning_rate=0.1, ###0.01
75 | iterations=2500, ###2000
76 | od_type="Iter",
77 | # depth=8,
78 | thread_count=10,
79 | early_stopping_rounds=100, ###100
80 | # l2_leaf_reg=1,
81 | # border_count=96,
82 | random_seed=42
83 | )
84 |
85 | return clf
86 |
87 |
88 | # In[8]:
89 |
90 |
91 | # 开源工具包
92 | import ml_metrics as metrics
93 | def cal_map(pred_valid,cv,train_df,tr_data):
94 | df_pred = train_df[train_df['cv']==cv].copy()
95 | df_pred['pred'] = pred_valid
96 | df_pred = df_pred[['description_id','paper_id','pred']]
97 | sort_df_pred = df_pred.sort_values(['description_id', 'pred'], ascending=False)
98 | df_pred = df_pred[['description_id']].drop_duplicates() .merge(sort_df_pred, on=['description_id'], how='left')
99 | df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
100 | df_pred = df_pred[df_pred['rank'] < 3]
101 | df_pred = df_pred.groupby(['description_id'])['paper_id'] .apply(lambda s : ','.join((s))).reset_index()
102 | df_pred = df_pred.merge(tr_data, on=['description_id'], how='left')
103 | df_pred.rename(columns={'paper_id': 'paper_ids'}, inplace=True)
104 | df_pred['paper_ids'] = df_pred['paper_ids'].apply(lambda s: s.split(','))
105 | df_pred['target_id'] = df_pred['target_id'].apply(lambda s: [s])
106 | return metrics.mapk(df_pred['target_id'].tolist(), df_pred['paper_ids'].tolist(), 3)
107 |
108 |
109 | # In[9]:
110 |
111 |
112 | import os
113 | model_dir = "./m1_model/catboost03"
114 | if not os.path.exists(model_dir):
115 | os.makedirs(model_dir)
116 |
117 |
118 | # In[10]:
119 |
120 |
121 | tr_data = pd.read_csv(f'{input_dir}/train_release.csv')
122 | tr_data = tr_data[['description_id', 'paper_id']].rename(columns={'paper_id': 'target_id'})
123 |
124 |
125 | # In[13]:
126 |
127 |
128 | for fea in feas:
129 | if fea not in test.columns:
130 | print(fea)
131 |
132 |
133 | # In[14]:
134 |
135 |
136 | CV_RESULT_OUT=True
137 |
138 |
139 | # In[15]:
140 |
141 |
142 | def train_one_fold(type_train_df,type_test_df,model_dir,cv,pi=False):
143 | print(" fold " + str(cv))
144 | train_data = type_train_df[(type_train_df['cv']!=cv)]
145 | valid_data = type_train_df[(type_train_df['cv']==cv)]
146 |
147 | des_id = valid_data['description_id']
148 | paper_id = valid_data['paper_id']
149 |
150 | idx_train = train_data.index
151 | idx_val = valid_data.index
152 | des_id = valid_data['description_id']
153 | paper_id = valid_data['paper_id']
154 | model_name = "fold_{}_cbt_best.model".format(str(cv))
155 | model_name_wrt = os.path.join(model_dir,model_name)
156 | clf = make_classifier()
157 | imp=pd.DataFrame()
158 | if not os.path.exists(model_name_wrt):
159 | clf.fit(train_data[feas], train_data[['target']], eval_set=(valid_data[feas],valid_data[['target']]),
160 | use_best_model=True, verbose=100)
161 | clf.save_model(model_name_wrt)
162 | fea_ = clf.feature_importances_
163 | fea_name = clf.feature_names_
164 | imp = pd.DataFrame({'name':fea_name,'imp':fea_})
165 | else:
166 | clf.load_model(model_name_wrt)
167 | cv_predict=clf.predict_proba(valid_data[feas])[:,1]
168 | # print(cv_predict.shape)
169 | cv_score_fold = cal_map(cv_predict,cv,type_train_df,tr_data)
170 | if CV_RESULT_OUT:
171 | cv_preds = cv_predict
172 | rdf = pd.DataFrame()
173 | rdf = rdf.reindex(columns=['description_id','paper_id','pred'])
174 | rdf['description_id'] = des_id
175 | rdf['paper_id'] = paper_id
176 | rdf['pred'] = cv_preds
177 | test_des_id = type_test_df['description_id']
178 | test_paper_id = type_test_df['paper_id']
179 | test_preds = clf.predict_proba(type_test_df[feas])[:,1]
180 | test_df = pd.DataFrame()
181 | test_df = test_df.reindex(columns=['description_id','paper_id','pred'])
182 | test_df['description_id'] = test_des_id
183 | test_df['paper_id'] = test_paper_id
184 | test_df['pred'] = test_preds
185 | return rdf,test_df,cv_score_fold,imp
186 |
187 |
188 | # In[16]:
189 |
190 |
191 | kfold = 5
192 | type_scores = []
193 | type_cv_results = []
194 | type_test_results = []
195 | model_name = '../../../output/m1/catboost03/'
196 | fold_scores = []
197 | fold_cv_results = []
198 | fold_test_results = []
199 | imps=[]
200 | # test_preds = np.zeros(len(test))
201 | for cv in range(1,kfold+1):#####这里是因为cv是1~5
202 | cv_df,test_df,cv_score,imp = train_one_fold(train,test,model_dir,cv)
203 | # fold_cv_results.append(cv_df)
204 | # fold_test_results.append(test_df)
205 | cv_df.to_csv(f"{model_name}_cv_{cv}.csv",index=False)
206 | test_df.to_csv(f"{model_name}_result_{cv}.csv",index=False)
207 | imp.to_csv(f"{model_name}_imp_{cv}.csv",index=False)
208 | print("fold {} finished".format(cv))
209 | print(cv_score)
210 | fold_scores.append(cv_score)
211 | imps.append(imp)
212 |
213 |
214 | # In[1]:
215 |
216 |
217 | np.mean(fold_scores)
218 |
219 | #0.35309347230573923
220 | #0.3522860689007414
221 | #0.3585175465159315
222 | #0.35720084429290466
223 | #0.34729405401751007
224 |
225 |
226 | # In[ ]:
227 |
228 |
229 | result = []
230 | for i in range(1,6):
231 | re_csv = f"{model_name}_result_{i}.csv"
232 | test_df = pd.read_csv(re_csv)
233 | result.append(test_df)
234 |
235 |
236 | # In[ ]:
237 |
238 |
239 | final_test = result[0].copy()
240 |
241 |
242 | # In[ ]:
243 |
244 |
245 | for i in range(1,5):
246 | final_test['pred']+=result[i]['pred']
247 |
248 |
249 | # In[ ]:
250 |
251 |
252 | final_test['pred'] = final_test['pred']/5
253 |
254 |
255 | # In[ ]:
256 |
257 |
258 | final_test.to_csv("../../../output/m1/nn02/te_catboost03newtest.csv",index=False)
259 |
260 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/.gitignore:
--------------------------------------------------------------------------------
1 | # Object files
2 | *.o
3 | *.ko
4 | *.obj
5 | *.elf
6 |
7 | # Precompiled Headers
8 | *.gch
9 | *.pch
10 |
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 |
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 |
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 |
31 | # Debug files
32 | *.dSYM/
33 |
34 |
35 | build/*
36 | *.swp
37 |
38 | # OS X stuff
39 | ._*
40 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/.travis.yml:
--------------------------------------------------------------------------------
1 | language: c
2 | dist: trusty
3 | sudo: required
4 | before_install:
5 | - sudo apt-get install python2.7 python-numpy python-pip
6 | script: pip install numpy && ./demo.sh | tee results.txt && [[ `cat results.txt | egrep "Total accuracy. 2[23]" | wc -l` = "1" ]] && echo test-passed
7 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/Makefile:
--------------------------------------------------------------------------------
1 | CC = gcc
2 | #For older gcc, use -O3 or -O2 instead of -Ofast
3 | # CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wno-unused-result
4 | CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic
5 | BUILDDIR := build
6 | SRCDIR := src
7 |
8 | all: dir glove shuffle cooccur vocab_count
9 |
10 | dir :
11 | mkdir -p $(BUILDDIR)
12 | glove : $(SRCDIR)/glove.c
13 | $(CC) $(SRCDIR)/glove.c -o $(BUILDDIR)/glove $(CFLAGS)
14 | shuffle : $(SRCDIR)/shuffle.c
15 | $(CC) $(SRCDIR)/shuffle.c -o $(BUILDDIR)/shuffle $(CFLAGS)
16 | cooccur : $(SRCDIR)/cooccur.c
17 | $(CC) $(SRCDIR)/cooccur.c -o $(BUILDDIR)/cooccur $(CFLAGS)
18 | vocab_count : $(SRCDIR)/vocab_count.c
19 | $(CC) $(SRCDIR)/vocab_count.c -o $(BUILDDIR)/vocab_count $(CFLAGS)
20 |
21 | clean:
22 | rm -rf glove shuffle cooccur vocab_count build
23 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/README.md:
--------------------------------------------------------------------------------
1 | ## GloVe: Global Vectors for Word Representation
2 |
3 |
4 | | nearest neighbors of
frog | Litoria | Leptodactylidae | Rana | Eleutherodactylus |
5 | | --- | ------------------------------- | ------------------- | ---------------- | ------------------- |
6 | | Pictures |
|
|
|
|
7 |
8 | | Comparisons | man -> woman | city -> zip | comparative -> superlative |
9 | | --- | ------------------------|-------------------------|-------------------------|
10 | | GloVe Geometry |
|
|
|
11 |
12 | We provide an implementation of the GloVe model for learning word representations, and describe how to download web-dataset vectors or train your own. See the [project page](http://nlp.stanford.edu/projects/glove/) or the [paper](http://nlp.stanford.edu/pubs/glove.pdf) for more information on glove vectors.
13 |
14 | ## Download pre-trained word vectors
15 | The links below contain word vectors obtained from the respective corpora. If you want word vectors trained on massive web datasets, you need only download one of these text files! Pre-trained word vectors are made available under the Public Domain Dedication and License.
16 |
17 |
18 | - Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): glove.42B.300d.zip
19 | - Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): glove.840B.300d.zip
20 | - Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): glove.6B.zip
21 | - Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download): glove.twitter.27B.zip
22 |
23 |
24 |
25 | ## Train word vectors on a new corpus
26 |
27 |
28 |
29 | If the web datasets above don't match the semantics of your end use case, you can train word vectors on your own corpus.
30 |
31 | $ git clone http://github.com/stanfordnlp/glove
32 | $ cd glove && make
33 | $ ./demo.sh
34 |
35 | The demo.sh script downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python to verify word vector quality. More details about training on your own corpus can be found by reading [demo.sh](https://github.com/stanfordnlp/GloVe/blob/master/demo.sh) or the [src/README.md](https://github.com/stanfordnlp/GloVe/tree/master/src)
36 |
37 | ### License
38 | All work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file.
39 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/demo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
5 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python
6 |
7 | make
8 | if [ ! -e text8 ]; then
9 | if hash wget 2>/dev/null; then
10 | wget http://mattmahoney.net/dc/text8.zip
11 | else
12 | curl -O http://mattmahoney.net/dc/text8.zip
13 | fi
14 | unzip text8.zip
15 | rm text8.zip
16 | fi
17 |
18 | CORPUS=../corpus.txt
19 | VOCAB_FILE=vocab.txt
20 | COOCCURRENCE_FILE=cooccurrence.bin
21 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
22 | BUILDDIR=build
23 | SAVE_FILE=vectors
24 | VERBOSE=2
25 | MEMORY=4.0
26 | VOCAB_MIN_COUNT=2
27 | VECTOR_SIZE=300
28 | MAX_ITER=15
29 | WINDOW_SIZE=15
30 | BINARY=2
31 | NUM_THREADS=8
32 | X_MAX=10
33 |
34 | echo
35 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
36 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
37 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE"
38 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
39 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
40 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
41 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE"
42 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
43 | if [ "$CORPUS" = 'text8' ]; then
44 | if [ "$1" = 'matlab' ]; then
45 | matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2
46 | elif [ "$1" = 'octave' ]; then
47 | octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
48 | else
49 | echo "$ python eval/python/evaluate.py"
50 | python eval/python/evaluate.py
51 | fi
52 | fi
53 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/matlab/WordLookup.m:
--------------------------------------------------------------------------------
1 | function index = WordLookup(InputString)
2 | global wordMap
3 | if wordMap.isKey(InputString)
4 | index = wordMap(InputString);
5 | elseif wordMap.isKey('')
6 | index = wordMap('');
7 | else
8 | index = 0;
9 | end
10 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/matlab/evaluate_vectors.m:
--------------------------------------------------------------------------------
1 | function [BB] = evaluate_vectors(W)
2 |
3 | global wordMap
4 |
5 | filenames = {'capital-common-countries' 'capital-world' 'currency' 'city-in-state' 'family' 'gram1-adjective-to-adverb' ...
6 | 'gram2-opposite' 'gram3-comparative' 'gram4-superlative' 'gram5-present-participle' 'gram6-nationality-adjective' ...
7 | 'gram7-past-tense' 'gram8-plural' 'gram9-plural-verbs'};
8 | path = './eval/question-data/';
9 |
10 | split_size = 100; %to avoid memory overflow, could be increased/decreased depending on system and vocab size
11 |
12 | correct_sem = 0; %count correct semantic questions
13 | correct_syn = 0; %count correct syntactic questions
14 | correct_tot = 0; %count correct questions
15 | count_sem = 0; %count all semantic questions
16 | count_syn = 0; %count all syntactic questions
17 | count_tot = 0; %count all questions
18 | full_count = 0; %count all questions, including those with unknown words
19 |
20 | if wordMap.isKey('')
21 | unkkey = wordMap('');
22 | else
23 | unkkey = 0;
24 | end
25 |
26 | for j=1:length(filenames);
27 |
28 | clear dist;
29 |
30 | fid=fopen([path filenames{j} '.txt']);
31 | temp=textscan(fid,'%s%s%s%s');
32 | fclose(fid);
33 | ind1 = cellfun(@WordLookup,temp{1}); %indices of first word in analogy
34 | ind2 = cellfun(@WordLookup,temp{2}); %indices of second word in analogy
35 | ind3 = cellfun(@WordLookup,temp{3}); %indices of third word in analogy
36 | ind4 = cellfun(@WordLookup,temp{4}); %indices of answer word in analogy
37 | full_count = full_count + length(ind1);
38 | ind = (ind1 ~= unkkey) & (ind2 ~= unkkey) & (ind3 ~= unkkey) & (ind4 ~= unkkey); %only look at those questions which have no unknown words
39 | ind1 = ind1(ind);
40 | ind2 = ind2(ind);
41 | ind3 = ind3(ind);
42 | ind4 = ind4(ind);
43 | disp([filenames{j} ':']);
44 | mx = zeros(1,length(ind1));
45 | num_iter = ceil(length(ind1)/split_size);
46 | for jj=1:num_iter
47 | range = (jj-1)*split_size+1:min(jj*split_size,length(ind1));
48 | dist = full(W * (W(ind2(range),:)' - W(ind1(range),:)' + W(ind3(range),:)')); %cosine similarity if input W has been normalized
49 | for i=1:length(range)
50 | dist(ind1(range(i)),i) = -Inf;
51 | dist(ind2(range(i)),i) = -Inf;
52 | dist(ind3(range(i)),i) = -Inf;
53 | end
54 | [~, mx(range)] = max(dist); %predicted word index
55 | end
56 |
57 | val = (ind4 == mx'); %correct predictions
58 | count_tot = count_tot + length(ind1);
59 | correct_tot = correct_tot + sum(val);
60 | disp(['ACCURACY TOP1: ' num2str(mean(val)*100,'%-2.2f') '% (' num2str(sum(val)) '/' num2str(length(val)) ')']);
61 | if j < 6
62 | count_sem = count_sem + length(ind1);
63 | correct_sem = correct_sem + sum(val);
64 | else
65 | count_syn = count_syn + length(ind1);
66 | correct_syn = correct_syn + sum(val);
67 | end
68 |
69 | disp(['Total accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% Semantic accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% Syntactic accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%']);
70 |
71 | end
72 | disp('________________________________________________________________________________');
73 | disp(['Questions seen/total: ' num2str(100*count_tot/full_count,'%-2.2f') '% (' num2str(count_tot) '/' num2str(full_count) ')']);
74 | disp(['Semantic Accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% (' num2str(correct_sem) '/' num2str(count_sem) ')']);
75 | disp(['Syntactic Accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '% (' num2str(correct_syn) '/' num2str(count_syn) ')']);
76 | disp(['Total Accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% (' num2str(correct_tot) '/' num2str(count_tot) ')']);
77 | BB = [100*correct_sem/count_sem 100*correct_syn/count_syn 100*correct_tot/count_tot];
78 |
79 | end
80 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/matlab/read_and_evaluate.m:
--------------------------------------------------------------------------------
1 | addpath('./eval/matlab');
2 | if(~exist('vocab_file'))
3 | vocab_file = 'vocab.txt';
4 | end
5 | if(~exist('vectors_file'))
6 | vectors_file = 'vectors.bin';
7 | end
8 |
9 | fid = fopen(vocab_file, 'r');
10 | words = textscan(fid, '%s %f');
11 | fclose(fid);
12 | words = words{1};
13 | vocab_size = length(words);
14 | global wordMap
15 | wordMap = containers.Map(words(1:vocab_size),1:vocab_size);
16 |
17 | fid = fopen(vectors_file,'r');
18 | fseek(fid,0,'eof');
19 | vector_size = ftell(fid)/16/vocab_size - 1;
20 | frewind(fid);
21 | WW = fread(fid, [vector_size+1 2*vocab_size], 'double')';
22 | fclose(fid);
23 |
24 | W1 = WW(1:vocab_size, 1:vector_size); % word vectors
25 | W2 = WW(vocab_size+1:end, 1:vector_size); % context (tilde) word vectors
26 |
27 | W = W1 + W2; %Evaluate on sum of word vectors
28 | W = bsxfun(@rdivide,W,sqrt(sum(W.*W,2))); %normalize vectors before evaluation
29 | evaluate_vectors(W);
30 | exit
31 |
32 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/octave/WordLookup_octave.m:
--------------------------------------------------------------------------------
1 | function index = WordLookup_octave(InputString)
2 | global wordMap
3 |
4 | if isfield(wordMap, InputString)
5 | index = wordMap.(InputString);
6 | elseif isfield(wordMap, '')
7 | index = wordMap.('');
8 | else
9 | index = 0;
10 | end
11 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/octave/evaluate_vectors_octave.m:
--------------------------------------------------------------------------------
1 | function [BB] = evaluate_vectors_octave(W)
2 |
3 | global wordMap
4 |
5 | filenames = {'capital-common-countries' 'capital-world' 'currency' 'city-in-state' 'family' 'gram1-adjective-to-adverb' ...
6 | 'gram2-opposite' 'gram3-comparative' 'gram4-superlative' 'gram5-present-participle' 'gram6-nationality-adjective' ...
7 | 'gram7-past-tense' 'gram8-plural' 'gram9-plural-verbs'};
8 | path = './eval/question-data/';
9 |
10 | split_size = 100; %to avoid memory overflow, could be increased/decreased depending on system and vocab size
11 |
12 | correct_sem = 0; %count correct semantic questions
13 | correct_syn = 0; %count correct syntactic questions
14 | correct_tot = 0; %count correct questions
15 | count_sem = 0; %count all semantic questions
16 | count_syn = 0; %count all syntactic questions
17 | count_tot = 0; %count all questions
18 | full_count = 0; %count all questions, including those with unknown words
19 |
20 |
21 | if isfield(wordMap, '')
22 | unkkey = wordMap.('');
23 | else
24 | unkkey = 0;
25 | end
26 |
27 | for j=1:length(filenames);
28 |
29 | clear dist;
30 |
31 | fid=fopen([path filenames{j} '.txt']);
32 | temp=textscan(fid,'%s%s%s%s');
33 | fclose(fid);
34 | ind1 = cellfun(@WordLookup_octave,temp{1}); %indices of first word in analogy
35 | ind2 = cellfun(@WordLookup_octave,temp{2}); %indices of second word in analogy
36 | ind3 = cellfun(@WordLookup_octave,temp{3}); %indices of third word in analogy
37 | ind4 = cellfun(@WordLookup_octave,temp{4}); %indices of answer word in analogy
38 | full_count = full_count + length(ind1);
39 | ind = (ind1 ~= unkkey) & (ind2 ~= unkkey) & (ind3 ~= unkkey) & (ind4 ~= unkkey); %only look at those questions which have no unknown words
40 | ind1 = ind1(ind);
41 | ind2 = ind2(ind);
42 | ind3 = ind3(ind);
43 | ind4 = ind4(ind);
44 | disp([filenames{j} ':']);
45 | mx = zeros(1,length(ind1));
46 | num_iter = ceil(length(ind1)/split_size);
47 | for jj=1:num_iter
48 | range = (jj-1)*split_size+1:min(jj*split_size,length(ind1));
49 | dist = full(W * (W(ind2(range),:)' - W(ind1(range),:)' + W(ind3(range),:)')); %cosine similarity if input W has been normalized
50 | for i=1:length(range)
51 | dist(ind1(range(i)),i) = -Inf;
52 | dist(ind2(range(i)),i) = -Inf;
53 | dist(ind3(range(i)),i) = -Inf;
54 | end
55 | [~, mx(range)] = max(dist); %predicted word index
56 | end
57 |
58 | val = (ind4 == mx'); %correct predictions
59 | count_tot = count_tot + length(ind1);
60 | correct_tot = correct_tot + sum(val);
61 | disp(['ACCURACY TOP1: ' num2str(mean(val)*100,'%-2.2f') '% (' num2str(sum(val)) '/' num2str(length(val)) ')']);
62 | if j < 6
63 | count_sem = count_sem + length(ind1);
64 | correct_sem = correct_sem + sum(val);
65 | else
66 | count_syn = count_syn + length(ind1);
67 | correct_syn = correct_syn + sum(val);
68 | end
69 |
70 | disp(['Total accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% Semantic accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% Syntactic accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%']);
71 |
72 | end
73 | disp('________________________________________________________________________________');
74 | disp(['Questions seen/total: ' num2str(100*count_tot/full_count,'%-2.2f') '% (' num2str(count_tot) '/' num2str(full_count) ')']);
75 | disp(['Semantic Accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% (' num2str(correct_sem) '/' num2str(count_sem) ')']);
76 | disp(['Syntactic Accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '% (' num2str(correct_syn) '/' num2str(count_syn) ')']);
77 | disp(['Total Accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% (' num2str(correct_tot) '/' num2str(count_tot) ')']);
78 | BB = [100*correct_sem/count_sem 100*correct_syn/count_syn 100*correct_tot/count_tot];
79 |
80 | end
81 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/octave/read_and_evaluate_octave.m:
--------------------------------------------------------------------------------
1 | addpath('./eval/octave');
2 | if(~exist('vocab_file'))
3 | vocab_file = 'vocab.txt';
4 | end
5 | if(~exist('vectors_file'))
6 | vectors_file = 'vectors.bin';
7 | end
8 |
9 | fid = fopen(vocab_file, 'r');
10 | words = textscan(fid, '%s %f');
11 | fclose(fid);
12 | words = words{1};
13 | vocab_size = length(words);
14 | global wordMap
15 |
16 | wordMap = struct();
17 | for i=1:numel(words)
18 | wordMap.(words{i}) = i;
19 | end
20 |
21 | fid = fopen(vectors_file,'r');
22 | fseek(fid,0,'eof');
23 | vector_size = ftell(fid)/16/vocab_size - 1;
24 | frewind(fid);
25 | WW = fread(fid, [vector_size+1 2*vocab_size], 'double')';
26 | fclose(fid);
27 |
28 | W1 = WW(1:vocab_size, 1:vector_size); % word vectors
29 | W2 = WW(vocab_size+1:end, 1:vector_size); % context (tilde) word vectors
30 |
31 | W = W1 + W2; %Evaluate on sum of word vectors
32 | W = bsxfun(@rdivide,W,sqrt(sum(W.*W,2))); %normalize vectors before evaluation
33 | evaluate_vectors_octave(W);
34 | exit
35 |
36 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/python/distance.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import numpy as np
3 | import sys
4 |
5 | def generate():
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('--vocab_file', default='vocab.txt', type=str)
8 | parser.add_argument('--vectors_file', default='vectors.txt', type=str)
9 | args = parser.parse_args()
10 |
11 | with open(args.vocab_file, 'r') as f:
12 | words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 | with open(args.vectors_file, 'r') as f:
14 | vectors = {}
15 | for line in f:
16 | vals = line.rstrip().split(' ')
17 | vectors[vals[0]] = [float(x) for x in vals[1:]]
18 |
19 | vocab_size = len(words)
20 | vocab = {w: idx for idx, w in enumerate(words)}
21 | ivocab = {idx: w for idx, w in enumerate(words)}
22 |
23 | vector_dim = len(vectors[ivocab[0]])
24 | W = np.zeros((vocab_size, vector_dim))
25 | for word, v in vectors.items():
26 | if word == '':
27 | continue
28 | W[vocab[word], :] = v
29 |
30 | # normalize each word vector to unit variance
31 | W_norm = np.zeros(W.shape)
32 | d = (np.sum(W ** 2, 1) ** (0.5))
33 | W_norm = (W.T / d).T
34 | return (W_norm, vocab, ivocab)
35 |
36 |
37 | def distance(W, vocab, ivocab, input_term):
38 | for idx, term in enumerate(input_term.split(' ')):
39 | if term in vocab:
40 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term]))
41 | if idx == 0:
42 | vec_result = np.copy(W[vocab[term], :])
43 | else:
44 | vec_result += W[vocab[term], :]
45 | else:
46 | print('Word: %s Out of dictionary!\n' % term)
47 | return
48 |
49 | vec_norm = np.zeros(vec_result.shape)
50 | d = (np.sum(vec_result ** 2,) ** (0.5))
51 | vec_norm = (vec_result.T / d).T
52 |
53 | dist = np.dot(W, vec_norm.T)
54 |
55 | for term in input_term.split(' '):
56 | index = vocab[term]
57 | dist[index] = -np.Inf
58 |
59 | a = np.argsort(-dist)[:N]
60 |
61 | print("\n Word Cosine distance\n")
62 | print("---------------------------------------------------------\n")
63 | for x in a:
64 | print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
65 |
66 |
67 | if __name__ == "__main__":
68 | N = 100; # number of closest words that will be shown
69 | W, vocab, ivocab = generate()
70 | while True:
71 | input_term = raw_input("\nEnter word or sentence (EXIT to break): ")
72 | if input_term == 'EXIT':
73 | break
74 | else:
75 | distance(W, vocab, ivocab, input_term)
76 |
77 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/python/evaluate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import numpy as np
3 |
4 | def main():
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument('--vocab_file', default='vocab.txt', type=str)
7 | parser.add_argument('--vectors_file', default='vectors.txt', type=str)
8 | args = parser.parse_args()
9 |
10 | with open(args.vocab_file, 'r') as f:
11 | words = [x.rstrip().split(' ')[0] for x in f.readlines()]
12 | with open(args.vectors_file, 'r') as f:
13 | vectors = {}
14 | for line in f:
15 | vals = line.rstrip().split(' ')
16 | vectors[vals[0]] = [float(x) for x in vals[1:]]
17 |
18 | vocab_size = len(words)
19 | vocab = {w: idx for idx, w in enumerate(words)}
20 | ivocab = {idx: w for idx, w in enumerate(words)}
21 |
22 | vector_dim = len(vectors[ivocab[0]])
23 | W = np.zeros((vocab_size, vector_dim))
24 | for word, v in vectors.items():
25 | if word == '':
26 | continue
27 | W[vocab[word], :] = v
28 |
29 | # normalize each word vector to unit length
30 | W_norm = np.zeros(W.shape)
31 | d = (np.sum(W ** 2, 1) ** (0.5))
32 | W_norm = (W.T / d).T
33 | evaluate_vectors(W_norm, vocab, ivocab)
34 |
35 | def evaluate_vectors(W, vocab, ivocab):
36 | """Evaluate the trained word vectors on a variety of tasks"""
37 |
38 | filenames = [
39 | 'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
40 | 'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
41 | 'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
42 | 'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
43 | 'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
44 | ]
45 | prefix = './eval/question-data/'
46 |
47 | # to avoid memory overflow, could be increased/decreased
48 | # depending on system and vocab size
49 | split_size = 100
50 |
51 | correct_sem = 0; # count correct semantic questions
52 | correct_syn = 0; # count correct syntactic questions
53 | correct_tot = 0 # count correct questions
54 | count_sem = 0; # count all semantic questions
55 | count_syn = 0; # count all syntactic questions
56 | count_tot = 0 # count all questions
57 | full_count = 0 # count all questions, including those with unknown words
58 |
59 | for i in range(len(filenames)):
60 | with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
61 | full_data = [line.rstrip().split(' ') for line in f]
62 | full_count += len(full_data)
63 | data = [x for x in full_data if all(word in vocab for word in x)]
64 |
65 | indices = np.array([[vocab[word] for word in row] for row in data])
66 | ind1, ind2, ind3, ind4 = indices.T
67 |
68 | predictions = np.zeros((len(indices),))
69 | num_iter = int(np.ceil(len(indices) / float(split_size)))
70 | for j in range(num_iter):
71 | subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
72 |
73 | pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
74 | + W[ind3[subset], :])
75 | #cosine similarity if input W has been normalized
76 | dist = np.dot(W, pred_vec.T)
77 |
78 | for k in range(len(subset)):
79 | dist[ind1[subset[k]], k] = -np.Inf
80 | dist[ind2[subset[k]], k] = -np.Inf
81 | dist[ind3[subset[k]], k] = -np.Inf
82 |
83 | # predicted word index
84 | predictions[subset] = np.argmax(dist, 0).flatten()
85 |
86 | val = (ind4 == predictions) # correct predictions
87 | count_tot = count_tot + len(ind1)
88 | correct_tot = correct_tot + sum(val)
89 | if i < 5:
90 | count_sem = count_sem + len(ind1)
91 | correct_sem = correct_sem + sum(val)
92 | else:
93 | count_syn = count_syn + len(ind1)
94 | correct_syn = correct_syn + sum(val)
95 |
96 | print("%s:" % filenames[i])
97 | print('ACCURACY TOP1: %.2f%% (%d/%d)' %
98 | (np.mean(val) * 100, np.sum(val), len(val)))
99 |
100 | print('Questions seen/total: %.2f%% (%d/%d)' %
101 | (100 * count_tot / float(full_count), count_tot, full_count))
102 | print('Semantic accuracy: %.2f%% (%i/%i)' %
103 | (100 * correct_sem / float(count_sem), correct_sem, count_sem))
104 | print('Syntactic accuracy: %.2f%% (%i/%i)' %
105 | (100 * correct_syn / float(count_syn), correct_syn, count_syn))
106 | print('Total accuracy: %.2f%% (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot))
107 |
108 |
109 | if __name__ == "__main__":
110 | main()
111 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/python/word_analogy.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import numpy as np
3 | import sys
4 |
5 | def generate():
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('--vocab_file', default='vocab.txt', type=str)
8 | parser.add_argument('--vectors_file', default='vectors.txt', type=str)
9 | args = parser.parse_args()
10 |
11 | with open(args.vocab_file, 'r') as f:
12 | words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 | with open(args.vectors_file, 'r') as f:
14 | vectors = {}
15 | for line in f:
16 | vals = line.rstrip().split(' ')
17 | vectors[vals[0]] = [float(x) for x in vals[1:]]
18 |
19 | vocab_size = len(words)
20 | vocab = {w: idx for idx, w in enumerate(words)}
21 | ivocab = {idx: w for idx, w in enumerate(words)}
22 |
23 | vector_dim = len(vectors[ivocab[0]])
24 | W = np.zeros((vocab_size, vector_dim))
25 | for word, v in vectors.items():
26 | if word == '':
27 | continue
28 | W[vocab[word], :] = v
29 |
30 | # normalize each word vector to unit variance
31 | W_norm = np.zeros(W.shape)
32 | d = (np.sum(W ** 2, 1) ** (0.5))
33 | W_norm = (W.T / d).T
34 | return (W_norm, vocab, ivocab)
35 |
36 |
37 | def distance(W, vocab, ivocab, input_term):
38 | vecs = {}
39 | if len(input_term.split(' ')) < 3:
40 | print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' ')))
41 | return
42 | else:
43 | for idx, term in enumerate(input_term.split(' ')):
44 | if term in vocab:
45 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term]))
46 | vecs[idx] = W[vocab[term], :]
47 | else:
48 | print('Word: %s Out of dictionary!\n' % term)
49 | return
50 |
51 | vec_result = vecs[1] - vecs[0] + vecs[2]
52 |
53 | vec_norm = np.zeros(vec_result.shape)
54 | d = (np.sum(vec_result ** 2,) ** (0.5))
55 | vec_norm = (vec_result.T / d).T
56 |
57 | dist = np.dot(W, vec_norm.T)
58 |
59 | for term in input_term.split(' '):
60 | index = vocab[term]
61 | dist[index] = -np.Inf
62 |
63 | a = np.argsort(-dist)[:N]
64 |
65 | print("\n Word Cosine distance\n")
66 | print("---------------------------------------------------------\n")
67 | for x in a:
68 | print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
69 |
70 |
71 | if __name__ == "__main__":
72 | N = 100; # number of closest words that will be shown
73 | W, vocab, ivocab = generate()
74 | while True:
75 | input_term = raw_input("\nEnter three words (EXIT to break): ")
76 | if input_term == 'EXIT':
77 | break
78 | else:
79 | distance(W, vocab, ivocab, input_term)
80 |
81 |
--------------------------------------------------------------------------------
/src/rank/m1/glove/src/README.md:
--------------------------------------------------------------------------------
1 | ### Package Contents
2 |
3 | To train your own GloVe vectors, first you'll need to prepare your corpus as a single text file with all words separated by one or more spaces or tabs. If your corpus has multiple documents, the documents (only) should be separated by new line characters. Cooccurrence contexts for words do not extend past newline characters. Once you create your corpus, you can train GloVe vectors using the following 4 tools. An example is included in `demo.sh`, which you can modify as necessary.
4 |
5 | The four main tools in this package are:
6 |
7 | #### 1) vocab_count
8 | This tool requires an input corpus that should already consist of whitespace-separated tokens. Use something like the [Stanford Tokenizer](https://nlp.stanford.edu/software/tokenizer.html) first on raw text. From the corpus, it constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count.
9 |
10 | #### 2) cooccur
11 | Constructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by `vocab_count`, and may specify a variety of parameters, as described by running `./build/cooccur`.
12 |
13 | #### 3) shuffle
14 | Shuffles the binary file of cooccurrence statistics produced by `cooccur`. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled together. The user may specify a number of parameters, as described by running `./build/shuffle`.
15 |
16 | #### 4) glove
17 | Train the GloVe model on the specified cooccurrence data, which typically will be the output of the `shuffle` tool. The user should supply a vocabulary file, as given by `vocab_count`, and may specify a number of other parameters, which are described by running `./build/glove`.
18 |
--------------------------------------------------------------------------------
/src/rank/m1/prepare_rank_train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from tqdm import tqdm
10 |
11 |
12 | # In[2]:
13 |
14 |
15 | paper = pd.read_feather("../../../input/paper_input_final.ftr")
16 |
17 |
18 | # In[3]:
19 |
20 |
21 | paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
22 | paper['corp'] = paper['titl']+' '+paper['keywords'].fillna('').replace(';', ' ')+paper['abst']
23 |
24 |
25 | # In[4]:
26 |
27 |
28 | df_train = pd.read_feather("../../../input/tr_input_final.ftr")
29 |
30 |
31 | # In[5]:
32 |
33 |
34 | df_train.head()
35 |
36 |
37 | # In[6]:
38 |
39 |
40 | df_test = pd.read_feather("../../../input/te_input_final.ftr")
41 |
42 |
43 | # In[7]:
44 |
45 |
46 | df_test.head()
47 |
48 |
49 | # In[8]:
50 |
51 |
52 | #####reduce mem
53 | import datetime
54 | def pandas_reduce_mem_usage(df):
55 | start_mem=df.memory_usage().sum() / 1024**2
56 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
57 | starttime = datetime.datetime.now()
58 | for col in df.columns:
59 | col_type=df[col].dtype #每一列的类型
60 | if col_type !=object: #不是object类型
61 | c_min=df[col].min()
62 | c_max=df[col].max()
63 | # print('{} column dtype is {} and begin convert to others'.format(col,col_type))
64 | if str(col_type)[:3]=='int':
65 | #是有符号整数
66 | if c_min<0:
67 | if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
68 | df[col] = df[col].astype(np.int8)
69 | elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
70 | df[col] = df[col].astype(np.int16)
71 | elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
72 | df[col] = df[col].astype(np.int32)
73 | else:
74 | df[col] = df[col].astype(np.int64)
75 | else:
76 | if c_min >= np.iinfo(np.uint8).min and c_max<=np.iinfo(np.uint8).max:
77 | df[col]=df[col].astype(np.uint8)
78 | elif c_min >= np.iinfo(np.uint16).min and c_max <= np.iinfo(np.uint16).max:
79 | df[col] = df[col].astype(np.uint16)
80 | elif c_min >= np.iinfo(np.uint32).min and c_max <= np.iinfo(np.uint32).max:
81 | df[col] = df[col].astype(np.uint32)
82 | else:
83 | df[col] = df[col].astype(np.uint64)
84 | #浮点数
85 | else:
86 | if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float32).max:
87 | df[col] = df[col].astype(np.float32)
88 | else:
89 | df[col] = df[col].astype(np.float64)
90 | # print('\t\tcolumn dtype is {}'.format(df[col].dtype))
91 |
92 | #是object类型,比如str
93 | else:
94 | # print('\t\tcolumns dtype is object and will convert to category')
95 | df[col] = df[col].astype('category')
96 | end_mem = df.memory_usage().sum() / 1024 ** 2
97 | endtime = datetime.datetime.now()
98 | print('consume times: {:.4f}'.format((endtime - starttime).seconds))
99 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
100 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
101 | return df
102 |
103 |
104 | # In[9]:
105 |
106 |
107 | recall_train = pd.read_feather('../../../input/tr_s0_32-50.ftr')
108 | recall_test = pd.read_feather('../../../input/te_s0_32-50.ftr')
109 |
110 |
111 | # In[10]:
112 |
113 |
114 | recall_train = pandas_reduce_mem_usage(recall_train)
115 |
116 |
117 | # In[11]:
118 |
119 |
120 | recall_test = pandas_reduce_mem_usage(recall_test)
121 |
122 |
123 | # In[12]:
124 |
125 |
126 | recall_train.shape
127 |
128 |
129 | # In[13]:
130 |
131 |
132 | cv_id = pd.read_csv("../../../input/cv_ids_0109.csv")
133 | recall_train.drop(columns=['cv'],axis=1,inplace=True)
134 | recall_train = recall_train.merge(cv_id,on=['description_id'],how='left')
135 |
136 |
137 | # In[14]:
138 |
139 |
140 | recall_train = recall_train.dropna(subset=['cv']).reset_index(drop=True)
141 |
142 |
143 | # In[15]:
144 |
145 |
146 | recall_train.shape,recall_test.shape
147 |
148 |
149 | # In[16]:
150 |
151 |
152 | recall_train = recall_train.merge(paper[['paper_id','corp']],on=['paper_id'],how='left')
153 | recall_test = recall_test.merge(paper[['paper_id','corp']],on=['paper_id'],how='left')
154 |
155 |
156 | # In[17]:
157 |
158 |
159 | recall_train = recall_train.merge(df_train[['description_id','quer_key','quer_all']],on=['description_id'],how='left')
160 | recall_test = recall_test.merge(df_test[['description_id','quer_key','quer_all']],on=['description_id'],how='left')
161 |
162 |
163 | # In[18]:
164 |
165 |
166 | recall_train = recall_train.sort_values(['description_id', 'corp_sim_score'], ascending=False)
167 | recall_train['rank'] = recall_train.groupby('description_id').cumcount().values
168 | recall_test = recall_test.sort_values(['description_id', 'corp_sim_score'], ascending=False)
169 | recall_test['rank'] = recall_test.groupby('description_id').cumcount().values
170 |
171 |
172 | # In[19]:
173 |
174 |
175 | keep_columns = ['description_id','paper_id','corp','quer_key','quer_all','corp_sim_score','cv','rank','target']
176 | recall_train = recall_train[keep_columns].reset_index(drop=True)
177 | recall_test = recall_test[keep_columns].reset_index(drop=True)
178 |
179 |
180 | # In[20]:
181 |
182 |
183 | recall_train.head()
184 |
185 |
186 | # In[22]:
187 |
188 |
189 | recall_train.to_csv('recall_train.csv',index=False)
190 |
191 |
192 | # In[23]:
193 |
194 |
195 | recall_test.to_csv('recall_test.csv',index=False)
196 |
197 |
198 | # In[ ]:
199 |
200 |
201 | # recall_train.shape
202 |
203 |
--------------------------------------------------------------------------------
/src/rank/m1/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | ####依赖paper_input_1.ftr、te_input_1.ftr
4 | ####写入语料,并训练word2vector词向量
5 | python3 w2v_training.py ###暂时用jupyter notebook占位
6 |
7 | ###训练glove词向量
8 | cd glove && make
9 | bash demo.sh
10 | ####回到目录
11 | cd ..
12 |
13 | ###词向量序列化
14 |
15 |
16 |
17 | ###准备训练数据
18 | python3 prepare_rank_train.py ###暂时用jupyter notebook占位
19 |
20 | ###inferSent-simple-5-fold训练
21 | python3 inferSent1-5-fold_train.py ###暂时用jupyter notebook 占位
22 |
23 | ###inferSent-simple-5-fold预测
24 | python3 inferSent1-5-fold_predict.py ###暂时用jupyter notebook 占位
25 |
26 | ###catboost模型训练&预测
27 | python3 catboost3.py ###暂时用jupyter notebook 占位
28 |
29 | ###nn02模型训练
30 | python3 nn02_train.py ###暂时用jupyter notebook 占位
31 | python3 nn02_predict.py ###暂时用jupyter notebook 占位
32 |
--------------------------------------------------------------------------------
/src/rank/m1/w2v_training.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | # external vec
8 | import warnings
9 | warnings.filterwarnings('always')
10 | warnings.filterwarnings('ignore')
11 |
12 | import os
13 | import sys
14 | import numpy as np
15 | import pandas as pd
16 | from tqdm import tqdm
17 |
18 | import time
19 | from datetime import datetime
20 | from gensim.models import Word2Vec
21 | from gensim.models.word2vec import LineSentence
22 | from gensim import corpora, models, similarities
23 | from gensim.similarities import SparseMatrixSimilarity
24 | from gensim.similarities import MatrixSimilarity
25 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
26 |
27 |
28 | # In[3]:
29 |
30 |
31 | paper = pd.read_feather("../../../input/paper_input_final.ftr")
32 |
33 |
34 | # In[4]:
35 |
36 |
37 | paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
38 | paper['corp'] = paper['titl']+' '+paper['keywords'].fillna('').replace(';', ' ')+paper['abst']
39 |
40 |
41 | # In[5]:
42 |
43 |
44 | paper.head()
45 |
46 |
47 | # In[6]:
48 |
49 |
50 | paper['len'] = paper['corp'].apply(len)
51 |
52 |
53 | # In[7]:
54 |
55 |
56 | paper['len'].describe()
57 |
58 |
59 | # In[8]:
60 |
61 |
62 | df_train = pd.read_feather("../../../input/tr_input_final.ftr")
63 |
64 |
65 | # In[9]:
66 |
67 |
68 | df_train.head()
69 |
70 |
71 | # In[10]:
72 |
73 |
74 | df_train['len'] = df_train['quer_key'].apply(len)
75 | df_train['len'].describe()
76 |
77 |
78 | # In[16]:
79 |
80 |
81 | df_test = pd.read_feather("../../../input/te_input_final.ftr")
82 |
83 |
84 | # In[17]:
85 |
86 |
87 | df_test.head()
88 |
89 |
90 | # In[18]:
91 |
92 |
93 | # df_train[df_train['quer_all'].str.contains("[##]")]
94 |
95 |
96 | # In[19]:
97 |
98 |
99 | from tqdm import tqdm
100 | ###训练语料准备
101 | with open("corpus.txt","w+") as f:
102 | for i in tqdm(range(len(paper))):
103 | abst = paper.iloc[i]['abst']
104 | if abst!='no_content' and abst!="none":
105 | f.write(abst+"\n")
106 | title = paper.iloc[i]['titl']
107 | if title!='no_content' and title!="none":
108 | f.write(title+"\n")
109 | for i in tqdm(range(len(df_train))):
110 | quer_all = df_train.iloc[i]['quer_all']
111 | f.write(quer_all+"\n")
112 | for i in tqdm(range(len(df_test))):
113 | quer_all = df_test.iloc[i]['quer_all']
114 | f.write(quer_all+"\n")
115 |
116 |
117 | # In[23]:
118 |
119 |
120 | ####word2vector
121 | from gensim.models import word2vec
122 | sentences = word2vec.LineSentence('./corpus.txt')
123 | model = word2vec.Word2Vec(sentences, sg=1,min_count=2,window=8,size=300,iter=6,sample=1e-4, hs=1, workers=12)
124 |
125 |
126 | # In[24]:
127 |
128 |
129 | model.save("word2vec.model")
130 |
131 |
132 | # In[34]:
133 |
134 |
135 | model.wv.save_word2vec_format("word2vec.txt",binary=False)
136 |
137 |
138 | # In[26]:
139 |
140 |
141 | #glove的已有
142 | from gensim.test.utils import datapath, get_tmpfile
143 | from gensim.models import KeyedVectors
144 |
145 |
146 | # In[31]:
147 |
148 |
149 | # 输入文件
150 | glove_file = datapath('glove/vectors.txt')
151 | # 输出文件
152 | tmp_file = get_tmpfile("glove_vec.txt")
153 |
154 |
--------------------------------------------------------------------------------
/src/rank/m2/bert_5_fold_predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gc
3 | from tqdm import tqdm
4 | import numpy as np
5 | import pandas as pd
6 |
7 | import torch
8 | from pytorch_transformers import AdamW, WarmupLinearSchedule
9 | import matchzoo as mz
10 | from matchzoo.preprocessors.units.truncated_length import TruncatedLength
11 | from utils import MAP, build_matrix, topk_lines, predict, Logger
12 |
13 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
14 |
15 | import argparse
16 |
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--model_id', type=str, default='bert_002')
19 | args = parser.parse_args()
20 |
21 | model_id = args.model_id
22 |
23 | if model_id=="bert_002":
24 | test_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_final_test_processed_query_key.dp")
25 | bst_epochs = {1:1, 2:1, 3:2, 4:1, 5:1}
26 | if model_id=="bert_003":
27 | test_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_test_processed_query_all.dp")
28 | bst_epochs = {1:2, 2:1, 3:1, 4:2, 5:1}
29 | if model_id=="bert_004":
30 | test_processed = mz.data_pack.data_pack.load_data_pack(
31 | "bert_data/bert_final_test_processed_query_all_nopreprocessing.dp/")
32 | bst_epochs = {1:2, 2:2, 3:1, 4:1, 5:1}
33 |
34 | padding_callback = mz.models.Bert.get_default_padding_callback()
35 | testset = mz.dataloader.Dataset(
36 | data_pack=test_processed,
37 | batch_size=128,
38 | sort=False,
39 | shuffle=False
40 | )
41 | testloader = mz.dataloader.DataLoader(
42 | dataset=testset,
43 | stage='dev',
44 | callback=padding_callback
45 | )
46 |
47 |
48 | num_dup = 1
49 | num_neg = 7
50 |
51 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
52 | padding_callback = mz.models.Bert.get_default_padding_callback()
53 | task = mz.tasks.Ranking(losses=losses)
54 | task.metrics = [
55 | mz.metrics.MeanAveragePrecision(),
56 | MAP()
57 | ]
58 |
59 | model = mz.models.Bert()
60 |
61 | model.params['task'] = task
62 | model.params['mode'] = 'bert-base-uncased'
63 | model.params['dropout_rate'] = 0.2
64 |
65 | model.build()
66 |
67 | print('Trainable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
68 |
69 | no_decay = ['bias', 'LayerNorm.weight']
70 | optimizer_grouped_parameters = [
71 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 5e-5},
72 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
73 | ]
74 |
75 |
76 | optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, betas=(0.9, 0.98), eps=1e-8)
77 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=6, t_total=-1)
78 |
79 | trainer = mz.trainers.Trainer(
80 | model=model,
81 | optimizer=optimizer,
82 | scheduler=scheduler,
83 | trainloader=testloader,
84 | validloader=testloader,
85 | validate_interval=None,
86 | epochs=1
87 | )
88 |
89 |
90 | for fold in range(1,6):
91 | i = bst_epochs[fold]
92 | trainer.restore_model("save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
93 |
94 | score = predict(trainer, testloader)
95 | X, y = test_processed.unpack()
96 | result = pd.DataFrame(data={
97 | 'description_id': X['id_left'],
98 | 'paper_id': X['id_right'],
99 | 'score': score[:,0]})
100 | # result.to_csv("result/{}/{}_fold_{}_test.csv".format(model_id, model_id, fold), index=False)
101 | result.to_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold), index=False)
102 |
103 |
104 |
--------------------------------------------------------------------------------
/src/rank/m2/bert_5_fold_train.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3 |
4 | import gc
5 | from tqdm import tqdm
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import torch
10 | from pytorch_transformers import AdamW, WarmupLinearSchedule
11 | import matchzoo as mz
12 | from matchzoo.preprocessors.units.truncated_length import TruncatedLength
13 | from utils import MAP, build_matrix, topk_lines, predict, Logger
14 |
15 | from matchzoo.data_pack import DataPack
16 |
17 | import argparse
18 |
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument('--model_id', type=str, default='bert_002')
21 | args = parser.parse_args()
22 |
23 | model_id = args.model_id
24 |
25 | num_dup = 1
26 | num_neg = 7
27 |
28 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
29 | padding_callback = mz.models.Bert.get_default_padding_callback()
30 | task = mz.tasks.Ranking(losses=losses)
31 | task.metrics = [
32 | mz.metrics.MeanAveragePrecision(),
33 | MAP()
34 | ]
35 |
36 | with Logger(log_filename = '{}.log'.format(model_id)):
37 | for fold in range(1,6):
38 | if model_id=='bert_002':
39 | train_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_train_processed_{}.dp".format(fold))
40 | val_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_val_processed_{}.dp".format(fold))
41 | if model_id=='bert_003':
42 | train_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_train_processed_query_all_{}.dp".format(fold))
43 | val_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_val_processed_query_all_{}.dp".format(fold))
44 | if model_id=='bert_004':
45 | train_processed = mz.data_pack.data_pack.load_data_pack(
46 | "bert_data/bert_train_processed_query_all_nopreprocessing_{}.dp".format(fold))
47 | val_processed = mz.data_pack.data_pack.load_data_pack(
48 | "bert_data/bert_val_processed_query_all_nopreprocessing_{}.dp".format(fold))
49 |
50 | model = mz.models.Bert()
51 |
52 | model.params['task'] = task
53 | model.params['mode'] = 'bert-base-uncased'
54 | model.params['dropout_rate'] = 0.2
55 |
56 | model.build()
57 |
58 | print('Trainable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
59 |
60 |
61 | trainset = mz.dataloader.Dataset(
62 | data_pack=train_processed,
63 | mode='pair',
64 | num_dup=num_dup,
65 | num_neg=num_neg,
66 | batch_size=1,
67 | resample=True,
68 | sort=False,
69 | shuffle=True
70 | )
71 | trainloader = mz.dataloader.DataLoader(
72 | dataset=trainset,
73 | stage='train',
74 | callback=padding_callback
75 | )
76 |
77 | valset = mz.dataloader.Dataset(
78 | data_pack=val_processed,
79 | batch_size=32,
80 | sort=False,
81 | shuffle=False
82 | )
83 | valloader = mz.dataloader.DataLoader(
84 | dataset=valset,
85 | stage='dev',
86 | callback=padding_callback
87 | )
88 |
89 |
90 | no_decay = ['bias', 'LayerNorm.weight']
91 | optimizer_grouped_parameters = [
92 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 5e-5},
93 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
94 | ]
95 |
96 | optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, betas=(0.9, 0.98), eps=1e-8)
97 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=6, t_total=-1)
98 |
99 | trainer = mz.trainers.Trainer(
100 | model=model,
101 | optimizer=optimizer,
102 | scheduler=scheduler,
103 | trainloader=trainloader,
104 | validloader=valloader,
105 | validate_interval=None,
106 | epochs=1
107 | )
108 |
109 | for i in range(0,8):
110 | print("="*10+" epoch: "+str(i)+" "+"="*10)
111 | trainer.run()
112 | trainer.save_model()
113 | os.rename("save/model.pt", "save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
114 |
115 |
116 |
--------------------------------------------------------------------------------
/src/rank/m2/bert_preprocessing.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gc
3 | from tqdm import tqdm
4 | import numpy as np
5 | import pandas as pd
6 |
7 | import torch
8 | import matchzoo as mz
9 | from matchzoo.preprocessors.units.truncated_length import TruncatedLength
10 | from utils import MAP, build_matrix, topk_lines, predict
11 |
12 | import argparse
13 |
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--preprocessing_type', type=str, default='fine')
16 | parser.add_argument('--left_truncated_length', type=int, default=64)
17 | parser.add_argument('--query_type', type=str, default='query_key')
18 | args = parser.parse_args()
19 |
20 | preprocessing_type = args.preprocessing_type
21 | left_truncated_length = args.left_truncated_length
22 | dp_type = args.query_type
23 |
24 | num_neg = 7
25 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
26 | task = mz.tasks.Ranking(losses=losses)
27 | task.metrics = [
28 | mz.metrics.MeanAveragePrecision(),
29 | MAP()
30 | ]
31 |
32 | preprocessor = mz.models.Bert.get_default_preprocessor(mode='bert-base-uncased')
33 |
34 |
35 | if preprocessing_type == 'fine':
36 | candidate_dic = pd.read_feather('data/candidate_dic.ftr')
37 | train_description = pd.read_feather('data/train_description_{}.ftr'.format(dp_type))
38 | else:
39 | candidate_dic = pd.read_csv('../../../input/candidate_paper_for_wsdm2020.csv')
40 | candidate_dic.loc[candidate_dic['keywords'].isna(),'keywords'] = ''
41 | candidate_dic.loc[candidate_dic['title'].isna(),'title'] = ''
42 | candidate_dic.loc[candidate_dic['abstract'].isna(),'abstract'] = ''
43 | candidate_dic['text_right'] = candidate_dic['abstract'].str.cat(
44 | candidate_dic['keywords'], sep=' ').str.cat(
45 | candidate_dic['title'], sep=' ')
46 | candidate_dic = candidate_dic.rename(columns={'paper_id': 'id_right'})[['id_right', 'text_right']]
47 |
48 | train_description = pd.read_csv('../../../input/train_release.csv')
49 | train_description = train_description.rename(
50 | columns={'description_id': 'id_left',
51 | 'description_text': 'text_left'})[['id_left', 'text_left']]
52 | dp_type = 'query_all_nopreprocessing'
53 |
54 | train_recall = pd.read_feather('data/train_recall.ftr')[['id_left', 'id_right', 'label', 'cv']]
55 | train_recall = pd.merge(train_recall, train_description, how='left', on='id_left')
56 | train_recall = pd.merge(train_recall, candidate_dic, how='left', on='id_right')
57 | train_recall = train_recall.drop_duplicates().reset_index(drop=True)
58 | train_recall = train_recall[['id_left', 'text_left', 'id_right', 'text_right', 'label', 'cv']]
59 | del train_description
60 | gc.collect()
61 |
62 |
63 |
64 | for i in range(1,6):
65 | print("="*20, i, "="*20)
66 | train_df = train_recall[train_recall.cv!=i][
67 | ['id_left', 'text_left', 'id_right', 'text_right', 'label']].reset_index(drop=True)
68 | val_df = train_recall[train_recall.cv==i][
69 | ['id_left', 'text_left', 'id_right', 'text_right', 'label']].reset_index(drop=True)
70 |
71 | train_raw = mz.pack(train_df, task)
72 | train_processed = preprocessor.transform(train_raw)
73 | train_processed.apply_on_text(TruncatedLength(left_truncated_length, 'pre').transform,
74 | mode='left', inplace=True, verbose=1)
75 | train_processed.apply_on_text(TruncatedLength(256, 'pre').transform, mode='right', inplace=True, verbose=1)
76 | train_processed.append_text_length(inplace=True, verbose=1)
77 | train_processed.save("bert_data/bert_train_processed_{}_{}.dp".format(dp_type, i))
78 |
79 | val_raw = mz.pack(val_df, task)
80 | val_processed = preprocessor.transform(val_raw)
81 | val_processed.apply_on_text(TruncatedLength(left_truncated_length, 'pre').transform,
82 | mode='left', inplace=True, verbose=1)
83 | val_processed.apply_on_text(TruncatedLength(256, 'pre').transform, mode='right', inplace=True, verbose=1)
84 | val_processed.append_text_length(inplace=True, verbose=1)
85 | val_processed.save("bert_data/bert_val_processed_{}_{}.dp".format(dp_type, i))
86 |
87 |
88 | if preprocessing_type == 'fine':
89 | test_description = pd.read_feather('data/test_description_quer_all.ftr')
90 | else:
91 | test_description = pd.read_csv('../../input/test.csv')
92 | test_description = test_description.rename(
93 | columns={'description_id': 'id_left',
94 | 'description_text': 'text_left'})[['id_left', 'text_left']]
95 |
96 |
97 | test_recall = pd.read_feather('data/test_recall.ftr')[['id_left', 'id_right', 'label']]
98 | test_recall = pd.merge(test_recall, test_description, how='left', on='id_left')
99 | test_recall = pd.merge(test_recall, candidate_dic, how='left', on='id_right')
100 | del test_description, candidate_dic
101 | gc.collect()
102 |
103 | test_raw = mz.pack(test_recall, task)
104 | test_processed = preprocessor.transform(test_raw)
105 | test_processed.apply_on_text(TruncatedLength(left_truncated_length, 'pre').transform,
106 | mode='left', inplace=True, verbose=1)
107 | test_processed.apply_on_text(TruncatedLength(256, 'pre').transform, mode='right', inplace=True, verbose=1)
108 | test_processed.append_text_length(inplace=True, verbose=1)
109 | test_processed.save("bert_data/bert_test_processed_{}.dp".format(dp_type))
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/src/rank/m2/change_formatting4stk.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | import argparse
6 |
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
9 | args = parser.parse_args()
10 |
11 | model_id = args.model_id
12 |
13 | stk_path = "../../../stk_feat"
14 |
15 | df = pd.read_csv("oof_m2_{}_5cv.csv".format(model_id))
16 | df = df.rename(columns={"target": "pred"})
17 | df.to_feather("{}/m2_{}_tr.ftr".format(stk_path, model_name))
18 |
19 | df = pd.read_csv("result_m2_{}_5cv.csv".format(model_id))
20 | df = df.rename(columns={"target": "pred"})
21 | df.to_feather("{}/final_m2_{}_te.ftr".format(stk_path, model_name))
22 |
23 |
--------------------------------------------------------------------------------
/src/rank/m2/final_blend.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from tqdm import tqdm
4 |
5 | np.set_printoptions(precision=4)
6 |
7 | def map3_func(df, topk = 50, verbose=0):
8 | ids = df[df.label==1].description_id.values
9 | df_recalled = df[df.description_id.isin(ids)].reset_index(drop=True)
10 | df_recalled = df_recalled.sort_values(
11 | by=['description_id', 'label'], ascending=False).reset_index(drop=True)
12 | result = df_recalled.score.values.reshape([-1,topk])
13 | ranks = topk-result.argsort(axis=1).argsort(axis=1)
14 | map3_sum = sum(((1/ranks[:,0])*(ranks[:,0]<4)))
15 | if verbose>0:
16 | print("recall rate: "+str((df_recalled.shape[0]/topk)/(df.shape[0]/topk)))
17 | print("map@3 in recall: "+str(map3_sum/(df_recalled.shape[0]/topk)))
18 | print("map@3 in all: "+str(map3_sum/(df.shape[0]/topk)))
19 |
20 |
21 | m2_path = "../../model/"
22 |
23 | res = pd.read_feather('{}/lgb_s0_m2_33-0/lgb_s0_m3_33.ftr'.format(m2_path))
24 | res['score'] = res['target'].apply(lambda x:np.log(x/(1-x)))
25 | res.loc[res['score']<-12, 'score'] = -12
26 | res = res[['description_id', 'paper_id', 'score']]
27 | res.head()
28 |
29 | res1 = pd.read_feather('{}/lgb_s0_m2_33-1/lgb_s0_m3_33.ftr'.format(m2_path))
30 | res1['score'] = res1['target'].apply(lambda x:np.log(x/(1-x)))
31 | res1.loc[res1['score']<-12, 'score'] = -12
32 | res1 = res1[['description_id', 'paper_id', 'score']]
33 | res1.head()
34 |
35 |
36 | res2 = pd.read_feather('{}/lgb_s0_m3_34-0/lgb_s0_m3_34.ftr'.format(m2_path))
37 | res2['score'] = res2['target'].apply(lambda x:np.log(x/(1-x)))
38 | res2.loc[res2['score']<-12, 'score'] = -12
39 | res2 = res2[['description_id', 'paper_id', 'score']]
40 | res2.head()
41 |
42 |
43 | res3 = pd.read_feather('{}/lgb_s0_m3_34-1/lgb_s0_m3_34.ftr'.format(m2_path))
44 | res3['score'] = res3['target'].apply(lambda x:np.log(x/(1-x)))
45 | res3.loc[res3['score']<-12, 'score'] = -12
46 | res3 = res3[['description_id', 'paper_id', 'score']]
47 | res3.head()
48 |
49 |
50 | res4 = pd.read_feather('{}/lgb_s0_m3_35-0/lgb_s0_m3_35.ftr'.format(m2_path))
51 | res4['score'] = res4['target'].apply(lambda x:np.log(x/(1-x)))
52 | res4.loc[res4['score']<-12, 'score'] = -12
53 | res4 = res4[['description_id', 'paper_id', 'score']]
54 | res4.head()
55 |
56 |
57 | res5 = pd.read_feather('{}/lgb_s0_m3_35-1/lgb_s0_m3_35.ftr'.format(m2_path))
58 | res5['score'] = res5['target'].apply(lambda x:np.log(x/(1-x)))
59 | res5.loc[res5['score']<-12, 'score'] = -12
60 | res5 = res5[['description_id', 'paper_id', 'score']]
61 | res5.head()
62 |
63 |
64 | res6 = pd.read_feather('{}/lgb_s0_m3_38-0lgb_s0_m3_38.ftr'.format(m2_path))
65 | res6['score'] = res6['target'].apply(lambda x:np.log(x/(1-x)))
66 | res6.loc[res6['score']<-12, 'score'] = -12
67 | res6 = res6[['description_id', 'paper_id', 'score']]
68 | res6.head()
69 |
70 |
71 | res7 = pd.read_feather('{}/lgb_s0_m3_38-1/lgb_s0_m3_38.ftr'.format(m2_path))
72 | res7['score'] = res7['target'].apply(lambda x:np.log(x/(1-x)))
73 | res7.loc[res7['score']<-12, 'score'] = -12
74 | res7 = res7[['description_id', 'paper_id', 'score']]
75 | res7.head()
76 |
77 |
78 | res8 = pd.read_feather('{}/lgb_s0_m3_40-0/lgb_s0_m3_40.ftr'.format(m2_path))
79 | res8['score'] = res8['target'].apply(lambda x:np.log(x/(1-x)))
80 | res8.loc[res8['score']<-12, 'score'] = -12
81 | res8 = res8[['description_id', 'paper_id', 'score']]
82 | res8.head()
83 |
84 |
85 | res9 = pd.read_feather('{}/model/m1/m1_catboost13.ftr'.format(m2_path))
86 | res9['score'] = res9['pred'].apply(lambda x:np.log(x/(1-x)))
87 | res9.loc[res9['score']<-12, 'score'] = -12
88 | res9 = res9[['description_id', 'paper_id', 'score']]
89 | res9.head()
90 |
91 |
92 | model_id = 'bert_002'
93 | res_b1 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
94 | res_b1['score'] = res_b1['target'].apply(lambda x:np.log(x/(1-x)))
95 | res_b1.loc[res_b1['score']<-12, 'score'] = -12
96 | res_b1 = res_b1[['description_id', 'paper_id', 'score']]
97 | res_b1.head()
98 |
99 |
100 | model_id = 'bert_003'
101 | res_b2 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
102 | res_b2['score'] = res_b2['target'].apply(lambda x:np.log(x/(1-x)))
103 | res_b2.loc[res_b2['score']<-12, 'score'] = -12
104 | res_b2 = res_b2[['description_id', 'paper_id', 'score']]
105 | res_b2.head()
106 |
107 |
108 | model_id = 'bert_004'
109 | res_b3 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
110 | res_b3['score'] = res_b3['target'].apply(lambda x:np.log(x/(1-x)))
111 | res_b3.loc[res_b3['score']<-12, 'score'] = -12
112 | res_b3 = res_b3[['description_id', 'paper_id', 'score']]
113 | res_b3.head()
114 |
115 | model_id = 'bert_year_test'
116 | res_b4 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
117 | res_b4['score'] = res_b4['target'].apply(lambda x:np.log(x/(1-x)))
118 | res_b4.loc[res_b4['score']<-12, 'score'] = -12
119 | res_b4 = res_b4[['description_id', 'paper_id', 'score']]
120 | res_b4.head()
121 |
122 |
123 | res_all = res.rename(columns={'score': 'score_0'}).merge(
124 | res1.rename(columns={'score': 'score_1'}), how='outer', on=['description_id', 'paper_id']).merge(
125 | res2.rename(columns={'score': 'score_2'}), how='outer', on=['description_id', 'paper_id']).merge(
126 | res3.rename(columns={'score': 'score_3'}), how='outer', on=['description_id', 'paper_id']).merge(
127 | res4.rename(columns={'score': 'score_4'}), how='outer', on=['description_id', 'paper_id']).merge(
128 | res5.rename(columns={'score': 'score_5'}), how='outer', on=['description_id', 'paper_id']).merge(
129 | res6.rename(columns={'score': 'score_6'}), how='outer', on=['description_id', 'paper_id']).merge(
130 | res7.rename(columns={'score': 'score_7'}), how='outer', on=['description_id', 'paper_id']).merge(
131 | res8.rename(columns={'score': 'score_8'}), how='outer', on=['description_id', 'paper_id']).merge(
132 | res9.rename(columns={'score': 'score_9'}), how='outer', on=['description_id', 'paper_id']).merge(
133 | res_b1.rename(columns={'score': 'score_b1'}), how='outer', on=['description_id', 'paper_id']).merge(
134 | res_b2.rename(columns={'score': 'score_b2'}), how='outer', on=['description_id', 'paper_id']).merge(
135 | res_b3.rename(columns={'score': 'score_b3'}), how='outer', on=['description_id', 'paper_id']).merge(
136 | res_b4.rename(columns={'score': 'score_b4'}), how='outer', on=['description_id', 'paper_id'])
137 | res_all = res_all.fillna(0.0)
138 | res_all.head()
139 |
140 |
141 | cols = ['score_0', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5',
142 | 'score_6', 'score_7', 'score_8', 'score_9',
143 | 'score_b1', 'score_b2', 'score_b3']
144 |
145 | corr_matrix = []
146 | for description_id, df_tmp in tqdm(res_all.groupby('description_id')):
147 | corr_matrix.append(
148 | df_tmp[cols].corr().values[:,:,np.newaxis])
149 | corr_matrix = np.concatenate(corr_matrix, axis=2)
150 | corr_matrix[np.isnan(corr_matrix)] = 0
151 | pd.DataFrame(data=corr_matrix.mean(axis=2), columns=cols, index=cols)
152 |
153 | res_all['score'] = (
154 | (
155 | res_all['score_0'] + res_all['score_1'] + res_all['score_2'] + res_all['score_3'] +
156 | res_all['score_4'] + res_all['score_5'] + res_all['score_6'] + res_all['score_7']
157 | )/8 +
158 | (
159 | res_all['score_8'] + res_all['score_9']
160 | )/2 +
161 | (
162 | res_all['score_b1'] + 1.5*res_all['score_b2']
163 | )/2.5*5 +
164 | (
165 | res_all['score_b2'] + 3*res_all['score_b3']
166 | )/4
167 | )
168 |
169 |
170 | result = res_all.sort_values(by=['description_id', 'score'], na_position='first').groupby(
171 | 'description_id').tail(3)
172 |
173 |
174 | model_id = 'all_model'
175 |
176 | description_id_list = []
177 | paper_id_list_1 = []
178 | paper_id_list_2 = []
179 | paper_id_list_3 = []
180 | for description_id, df_tmp in tqdm(result.groupby('description_id')):
181 | description_id_list.append(description_id)
182 | paper_id_list_1.append(df_tmp.iloc[2,1])
183 | paper_id_list_2.append(df_tmp.iloc[1,1])
184 | paper_id_list_3.append(df_tmp.iloc[0,1])
185 |
186 | sub = pd.DataFrame(data={'description_id':description_id_list,
187 | 'paper_id_1': paper_id_list_1,
188 | 'paper_id_2': paper_id_list_2,
189 | 'paper_id_3': paper_id_list_3})
190 | sub.to_csv("blend_{}.csv".format(model_id), header=False, index=False)
191 | print("blend_{}.csv".format(model_id))
192 |
193 |
--------------------------------------------------------------------------------
/src/rank/m2/fold_result_integration.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from tqdm import tqdm
4 |
5 |
6 | import argparse
7 |
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
10 | args = parser.parse_args()
11 | model_id = args.model_id
12 |
13 |
14 | def map3_func(df, topk = 50, verbose=0):
15 | ids = df[df.label==1].description_id.values
16 | df_recalled = df[df.description_id.isin(ids)].reset_index(drop=True)
17 | df_recalled = df_recalled.sort_values(
18 | by=['description_id', 'label'], ascending=False).reset_index(drop=True)
19 | result = df_recalled.score.values.reshape([-1,topk])
20 | ranks = topk-result.argsort(axis=1).argsort(axis=1)
21 | map3_sum = sum(((1/ranks[:,0])*(ranks[:,0]<4)))
22 | if verbose>1:
23 | print("recall rate: "+str((df_recalled.shape[0]/topk)/(df.shape[0]/topk)))
24 | print("map@3 in recall: "+str(map3_sum/(df_recalled.shape[0]/topk)))
25 | if verbose>0:
26 | print("map@3 in all: "+str(map3_sum/(df.shape[0]/topk)))
27 | return map3_sum/(df.shape[0]/topk)
28 |
29 |
30 | fold = 1
31 | val_df = pd.read_csv("result/{}/{}_fold_{}_cv.csv".format(model_id, model_id, fold))
32 | test_df = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
33 | columns={'score':'score_1'})
34 |
35 | for fold in tqdm(range(2,6)):
36 | val_df_cv = pd.read_csv("result/{}/{}_fold_{}_cv.csv".format(model_id, model_id, fold))
37 | val_df = pd.concat([val_df, val_df_cv], ignore_index=True, sort=True)
38 |
39 | test_df_cv = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
40 | columns={'score':'score_{}'.format(fold)})
41 | test_df = test_df.merge(test_df_cv)
42 |
43 | val_df = val_df.merge(train_recall, how='left')
44 | val_df = val_df[val_df.description_id!='6.45E+04'].reset_index(drop=True)
45 | # assert val_df.description_id.nunique()==49945
46 | map3_func(val_df)
47 | val_df['target'] = val_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
48 | val_df.to_csv("oof_m2_{}_5cv.csv".format(model_id), index=False)
49 |
50 | score_cols = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
51 | test_df['score'] = test_df[score_cols].mean(axis=1)
52 | print(test_df[score_cols+['score']].corr(method='spearman'))
53 |
54 | test_df['target'] = test_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
55 | val_df['target'] = val_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
56 |
57 | test_df = test_recall.merge(
58 | test_df[['description_id', 'paper_id', 'score']], how='left', on=['description_id', 'paper_id'])
59 | test_df['target'] = test_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
60 | test_df['target'] = test_df['target'].fillna(0)
61 | test_df[['description_id', 'paper_id', 'target']].to_csv("result_m2_{}_5cv.csv".format(model_id), index=False)
62 |
63 |
64 |
--------------------------------------------------------------------------------
/src/rank/m2/gen_w2v.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # set -e
3 |
4 | BUILDDIR=build
5 | CORPUS=corpus.txt
6 | VOCAB_FILE=vocab.txt
7 | SAVE_FILE=glove.w2v
8 |
9 | VERBOSE=2
10 | MEMORY=4.0
11 |
12 | VOCAB_MIN_COUNT=5
13 |
14 | WINDOW_SIZE=5
15 | COOCCURRENCE_FILE=cooccurrence.bin
16 | WEIGHT=1
17 |
18 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
19 |
20 | VECTOR_SIZE=256
21 | MAX_ITER=25
22 | WINDOW_SIZE=2
23 | BINARY=0
24 | NUM_THREADS=8
25 | X_MAX=10
26 | HEADLINE=1
27 |
28 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
29 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
30 |
31 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE -distance-weighting $WEIGHT < $CORPUS > $COOCCURRENCE_FILE"
32 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE -distance-weighting $WEIGHT < $CORPUS > $COOCCURRENCE_FILE
33 |
34 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
35 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
36 |
37 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE -write-header $HEADLINE"
38 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE -write-header $HEADLINE
39 |
40 |
41 |
--------------------------------------------------------------------------------
/src/rank/m2/mk_submission.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from tqdm import tqdm
4 |
5 | test_recall = pd.read_feather('../../feat/te_s0_32-50.ftr')[['description_id', 'paper_id', 'corp_sim_score']]
6 |
7 | import argparse
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
11 | args = parser.parse_args()
12 | model_id = args.model_id
13 |
14 | if '_pointwise' in model_id:
15 | fold = 1
16 | test_df = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
17 | columns={'target':'target_1'})
18 |
19 | for fold in tqdm(range(2,6)):
20 | test_df_cv = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
21 | columns={'target':'target_{}'.format(fold)})
22 | test_df = test_df.merge(test_df_cv)
23 |
24 | score_cols = ['target_1', 'target_2', 'target_3', 'target_4', 'target_5']
25 | test_df['target'] = test_df[score_cols].mean(axis=1)
26 | print(test_df[score_cols+['target']].corr(method='spearman'))
27 | else:
28 | fold = 1
29 | test_df = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
30 | columns={'score':'score_1'})
31 |
32 | for fold in tqdm(range(2,6)):
33 | test_df_cv = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
34 | columns={'score':'score_{}'.format(fold)})
35 | test_df = test_df.merge(test_df_cv)
36 |
37 | score_cols = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
38 | test_df['score'] = test_df[score_cols].mean(axis=1)
39 | print(test_df[score_cols+['score']].corr(method='spearman'))
40 |
41 |
42 | if 'target' not in test_df.columns:
43 | test_df['target'] = test_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
44 |
45 | test_df = test_recall.merge(
46 | test_df[['description_id', 'paper_id', 'target']], how='left', on=['description_id', 'paper_id'])
47 | test_df[['description_id', 'paper_id', 'target']].to_csv("final_result_m2_{}_5cv.csv".format(model_id), index=False)
48 |
49 | result = test_df.sort_values(by=['description_id', 'target', 'corp_sim_score'], na_position='first').groupby(
50 | 'description_id').tail(3)
51 |
52 | description_id_list = []
53 | paper_id_list_1 = []
54 | paper_id_list_2 = []
55 | paper_id_list_3 = []
56 | for description_id, df_tmp in tqdm(result.groupby('description_id')):
57 | description_id_list.append(description_id)
58 | paper_id_list_1.append(df_tmp.iloc[2,1])
59 | paper_id_list_2.append(df_tmp.iloc[1,1])
60 | paper_id_list_3.append(df_tmp.iloc[0,1])
61 |
62 | sub = pd.DataFrame(data={'description_id':description_id_list,
63 | 'paper_id_1': paper_id_list_1,
64 | 'paper_id_2': paper_id_list_2,
65 | 'paper_id_3': paper_id_list_3})
66 | sub.to_csv("final_{}_sub_5cv.csv".format(model_id), header=False, index=False)
67 | print("final_{}_sub_5cv.csv".format(model_id))
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/src/rank/m2/model.py:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 | import torch
4 | import torch.nn as nn
5 | from torch.nn import functional as F
6 |
7 | import matchzoo as mz
8 | from matchzoo.engine.param_table import ParamTable
9 | from matchzoo.engine.param import Param
10 | from matchzoo.engine.base_model import BaseModel
11 | from matchzoo.modules import RNNDropout
12 | from matchzoo.modules import BidirectionalAttention
13 | from matchzoo.modules import StackedBRNN
14 |
15 |
16 | class ESIMplus(mz.models.ESIM):
17 | def set_feature_dim(self, feature_dim):
18 | self.feature_dim = feature_dim
19 |
20 | def build(self):
21 | """Instantiating layers."""
22 | rnn_mapping = {'lstm': nn.LSTM, 'gru': nn.GRU}
23 | self.embedding = self._make_default_embedding_layer()
24 | self.rnn_dropout = RNNDropout(p=self._params['dropout'])
25 | lstm_size = self._params['hidden_size']
26 | if self._params['concat_lstm']:
27 | lstm_size /= self._params['lstm_layer']
28 | self.input_encoding = StackedBRNN(
29 | self._params['embedding_output_dim'],
30 | int(lstm_size / 2),
31 | self._params['lstm_layer'],
32 | dropout_rate=self._params['dropout'],
33 | dropout_output=self._params['drop_lstm'],
34 | rnn_type=rnn_mapping[self._params['rnn_type'].lower()],
35 | concat_layers=self._params['concat_lstm'])
36 | self.attention = BidirectionalAttention()
37 | self.projection = nn.Sequential(
38 | nn.Linear(
39 | 4 * self._params['hidden_size'],
40 | self._params['hidden_size']),
41 | nn.ReLU())
42 | self.composition = StackedBRNN(
43 | self._params['hidden_size'],
44 | int(lstm_size / 2),
45 | self._params['lstm_layer'],
46 | dropout_rate=self._params['dropout'],
47 | dropout_output=self._params['drop_lstm'],
48 | rnn_type=rnn_mapping[self._params['rnn_type'].lower()],
49 | concat_layers=self._params['concat_lstm'])
50 | self.wide_net = nn.Sequential(
51 | nn.Linear(self.feature_dim, self._params['hidden_size']),
52 | nn.ReLU(),
53 | nn.Linear(self._params['hidden_size'], self._params['hidden_size']),
54 | nn.ReLU())
55 | self.classification = nn.Sequential(
56 | nn.Dropout(
57 | p=self._params['dropout']),
58 | nn.Linear(
59 | 4 * self._params['hidden_size']+self._params['hidden_size'],
60 | self._params['hidden_size']),
61 | nn.Tanh(),
62 | nn.Dropout(
63 | p=self._params['dropout']))
64 | self.out = self._make_output_layer(self._params['hidden_size'])
65 |
66 |
67 | def forward(self, inputs):
68 | """Forward."""
69 | # Scalar dimensions referenced here:
70 | # B = batch size (number of sequences)
71 | # D = embedding size
72 | # L = `input_left` sequence length
73 | # R = `input_right` sequence length
74 | # F = `feature` dim
75 | # H = hidden size
76 |
77 | # [B, L], [B, R]
78 |
79 | query, doc = inputs['text_left'].long(), inputs['text_right'].long()
80 |
81 | # [B, L]
82 | # [B, R]
83 | query_mask = (query == self._params['mask_value'])
84 | doc_mask = (doc == self._params['mask_value'])
85 |
86 | # [B, L, D]
87 | # [B, R, D]
88 | query = self.embedding(query)
89 | doc = self.embedding(doc)
90 |
91 | # [B, L, D]
92 | # [B, R, D]
93 | query = self.rnn_dropout(query)
94 | doc = self.rnn_dropout(doc)
95 |
96 | # [B, L, H]
97 | # [B, R, H]
98 | query = self.input_encoding(query, query_mask)
99 | doc = self.input_encoding(doc, doc_mask)
100 |
101 | # [B, L, H], [B, L, H]
102 | attended_query, attended_doc = self.attention(
103 | query, query_mask, doc, doc_mask)
104 |
105 | # [B, L, 4 * H]
106 | # [B, L, 4 * H]
107 | enhanced_query = torch.cat([query,
108 | attended_query,
109 | query - attended_query,
110 | query * attended_query],
111 | dim=-1)
112 | enhanced_doc = torch.cat([doc,
113 | attended_doc,
114 | doc - attended_doc,
115 | doc * attended_doc],
116 | dim=-1)
117 | # [B, L, H]
118 | # [B, L, H]
119 | projected_query = self.projection(enhanced_query)
120 | projected_doc = self.projection(enhanced_doc)
121 |
122 | # [B, L, H]
123 | # [B, L, H]
124 | query = self.composition(projected_query, query_mask)
125 | doc = self.composition(projected_doc, doc_mask)
126 |
127 | # [B, L]
128 | # [B, R]
129 | reverse_query_mask = 1. - query_mask.float()
130 | reverse_doc_mask = 1. - doc_mask.float()
131 |
132 | # [B, H]
133 | # [B, H]
134 | query_avg = torch.sum(query * reverse_query_mask.unsqueeze(2), dim=1)\
135 | / (torch.sum(reverse_query_mask, dim=1, keepdim=True) + 1e-8)
136 | doc_avg = torch.sum(doc * reverse_doc_mask.unsqueeze(2), dim=1)\
137 | / (torch.sum(reverse_doc_mask, dim=1, keepdim=True) + 1e-8)
138 |
139 | # [B, L, H]
140 | # [B, L, H]
141 | query = query.masked_fill(query_mask.unsqueeze(2), -1e7)
142 | doc = doc.masked_fill(doc_mask.unsqueeze(2), -1e7)
143 |
144 | # [B, H]
145 | # [B, H]
146 | query_max, _ = query.max(dim=1)
147 | doc_max, _ = doc.max(dim=1)
148 |
149 | feature = inputs['feature'].float()
150 | feat_emb = self.wide_net(feature)
151 |
152 | # [B, 4 * H + H]
153 | v = torch.cat([query_avg, query_max, doc_avg, doc_max, feat_emb], dim=-1)
154 |
155 | # [B, H]
156 | hidden = self.classification(v)
157 |
158 | # [B, num_classes]
159 | out = self.out(hidden)
160 |
161 | return out
162 |
163 |
164 |
--------------------------------------------------------------------------------
/src/rank/m2/nn_5_fold_predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
3 |
4 | import gc
5 | from tqdm import tqdm
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import torch
10 | import matchzoo as mz
11 | from model import ESIMplus
12 |
13 | from utils import MAP, build_matrix, topk_lines, predict, Logger
14 |
15 |
16 | import argparse
17 |
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
20 | args = parser.parse_args()
21 |
22 | model_id = args.model_id
23 |
24 | num_dup = 6
25 | num_neg = 10
26 | batch_size = 128
27 | add_lgb_feat = False
28 | debug = False
29 |
30 | if model_id == 'ESIMplus_001':
31 | bst_epochs = {1:0, 2:2, 3:4, 4:2, 5:1}
32 | Model = ESIMplus
33 | lr = 0.001
34 | add_lgb_feat = True
35 | params = {'embedding_freeze': True,
36 | 'mask_value': 0,
37 | 'lstm_layer': 2,
38 | 'hidden_size': 200,
39 | 'dropout': 0.2}
40 |
41 |
42 | if model_id == 'aNMM_001':
43 | bst_epochs = {1:4, 2:4, 3:3, 4:4, 5:9}
44 | Model = mz.models.aNMM
45 | lr = 0.001
46 | params = {'embedding_freeze': True,
47 | 'mask_value': 0,
48 | 'dropout_rate': 0.1}
49 |
50 | if model_id == 'ESIM_001':
51 | bst_epochs = {1:4, 2:4, 3:2, 4:2, 5:6}
52 | Model = mz.models.ESIM
53 | lr = 0.001
54 | params = {'embedding_freeze': True,
55 | 'mask_value': 0,
56 | 'lstm_layer': 2,
57 | 'hidden_size': 200,
58 | 'dropout': 0.2}
59 |
60 | if model_id == 'MatchLSTM_001':
61 | bst_epochs = {1:4, 2:2, 3:2, 4:4, 5:3}
62 | Model = mz.models.MatchLSTM
63 | lr = 0.001
64 | params = {'embedding_freeze': True,
65 | 'mask_value': 0}
66 |
67 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
68 | task = mz.tasks.Ranking(losses=losses)
69 | task.metrics = [
70 | mz.metrics.MeanAveragePrecision(),
71 | MAP()
72 | ]
73 |
74 | if model_id == 'ESIM_001_pointwise':
75 | bst_epochs = {1:4, 2:3, 3:7, 4:12, 5:5}
76 | Model = mz.models.ESIM
77 | lr = 0.001
78 | params = {'embedding_freeze': True,
79 | 'mask_value': 0,
80 | 'lstm_layer': 2,
81 | 'hidden_size': 200,
82 | 'dropout': 0.2}
83 |
84 | task = mz.tasks.Classification(num_classes=2)
85 | task.metrics = ['acc']
86 |
87 |
88 | padding_callback = Model.get_default_padding_callback()
89 | embedding_matrix = np.load("data/embedding_matrix.npy")
90 | # l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
91 | # embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
92 |
93 | test_processed = mz.data_pack.data_pack.load_data_pack("test_processed.dp")
94 | testset = mz.dataloader.Dataset(
95 | data_pack=test_processed,
96 | batch_size=batch_size,
97 | sort=False,
98 | shuffle=False
99 | )
100 |
101 | testloader = mz.dataloader.DataLoader(
102 | dataset=testset,
103 | stage='dev',
104 | callback=padding_callback
105 | )
106 |
107 |
108 |
109 | model = Model()
110 | if add_lgb_feat: model.set_feature_dim(30)
111 |
112 | model.params['task'] = task
113 | model.params['embedding'] = embedding_matrix
114 |
115 | for param in params:
116 | model.params[param] = params[param]
117 |
118 | model.build()
119 |
120 | optimizer = torch.optim.Adam(model.parameters(), lr=lr)
121 | trainer = mz.trainers.Trainer(
122 | model=model,
123 | optimizer=optimizer,
124 | trainloader=testloader,
125 | validloader=testloader,
126 | validate_interval=None,
127 | epochs=1
128 | )
129 |
130 |
131 | for fold in range(1,6):
132 | i = bst_epochs[fold]
133 | val_processed = mz.data_pack.data_pack.load_data_pack("5fold/val_processed_{}.dp".format(fold))
134 | valset = mz.dataloader.Dataset(
135 | data_pack=val_processed,
136 | batch_size=batch_size,
137 | sort=False,
138 | shuffle=False
139 | )
140 |
141 | valloader = mz.dataloader.DataLoader(
142 | dataset=valset,
143 | stage='dev',
144 | callback=padding_callback
145 | )
146 |
147 | trainer.restore_model("save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
148 |
149 | score = predict(trainer, valloader)
150 | X, y = val_processed.unpack()
151 | result = pd.DataFrame(data={
152 | 'description_id': X['id_left'],
153 | 'paper_id': X['id_right'],
154 | 'score': score[:,0]})
155 | result.to_csv("result/{}/{}_fold_{}_cv.csv".format(model_id, model_id, fold), index=False)
156 |
157 | score = predict(trainer, testloader)
158 | X, y = test_processed.unpack()
159 | result = pd.DataFrame(data={
160 | 'description_id': X['id_left'],
161 | 'paper_id': X['id_right'],
162 | 'score': score[:,0]})
163 | result.to_csv("result/{}/{}_fold_{}_test.csv".format(model_id, model_id, fold), index=False)
164 |
165 |
166 |
--------------------------------------------------------------------------------
/src/rank/m2/nn_5_fold_train.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
3 |
4 | import gc
5 | from tqdm import tqdm
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import torch
10 | import matchzoo as mz
11 | from model import ESIMplus
12 |
13 | from utils import MAP, build_matrix, topk_lines, predict, Logger
14 |
15 | import argparse
16 |
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
19 | args = parser.parse_args()
20 |
21 | num_dup = 6
22 | num_neg = 10
23 | batch_size = 128
24 | add_lgb_feat = False
25 | debug = False
26 |
27 | if model_id == 'ESIMplus_001':
28 | Model = ESIMplus
29 | lr = 0.001
30 | add_lgb_feat = True
31 | params = {'embedding_freeze': True,
32 | 'mask_value': 0,
33 | 'lstm_layer': 2,
34 | 'hidden_size': 200,
35 | 'dropout': 0.2}
36 |
37 |
38 | if model_id == 'aNMM_001':
39 | Model = mz.models.aNMM
40 | lr = 0.001
41 | params = {'embedding_freeze': True,
42 | 'mask_value': 0,
43 | 'dropout_rate': 0.1}
44 |
45 | if model_id == 'ESIM_001':
46 | Model = mz.models.ESIM
47 | lr = 0.001
48 | params = {'embedding_freeze': True,
49 | 'mask_value': 0,
50 | 'lstm_layer': 2,
51 | 'hidden_size': 200,
52 | 'dropout': 0.2}
53 |
54 | if model_id == 'MatchLSTM':
55 | model_id = 'MatchLSTM_001'
56 | Model = mz.models.MatchLSTM
57 | lr = 0.001
58 | params = {'embedding_freeze': True,
59 | 'mask_value': 0}
60 |
61 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
62 | padding_callback = Model.get_default_padding_callback()
63 | task = mz.tasks.Ranking(losses=losses)
64 | task.metrics = [
65 | mz.metrics.MeanAveragePrecision(),
66 | MAP()
67 | ]
68 |
69 | if model_id == 'ESIM_001_pointwise':
70 | Model = mz.models.ESIM
71 | lr = 0.001
72 | params = {'embedding_freeze': True,
73 | 'mask_value': 0,
74 | 'lstm_layer': 2,
75 | 'hidden_size': 200,
76 | 'dropout': 0.2}
77 |
78 | task = mz.tasks.Classification(num_classes=2)
79 | task.metrics = ['acc']
80 |
81 | embedding_matrix = np.load("data/embedding_matrix.npy")
82 |
83 |
84 | if not os.path.exists('result/{}'.format(model_id)):
85 | os.makedirs('result/{}'.format(model_id))
86 |
87 | with Logger(log_filename = '{}.log'.format(model_id)):
88 | for fold in range(1,5):
89 | print("="*10+" fold: "+str(fold)+" data_processed prepare "+"="*10)
90 | train_processed = mz.data_pack.data_pack.load_data_pack("5fold/train_processed_{}.dp".format(fold))
91 | val_processed = mz.data_pack.data_pack.load_data_pack("5fold/val_processed_{}.dp".format(fold))
92 |
93 | if model_id == 'ESIM_001_pointwise':
94 | train_processed.relation.label = train_processed.relation.label.astype(np.long)
95 | val_processed.relation.label = val_processed.relation.label.astype(np.long)
96 |
97 |
98 | print("="*10+" fold: "+str(fold)+" dataset prepare "+"="*10)
99 | trainset = mz.dataloader.Dataset(
100 | data_pack=train_processed,
101 | mode='pair',
102 | num_dup=num_dup,
103 | num_neg=num_neg,
104 | batch_size=batch_size,
105 | resample=True,
106 | sort=False,
107 | shuffle=True
108 | )
109 | valset = mz.dataloader.Dataset(
110 | data_pack=val_processed,
111 | batch_size=batch_size,
112 | sort=False,
113 | shuffle=False
114 | )
115 |
116 | print("="*10+" fold: "+str(fold)+" dataloader prepare "+"="*10)
117 | trainloader = mz.dataloader.DataLoader(
118 | dataset=trainset,
119 | stage='train',
120 | callback=padding_callback
121 | )
122 | valloader = mz.dataloader.DataLoader(
123 | dataset=valset,
124 | stage='dev',
125 | callback=padding_callback
126 | )
127 |
128 | print("="*10+" fold: "+str(fold)+" model build "+"="*10)
129 | model = Model()
130 | if add_lgb_feat: model.set_feature_dim(30)
131 |
132 | model.params['task'] = task
133 | model.params['embedding'] = embedding_matrix
134 |
135 | for param in params:
136 | model.params[param] = params[param]
137 |
138 | model.build()
139 | if debug: print(model)
140 |
141 | print("="*10+" fold: "+str(fold)+" trainers build "+"="*10)
142 | optimizer = torch.optim.Adam(model.parameters(), lr=lr)
143 |
144 | trainer = mz.trainers.Trainer(
145 | model=model,
146 | optimizer=optimizer,
147 | trainloader=trainloader,
148 | validloader=valloader,
149 | validate_interval=None,
150 | epochs=1
151 | )
152 |
153 | print("="*10+" fold: "+str(fold)+" training "+"="*10)
154 | trainer.restore_model("save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, 1))
155 | for i in range(2,6):
156 | trainer._model.embedding.requires_grad_(requires_grad=False)
157 | print("="*10+" fold: "+str(fold)+" epoch: "+str(i)+" "+"="*10)
158 | trainer.run()
159 | trainer.save_model()
160 | os.rename("save/model.pt", "save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
161 |
162 |
163 |
--------------------------------------------------------------------------------
/src/rank/m2/nn_preprocessing.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gc
3 | from tqdm import tqdm
4 | import numpy as np
5 | import pandas as pd
6 |
7 | import torch
8 | import matchzoo as mz
9 | from model import ESIMplus
10 |
11 | from gensim.models import KeyedVectors
12 | from utils import MAP, build_matrix, topk_lines, predict
13 |
14 | pd.set_option('display.max_columns', None)
15 | pd.set_option('display.max_rows', 200)
16 | pd.set_option('max_colwidth',400)
17 |
18 |
19 | num_neg = 10
20 | fit_preprocessor = True
21 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
22 | feature = [
23 | 'quer_key_tfidf_corp_cos_dis',
24 | 'quer_key_tfidf_corp_eucl_dis',
25 | 'quer_key_corp_bm25_score',
26 | 'corp_sim_score',
27 | 'quer_all_tfidf_corp_eucl_dis',
28 | 'quer_all_corp_bm25_score',
29 | 'quer_key_tfidf_titl_manh_dis',
30 | 'quer_all_titl_bm25_score',
31 | 'quer_all_tfidf_corp_cos_dis',
32 | 'jaccard_coef_of_unigram_between_corp_quer_key',
33 | 'ratio_of_unique_corp_unigram',
34 | 'jaccard_coef_of_unigram_between_corp_quer_all',
35 | 'jaccard_coef_of_unigram_between_titl_quer_key',
36 | 'quer_key_tfidf_titl_cos_dis',
37 | 'jaccard_coef_of_unigram_between_abst_quer_key',
38 | 'quer_key_abst_bm25_score',
39 | 'quer_all_tfidf_titl_cos_dis',
40 | 'quer_key_tfidf_titl_eucl_dis',
41 | 'count_of_quer_key_unigram',
42 | 'quer_all_tfidf_titl_eucl_dis',
43 | 'ratio_of_unique_quer_all_unigram',
44 | 'quer_key_tfidf_abst_cos_dis',
45 | 'count_of_unique_corp_unigram',
46 | 'ratio_of_unique_abst_unigram',
47 | 'normalized_pos_of_corp_unigram_in_quer_all_max',
48 | 'quer_all_abst_bm25_score',
49 | 'normalized_pos_of_titl_unigram_in_quer_all_std',
50 | 'quer_all_tfidf_titl_manh_dis',
51 | 'jaccard_coef_of_unigram_between_abst_quer_all',
52 | 'dice_dist_of_unigram_between_corp_quer_key']
53 |
54 | task = mz.tasks.Ranking(losses=losses)
55 | task.metrics = [
56 | mz.metrics.MeanAveragePrecision(),
57 | MAP()
58 | ]
59 | print("task is", task)
60 | print("`task` initialized with metrics", task.metrics)
61 |
62 | if fit_preprocessor:
63 |
64 | preprocessor = mz.models.ESIM.get_default_preprocessor(
65 | truncated_mode='pre',
66 | truncated_length_left=64,
67 | truncated_length_right=256,
68 | filter_mode='df',
69 | filter_low_freq=2)
70 |
71 | preprocessor = preprocessor.fit(all_data_raw)
72 | preprocessor.save("preprocessor.prep")
73 | else:
74 | preprocessor = mz.load_preprocessor("preprocessor.prep")
75 |
76 |
77 | candidate_dic = pd.read_feather('data/candidate_dic.ftr')
78 |
79 | train_recall = pd.read_feather('data/train_recall.ftr')
80 | train_description = pd.read_feather('data/train_description.ftr')
81 | train_recall = pd.merge(train_recall, train_description, how='left', on='id_left')
82 | train_recall = pd.merge(train_recall, candidate_dic, how='left', on='id_right')
83 | train_recall = train_recall.drop_duplicates().reset_index(drop=True)
84 | del train_description
85 | gc.collect()
86 |
87 |
88 | test_recall = pd.read_feather('data/test_recall.ftr')
89 | test_description = pd.read_feather('data/test_description.ftr')
90 | test_recall = pd.merge(test_recall, test_description, how='left', on='id_left')
91 | test_recall = pd.merge(test_recall, candidate_dic, how='left', on='id_right')
92 | del test_description, candidate_dic
93 | gc.collect()
94 |
95 | all_data_df = train_recall.copy()
96 | all_data_df.id_left = all_data_df.id_left+'_tr'
97 | all_data_df = pd.concat([all_data_df, test_recall]).reset_index(drop=True)
98 | norm_df = all_data_df[feature].quantile(q=0.99)
99 |
100 | del all_data_df, train_recall, test_recall
101 | gc.collect()
102 |
103 | train_recall[feature] = train_recall[feature]/norm_df
104 | train_recall['feature'] = list(train_recall[feature].values)
105 | train_recall = train_recall[['id_left', 'text_left', 'id_right', 'text_right', 'label', 'feature']]
106 | cv_ids = pd.read_csv("../../input/cv_ids_0109.csv")
107 | train_recall = train_recall.merge(
108 | cv_ids.rename(columns={'description_id': 'id_left'}),
109 | how='left',
110 | on='id_left').fillna(5.0)
111 |
112 |
113 | for i in range(1,6):
114 | print("="*20, i, "="*20)
115 | train_df = train_recall[train_recall.cv!=i][
116 | ['id_left', 'text_left', 'id_right', 'text_right', 'label', 'feature']].reset_index(drop=True)
117 | val_df = train_recall[train_recall.cv==i][
118 | ['id_left', 'text_left', 'id_right', 'text_right', 'label', 'feature']].reset_index(drop=True)
119 |
120 | train_raw = mz.pack(train_df, task)
121 | val_raw = mz.pack(val_df, task)
122 |
123 | train_processed = preprocessor.transform(train_raw)
124 | val_processed = preprocessor.transform(val_raw)
125 |
126 | train_processed.save("5fold/train_processed_{}.dp".format(i))
127 | val_processed.save("5fold/val_processed_{}.dp".format(i))
128 |
129 |
130 | test_recall[feature] = test_recall[feature]/norm_df
131 | test_recall['feature'] = list(test_recall[feature].values)
132 | test_recall = test_recall[['id_left', 'text_left', 'id_right', 'text_right', 'feature']]
133 |
134 | test_raw = mz.pack(test_recall, task)
135 | test_processed = preprocessor.transform(test_raw)
136 | # test_processed.save("test_processed.dp")
137 | test_processed.save("final_test_processed.dp")
138 |
139 |
140 | from gensim.models import KeyedVectors
141 | w2v_path = "data/glove.w2v"
142 | w2v_model = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
143 | term_index = preprocessor.context['vocab_unit'].state['term_index']
144 | embedding_matrix = build_matrix(term_index, w2v_model)
145 | del w2v_model, term_index
146 | gc.collect()
147 | np.save("data/embedding_matrix.npy", embedding_matrix)
148 |
149 |
150 |
--------------------------------------------------------------------------------
/src/rank/m2/preprocessing.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import numpy as np
3 | import pandas as pd
4 | import feather
5 |
6 | import argparse
7 |
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--query_type', type=str, default='query_key')
10 | args = parser.parse_args()
11 |
12 | query_type = args.query_type
13 |
14 | def topk_lines(df, k):
15 | print(df.shape)
16 | df.loc[:, 'rank'] = df.groupby(['description_id', 'type']).cumcount().values
17 | df = df[df['rank'] < k]
18 | df.drop(['rank'], axis=1, inplace=True)
19 | print(df.shape)
20 | return df
21 |
22 |
23 | ## preprocess
24 | candidate_dic = feather.read_dataframe('../../../input/paper_input_final.ftr')
25 |
26 | candidate_dic.loc[candidate_dic['keywords'].isna(),'keywords'] = ''
27 | candidate_dic.loc[candidate_dic['titl'].isna(),'titl'] = ''
28 | candidate_dic.loc[candidate_dic['abst'].isna(),'abst'] = ''
29 |
30 | candidate_dic['text_right'] = candidate_dic['abst'].str.cat(
31 | candidate_dic['keywords'], sep=' ').str.cat(
32 | candidate_dic['titl'], sep=' ')
33 |
34 | candidate_dic = candidate_dic.rename(columns={'paper_id': 'id_right'})[['id_right', 'text_right']]
35 | candidate_dic.to_feather('data/candidate_dic.ftr')
36 |
37 | train_description = feather.read_dataframe('../../../input/tr_input_final.ftr')
38 |
39 | train_description = train_description.rename(
40 | columns={'description_id': 'id_left', query_type: 'text_left'})
41 | train_description[['id_left', 'text_left']].to_feather('data/train_description_{}.ftr'.format(query_type))
42 |
43 |
44 | test_description = feather.read_dataframe('../../../input/te_input_final.ftr')
45 |
46 | test_description = test_description.rename(
47 | columns={'description_id': 'id_left', query_type: 'text_left'})
48 |
49 | test_description[['id_left', 'text_left']].to_feather('data/test_description_{}.ftr'.format(query_type))
50 |
51 | train_recall = feather.read_dataframe('../../../feat/tr_s0_32-50.ftr')
52 |
53 | ## recall
54 | train_recall = train_recall.rename(
55 | columns={'description_id': 'id_left', 'paper_id': 'id_right', 'target': 'label'})
56 |
57 | train_recall = train_recall[train_recall.id_left.isin(train_description.id_left.values)].reset_index(drop=True)
58 | train_recall = train_recall.drop_duplicates()
59 | train_recall = train_recall.fillna(0)
60 | train_recall.to_feather('data/train_recall.ftr')
61 |
62 | test_recall = feather.read_dataframe('../../../feat/te_s0_32-50.ftr')
63 | test_recall = test_recall.reset_index(drop=True)
64 |
65 | test_recall = test_recall.rename(
66 | columns={'description_id': 'id_left',
67 | 'paper_id': 'id_right',
68 | 'target': 'label'})
69 |
70 | # test_recall[['id_left', 'id_right', 'label']].to_feather('data/test_recall.ftr')
71 | test_recall[['id_left', 'id_right', 'label']].to_feather('data/final_test_recall.ftr')
72 |
73 |
74 | ## corpus
75 | if query_type== 'query_key':
76 | candidate_dic = feather.read_dataframe('data/candidate_dic.ftr')
77 | train_description = feather.read_dataframe('data/train_description.ftr')
78 | test_description = feather.read_dataframe('data/test_description.ftr')
79 |
80 | with open('data/corpus.txt','a') as fid:
81 | for sent in tqdm(candidate_dic['text_right']):
82 | if type(sent)==str:
83 | fid.write(sent+'\n')
84 | for sent in tqdm(train_description['text_left']):
85 | if type(sent)==str:
86 | fid.write(sent+'\n')
87 | for sent in tqdm(test_description['text_left']):
88 | if type(sent)==str:
89 | fid.write(sent+'\n')
90 |
91 |
--------------------------------------------------------------------------------
/src/rank/m2/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | python3 preprocessing.py --query_type quer_key
4 | python3 preprocessing.py --query_type quer_all
5 |
6 | git clone http://github.com/stanfordnlp/glove
7 | cp gen_w2v.sh glove/
8 | cp data/corpus.txt glove/
9 | cd glove && make
10 | . gen_w2v.sh
11 | cd ..
12 | cp glove/glove.w2v data/
13 |
14 | python3 nn_preprocessing.py
15 | python3 bert_preprocessing.py --preprocessing_type fine --left_truncated_length 64 --query_type query_key
16 | python3 bert_preprocessing.py --preprocessing_type fine --left_truncated_length 200 --query_type query_all
17 | python3 bert_preprocessing.py --preprocessing_type coarse --left_truncated_length 200 --query_type query_all
18 |
19 | python3 nn_5_fold_train.py --model_id ESIM_001
20 | python3 nn_5_fold_train.py --model_id ESIMplus_001
21 | python3 nn_5_fold_train.py --model_id aNMM_001
22 | python3 nn_5_fold_train.py --model_id MatchLSTM_001
23 | python3 nn_5_fold_train.py --model_id ESIM_001_pointwise
24 |
25 | python3 bert_5_fold_train.py --model_id bert_002
26 | python3 bert_5_fold_train.py --model_id bert_003
27 | python3 bert_5_fold_train.py --model_id bert_004
28 |
29 | python3 nn_5_fold_predict.py --model_id ESIM_001
30 | python3 nn_5_fold_predict.py --model_id ESIMplus_001
31 | python3 nn_5_fold_predict.py --model_id aNMM_001
32 | python3 nn_5_fold_predict.py --model_id MatchLSTM_001
33 | python3 nn_5_fold_predict.py --model_id ESIM_001_pointwise
34 |
35 | python3 bert_5_fold_predict.py --model_id bert_002
36 | python3 bert_5_fold_predict.py --model_id bert_003
37 | python3 bert_5_fold_predict.py --model_id bert_004
38 |
39 | python3 fold_result_integration.py --model_id ESIM_001
40 | python3 fold_result_integration.py --model_id ESIMplus_001
41 | python3 fold_result_integration.py --model_id aNMM_001
42 | python3 fold_result_integration.py --model_id MatchLSTM_001
43 | python3 fold_result_integration.py --model_id ESIM_001_pointwise
44 | python3 fold_result_integration.py --model_id bert_002
45 | python3 fold_result_integration.py --model_id bert_003
46 | python3 fold_result_integration.py --model_id bert_004
47 |
48 | python3 mk_submission.py --model_id ESIM_001
49 | python3 mk_submission.py --model_id ESIMplus_001
50 | python3 mk_submission.py --model_id aNMM_001
51 | python3 mk_submission.py --model_id MatchLSTM_001
52 | python3 mk_submission.py --model_id ESIM_001_pointwise
53 | python3 mk_submission.py --model_id bert_002
54 | python3 mk_submission.py --model_id bert_003
55 | python3 mk_submission.py --model_id bert_004
56 |
57 | python3 change_formatting4stk.py --model_id ESIM_001
58 | python3 change_formatting4stk.py --model_id ESIMplus_001
59 | python3 change_formatting4stk.py --model_id aNMM_001
60 | python3 change_formatting4stk.py --model_id MatchLSTM_001
61 | python3 change_formatting4stk.py --model_id ESIM_001_pointwise
62 | python3 change_formatting4stk.py --model_id bert_002
63 | python3 change_formatting4stk.py --model_id bert_003
64 | python3 change_formatting4stk.py --model_id bert_004
65 |
66 | ###### finally #####
67 | python3 final_blend.py
68 |
69 |
--------------------------------------------------------------------------------
/src/rank/m2/utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import time
3 | import numpy as np
4 | from tqdm import tqdm
5 | import torch
6 | from matchzoo.engine.base_metric import sort_and_couple, RankingMetric
7 |
8 |
9 | def build_matrix(term_index, gv_model, dim=256):
10 |
11 | input_dim = len(term_index)
12 | matrix = np.empty((input_dim, dim))
13 |
14 | valid_keys = gv_model.vocab.keys()
15 | for term, index in term_index.items():
16 | if term in valid_keys:
17 | matrix[index] = gv_model.word_vec(term)
18 | else:
19 | if '' in gv_model.vocab.keys():
20 | matrix[index] = gv_model.word_vec("")
21 | else:
22 | matrix[index] = np.random.randn(dim).astype(dtype=np.float32)
23 | return matrix
24 |
25 | def topk_lines(df, k):
26 | print(df.shape)
27 | df.loc[:, 'rank'] = df.groupby(['description_id', 'type']).cumcount().values
28 | df = df[df['rank'] < k]
29 | df.drop(['rank'], axis=1, inplace=True)
30 | print(df.shape)
31 | return df
32 |
33 |
34 | class MAP(RankingMetric):
35 |
36 | def __init__(self, k = 3):
37 | self._k = k
38 |
39 | def __repr__(self) -> str:
40 | return 'mean_average_precision@{}'.format(self._k)
41 |
42 | def __call__(self, y_true, y_pred):
43 | coupled_pair = sort_and_couple(y_true, y_pred)
44 | for idx, (label, pred) in enumerate(coupled_pair):
45 | if idx+1>self._k:
46 | return 0
47 | if label > 0:
48 | return 1. / (idx + 1)
49 | return 0.
50 |
51 |
52 | def predict(trainer, testloader):
53 | with torch.no_grad():
54 | trainer._model.eval()
55 | predictions = []
56 | for batch in tqdm(testloader):
57 | inputs = batch[0]
58 | outputs = trainer._model(inputs).detach().cpu()
59 | predictions.append(outputs)
60 | trainer._model.train()
61 |
62 | return torch.cat(predictions, dim=0).numpy()
63 |
64 |
65 | class Logger:
66 | def __init__(self, log_filename="log.txt"):
67 | self.terminal = sys.stdout
68 | self.log = open(log_filename, "a")
69 | self.log.write("="*10+" Start Time:"+time.ctime()+" "+"="*10+"\n")
70 |
71 | def __enter__(self):
72 | sys.stdout = self
73 |
74 | def __exit__(self, e_t, e_v, t_b):
75 | sys.stdout = self.close()
76 |
77 | def stop_log(self):
78 | sys.stdout = self.close()
79 |
80 | def write(self, message):
81 | self.terminal.write(message)
82 | if message=="\n":
83 | self.log.write(message)
84 | else:
85 | self.log.write("["+time.ctime()+"]: "+message)
86 |
87 | def flush(self):
88 | self.terminal.flush()
89 | self.log.flush()
90 |
91 | def close(self):
92 | self.log.write("="*10+" End Time"+time.ctime()+" "+"="*10+"\n")
93 | self.log.close()
94 | return self.terminal
95 |
96 |
--------------------------------------------------------------------------------
/src/rank/m3/convert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 |
17 | # 自定义工具包
18 | sys.path.append('../../../tools/')
19 | import loader
20 |
21 | # 设置随机种子
22 | SEED = 2020
23 | np.random.seed (SEED)
24 |
25 | def val_convert(df_path, pred_path, out_path):
26 | tr_data = loader.load_df(df_path)
27 | df_pred = loader.load_df(pred_path)
28 |
29 | sort_df_pred = df_pred.sort_values(['description_id', 'target'], ascending=False)
30 | df_pred = df_pred[['description_id']].drop_duplicates() \
31 | .merge(sort_df_pred, on=['description_id'], how='left')
32 | df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
33 | df_pred = df_pred[df_pred['rank'] < 3]
34 | df_pred = df_pred.groupby(['description_id'])['paper_id'] \
35 | .apply(lambda s : ','.join((s))).reset_index()
36 |
37 | tr_data = tr_data[['description_id', 'paper_id']].rename(columns={'paper_id': 'target_id'})
38 | df_pred = df_pred.merge(tr_data, on=['description_id'], how='left')
39 | loader.save_df(df_pred, out_path)
40 |
41 | def output(df, out_path):
42 | fo = open(out_path, 'w')
43 | for i in range(df.shape[0]):
44 | desc_id = df.iloc[i]['description_id']
45 | paper_ids = df.iloc[i]['paper_id']
46 | print (desc_id + ',' + paper_ids, file=fo)
47 | fo.close()
48 |
49 | def sub_convert(df_path, pred_path, out_path1, out_path2):
50 | te_data = loader.load_df(df_path)
51 | df_pred = loader.load_df(pred_path)
52 |
53 | sort_df_pred = df_pred.sort_values(['description_id', 'target'], ascending=False)
54 | df_pred = df_pred[['description_id']].drop_duplicates() \
55 | .merge(sort_df_pred, on=['description_id'], how='left')
56 | df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
57 | df_pred = df_pred[df_pred['rank'] < 3]
58 | df_pred = df_pred.groupby(['description_id'])['paper_id'] \
59 | .apply(lambda s : ','.join((s))).reset_index()
60 |
61 | df_pred = te_data[['description_id']].merge(df_pred, on=['description_id'], how='left')
62 | loader.save_df(df_pred, out_path1)
63 | #output(df_pred, out_path2)
64 |
65 | if __name__ == "__main__":
66 |
67 | print('start time: %s' % datetime.now())
68 | root_path = '../../../feat/'
69 | base_tr_path = '../../../input/train_release.csv'
70 | base_te_path = '../../../input/test.csv'
71 |
72 | sub_file_path = sys.argv[1]
73 | sub_name = sys.argv[2]
74 |
75 | val_path = '{}/{}_cv.ftr'.format(sub_file_path, sub_name)
76 | val_out_path = '{}/r_{}_cv.csv'.format(sub_file_path, sub_name)
77 | val_convert(base_tr_path, val_path, val_out_path)
78 |
79 | sub_path = '{}/{}.ftr'.format(sub_file_path, sub_name)
80 | sub_out_pathA = '{}/r_{}.csv'.format(sub_file_path, sub_name)
81 | sub_out_pathB = '{}/s_{}.csv'.format(sub_file_path, sub_name)
82 | sub_out_pathA2 = '{}/r2_{}.csv'.format(sub_file_path, sub_name)
83 | sub_out_pathB2 = '{}/s2_{}.csv'.format(sub_file_path, sub_name)
84 | sub_convert(base_te_path, sub_path, sub_out_pathA, sub_out_pathB)
85 |
86 | print('all completed: %s' % datetime.now())
87 |
88 |
89 |
--------------------------------------------------------------------------------
/src/rank/m3/eval.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 |
17 | # 自定义工具包
18 | sys.path.append('../../../tools/')
19 | import loader
20 |
21 | # 开源工具包
22 | import ml_metrics as metrics
23 |
24 | # 设置随机种子
25 | SEED = 2020
26 | np.random.seed (SEED)
27 |
28 | def calc_map(df, k):
29 | df.rename(columns={'paper_id': 'paper_ids'}, inplace=True)
30 | df['paper_ids'] = df['paper_ids'].apply(lambda s: s.split(','))
31 | df['target_id'] = df['target_id'].apply(lambda s: [s])
32 | return metrics.mapk(df['target_id'].tolist(), df['paper_ids'].tolist(), k)
33 |
34 | if __name__ == "__main__":
35 |
36 | print('start time: %s' % datetime.now())
37 | in_path = sys.argv[1]
38 | df = loader.load_df(in_path)
39 | mapk = calc_map(df, k=3)
40 | print ('{} {}'.format(df.shape, round(mapk, 5)))
41 | print('all completed: %s' % datetime.now())
42 |
43 |
--------------------------------------------------------------------------------
/src/rank/m3/flow.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import time
8 |
9 | ts = time.time()
10 |
11 | num = sys.argv[1]
12 |
13 | sub_file_path = '../../../output/m3/lgb_m3_{}'.format(num)
14 | sub_name = 'lgb_m3_{}'.format(num)
15 |
16 | # lgb train
17 | print ('lgb_train-%s.py %s' % (num, num))
18 | os.system('python3 -u lgb_train_%s.py %s' % (num, num))
19 |
20 | # merge cv & sub
21 | print('\nkfold_merge')
22 | os.system('python3 -u kfold_merge.py %s %s' % (sub_file_path, sub_name))
23 |
24 | # convert cv & sub to list format
25 | print ('\nconvert')
26 | os.system('python3 -u convert.py %s %s' % (sub_file_path, sub_name))
27 |
28 | # calculate mrr & auc
29 | print ('\neval')
30 | os.system('python3 -u eval.py %s' % ('{}/r_{}_cv.csv'.format(sub_file_path, sub_name)))
31 |
32 | print ('all completed, cost {}s'.format(time.time() - ts))
33 |
--------------------------------------------------------------------------------
/src/rank/m3/kfold_merge.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 |
19 | # 自定义工具包
20 | sys.path.append('../../../tools/')
21 | import loader
22 |
23 | # 设置随机种子
24 | SEED = 2020
25 | np.random.seed (SEED)
26 |
27 | TARGET_NAME = 'target'
28 | FOLD_NUM = 5
29 |
30 | def merge_val(file_path, sub_name, fold_num):
31 | file_list = os.listdir(file_path)
32 |
33 | paths = ['{}_cv_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
34 | print (paths)
35 |
36 | dfs = []
37 | for path in paths:
38 | assert path in file_list, '{} not exist'.format(path)
39 | path = '{}/{}'.format(file_path, path)
40 | dfs.append(loader.load_df(path))
41 |
42 | df = pd.concat(dfs)
43 | print (df.head())
44 | print (df.describe())
45 | out_path = '{}/{}_cv.ftr'.format(file_path, sub_name)
46 | loader.save_df(df, out_path)
47 |
48 | def merge_sub(file_path, sub_name, fold_num):
49 | file_list = os.listdir(file_path)
50 |
51 | paths = ['{}_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
52 | print (paths)
53 |
54 | df = pd.DataFrame()
55 | for i, path in enumerate(paths):
56 | assert path in file_list, '{} not exist'.format(path)
57 | path = '{}/{}'.format(file_path, path)
58 | if i == 0:
59 | df = loader.load_df(path)
60 | else:
61 | df[TARGET_NAME] += loader.load_df(path)[TARGET_NAME]
62 |
63 | df[TARGET_NAME] /= fold_num
64 | print (df.head())
65 | print (df.describe())
66 | out_path = '{}/{}.ftr'.format(file_path, sub_name)
67 | loader.save_df(df, out_path)
68 |
69 |
70 | if __name__ == '__main__':
71 |
72 | sub_file_path = sys.argv[1]
73 | sub_name = sys.argv[2]
74 |
75 | merge_val(sub_file_path, sub_name, FOLD_NUM)
76 | merge_sub(sub_file_path, sub_name, FOLD_NUM)
77 |
78 |
79 |
--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_32-50-0.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 |
19 | # 自定义工具包
20 | sys.path.append('../../../tools/')
21 | import loader
22 | from lgb_learner import lgbLearner
23 |
24 | # 设置随机种子
25 | SEED = 2020
26 | np.random.seed (SEED)
27 |
28 | FEA_NUM = sys.argv[1]
29 | FEA_NUM = '32-50'
30 |
31 | fold_num = 5
32 | out_name = 'lgb_m3_{}-0'.format(FEA_NUM)
33 | root_path = '../../../output/m3/' + out_name + '/'
34 |
35 | ID_NAMES = ['description_id', 'paper_id']
36 | TARGET_NAME = 'target'
37 |
38 | TASK_TYPE = 'te'
39 | #TASK_TYPE = 'tr'
40 | #TASK_TYPE = 'pe'
41 |
42 | if not os.path.exists(root_path):
43 | os.mkdir(root_path)
44 | print ('create dir succ {}'.format(root_path))
45 |
46 | def sum_score(x, y):
47 | return max(x, 0) + max(y, 0)
48 |
49 | def get_feas(data):
50 |
51 | cols = data.columns.tolist()
52 | del_cols = ID_NAMES + ['target', 'cv']
53 | sub_cols = ['year']
54 | for col in data.columns:
55 | for sub_col in sub_cols:
56 | #if sub_col in col and col != 'year':
57 | if sub_col in col:
58 | del_cols.append(col)
59 |
60 | cols = [val for val in cols if val not in del_cols]
61 | print ('del_cols', del_cols)
62 | return cols
63 |
64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
65 | params = {
66 | "objective": "binary",
67 | "boosting_type": "gbdt",
68 | #"metric": ['binary_logloss'],
69 | "metric": ['auc'],
70 | "boost_from_average": False,
71 | "learning_rate": 0.03,
72 | "num_leaves": 32,
73 | "max_depth": -1,
74 | "feature_fraction": 0.7,
75 | "bagging_fraction": 0.7,
76 | "bagging_freq": 2,
77 | "lambda_l1": 0,
78 | "lambda_l2": 0,
79 | "seed": seed,
80 | 'min_child_weight': 0.005,
81 | 'min_data_in_leaf': 50,
82 | 'max_bin': 255,
83 | "num_threads": 16,
84 | "verbose": -1,
85 | "early_stopping_round": 50
86 | }
87 | params['learning_rate'] = 0.03
88 | num_trees = 2000
89 | print ('training params:', num_trees, params)
90 |
91 | lgb_learner = lgbLearner(train_data, test_data, \
92 | fea_col_names, ID_NAMES, TARGET_NAME, \
93 | params, num_trees, fold_num, out_name, \
94 | metric_names=['auc', 'logloss'], \
95 | model_postfix='')
96 | predicted_folds = [1,2,3,4,5]
97 |
98 | if TASK_TYPE == 'te':
99 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 | predicted_folds=predicted_folds, need_predict_test=True)
101 | elif TASK_TYPE == 'tr':
102 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 | predicted_folds=predicted_folds, need_predict_test=False)
104 | elif TASK_TYPE == 'pe':
105 | lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 | predicted_folds=predicted_folds, need_predict_test=False)
107 |
108 | if __name__ == '__main__':
109 |
110 | ################## params ####################
111 | print("Load the training, test and store data using pandas")
112 | ts = time.time()
113 | root_path = '../../../feat/'
114 | postfix = 's0_{}'.format(FEA_NUM)
115 | file_type = 'ftr'
116 |
117 | train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 | test_path = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 | if TASK_TYPE in ['te', 'pe']:
120 | test_path = root_path + 'te_{}.{}'.format(postfix, file_type)
121 |
122 | print ('tr path', train_path)
123 | print ('te path', test_path)
124 | train_data = loader.load_df(train_path)
125 | test_data = loader.load_df(test_path)
126 |
127 | paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 | tr = loader.load_df('../../../input/tr_input_final.ftr')
129 | tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 | desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 | #train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |
133 | print (train_data.columns)
134 | print (train_data.shape, test_data.shape)
135 |
136 | fea_col_names = get_feas(train_data)
137 | print (len(fea_col_names), fea_col_names)
138 |
139 | required_cols = ID_NAMES + ['cv', 'target']
140 | drop_cols = [col for col in train_data.columns \
141 | if col not in fea_col_names and col not in required_cols]
142 |
143 | train_data = train_data.drop(drop_cols, axis=1)
144 | test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 |
146 | lgb_train(train_data, test_data, fea_col_names)
147 | print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_37-0.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 |
19 | # 自定义工具包
20 | sys.path.append('../../../tools/')
21 | import loader
22 | from lgb_learner import lgbLearner
23 |
24 | # 设置随机种子
25 | SEED = 2020
26 | np.random.seed (SEED)
27 |
28 | FEA_NUM = sys.argv[1]
29 | FEA_NUM = '37'
30 |
31 | fold_num = 5
32 | out_name = 'lgb_m3_{}-0'.format(FEA_NUM)
33 | root_path = '../../../output/m3/' + out_name + '/'
34 |
35 | ID_NAMES = ['description_id', 'paper_id']
36 | TARGET_NAME = 'target'
37 |
38 | TASK_TYPE = 'te'
39 | #TASK_TYPE = 'tr'
40 | #TASK_TYPE = 'pe'
41 |
42 | if not os.path.exists(root_path):
43 | os.mkdir(root_path)
44 | print ('create dir succ {}'.format(root_path))
45 |
46 | def sum_score(x, y):
47 | return max(x, 0) + max(y, 0)
48 |
49 | def get_feas(data):
50 |
51 | cols = data.columns.tolist()
52 | del_cols = ID_NAMES + ['target', 'cv']
53 | sub_cols = ['year']
54 | for col in data.columns:
55 | for sub_col in sub_cols:
56 | #if sub_col in col and col != 'year':
57 | if sub_col in col:
58 | del_cols.append(col)
59 |
60 | cols = [val for val in cols if val not in del_cols]
61 | print ('del_cols', del_cols)
62 | return cols
63 |
64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
65 | params = {
66 | "objective": "binary",
67 | "boosting_type": "gbdt",
68 | #"metric": ['binary_logloss'],
69 | "metric": ['auc'],
70 | "boost_from_average": False,
71 | "learning_rate": 0.03,
72 | "num_leaves": 32,
73 | "max_depth": -1,
74 | "feature_fraction": 0.7,
75 | "bagging_fraction": 0.7,
76 | "bagging_freq": 2,
77 | "lambda_l1": 0,
78 | "lambda_l2": 0,
79 | "seed": seed,
80 | 'min_child_weight': 0.005,
81 | 'min_data_in_leaf': 50,
82 | 'max_bin': 255,
83 | "num_threads": 16,
84 | "verbose": -1,
85 | "early_stopping_round": 50
86 | }
87 | params['learning_rate'] = 0.03
88 | num_trees = 2000
89 | print ('training params:', num_trees, params)
90 |
91 | lgb_learner = lgbLearner(train_data, test_data, \
92 | fea_col_names, ID_NAMES, TARGET_NAME, \
93 | params, num_trees, fold_num, out_name, \
94 | metric_names=['auc', 'logloss'], \
95 | model_postfix='')
96 | predicted_folds = [1,2,3,4,5]
97 |
98 | if TASK_TYPE == 'te':
99 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 | predicted_folds=predicted_folds, need_predict_test=True)
101 | elif TASK_TYPE == 'tr':
102 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 | predicted_folds=predicted_folds, need_predict_test=False)
104 | elif TASK_TYPE == 'pe':
105 | lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 | predicted_folds=predicted_folds, need_predict_test=False)
107 |
108 | if __name__ == '__main__':
109 |
110 | ################## params ####################
111 | print("Load the training, test and store data using pandas")
112 | ts = time.time()
113 | root_path = '../../../feat/'
114 | postfix = 's0_{}'.format(FEA_NUM)
115 | file_type = 'ftr'
116 |
117 | train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 | test_path = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 | if TASK_TYPE in ['te', 'pe']:
120 | test_path = root_path + 'te_{}.{}'.format(postfix, file_type)
121 |
122 | print ('tr path', train_path)
123 | print ('te path', test_path)
124 | train_data = loader.load_df(train_path)
125 | test_data = loader.load_df(test_path)
126 |
127 | paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 | tr = loader.load_df('../../../input/tr_input_final.ftr')
129 | tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 | desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 | #train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |
133 | print (train_data.columns)
134 | print (train_data.shape, test_data.shape)
135 |
136 | fea_col_names = get_feas(train_data)
137 | print (len(fea_col_names), fea_col_names)
138 |
139 | required_cols = ID_NAMES + ['cv', 'target']
140 | drop_cols = [col for col in train_data.columns \
141 | if col not in fea_col_names and col not in required_cols]
142 |
143 | train_data = train_data.drop(drop_cols, axis=1)
144 | test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 |
146 | lgb_train(train_data, test_data, fea_col_names)
147 | print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_38-0.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 |
19 | # 自定义工具包
20 | sys.path.append('../../../tools/')
21 | import loader
22 | from lgb_learner import lgbLearner
23 |
24 | # 设置随机种子
25 | SEED = 2020
26 | np.random.seed (SEED)
27 |
28 | FEA_NUM = sys.argv[1]
29 | FEA_NUM = '38'
30 |
31 | fold_num = 5
32 | out_name = 'lgb_m3_{}-0'.format(FEA_NUM)
33 | root_path = '../../../output/m3/' + out_name + '/'
34 |
35 | ID_NAMES = ['description_id', 'paper_id']
36 | TARGET_NAME = 'target'
37 |
38 | TASK_TYPE = 'te'
39 | #TASK_TYPE = 'tr'
40 | #TASK_TYPE = 'pe'
41 |
42 | if not os.path.exists(root_path):
43 | os.mkdir(root_path)
44 | print ('create dir succ {}'.format(root_path))
45 |
46 | def sum_score(x, y):
47 | return max(x, 0) + max(y, 0)
48 |
49 | def get_feas(data):
50 |
51 | cols = data.columns.tolist()
52 | del_cols = ID_NAMES + ['target', 'cv']
53 | sub_cols = ['year']
54 | for col in data.columns:
55 | for sub_col in sub_cols:
56 | #if sub_col in col and col != 'year':
57 | if sub_col in col:
58 | del_cols.append(col)
59 |
60 | cols = [val for val in cols if val not in del_cols]
61 | print ('del_cols', del_cols)
62 | return cols
63 |
64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
65 | params = {
66 | "objective": "binary",
67 | "boosting_type": "gbdt",
68 | #"metric": ['binary_logloss'],
69 | "metric": ['auc'],
70 | "boost_from_average": False,
71 | "learning_rate": 0.03,
72 | "num_leaves": 32,
73 | "max_depth": -1,
74 | "feature_fraction": 0.7,
75 | "bagging_fraction": 0.7,
76 | "bagging_freq": 2,
77 | "lambda_l1": 0,
78 | "lambda_l2": 0,
79 | "seed": seed,
80 | 'min_child_weight': 0.005,
81 | 'min_data_in_leaf': 50,
82 | 'max_bin': 255,
83 | "num_threads": 16,
84 | "verbose": -1,
85 | "early_stopping_round": 50
86 | }
87 | params['learning_rate'] = 0.03
88 | num_trees = 2000
89 | print ('training params:', num_trees, params)
90 |
91 | lgb_learner = lgbLearner(train_data, test_data, \
92 | fea_col_names, ID_NAMES, TARGET_NAME, \
93 | params, num_trees, fold_num, out_name, \
94 | metric_names=['auc', 'logloss'], \
95 | model_postfix='')
96 | predicted_folds = [1,2,3,4,5]
97 |
98 | if TASK_TYPE == 'te':
99 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 | predicted_folds=predicted_folds, need_predict_test=True)
101 | elif TASK_TYPE == 'tr':
102 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 | predicted_folds=predicted_folds, need_predict_test=False)
104 | elif TASK_TYPE == 'pe':
105 | lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 | predicted_folds=predicted_folds, need_predict_test=False)
107 |
108 | if __name__ == '__main__':
109 |
110 | ################## params ####################
111 | print("Load the training, test and store data using pandas")
112 | ts = time.time()
113 | root_path = '../../../feat/'
114 | postfix = 's0_{}'.format(FEA_NUM)
115 | file_type = 'ftr'
116 |
117 | train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 | test_path = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 | if TASK_TYPE in ['te', 'pe']:
120 | test_path = root_path + 'te_{}.{}'.format(postfix, file_type)
121 |
122 | print ('tr path', train_path)
123 | print ('te path', test_path)
124 | train_data = loader.load_df(train_path)
125 | test_data = loader.load_df(test_path)
126 |
127 | paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 | tr = loader.load_df('../../../input/tr_input_final.ftr')
129 | tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 | desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 | #train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |
133 | print (train_data.columns)
134 | print (train_data.shape, test_data.shape)
135 |
136 | fea_col_names = get_feas(train_data)
137 | print (len(fea_col_names), fea_col_names)
138 |
139 | required_cols = ID_NAMES + ['cv', 'target']
140 | drop_cols = [col for col in train_data.columns \
141 | if col not in fea_col_names and col not in required_cols]
142 |
143 | train_data = train_data.drop(drop_cols, axis=1)
144 | test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 |
146 | lgb_train(train_data, test_data, fea_col_names)
147 | print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_38-1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 |
19 | # 自定义工具包
20 | sys.path.append('../../../tools/')
21 | import loader
22 | from lgb_learner import lgbLearner
23 |
24 | # 设置随机种子
25 | SEED = 2020
26 | np.random.seed (SEED)
27 |
28 | FEA_NUM = sys.argv[1]
29 | FEA_NUM = '38'
30 |
31 | fold_num = 5
32 | out_name = 'lgb_m3_{}-0'.format(FEA_NUM)
33 | root_path = '../../../output/m3/' + out_name + '/'
34 |
35 | ID_NAMES = ['description_id', 'paper_id']
36 | TARGET_NAME = 'target'
37 |
38 | TASK_TYPE = 'te'
39 | #TASK_TYPE = 'tr'
40 | #TASK_TYPE = 'pe'
41 |
42 | if not os.path.exists(root_path):
43 | os.mkdir(root_path)
44 | print ('create dir succ {}'.format(root_path))
45 |
46 | def sum_score(x, y):
47 | return max(x, 0) + max(y, 0)
48 |
49 | def get_feas(data):
50 |
51 | cols = data.columns.tolist()
52 | del_cols = ID_NAMES + ['target', 'cv']
53 | sub_cols = ['year']
54 | for col in data.columns:
55 | for sub_col in sub_cols:
56 | if sub_col in col and col != 'year':
57 | #if sub_col in col:
58 | del_cols.append(col)
59 |
60 | cols = [val for val in cols if val not in del_cols]
61 | print ('del_cols', del_cols)
62 | return cols
63 |
64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
65 | params = {
66 | "objective": "binary",
67 | "boosting_type": "gbdt",
68 | #"metric": ['binary_logloss'],
69 | "metric": ['auc'],
70 | "boost_from_average": False,
71 | "learning_rate": 0.03,
72 | "num_leaves": 32,
73 | "max_depth": -1,
74 | "feature_fraction": 0.7,
75 | "bagging_fraction": 0.7,
76 | "bagging_freq": 2,
77 | "lambda_l1": 0,
78 | "lambda_l2": 0,
79 | "seed": seed,
80 | 'min_child_weight': 0.005,
81 | 'min_data_in_leaf': 50,
82 | 'max_bin': 255,
83 | "num_threads": 16,
84 | "verbose": -1,
85 | "early_stopping_round": 50
86 | }
87 | params['learning_rate'] = 0.03
88 | num_trees = 2000
89 | print ('training params:', num_trees, params)
90 |
91 | lgb_learner = lgbLearner(train_data, test_data, \
92 | fea_col_names, ID_NAMES, TARGET_NAME, \
93 | params, num_trees, fold_num, out_name, \
94 | metric_names=['auc', 'logloss'], \
95 | model_postfix='')
96 | predicted_folds = [1,2,3,4,5]
97 |
98 | if TASK_TYPE == 'te':
99 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 | predicted_folds=predicted_folds, need_predict_test=True)
101 | elif TASK_TYPE == 'tr':
102 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 | predicted_folds=predicted_folds, need_predict_test=False)
104 | elif TASK_TYPE == 'pe':
105 | lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 | predicted_folds=predicted_folds, need_predict_test=False)
107 |
108 | if __name__ == '__main__':
109 |
110 | ################## params ####################
111 | print("Load the training, test and store data using pandas")
112 | ts = time.time()
113 | root_path = '../../../feat/'
114 | postfix = 's0_{}'.format(FEA_NUM)
115 | file_type = 'ftr'
116 |
117 | train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 | test_path = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 | if TASK_TYPE in ['te', 'pe']:
120 | test_path = root_path + 'te_{}.{}'.format(postfix, file_type)
121 |
122 | print ('tr path', train_path)
123 | print ('te path', test_path)
124 | train_data = loader.load_df(train_path)
125 | test_data = loader.load_df(test_path)
126 |
127 | paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 | tr = loader.load_df('../../../input/tr_input_final.ftr')
129 | tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 | desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 | train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |
133 | print (train_data.columns)
134 | print (train_data.shape, test_data.shape)
135 |
136 | fea_col_names = get_feas(train_data)
137 | print (len(fea_col_names), fea_col_names)
138 |
139 | required_cols = ID_NAMES + ['cv', 'target']
140 | drop_cols = [col for col in train_data.columns \
141 | if col not in fea_col_names and col not in required_cols]
142 |
143 | train_data = train_data.drop(drop_cols, axis=1)
144 | test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 |
146 | lgb_train(train_data, test_data, fea_col_names)
147 | print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_40-0.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 |
19 | # 自定义工具包
20 | sys.path.append('../../../tools/')
21 | import loader
22 | from lgb_learner import lgbLearner
23 |
24 | # 设置随机种子
25 | SEED = 2020
26 | np.random.seed (SEED)
27 |
28 | FEA_NUM = sys.argv[1]
29 | FEA_NUM = '40'
30 |
31 | fold_num = 5
32 | out_name = 'lgb_m3_{}-0'.format(FEA_NUM)
33 | root_path = '../../../output/m3/' + out_name + '/'
34 |
35 | ID_NAMES = ['description_id', 'paper_id']
36 | TARGET_NAME = 'target'
37 |
38 | TASK_TYPE = 'te'
39 | #TASK_TYPE = 'tr'
40 | #TASK_TYPE = 'pe'
41 |
42 | if not os.path.exists(root_path):
43 | os.mkdir(root_path)
44 | print ('create dir succ {}'.format(root_path))
45 |
46 | def sum_score(x, y):
47 | return max(x, 0) + max(y, 0)
48 |
49 | def get_feas(data):
50 |
51 | cols = data.columns.tolist()
52 | del_cols = ID_NAMES + ['target', 'cv']
53 | sub_cols = ['year']
54 | for col in data.columns:
55 | for sub_col in sub_cols:
56 | #if sub_col in col and col != 'year':
57 | if sub_col in col:
58 | del_cols.append(col)
59 |
60 | cols = [val for val in cols if val not in del_cols]
61 | print ('del_cols', del_cols)
62 | return cols
63 |
64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
65 | params = {
66 | "objective": "binary",
67 | "boosting_type": "gbdt",
68 | #"metric": ['binary_logloss'],
69 | "metric": ['auc'],
70 | "boost_from_average": False,
71 | "learning_rate": 0.03,
72 | "num_leaves": 32,
73 | "max_depth": -1,
74 | "feature_fraction": 0.7,
75 | "bagging_fraction": 0.7,
76 | "bagging_freq": 2,
77 | "lambda_l1": 0,
78 | "lambda_l2": 0,
79 | "seed": seed,
80 | 'min_child_weight': 0.005,
81 | 'min_data_in_leaf': 50,
82 | 'max_bin': 255,
83 | "num_threads": 16,
84 | "verbose": -1,
85 | "early_stopping_round": 50
86 | }
87 | params['learning_rate'] = 0.03
88 | num_trees = 2000
89 | print ('training params:', num_trees, params)
90 |
91 | lgb_learner = lgbLearner(train_data, test_data, \
92 | fea_col_names, ID_NAMES, TARGET_NAME, \
93 | params, num_trees, fold_num, out_name, \
94 | metric_names=['auc', 'logloss'], \
95 | model_postfix='')
96 | predicted_folds = [1,2,3,4,5]
97 |
98 | if TASK_TYPE == 'te':
99 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 | predicted_folds=predicted_folds, need_predict_test=True)
101 | elif TASK_TYPE == 'tr':
102 | lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 | predicted_folds=predicted_folds, need_predict_test=False)
104 | elif TASK_TYPE == 'pe':
105 | lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 | predicted_folds=predicted_folds, need_predict_test=False)
107 |
108 | if __name__ == '__main__':
109 |
110 | ################## params ####################
111 | print("Load the training, test and store data using pandas")
112 | ts = time.time()
113 | root_path = '../../../feat/'
114 | postfix = 's0_{}'.format(FEA_NUM)
115 | file_type = 'ftr'
116 |
117 | train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 | test_path = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 | if TASK_TYPE in ['te', 'pe']:
120 | test_path = root_path + 'te_{}.{}'.format(postfix, file_type)
121 |
122 | print ('tr path', train_path)
123 | print ('te path', test_path)
124 | train_data = loader.load_df(train_path)
125 | test_data = loader.load_df(test_path)
126 |
127 | paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 | tr = loader.load_df('../../../input/tr_input_final.ftr')
129 | tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 | desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 | train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |
133 | print (train_data.columns)
134 | print (train_data.shape, test_data.shape)
135 |
136 | fea_col_names = get_feas(train_data)
137 | print (len(fea_col_names), fea_col_names)
138 |
139 | required_cols = ID_NAMES + ['cv', 'target']
140 | drop_cols = [col for col in train_data.columns \
141 | if col not in fea_col_names and col not in required_cols]
142 |
143 | train_data = train_data.drop(drop_cols, axis=1)
144 | test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 |
146 | lgb_train(train_data, test_data, fea_col_names)
147 | print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/src/recall/tfidf_recall_30.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | # bm25 recall
5 |
6 | # 基础模块
7 | import os
8 | import gc
9 | import sys
10 | import time
11 | import functools
12 | from tqdm import tqdm
13 | from six import iteritems
14 | from datetime import datetime
15 |
16 | # 数据处理
17 | import re
18 | import math
19 | import pickle
20 | import numpy as np
21 | import pandas as pd
22 | from multiprocessing import Pool
23 |
24 | # 自定义工具包
25 | sys.path.append('../../tools/')
26 | import loader
27 | import pandas_util
28 | import custom_bm25 as bm25
29 |
30 | # 开源工具包
31 | from gensim.models import Word2Vec
32 | from gensim.models.word2vec import LineSentence
33 | from gensim import corpora, models, similarities
34 | from gensim.similarities import SparseMatrixSimilarity
35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
36 |
37 | # 设置随机种子
38 | SEED = 2020
39 | PROCESS_NUM, PARTITION_NUM = 18, 18
40 |
41 | input_root_path = '../../input/'
42 | output_root_path = '../../feat/'
43 |
44 | postfix = '30'
45 | file_type = 'ftr'
46 |
47 | train_out_path = output_root_path + 'tr_tfidf_{}.{}'.format(postfix, file_type)
48 | test_out_path = output_root_path + 'te_tfidf_{}.{}'.format(postfix, file_type)
49 |
50 | def topk_sim_samples(desc, desc_ids, paper_ids, bm25_model, k=10):
51 | desc_id2papers = {}
52 | for desc_i in tqdm(range(len(desc))):
53 | query_vec, query_desc_id = desc[desc_i], desc_ids[desc_i]
54 | sims = bm25_model.get_scores(query_vec)
55 | sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
56 | sim_papers = [paper_ids[val[0]] for val in sort_sims[:k]]
57 | sim_scores = [str(val[1]) for val in sort_sims[:k]]
58 | desc_id2papers[query_desc_id] = ['|'.join(sim_papers), '|'.join(sim_scores)]
59 | sim_df = pd.DataFrame.from_dict(desc_id2papers, orient='index', columns=['paper_id', 'sim_score'])
60 | sim_df = sim_df.reset_index().rename(columns={'index':'description_id'})
61 | return sim_df
62 |
63 | def partition(queries, num):
64 | queries_partitions, step = [], int(np.ceil(len(queries)/num))
65 | for i in range(0, len(queries), step):
66 | queries_partitions.append(queries[i:i+step])
67 | return queries_partitions
68 |
69 | def single_process_search(params=None):
70 | (query_vecs, desc_ids, paper_ids, bm25_model, k, i) = params
71 | print (i, 'start', datetime.now())
72 | gc.collect()
73 | sim_df = topk_sim_samples(query_vecs, desc_ids, paper_ids, bm25_model, k)
74 | print (i, 'completed', datetime.now())
75 | return sim_df
76 |
77 | def multi_process_search(query_vecs, desc_ids, paper_ids, bm25_model, k):
78 | pool = Pool(PROCESS_NUM)
79 | queries_parts = partition(query_vecs, PARTITION_NUM)
80 | desc_ids_parts = partition(desc_ids, PARTITION_NUM)
81 | print ('{} processes init and partition to {} parts' \
82 | .format(PROCESS_NUM, PARTITION_NUM))
83 |
84 | param_list = [(queries_parts[i], desc_ids_parts[i], \
85 | paper_ids, bm25_model, k, i) for i in range(PARTITION_NUM)]
86 | sim_dfs = pool.map(single_process_search, param_list)
87 | sim_df = pd.concat(sim_dfs, axis=0)
88 | return sim_df
89 |
90 | def gen_samples(df, desc, desc_ids, corpus_list, paper_ids_list, k):
91 | df_samples_list = []
92 | for i, corpus in enumerate(corpus_list):
93 | bm25_model = bm25.BM25(corpus[0])
94 | cur_df_sample = multi_process_search(desc, desc_ids, \
95 | paper_ids_list[i], bm25_model, k)
96 | cur_df_sample_out = pandas_util.explode(cur_df_sample, ['paper_id', 'sim_score'])
97 | cur_df_sample_out['type'] = corpus[1] # recall_name
98 | df_samples_list.append(cur_df_sample_out)
99 | df_samples = pd.concat(df_samples_list, axis=0)
100 | df_samples.drop_duplicates(subset=['description_id', 'paper_id'], inplace=True)
101 | df_samples['target'] = 0
102 | return df_samples
103 |
104 | if __name__ == "__main__":
105 |
106 | ts = time.time()
107 | tqdm.pandas()
108 | print('start time: %s' % datetime.now())
109 | # load data
110 | df = loader.load_df(input_root_path + 'paper_input_final.ftr')
111 | df = df[~pd.isnull(df['paper_id'])]
112 |
113 | # gen tfidf vecs
114 | dictionary = pickle.load(open('../../feat/corpus.dict', 'rb'))
115 | print ('dic len', len(dictionary))
116 |
117 | df['corp'] = df['abst'] + ' ' + df['titl'] + ' ' + df['keywords'].fillna('').replace(';', ' ')
118 | df_corp, corp_paper_ids = [dictionary.doc2bow(line.split(' ')) for line in df['corp'].tolist()], \
119 | df['paper_id'].tolist()
120 |
121 | # gen topk sim samples
122 | paper_ids_list = [corp_paper_ids]
123 | corpus_list = [(df_corp, 'corp_bm25')]
124 | out_cols = ['description_id', 'paper_id', 'sim_score', 'target', 'type']
125 |
126 | if sys.argv[1] in ['tr']:
127 | # for tr ins
128 | tr = loader.load_df(input_root_path + 'tr_input_final.ftr')
129 | tr = tr[~pd.isnull(tr['description_id'])]
130 |
131 | # tr = tr.head(1000)
132 | tr_desc, tr_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in tr['quer_all'].tolist()], \
133 | tr['description_id'].tolist()
134 | print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))
135 |
136 | tr_samples = gen_samples(tr, tr_desc, tr_desc_ids, \
137 | corpus_list, paper_ids_list, k=50)
138 | tr_samples = tr.rename(columns={'paper_id': 'target_paper_id'}) \
139 | .merge(tr_samples, on='description_id', how='left')
140 | tr_samples.loc[tr_samples['target_paper_id'] == tr_samples['paper_id'], 'target'] = 1
141 | loader.save_df(tr_samples[out_cols], train_out_path)
142 | print ('recall succ {} from {}'.format(tr_samples['target'].sum(), tr.shape[0]))
143 | print (tr.shape, tr_samples.shape)
144 |
145 | if sys.argv[1] in ['te']:
146 | # for te ins
147 | te = loader.load_df(input_root_path + 'te_input_final.ftr')
148 | te = te[~pd.isnull(te['description_id'])]
149 |
150 | # te = te.head(1000)
151 | te_desc, te_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in te['quer_all'].tolist()], \
152 | te['description_id'].tolist()
153 | print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))
154 |
155 | te_samples = gen_samples(te, te_desc, te_desc_ids, \
156 | corpus_list, paper_ids_list, k=50)
157 | te_samples = te.merge(te_samples, on='description_id', how='left')
158 | loader.save_df(te_samples[out_cols], test_out_path)
159 | print (te.shape, te_samples.shape)
160 |
161 | print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/src/utils/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/src/utils/.gitkeep
--------------------------------------------------------------------------------
/stk_feat/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of generated stacking features
2 |
--------------------------------------------------------------------------------
/tools/__pycache__/basic_learner.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/basic_learner.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/__pycache__/custom_bm25.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/custom_bm25.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/__pycache__/custom_metrics.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/custom_metrics.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/__pycache__/feat_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/feat_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/__pycache__/lgb_learner.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/lgb_learner.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/__pycache__/loader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/loader.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/__pycache__/nlp_preprocess.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/nlp_preprocess.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/__pycache__/pandas_util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/pandas_util.cpython-37.pyc
--------------------------------------------------------------------------------
/tools/basic_learner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # 基础模块
5 | import os
6 | import sys
7 | import gc
8 | import json
9 | import time
10 | import functools
11 | from datetime import datetime
12 |
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 |
19 | # 自定义工具包
20 | import loader
21 | import custom_metrics
22 |
23 | # 设置随机种子
24 | SEED = 2018
25 | np.random.seed (SEED)
26 |
27 | class BaseLearner(object):
28 |
29 | def __init__(self, train_data, test_data,
30 | fea_names, id_names, target_name, \
31 | params, fold_num, out_name, metric_names=['auc'], \
32 | model_postfix=''):
33 | # 深度拷贝原始数据,防止外部主函数修改导致的数据异常
34 | self.train_data = train_data.copy(deep=True)
35 | self.test_data = test_data.copy(deep=True)
36 |
37 | # 基础数据信息
38 | self.fea_names = fea_names
39 | self.id_names = id_names
40 | self.target_name = target_name
41 |
42 | self.params = params
43 | self.fold_num = fold_num
44 | self.out_name = out_name
45 | self.root_path = '../../../output/m3/' + out_name + '/'
46 | self.metric_names = metric_names
47 | self.model_postfix = model_postfix
48 |
49 | # 获取模型存储路径
50 | def get_model_path(self, predicted_fold_index):
51 | model_path = self.root_path + 'model_' + str(predicted_fold_index)
52 | if self.model_postfix != '':
53 | model_path += '_' + self.model_postfix
54 | return model_path
55 |
56 | # 获取预测结果输出路径
57 | def get_preds_outpath(self, predicted_fold_index):
58 | out_path = self.root_path + self.out_name
59 | if self.model_postfix != '':
60 | out_path += '_' + self.model_postfix
61 | if predicted_fold_index != 0:
62 | out_path += '_cv_' + str(predicted_fold_index)
63 | return out_path
64 |
65 | # 训练、验证集划分接口,需要被重载
66 | def extract_train_data(self, data, predicted_fold_index):
67 | pass
68 |
69 | # 单 fold 训练接口,需要被重载
70 | def train(self, data, predicted_fold_index, model_dump_path=None):
71 | pass
72 |
73 | # 单 fold 预测接口,需要被重载
74 | def predict(self, data, predicted_fold_index, model_load_path=None):
75 | pass
76 |
77 | # 多 fold 训练
78 | def multi_fold_train(self, data, predicted_folds=[1,2,3,4,5], \
79 | need_predict_test=False):
80 | print ("multi_fold train start {}".format(datetime.now()))
81 | ts = time.time()
82 | for fold_index in predicted_folds:
83 | print ('training fold {}'.format(fold_index))
84 | self.train(data, fold_index)
85 | print ('fold {} completed, cost {}s'.format( \
86 | fold_index, time.time() - ts))
87 | self.multi_fold_predict(data, predicted_folds, need_predict_test)
88 |
89 | # 多 fold 预测
90 | def multi_fold_predict(self, data, predicted_folds, \
91 | need_predict_test=False):
92 | print ("multi_fold predict start {}".format(datetime.now()))
93 |
94 | multi_fold_eval_lis = []
95 |
96 | for fold_index in predicted_folds:
97 | dtrain, dvalid, Xvalid = self.extract_train_data( \
98 | self.train_data, fold_index)
99 |
100 | ypreds = self.predict(Xvalid, fold_index)
101 | labels = Xvalid[self.target_name]
102 |
103 | eval_lis = custom_metrics.calc_metrics(labels, ypreds, \
104 | self.metric_names)
105 |
106 | multi_fold_eval_lis.append(eval_lis)
107 | print ('{} eval: {}'.format(fold_index, eval_lis))
108 | loader.out_preds(self.target_name, \
109 | Xvalid[self.id_names], ypreds, \
110 | '{}.csv'.format(self.get_preds_outpath(fold_index)), \
111 | labels.tolist())
112 |
113 | if need_predict_test:
114 | print ('predict test data')
115 | ypreds = self.predict(self.test_data, 0,
116 | model_load_path=self.get_model_path(fold_index))
117 | # output preds
118 | loader.out_preds(self.target_name, \
119 | self.test_data[self.id_names], ypreds, \
120 | '{}_{}.csv'.format(self.get_preds_outpath(0), fold_index))
121 |
122 | multi_fold_eval_avgs = []
123 | for i in range(len(self.metric_names)):
124 | eval_avg = np.array([val[i] for val in multi_fold_eval_lis]).mean()
125 | eval_avg = round(eval_avg, 5)
126 | multi_fold_eval_avgs.append(eval_avg)
127 | print ('multi fold eval mean: ', multi_fold_eval_avgs)
128 |
129 | return multi_fold_eval_avgs
130 |
131 |
132 |
--------------------------------------------------------------------------------
/tools/basic_learner.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/basic_learner.pyc
--------------------------------------------------------------------------------
/tools/custom_bm25.py:
--------------------------------------------------------------------------------
1 | import math
2 | from six import iteritems
3 | from six.moves import range
4 |
5 | PARAM_K1 = 1.5
6 | PARAM_B = 0.75
7 | EPSILON = 0.25
8 |
9 | class BM25(object):
10 | def __init__(self, corpus):
11 | """
12 | Parameters
13 | ----------
14 | corpus : list of list of str
15 | Given corpus.
16 | """
17 | self.corpus_size = 0
18 | self.avgdl = 0
19 | self.doc_freqs = []
20 | self.idf = {}
21 | self.doc_len = []
22 | self._initialize(corpus)
23 |
24 | def _initialize(self, corpus):
25 | """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
26 | nd = {} # word -> number of documents with word
27 | num_doc = 0
28 |
29 | for document in corpus:
30 | self.corpus_size += 1
31 | cur_doc_len = 0
32 | frequencies = {}
33 |
34 | for word_tuple in document:
35 | word, word_cnt = word_tuple[0], word_tuple[1]
36 | if word not in frequencies:
37 | frequencies[word] = 0
38 | frequencies[word] += word_cnt
39 | cur_doc_len += word_cnt
40 | self.doc_freqs.append(frequencies)
41 | self.doc_len.append(cur_doc_len)
42 | num_doc += cur_doc_len
43 |
44 | for word, freq in iteritems(frequencies):
45 | if word not in nd:
46 | nd[word] = 0
47 | nd[word] += 1
48 |
49 | self.avgdl = float(num_doc) / self.corpus_size
50 | # collect idf sum to calculate an average idf for epsilon value
51 | idf_sum = 0
52 | # collect words with negative idf to set them a special epsilon value.
53 | # idf can be negative if word is contained in more than half of documents
54 | negative_idfs = []
55 | for word, freq in iteritems(nd):
56 | idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
57 | self.idf[word] = idf
58 | idf_sum += idf
59 | if idf < 0:
60 | negative_idfs.append(word)
61 | self.average_idf = float(idf_sum) / len(self.idf)
62 |
63 | eps = EPSILON * self.average_idf
64 | for word in negative_idfs:
65 | self.idf[word] = eps
66 |
67 | def get_score(self, document, index):
68 | """Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
69 | Parameters
70 | ----------
71 | document : list of str
72 | Document to be scored.
73 | index : int
74 | Index of document in corpus selected to score with `document`.
75 | Returns
76 | -------
77 | float
78 | BM25 score.
79 | """
80 | score = 0
81 | doc_freqs = self.doc_freqs[index]
82 | for word_tuple in document:
83 | word = word_tuple[0]
84 | if word not in doc_freqs:
85 | continue
86 | score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
87 | / (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
88 | return score
89 |
90 | def get_scores(self, document):
91 | """Computes and returns BM25 scores of given `document` in relation to
92 | every item in corpus.
93 | Parameters
94 | ----------
95 | document : list of str
96 | Document to be scored.
97 | Returns
98 | -------
99 | list of float
100 | BM25 scores.
101 | """
102 | scores = [self.get_score(document, index) for index in range(self.corpus_size)]
103 | return scores
104 |
105 | def get_scores_bow(self, document):
106 | """Computes and returns BM25 scores of given `document` in relation to
107 | every item in corpus.
108 | Parameters
109 | ----------
110 | document : list of str
111 | Document to be scored.
112 | Returns
113 | -------
114 | list of float
115 | BM25 scores.
116 | """
117 | scores = []
118 | for index in range(self.corpus_size):
119 | score = self.get_score(document, index)
120 | if score > 0:
121 | scores.append((index, score))
122 | return scores
123 |
--------------------------------------------------------------------------------
/tools/custom_bm25.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/custom_bm25.pyc
--------------------------------------------------------------------------------
/tools/custom_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import gc
7 | import json
8 | import time
9 | import functools
10 | from datetime import datetime
11 |
12 | # 数据处理
13 | import numpy as np
14 | import pandas as pd
15 | from math import sqrt
16 |
17 | # 评价指标
18 | from sklearn.metrics import log_loss
19 | from sklearn.metrics import roc_auc_score
20 | from sklearn.metrics import accuracy_score
21 | from sklearn.metrics import mean_absolute_error
22 | from sklearn.metrics import mean_squared_error
23 |
24 | def _calc_auc(labels, ypreds):
25 | return roc_auc_score(labels, ypreds)
26 |
27 | def _calc_logloss(labels, ypreds):
28 | return log_loss(labels, ypreds)
29 |
30 | def _calc_mae(labels, ypreds):
31 | return mean_absolute_error(labels, ypreds)
32 |
33 | def _calc_rmse(labels, ypreds):
34 | return sqrt(mean_squared_error(labels, ypreds))
35 |
36 | # kappa
37 |
38 | # multi-logloss
39 |
40 | def _calc_metric(labels, ypreds, metric_name='auc'):
41 | if metric_name == 'auc':
42 | return _calc_auc(labels, ypreds)
43 | elif metric_name == 'logloss':
44 | return _calc_logloss(labels, ypreds)
45 | elif metric_name == 'mae':
46 | return _calc_mae(labels, ypreds)
47 | elif metric_name == 'rmse':
48 | return _calc_rmse(labels, ypreds)
49 |
50 | def calc_metrics(labels, ypreds, metric_names=['auc']):
51 | eval_lis = []
52 | for metric_name in metric_names:
53 | eval_val = _calc_metric(labels, ypreds, metric_name=metric_name)
54 | eval_val = round(eval_val, 5)
55 | eval_lis.append(eval_val)
56 | return eval_lis
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/tools/custom_metrics.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/custom_metrics.pyc
--------------------------------------------------------------------------------
/tools/feat_utils.py:
--------------------------------------------------------------------------------
1 | def try_divide(x, y, val=0.0):
2 | """
3 | Try to divide two numbers
4 | """
5 | if y != 0.0:
6 | val = float(x) / y
7 | return val
8 |
9 |
10 | def get_sample_indices_by_relevance(dfTrain, additional_key=None):
11 | """
12 | return a dict with
13 | key: (additional_key, median_relevance)
14 | val: list of sample indices
15 | """
16 | dfTrain["sample_index"] = range(dfTrain.shape[0])
17 | group_key = ["median_relevance"]
18 | if additional_key != None:
19 | group_key.insert(0, additional_key)
20 | agg = dfTrain.groupby(group_key, as_index=False).apply(lambda x: list(x["sample_index"]))
21 | d = dict(agg)
22 | dfTrain = dfTrain.drop("sample_index", axis=1)
23 | return d
24 |
25 |
26 | def dump_feat_name(feat_names, feat_name_file):
27 | """
28 | save feat_names to feat_name_file
29 | """
30 | with open(feat_name_file, "wb") as f:
31 | for i,feat_name in enumerate(feat_names):
32 | if feat_name.startswith("count") or feat_name.startswith("pos_of"):
33 | f.write("('%s', SimpleTransform(config.count_feat_transform)),\n" % feat_name)
34 | else:
35 | f.write("('%s', SimpleTransform()),\n" % feat_name)
36 |
--------------------------------------------------------------------------------
/tools/lgb_learner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import gc
7 | import json
8 | import time
9 | import functools
10 | from datetime import datetime
11 |
12 | # 数据处理
13 | import numpy as np
14 | import pandas as pd
15 |
16 | # 模型相关
17 | import lightgbm as lgb
18 | from basic_learner import BaseLearner
19 |
20 | # 设置随机种子
21 | SEED = 2018
22 | np.random.seed (SEED)
23 |
24 | # 设置模型通用参数
25 | EVAL_ROUND = 100
26 | PRINT_TRAIN_METRICS = False
27 |
28 |
29 | class lgbLearner(BaseLearner):
30 |
31 | def __init__(self, train_data, test_data, \
32 | fea_names, id_names, target_name, \
33 | params, num_trees, fold_num, out_name, \
34 | cv_name='cv', metric_names=['auc'], model_postfix=''):
35 | super(lgbLearner, self).__init__(train_data, test_data, fea_names, \
36 | id_names, target_name, params, fold_num, \
37 | out_name, metric_names, model_postfix)
38 | self.num_trees = num_trees
39 | self.cv_name = cv_name
40 |
41 | self.eval_round = EVAL_ROUND
42 | self.print_train_metrics = PRINT_TRAIN_METRICS
43 |
44 | def extract_train_data(self, data, predicted_fold_index):
45 |
46 | Xtrain = data[data[self.cv_name] != predicted_fold_index]
47 | Xvalid = data[data[self.cv_name] == predicted_fold_index]
48 |
49 | dtrain = lgb.Dataset(Xtrain[self.fea_names].values, \
50 | Xtrain[self.target_name])
51 | dvalid = lgb.Dataset(Xvalid[self.fea_names].values, \
52 | Xvalid[self.target_name])
53 |
54 | print ('train, valid', Xtrain.shape, Xvalid.shape)
55 | return dtrain, dvalid, Xvalid
56 |
57 | def train(self, data, predicted_fold_index, model_dump_path=None):
58 | if model_dump_path == None:
59 | model_dump_path = self.get_model_path(predicted_fold_index)
60 |
61 | dtrain, dvalid, Xvalid = self.extract_train_data(self.train_data,
62 | predicted_fold_index)
63 |
64 | if self.print_train_metrics:
65 | valid_sets = [dtrain, dvalid] \
66 | if predicted_fold_index != 0 else [dtrain]
67 | valid_names = ['train', 'valid'] \
68 | if predicted_fold_index != 0 else ['train']
69 | else:
70 | valid_sets = [dvalid] if predicted_fold_index != 0 else [dtrain]
71 | valid_names = ['valid'] if predicted_fold_index != 0 else ['train']
72 |
73 | params = self.params
74 |
75 | bst = lgb.train(params, dtrain, self.num_trees,
76 | valid_sets=valid_sets,
77 | valid_names=valid_names,
78 | verbose_eval=self.eval_round)
79 | bst.save_model(model_dump_path)
80 |
81 | def predict(self, data, predicted_fold_index, \
82 | model_load_path=None):
83 | if model_load_path is None:
84 | model_load_path = self.get_model_path(predicted_fold_index)
85 |
86 | bst = lgb.Booster(model_file=model_load_path)
87 | ypreds = bst.predict(data[self.fea_names], num_iteration=self.num_trees)
88 |
89 | if predicted_fold_index != 0:
90 | # output fea importance
91 | df = pd.DataFrame(self.fea_names, columns=['feature'])
92 | df['importance'] = list(bst.feature_importance('gain'))
93 | df['precent'] = np.round(df.importance * 100 / sum(df.importance), 2)
94 | df['precent'] = df.precent.apply(lambda x : str(x) + '%')
95 |
96 | df = df.sort_values(by='importance', ascending=False)
97 | imp_path = 'imp'
98 | if self.model_postfix != '':
99 | imp_path = 'imp-{}'.format(self.model_postfix)
100 | df.to_csv(self.root_path + imp_path, sep='\t')
101 | return ypreds
102 |
103 |
--------------------------------------------------------------------------------
/tools/lgb_learner.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/lgb_learner.pyc
--------------------------------------------------------------------------------
/tools/loader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python$
2 | # -*- coding: utf-8 -*-$
3 |
4 | # 数据处理
5 | import numpy as np
6 | import pandas as pd
7 | import feather
8 |
9 | # 基础文件读写
10 | def load_df(filename, nrows=None):
11 | if filename.endswith('csv'):
12 | return pd.read_csv(filename, nrows = nrows)
13 | elif filename.endswith('ftr'):
14 | return feather.read_dataframe(filename)
15 |
16 | def save_df(df, filename, index=False):
17 | if filename.endswith('csv'):
18 | df.to_csv(filename, index=index)
19 | elif filename.endswith('ftr'):
20 | df = df.reset_index(drop=True)
21 | df.columns = [str(col) for col in df.columns]
22 | df.to_feather(filename)
23 |
24 | # merge 特征文件
25 | def merge_fea(df_list, primary_keys=[]):
26 | assert len(primary_keys) >= 0, 'empty primary keys'
27 | print (df_list)
28 |
29 | df_base = load_df(df_list[0])
30 | for i in range(1, len(df_list)):
31 | print (df_list[i])
32 | cur_df = load_df(df_list[i])
33 | df_base = pd.concat([df_base, \
34 | cur_df.drop(primary_keys, axis=1)], axis=1)
35 | print ('merge completed, df shape', df_base.shape)
36 | return df_base
37 |
38 | # 模型预测结果输出
39 | def out_preds(target_name, df_ids, ypreds, out_path, labels=[]):
40 | preds_df = pd.DataFrame(df_ids)
41 | preds_df[target_name] = ypreds
42 | if len(labels) == preds_df.shape[0]:
43 | preds_df['label'] = np.array(labels)
44 | elif len(labels) > 0:
45 | print ('labels length not match')
46 | preds_df.to_csv(out_path, float_format = '%.4f', index=False)
47 |
48 | #def out_preds(id_name, target_name, ids, ypreds, out_path, labels=[]):
49 | # preds_df = pd.DataFrame({id_name: np.array(ids)})
50 | # preds_df[target_name] = ypreds
51 | # if len(labels) == preds_df.shape[0]:
52 | # preds_df['label'] = np.array(labels)
53 | # elif len(labels) > 0:
54 | # print ('labels length not match')
55 | # preds_df.to_csv(out_path, float_format = '%.4f', index=False)
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/tools/loader.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/loader.pyc
--------------------------------------------------------------------------------
/tools/nlp_preprocess.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 | import numpy as np
4 | import nltk
5 | # nltk.download('punkt')
6 | from nltk.corpus import stopwords
7 | from nltk import word_tokenize, pos_tag
8 | from nltk.stem import WordNetLemmatizer
9 |
10 | def tokenize(sentence):
11 | '''
12 | 去除多余空白、分词、词性标注
13 | '''
14 | sentence = re.sub(r'\s+', ' ', sentence)
15 | token_words = word_tokenize(sentence) # 输入的是列表
16 | token_words = pos_tag(token_words)
17 | return token_words
18 |
19 | def stem(token_words):
20 | '''
21 | 词形归一化
22 | '''
23 | wordnet_lematizer = WordNetLemmatizer() # 单词转换原型
24 | words_lematizer = []
25 | for word, tag in token_words:
26 | if tag.startswith('NN'):
27 | word_lematizer = wordnet_lematizer.lemmatize(word, pos='n') # n代表名词
28 | elif tag.startswith('VB'):
29 | word_lematizer = wordnet_lematizer.lemmatize(word, pos='v') # v代表动词
30 | elif tag.startswith('JJ'):
31 | word_lematizer = wordnet_lematizer.lemmatize(word, pos='a') # a代表形容词
32 | elif tag.startswith('R'):
33 | word_lematizer = wordnet_lematizer.lemmatize(word, pos='r') # r代表代词
34 | else:
35 | word_lematizer = wordnet_lematizer.lemmatize(word)
36 | words_lematizer.append(word_lematizer)
37 | return words_lematizer
38 |
39 |
40 | sr = stopwords.words('english')
41 |
42 |
43 | def delete_stopwords(token_words):
44 | '''
45 | 去停用词
46 | '''
47 | cleaned_words = [word for word in token_words if word not in sr]
48 | return cleaned_words
49 |
50 |
51 | def is_number(s):
52 | '''
53 | 判断字符串是否为数字
54 | '''
55 | try:
56 | float(s)
57 | return True
58 | except ValueError:
59 | pass
60 |
61 | try:
62 | import unicodedata
63 | unicodedata.numeric(s)
64 | return True
65 | except (TypeError, ValueError):
66 | pass
67 |
68 | return False
69 |
70 |
71 | characters = [' ', ',', '.', 'DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '-', '...',
72 | '^', '{', '}']
73 |
74 |
75 | def delete_characters(token_words):
76 | '''
77 | 去除特殊字符、数字
78 | '''
79 | words_list = [word for word in token_words if word not in characters and not is_number(word)]
80 | return words_list
81 |
82 |
83 | def to_lower(token_words):
84 | '''
85 | 统一为小写
86 | '''
87 | words_lists = [x.lower() for x in token_words]
88 | return words_lists
89 |
90 | def replace_process(line):
91 | m = replace = {
92 | 'α': 'alpha',
93 | 'β': 'beta',
94 | 'γ': 'gamma',
95 | 'δ': 'delta',
96 | 'ε': 'epsilon',
97 | 'ζ': 'zeta',
98 | 'η': 'eta',
99 | 'θ': 'theta',
100 | 'ι': 'iota',
101 | 'κ': 'kappa',
102 | 'λ': 'lambda',
103 | 'μ': 'mu',
104 | 'ν': 'nu',
105 | 'ξ': 'xi',
106 | 'ο': 'omicron',
107 | 'π': 'pi',
108 | 'ρ': 'rho',
109 | 'ς': 'sigma',
110 | 'σ': 'sigma',
111 | 'τ': 'tau',
112 | 'υ': 'upsilon',
113 | 'φ': 'phi',
114 | 'χ': 'chi',
115 | 'ψ': 'psi',
116 | 'ω': 'omega',
117 | 'ϑ': 'theta',
118 | 'ϒ': 'gamma',
119 | 'ϕ': 'phi',
120 | 'ϱ': 'rho',
121 | 'ϵ': 'epsilon',
122 | '𝛼': 'alpha',
123 | '𝛽': 'beta',
124 | '𝜀': 'epsilon',
125 | '𝜃': 'theta',
126 | '𝜏': 'tau',
127 | '𝜖': 'epsilon',
128 | '𝜷': 'beta',
129 | }
130 | empty_str = ['etc.','et al.','fig.','figure.','e.g.','(', ')','[', ']',';',':','!',',','.','?','"','\'', \
131 | '%','>','<','+','&']
132 | m.update({s: ' ' for s in empty_str})
133 |
134 | for k, v in m.items():
135 | line = line.replace(k, v)
136 | line = ' '.join([s.strip() for s in line.split(' ') if s != ''])
137 | return line
138 |
139 | def preprocess(line):
140 | '''
141 | 文本预处理
142 | '''
143 | line = line.lower()
144 | line = replace_process(line)
145 | token_words = tokenize(line)
146 | token_words = stem(token_words)
147 | token_words = delete_stopwords(token_words)
148 | token_words = delete_characters(token_words)
149 | token_words = to_lower(token_words)
150 | return ' '.join(token_words)
151 |
152 | if __name__ == '__main__':
153 | text = 'This experiment was conducted to determine whether feeding meal and hulls derived from genetically modified soybeans to dairy cows affected production measures and sensory qualities of milk. The soybeans were genetically modified (Event DAS-444Ø6-6) to be resistant to multiple herbicides. Twenty-six Holstein cows (13/treatment) were fed a diet that contained meal and hulls derived from transgenic soybeans or a diet that contained meal and hulls from a nontransgenic near-isoline variety. Soybean products comprised approximately 21% of the diet dry matter, and diets were formulated to be nearly identical in crude protein, neutral detergent fiber, energy, and minerals and vitamins. The experimental design was a replicated 2×2 Latin square with a 28-d feeding period. Dry matter intake (21.3 vs. 21.4kg/d), milk yield (29.3 vs. 29.4kg/d), milk fat (3.70 vs. 3.68%), and milk protein (3.10 vs. 3.12%) did not differ between cows fed control or transgenic soybean products, respectively. Milk fatty acid profile was virtually identical between treatments. Somatic cell count was significantly lower for cows fed transgenic soybean products, but the difference was biologically trivial. Milk was collected from all cows in period 1 on d 0 (before treatment), 14, and 28 for sensory evaluation. On samples from all days (including d 0) judges could discriminate between treatments for perceived appearance of the milk. The presence of this difference at d 0 indicated that it was likely not a treatment effect but rather an initial bias in the cow population. No treatment differences were found for preference or acceptance of the milk. Overall, feeding soybean meal and hulls derived from this genetically modified soybean had essentially no effects on production or milk acceptance when fed to dairy cows. '
154 | text = 'Pyrvinium is a drug approved by the FDA and identified as a Wnt inhibitor by inhibiting Axin degradation and stabilizing 尾-catenin, which can increase Ki67+ cardiomyocytes in the peri-infarct area and alleviate cardiac remodeling in a mouse model of MI . UM206 is a peptide with a high homology to Wnt-3a/5a, and acts as an antagonist for Frizzled proteins to inhibit Wnt signaling pathway transduction. UM206 could reduce infarct size, increase the numbers of capillaries, decrease myofibroblasts in infarct area of post-MI heart, and ultimately suppress the development of heart failure . ICG-001, which specifically inhibits the interaction between 尾-catenin and CBP in the Wnt canonical signaling pathway, can promote the differentiation of epicardial progenitors, thereby contributing to myocardial regeneration and improving cardiac function in a rat model of MI . Small molecules invaliding Porcupine have been further studied, such as WNT-974, GNF-6231 and CGX-1321. WNT-974 decreases fibrosis in post-MI heart, with a mechanism of preventing collagen production in cardiomyocytes by blocking secretion of Wnt-3, a pro-fibrotic agonist, from cardiac fibroblasts and its signaling to cardiomyocytes . The phosphorylation of DVL protein is decreased in both the canonical and non-canonical Wnt signaling pathways by WNT-974 administration . GNF-6231 prevents adverse cardiac remodeling in a mouse model of MI by inhibiting the proliferation of interstitial cells, increasing the proliferation of Sca1+ cardiac progenitors and reducing the apoptosis of cardiomyocytes [[**##**]]. Similarly, we demonstrate that CGX-1321, which has also been applied in a phase I clinical trial to treat solid tumors ({"type":"clinical-trial","attrs":{"text":"NCT02675946","term_id":"NCT02675946"}}NCT02675946), inhibits both canonical and non-canonical Wnt signaling pathways in post-MI heart. CGX-1321 promotes cardiac function by reducing fibrosis and stimulating cardiomyocyte proliferation-mediated cardiac regeneration in a Hippo/YAP-independent manner . These reports implicate that Wnt pathway inhibitors are a class of potential drugs for treating MI through complex mechanisms, including reducing cardiomyocyte death, increasing angiogenesis, suppressing fibrosis and stimulating cardiac regeneration.'
155 | token_words = tokenize(text)
156 | print(token_words)
157 | token_words = stem(token_words) # 单词原型
158 | token_words = delete_stopwords(token_words) # 去停
159 | token_words = delete_characters(token_words)
160 | token_words = to_lower(token_words)
161 | print(token_words)
162 |
--------------------------------------------------------------------------------
/tools/pandas_util.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #coding=utf-8
3 |
4 | # 基础模块
5 | import math
6 | import os
7 | import sys
8 | import time
9 | from datetime import datetime
10 | from tqdm import tqdm
11 |
12 | # 数据处理
13 | import numpy as np
14 | import pandas as pd
15 |
16 | def string_to_array(s):
17 | """Convert pipe separated string to array."""
18 |
19 | if isinstance(s, str):
20 | out = s.split("|")
21 | elif math.isnan(s):
22 | out = []
23 | else:
24 | raise ValueError("Value must be either string of nan")
25 | return out
26 |
27 |
28 | def explode(df_in, col_expls):
29 | """Explode column col_expl of array type into multiple rows."""
30 |
31 | df = df_in.copy()
32 | for col_expl in col_expls:
33 | df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
34 |
35 | base_cols = list(set(df.columns) - set(col_expls))
36 | df_out = pd.DataFrame(
37 | {col: np.repeat(df[col].values,
38 | df[col_expls[0]].str.len())
39 | for col in base_cols}
40 | )
41 |
42 | for col_expl in col_expls:
43 | df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
44 | df_out.loc[:, col_expl] = df_out[col_expl]
45 | return df_out
46 |
--------------------------------------------------------------------------------
/tools/pandas_util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/pandas_util.pyc
--------------------------------------------------------------------------------