├── README.md
├── feat
    └── README.md
├── input
    └── README.md
├── output
    ├── README.md
    ├── m1
    │   ├── catboost03
    │   │   └── .gitkeep
    │   ├── inferSent1
    │   │   └── .gitkeep
    │   └── nn02
    │   │   └── .gitkeep
    └── m3
    │   ├── lgb_m3_32-50-0
    │       └── .gitkeep
    │   ├── lgb_m3_37-0
    │       └── .gitkeep
    │   └── lgb_m3_38-0
    │       └── .gitkeep
├── src
    ├── ensemble
    │   └── .gitkeep
    ├── feature
    │   ├── .gitkeep
    │   ├── .ipynb_checkpoints
    │   │   └── gen_dict-checkpoint.ipynb
    │   ├── data_preprocess.py
    │   ├── feat30-50.py
    │   ├── feat31-50.py
    │   ├── feat32-50.py
    │   ├── feat37-pairwise.py
    │   ├── feat38-stk.py
    │   ├── feat40.py
    │   ├── gen_dict.ipynb
    │   ├── gen_samples.py
    │   └── tfidf_recall_30.py
    ├── rank
    │   ├── m1
    │   │   ├── catboost03.py
    │   │   ├── glove
    │   │   │   ├── .gitignore
    │   │   │   ├── .travis.yml
    │   │   │   ├── LICENSE
    │   │   │   ├── Makefile
    │   │   │   ├── README.md
    │   │   │   ├── demo.sh
    │   │   │   ├── eval
    │   │   │   │   ├── matlab
    │   │   │   │   │   ├── WordLookup.m
    │   │   │   │   │   ├── evaluate_vectors.m
    │   │   │   │   │   └── read_and_evaluate.m
    │   │   │   │   ├── octave
    │   │   │   │   │   ├── WordLookup_octave.m
    │   │   │   │   │   ├── evaluate_vectors_octave.m
    │   │   │   │   │   └── read_and_evaluate_octave.m
    │   │   │   │   ├── python
    │   │   │   │   │   ├── distance.py
    │   │   │   │   │   ├── evaluate.py
    │   │   │   │   │   └── word_analogy.py
    │   │   │   │   └── question-data
    │   │   │   │   │   ├── capital-common-countries.txt
    │   │   │   │   │   ├── capital-world.txt
    │   │   │   │   │   ├── city-in-state.txt
    │   │   │   │   │   ├── currency.txt
    │   │   │   │   │   ├── family.txt
    │   │   │   │   │   ├── gram1-adjective-to-adverb.txt
    │   │   │   │   │   ├── gram2-opposite.txt
    │   │   │   │   │   ├── gram3-comparative.txt
    │   │   │   │   │   ├── gram4-superlative.txt
    │   │   │   │   │   ├── gram5-present-participle.txt
    │   │   │   │   │   ├── gram6-nationality-adjective.txt
    │   │   │   │   │   ├── gram7-past-tense.txt
    │   │   │   │   │   ├── gram8-plural.txt
    │   │   │   │   │   └── gram9-plural-verbs.txt
    │   │   │   └── src
    │   │   │   │   ├── README.md
    │   │   │   │   ├── cooccur.c
    │   │   │   │   ├── glove.c
    │   │   │   │   ├── shuffle.c
    │   │   │   │   └── vocab_count.c
    │   │   ├── inferSent1-5-fold_predict.py
    │   │   ├── inferSent1-5-fold_train.py
    │   │   ├── nn02_predict.py
    │   │   ├── nn02_train.py
    │   │   ├── prepare_rank_train.py
    │   │   ├── run.sh
    │   │   └── w2v_training.py
    │   ├── m2
    │   │   ├── bert_5_fold_predict.py
    │   │   ├── bert_5_fold_train.py
    │   │   ├── bert_preprocessing.py
    │   │   ├── change_formatting4stk.py
    │   │   ├── final_blend.py
    │   │   ├── fold_result_integration.py
    │   │   ├── gen_w2v.sh
    │   │   ├── mk_submission.py
    │   │   ├── model.py
    │   │   ├── nn_5_fold_predict.py
    │   │   ├── nn_5_fold_train.py
    │   │   ├── nn_preprocessing.py
    │   │   ├── preprocessing.py
    │   │   ├── run.sh
    │   │   └── utils.py
    │   └── m3
    │   │   ├── convert.py
    │   │   ├── eval.py
    │   │   ├── flow.py
    │   │   ├── kfold_merge.py
    │   │   ├── lgb_train_32-50-0.py
    │   │   ├── lgb_train_37-0.py
    │   │   ├── lgb_train_38-0.py
    │   │   ├── lgb_train_38-1.py
    │   │   └── lgb_train_40-0.py
    ├── recall
    │   └── tfidf_recall_30.py
    └── utils
    │   └── .gitkeep
├── stk_feat
    └── README.md
└── tools
    ├── __pycache__
        ├── basic_learner.cpython-37.pyc
        ├── custom_bm25.cpython-37.pyc
        ├── custom_metrics.cpython-37.pyc
        ├── feat_utils.cpython-37.pyc
        ├── lgb_learner.cpython-37.pyc
        ├── loader.cpython-37.pyc
        ├── nlp_preprocess.cpython-37.pyc
        └── pandas_util.cpython-37.pyc
    ├── basic_learner.py
    ├── basic_learner.pyc
    ├── custom_bm25.py
    ├── custom_bm25.pyc
    ├── custom_metrics.py
    ├── custom_metrics.pyc
    ├── feat_utils.py
    ├── lgb_learner.py
    ├── lgb_learner.pyc
    ├── loader.py
    ├── loader.pyc
    ├── nlp_preprocess.py
    ├── pandas_util.py
    └── pandas_util.pyc


/README.md:
--------------------------------------------------------------------------------
 1 | # WSDM2020-solution
 2 | ## Team Name: funny
 3 | Team Member: just4fun, greedisgood, slowdown, funny
 4 | ## No Data Leak
 5 | We achieve map@3 score 0.37458 at part 1 and 0.38020 at part 2 without using any data leak in the competition. During the recall process we search the related papers from the whole dataset without tricky data screening.
 6 | 
 7 | ## Our Basic Solution 
 8 | data preprocess -> recall by text similarity-> single model (LGB + NN) -> model stacking -> linear ensemble -> final result
 9 | 
10 | 


--------------------------------------------------------------------------------
/feat/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of generated features
2 | 


--------------------------------------------------------------------------------
/input/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of input
2 | 


--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of cv results and results.
2 | 


--------------------------------------------------------------------------------
/output/m1/catboost03/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m1/catboost03/.gitkeep


--------------------------------------------------------------------------------
/output/m1/inferSent1/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m1/inferSent1/.gitkeep


--------------------------------------------------------------------------------
/output/m1/nn02/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m1/nn02/.gitkeep


--------------------------------------------------------------------------------
/output/m3/lgb_m3_32-50-0/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m3/lgb_m3_32-50-0/.gitkeep


--------------------------------------------------------------------------------
/output/m3/lgb_m3_37-0/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m3/lgb_m3_37-0/.gitkeep


--------------------------------------------------------------------------------
/output/m3/lgb_m3_38-0/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/output/m3/lgb_m3_38-0/.gitkeep


--------------------------------------------------------------------------------
/src/ensemble/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/src/ensemble/.gitkeep


--------------------------------------------------------------------------------
/src/feature/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/src/feature/.gitkeep


--------------------------------------------------------------------------------
/src/feature/data_preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | 
  4 | # 基础模块
  5 | import os
  6 | import sys
  7 | import time
  8 | from tqdm import tqdm
  9 | from datetime import datetime
 10 | 
 11 | # 数据处理
 12 | import re
 13 | import pickle
 14 | import numpy as np
 15 | import pandas as pd
 16 | from multiprocessing import Pool
 17 | 
 18 | # 自定义工具包
 19 | sys.path.append('../../tools/')
 20 | import loader
 21 | import pandas_util
 22 | from nlp_preprocess import preprocess
 23 | 
 24 | # 设置随机种子
 25 | SEED = 2020
 26 | PROCESS_NUM, PARTITION_NUM = 32, 32
 27 | 
 28 | input_root_path  = '../../input/'
 29 | output_root_path = '../../input/'
 30 | 
 31 | postfix = 'final_all'
 32 | file_type = 'ftr'
 33 | 
 34 | tr_out_path = output_root_path + 'tr_input_{}.{}'.format(postfix, file_type)
 35 | te_out_path = output_root_path + 'te_input_{}.{}'.format(postfix, file_type)
 36 | paper_out_path = output_root_path + 'paper_input_{}.{}'.format(postfix, file_type)
 37 | 
 38 | # 获取关键句函数
 39 | def digest(text):
 40 |     backup = text[:]
 41 |     text = text.replace('al.', '').split('. ')
 42 |     t=''
 43 |     pre_text=[]
 44 |     len_text=len(text)-1
 45 |     add=True
 46 |     pre=''
 47 |     while len_text>=0:
 48 |         index=text[len_text]
 49 |         index+=pre
 50 |         if len(index.split(' '))<=3 :
 51 |             add=False
 52 |             pre=index+pre
 53 |         else:
 54 |             add=True
 55 |             pre=''
 56 |         if add:
 57 |             pre_text.append(index)
 58 |         len_text-=1
 59 |     if len(pre_text)==0:
 60 |         pre_text=text
 61 |     pre_text.reverse()
 62 |     for index in pre_text:
 63 |         if index.find('[**##**]') != -1:
 64 |             index = re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+','',index)
 65 |             index+='. '
 66 |             t+=index
 67 |     return t
 68 | 
 69 | def partition(df, num):
 70 |     df_partitions, step = [], int(np.ceil(df.shape[0]/num))
 71 |     for i in range(0, df.shape[0], step):
 72 |         df_partitions.append(df.iloc[i:i+step])
 73 |     return df_partitions
 74 | 
 75 | def tr_single_process(params=None):
 76 |     (tr, i) = params
 77 |     print (i, 'start', datetime.now())
 78 |     tr['quer_key'] = tr['description_text'].fillna('').progress_apply(lambda s: preprocess(digest(s)))
 79 |     tr['quer_all'] = tr['description_text'].fillna('').progress_apply(lambda s: preprocess(s))
 80 |     print (i, 'completed', datetime.now())
 81 |     return tr
 82 | 
 83 | def paper_single_process(params=None):
 84 |     (df, i) = params
 85 |     print (i, 'start', datetime.now())
 86 |     df['titl'] = df['title'].fillna('').progress_apply(lambda s: preprocess(s))
 87 |     df['abst'] = df['abstract'].fillna('').progress_apply(lambda s: preprocess(s))
 88 |     print (i, 'completed', datetime.now())
 89 |     return df
 90 | 
 91 | def multi_text_process(df, task, process_num=30):
 92 |     pool = Pool(process_num)
 93 |     df_parts = partition(df, process_num)
 94 |     print ('{} processes init and partition to {} parts' \
 95 |            .format(process_num, process_num))
 96 |     param_list = [(df_parts[i], i) for i in range(process_num)]
 97 |     if task in ['tr', 'te']:
 98 |         dfs = pool.map(tr_single_process, param_list)
 99 |     elif task in ['paper']:
100 |         dfs = pool.map(paper_single_process, param_list)
101 |     df = pd.concat(dfs, axis=0)
102 |     print (task, 'multi process completed')
103 |     print (df.columns)
104 |     return df
105 | 
106 | if __name__ == "__main__":
107 | 
108 |     ts = time.time()
109 |     tqdm.pandas()
110 |     print('start time: %s' % datetime.now())
111 |     # load data
112 |     df = loader.load_df(input_root_path + 'candidate_paper_for_wsdm2020.ftr')
113 |     tr = loader.load_df(input_root_path + 'train_release.csv')
114 |     te = loader.load_df(input_root_path + 'test.csv')
115 |     cv = loader.load_df(input_root_path + 'cv_ids_0109.csv')
116 | 
117 |     # 过滤重复数据 & 异常数据
118 |     tr = tr[tr['description_id'].isin(cv['description_id'].tolist())]
119 |     tr = tr[tr.description_id != '6.45E+04']
120 | 
121 |     df = df[~pd.isnull(df['paper_id'])]
122 |     tr = tr[~pd.isnull(tr['description_id'])]
123 |     print ('pre', te.shape)
124 |     te = te[~pd.isnull(te['description_id'])]
125 |     print ('post', te.shape) 
126 |     
127 |     #df = df.head(1000)
128 |     #tr = tr.head(1000)
129 |     #te = te.head(1000)
130 | 
131 |     tr = multi_text_process(tr, task='tr')
132 |     te = multi_text_process(te, task='te')
133 |     df = multi_text_process(df, task='paper')
134 |     
135 |     tr.drop(['description_text'], axis=1, inplace=True)
136 |     te.drop(['description_text'], axis=1, inplace=True)
137 |     df.drop(['abstract', 'title'], axis=1, inplace=True)
138 |     print ('text preprocess completed')
139 |     
140 |     loader.save_df(tr, tr_out_path)
141 |     print (tr.columns)
142 |     print (tr.head())
143 |     
144 |     loader.save_df(te, te_out_path) 
145 |     print (te.columns)
146 |     print (te.head())
147 |     
148 |     loader.save_df(df, paper_out_path)
149 |     print (df.columns)
150 |     print (df.head())
151 |     
152 |     print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
153 | 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/src/feature/feat31-50.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | 
  4 | # 生成词向量距离特征
  5 | 
  6 | # 基础模块
  7 | import os
  8 | import gc
  9 | import sys
 10 | import time
 11 | import pickle
 12 | from datetime import datetime
 13 | from tqdm import tqdm
 14 | 
 15 | # 数据处理
 16 | import numpy as np
 17 | import pandas as pd
 18 | from tqdm import tqdm
 19 | from multiprocessing import Pool
 20 | 
 21 | # 自定义工具包
 22 | sys.path.append('../../tools/')
 23 | import loader
 24 | import pandas_util
 25 | import custom_bm25 as bm25
 26 | from feat_utils import try_divide, dump_feat_name
 27 | 
 28 | # 开源工具包
 29 | import nltk
 30 | import gensim
 31 | from gensim.models import Word2Vec
 32 | from gensim.models.word2vec import LineSentence
 33 | from gensim import corpora, models, similarities
 34 | from gensim.similarities import SparseMatrixSimilarity
 35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
 36 | 
 37 | # 设置随机种子
 38 | SEED = 2020
 39 | 
 40 | input_root_path  = '../../input/'
 41 | output_root_path = '../../feat/'
 42 | 
 43 | postfix = '31-50'
 44 | file_type = 'ftr'
 45 | 
 46 | # 当前特征
 47 | tr_fea_out_path = output_root_path + 'tr_fea_{}.{}'.format(postfix, file_type)
 48 | te_fea_out_path = output_root_path + 'te_fea_{}.{}'.format(postfix, file_type)
 49 | 
 50 | # 当前特征 + 之前特征 merge 之后的完整训练数据
 51 | tr_out_path = output_root_path + 'tr_s0_{}.{}'.format(postfix, file_type)
 52 | te_out_path = output_root_path + 'te_s0_{}.{}'.format(postfix, file_type)
 53 | 
 54 | ID_NAMES = ['description_id', 'paper_id']
 55 | PROCESS_NUM = 15
 56 | 
 57 | # load data
 58 | ts = time.time()
 59 | dictionary = corpora.Dictionary.load('../../feat/corpus.dict')
 60 | tfidf = models.TfidfModel.load('../../feat/tfidf.model')
 61 | 
 62 | print ('load data completed, cost {}s'.format(np.round(time.time() - ts, 2)))
 63 |                                
 64 | def sum_score(x, y):
 65 |     return max(x, 0) + max(y, 0)
 66 | 
 67 | def cos_dis(vec_x, vec_y, norm=False):
 68 |     if vec_x == None or vec_y == None:
 69 |         return -1
 70 |     dic_x = {v[0]: v[1] for v in vec_x}
 71 |     dic_y = {v[0]: v[1] for v in vec_y}
 72 |     
 73 |     dot_prod = 0
 74 |     for k, x in dic_x.items():
 75 |         y = dic_y.get(k, 0)
 76 |         dot_prod += x * y
 77 |     norm_x = np.linalg.norm([v[1] for v in vec_x]) 
 78 |     norm_y = np.linalg.norm([v[1] for v in vec_y])
 79 |     
 80 |     cos = dot_prod / (norm_x * norm_y)
 81 |     return 0.5 * cos + 0.5 if norm else cos  # 归一化到[0, 1]区间内
 82 | 
 83 | def eucl_dis(vec_x, vec_y):
 84 |     if vec_x == None or vec_y == None:
 85 |         return -1
 86 |     dic_x = {v[0]: v[1] for v in vec_x}
 87 |     dic_y = {v[0]: v[1] for v in vec_y}
 88 |     lis_i = list(set(list(dic_x.keys()) + list(dic_y.keys())))
 89 |     squa_sum = 0
 90 |     for i in lis_i:
 91 |         x, y = dic_x.get(i, 0), dic_y.get(i, 0)
 92 |         squa_sum += np.square(x - y)
 93 |     return np.sqrt(squa_sum)
 94 | 
 95 | def manh_dis(vec_x, vec_y):
 96 |     if vec_x == None or vec_y == None:
 97 |         return -1
 98 |     dic_x = {v[0]: v[1] for v in vec_x}
 99 |     dic_y = {v[0]: v[1] for v in vec_y}
100 |     lis_i = list(set(list(dic_x.keys()) + list(dic_y.keys())))
101 |     abs_sum = 0
102 |     for i in lis_i:
103 |         x, y = dic_x.get(i, 0), dic_y.get(i, 0)
104 |         abs_sum += np.abs(x - y)
105 |     return abs_sum    
106 | 
107 | def get_bm25_corp(quer, paper_id):
108 |     quer_vec = dictionary.doc2bow(quer.split(' '))
109 |     corp_score = bm25_corp.get_score(quer_vec, paper_ids.index(paper_id))
110 |     return corp_score
111 | 
112 | def get_bm25_abst(quer, paper_id):
113 |     quer_vec = dictionary.doc2bow(quer.split(' '))
114 |     abst_score = bm25_abst.get_score(quer_vec, paper_ids.index(paper_id))
115 |     return abst_score
116 | 
117 | def get_bm25_titl(quer, paper_id):
118 |     quer_vec = dictionary.doc2bow(quer.split(' '))
119 |     titl_score = bm25_titl.get_score(quer_vec, paper_ids.index(paper_id))
120 |     return titl_score
121 | 
122 | def single_process_feat(params=None):
123 |     ts = time.time()
124 |     (df, i) = params
125 |     
126 |     ts = time.time()
127 |     print (i, 'start', datetime.now())
128 |     # tfidf vec dis
129 |     df['quer_key_vec'] = df['quer_key'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
130 |     df['quer_all_vec'] = df['quer_all'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
131 |     df['titl_vec'] = df['titl'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
132 |     df['abst_vec'] = df['abst'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])
133 |     df['corp_vec'] = df['corp'].progress_apply(lambda s: tfidf[dictionary.doc2bow(s.split(' '))])                               
134 |     print (i, 'load vec completed, cost {}s'.format(np.round(time.time() - ts), 2))
135 |     
136 |     ts = time.time()
137 |     vec_type = 'tfidf'
138 |     for vec_x in ['quer_key', 'quer_all']:
139 |         for vec_y in ['abst', 'titl', 'corp']:
140 |             df['{}_{}_{}_cos_dis'.format(vec_x, vec_type, vec_y)] = df.progress_apply(lambda row: \
141 |                 cos_dis(row['{}_vec'.format(vec_x)], row['{}_vec'.format(vec_y)]), axis=1)
142 |             df['{}_{}_{}_eucl_dis'.format(vec_x, vec_type, vec_y)] = df.progress_apply(lambda row: \
143 |                 eucl_dis(row['{}_vec'.format(vec_x)], row['{}_vec'.format(vec_y)]), axis=1) 
144 |             df['{}_{}_{}_manh_dis'.format(vec_x, vec_type, vec_y)] = df.progress_apply(lambda row: \
145 |                 manh_dis(row['{}_vec'.format(vec_x)], row['{}_vec'.format(vec_y)]), axis=1) 
146 |             
147 |         print (i, vec_x, 'tfidf completed, cost {}s'.format(np.round(time.time() - ts), 2))
148 |     
149 |     del_cols = [col for col in df.columns if df[col].dtype == 'O' and col not in ID_NAMES]
150 |     print ('del cols', del_cols)
151 |     df.drop(del_cols, axis=1, inplace=True)
152 |     return df
153 | 
154 | def partition(df, num):
155 |     df_partitions, step = [], int(np.ceil(df.shape[0]/num))
156 |     for i in range(0, df.shape[0], step):
157 |         df_partitions.append(df.iloc[i:i+step])
158 |     return df_partitions
159 | 
160 | def multi_process_feat(df):
161 |     pool = Pool(PROCESS_NUM)
162 |     df = df[ID_NAMES + ['quer_key', 'quer_all', 'abst', 'titl', 'corp']]
163 |     df_parts = partition(df, PROCESS_NUM)
164 |     print ('{} processes init and partition to {} parts' \
165 |            .format(PROCESS_NUM, PROCESS_NUM))
166 |     ts = time.time()
167 | 
168 |     param_list = [(df_parts[i], i) \
169 |             for i in range(PROCESS_NUM)]
170 |     dfs = pool.map(single_process_feat, param_list)
171 |     df_out = pd.concat(dfs, axis=0)
172 |     return df_out
173 | 
174 | def gen_samples(paper, tr_desc_path, tr_recall_path, fea_out_path):
175 |     tr_desc = loader.load_df(tr_desc_path)
176 |     tr = loader.load_df(tr_recall_path)
177 | #     tr = tr.head(1000)
178 |     
179 |     tr = tr.merge(paper, on=['paper_id'], how='left')
180 |     tr = tr.merge(tr_desc[['description_id', 'quer_key', 'quer_all']], on=['description_id'], how='left')
181 | 
182 |     print (tr.columns)
183 |     print (tr.head())
184 |     
185 |     tr_feat = multi_process_feat(tr)
186 |     loader.save_df(tr_feat, fea_out_path)
187 |     
188 |     tr = tr.merge(tr_feat, on=ID_NAMES, how='left')
189 |     del_cols = [col for col in tr.columns if tr[col].dtype == 'O' and col not in ID_NAMES]
190 |     print ('tr del cols', del_cols)
191 |     return tr.drop(del_cols, axis=1)
192 | 
193 | 
194 | # 增加 vec sim 特征
195 | 
196 | if __name__ == "__main__":
197 | 
198 |     ts = time.time()
199 |     tqdm.pandas()
200 |     print('start time: %s' % datetime.now())
201 |     paper = loader.load_df('../../input/paper_input_final.ftr')
202 |     paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
203 |     paper['corp'] = paper['abst'] + ' ' + paper['titl'] + ' ' + paper['keywords'].fillna('').replace(';', ' ')
204 |     
205 |     tr_desc_path = '../../input/tr_input_final.ftr'
206 |     te_desc_path = '../../input/te_input_final.ftr'
207 |     
208 |     tr_recall_path = '../../feat/tr_s0_30-50.ftr'
209 |     te_recall_path = '../../feat/te_s0_30-50.ftr'
210 |     
211 |     tr = gen_samples(paper, tr_desc_path, tr_recall_path, tr_fea_out_path)
212 |     print (tr.columns)
213 |     print ([col for col in tr.columns if tr[col].dtype == 'O'])
214 |     loader.save_df(tr, tr_out_path)
215 |     
216 |     te = gen_samples(paper, te_desc_path, te_recall_path, te_fea_out_path)
217 |     print (te.columns)
218 |     loader.save_df(te, te_out_path)
219 |     print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
220 | 
221 | 
222 | 
223 | 
224 | 


--------------------------------------------------------------------------------
/src/feature/feat37-pairwise.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | 
  4 | # 生成词向量距离特征
  5 | 
  6 | # 基础模块
  7 | import os
  8 | import gc
  9 | import sys
 10 | import time
 11 | import pickle
 12 | from datetime import datetime
 13 | from tqdm import tqdm
 14 | 
 15 | # 数据处理
 16 | import numpy as np
 17 | import pandas as pd
 18 | from tqdm import tqdm
 19 | from multiprocessing import Pool
 20 | 
 21 | # 自定义工具包
 22 | sys.path.append('../../tools/')
 23 | import loader
 24 | import pandas_util
 25 | import custom_bm25 as bm25
 26 | from feat_utils import try_divide, dump_feat_name
 27 | 
 28 | # 开源工具包
 29 | import nltk
 30 | import gensim
 31 | from gensim.models import Word2Vec
 32 | from gensim.models.word2vec import LineSentence
 33 | from gensim import corpora, models, similarities
 34 | from gensim.similarities import SparseMatrixSimilarity
 35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
 36 | 
 37 | # 设置随机种子
 38 | SEED = 2020
 39 | 
 40 | input_root_path  = '../../input/'
 41 | output_root_path = '../../feat/'
 42 | 
 43 | FEA_NUM = '37'
 44 | postfix = 's0_{}'.format(FEA_NUM)
 45 | file_type = 'ftr'
 46 | 
 47 | # 当前特征
 48 | tr_fea_out_path = output_root_path + 'tr_fea_{}.{}'.format(postfix, file_type)
 49 | te_fea_out_path = output_root_path + 'te_fea_{}.{}'.format(postfix, file_type)
 50 | 
 51 | # 当前特征 + 之前特征 merge 之后的完整训练数据
 52 | tr_out_path = output_root_path + 'tr_{}.{}'.format(postfix, file_type)
 53 | te_out_path = output_root_path + 'te_{}.{}'.format(postfix, file_type)
 54 | 
 55 | ID_NAMES = ['description_id', 'paper_id']
 56 | PROCESS_NUM = 20
 57 | 
 58 | # load data
 59 | ts = time.time()
 60 | 
 61 | def feat_extract(df, is_te=False):
 62 |     if is_te:
 63 |         df_pred = loader.load_df('../../output/m3/lgb_m3_32-50-0/lgb_m3_32-50-0.ftr')
 64 |     else:
 65 |         df_pred = loader.load_df('../../output/m3/lgb_m3_32-50-0/lgb_m3_32-50-0_cv.ftr')
 66 |     df_pred = df_pred[ID_NAMES + ['target']]
 67 |     
 68 |     df_pred = df_pred.sort_values(by=['target'], ascending=False)
 69 |     df_pred['pred_rank'] = df_pred.groupby(['description_id']).cumcount().values
 70 |     df_pred = df_pred.sort_values(by=['description_id', 'target'])
 71 |     print (df_pred.shape)
 72 |     print (df_pred.head(10))
 73 | 
 74 |     pred_top1 = df_pred[df_pred['pred_rank'] == 0] \
 75 |             .drop_duplicates(subset='description_id', keep='first')
 76 |     pred_top1 = pred_top1[['description_id', 'target']]
 77 |     pred_top1.columns = ['description_id', 'top1_pred']
 78 | 
 79 |     pred_top2 = df_pred[df_pred['pred_rank'] < 2]
 80 |     pred_top2['top2_pred_avg'] = pred_top2.groupby('description_id')['target'].transform('mean')
 81 |     pred_top2['top2_pred_std'] = pred_top2.groupby('description_id')['target'].transform('std')
 82 |     pred_top2 = pred_top2[['description_id', 'top2_pred_avg', \
 83 |             'top2_pred_std']].drop_duplicates(subset=['description_id'])
 84 | 
 85 |     pred_top3 = df_pred[df_pred['pred_rank'] < 3]
 86 |     pred_top3['top3_pred_avg'] = pred_top3.groupby('description_id')['target'].transform('mean')
 87 |     pred_top3['top3_pred_std'] = pred_top3.groupby('description_id')['target'].transform('std')
 88 |     pred_top3 = pred_top3[['description_id', 'top3_pred_avg', \
 89 |             'top3_pred_std']].drop_duplicates(subset=['description_id'])
 90 | 
 91 |     pred_top5 = df_pred[df_pred['pred_rank'] < 5]
 92 |     pred_top5['top5_pred_avg'] = pred_top5.groupby('description_id')['target'].transform('mean')
 93 |     pred_top5['top5_pred_std'] = pred_top5.groupby('description_id')['target'].transform('std')
 94 |     pred_top5 = pred_top5[['description_id', 'top5_pred_avg', \
 95 |             'top5_pred_std']].drop_duplicates(subset=['description_id'])
 96 | 
 97 |     df_pred.rename(columns={'target': 'pred'}, inplace=True)
 98 |     df = df.merge(df_pred, on=ID_NAMES, how='left')
 99 |     df = df.merge(pred_top1, on=['description_id'], how='left')
100 |     df = df.merge(pred_top2, on=['description_id'], how='left')
101 |     df = df.merge(pred_top3, on=['description_id'], how='left')
102 |     df = df.merge(pred_top5, on=['description_id'], how='left')
103 |     
104 |     df['pred_sub_top1'] = df['pred'] - df['top1_pred']
105 |     df['pred_sub_top2_avg'] = df['pred'] - df['top2_pred_avg']
106 |     df['pred_sub_top3_avg'] = df['pred'] - df['top3_pred_avg']
107 |     df['pred_sub_top5_avg'] = df['pred'] - df['top5_pred_avg']
108 | 
109 |     del_cols = ['paper_id', 'pred', 'pred_rank']
110 |     df.drop(del_cols, axis=1, inplace=True)
111 |     df_feat = df.drop_duplicates(subset=['description_id'])
112 | 
113 |     print ('df_feat info')
114 |     print (df_feat.shape)
115 |     print (df_feat.head())
116 |     print (df_feat.columns.tolist())
117 | 
118 |     return df_feat
119 | 
120 | def output_fea(tr, te):
121 |     print (tr.head())
122 |     print (te.head())
123 | 
124 |     loader.save_df(tr, tr_fea_out_path)
125 |     loader.save_df(te, te_fea_out_path)
126 | 
127 | def gen_fea():
128 |     tr = loader.load_df('../../feat/tr_s0_32-50.ftr')
129 |     te = loader.load_df('../../feat/te_s0_32-50.ftr')
130 |         
131 |     tr_feat = feat_extract(tr[ID_NAMES])
132 |     te_feat = feat_extract(te[ID_NAMES], is_te=True)
133 |     
134 |     tr = tr[ID_NAMES].merge(tr_feat, on=['description_id'], how='left')
135 |     te = te[ID_NAMES].merge(te_feat, on=['description_id'], how='left')
136 |     
137 |     print (tr.shape, te.shape)
138 |     print (tr.head())
139 |     print (te.head())
140 |     print (tr.columns)
141 |     
142 |     output_fea(tr, te)   
143 | 
144 | # merge 已有特征
145 | def merge_fea(tr_list, te_list):
146 |     tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES)
147 |     te = loader.merge_fea(te_list, primary_keys=ID_NAMES)
148 | 
149 |     print (tr.head())
150 |     print (te.head())
151 |     print (tr.columns.tolist())
152 | 
153 |     loader.save_df(tr, tr_out_path)
154 |     loader.save_df(te, te_out_path)
155 |     
156 | if __name__ == "__main__":
157 | 
158 |     print('start time: %s' % datetime.now())
159 |     root_path = '../../feat/'
160 |     base_tr_path = root_path + 'tr_s0_32-50.ftr'
161 |     base_te_path = root_path + 'te_s0_32-50.ftr'
162 | 
163 |     gen_fea()
164 | 
165 |     # merge fea
166 |     prefix = 's0'
167 |     fea_list = [FEA_NUM]
168 | 
169 |     tr_list = [base_tr_path] + \
170 |             [root_path + 'tr_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
171 |     te_list = [base_te_path] + \
172 |             [root_path + 'te_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
173 | 
174 |     merge_fea(tr_list, te_list)
175 | 
176 |     print('all completed: %s' % datetime.now())
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/src/feature/feat38-stk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | 
  4 | # 生成词向量距离特征
  5 | 
  6 | # 基础模块
  7 | import os
  8 | import gc
  9 | import sys
 10 | import time
 11 | import pickle
 12 | from datetime import datetime
 13 | from tqdm import tqdm
 14 | 
 15 | # 数据处理
 16 | import numpy as np
 17 | import pandas as pd
 18 | from tqdm import tqdm
 19 | from multiprocessing import Pool
 20 | 
 21 | # 自定义工具包
 22 | sys.path.append('../../tools/')
 23 | import loader
 24 | import pandas_util
 25 | import custom_bm25 as bm25
 26 | from preprocess import preprocess
 27 | from feat_utils import try_divide, dump_feat_name
 28 | 
 29 | # 开源工具包
 30 | import nltk
 31 | import gensim
 32 | from gensim.models import Word2Vec
 33 | from gensim.models.word2vec import LineSentence
 34 | from gensim import corpora, models, similarities
 35 | from gensim.similarities import SparseMatrixSimilarity
 36 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
 37 | 
 38 | # 设置随机种子
 39 | SEED = 2020
 40 | 
 41 | input_root_path  = '../../input/'
 42 | output_root_path = '../../feat/'
 43 | 
 44 | FEA_NUM = 38
 45 | 
 46 | postfix = 's0_{}'.format(FEA_NUM)
 47 | file_type = 'ftr'
 48 | 
 49 | # 当前特征
 50 | tr_fea_out_path = output_root_path + 'tr_fea_{}.{}'.format(postfix, file_type)
 51 | te_fea_out_path = output_root_path + 'te_fea_{}.{}'.format(postfix, file_type)
 52 | 
 53 | # 当前特征 + 之前特征 merge 之后的完整训练数据
 54 | tr_out_path = output_root_path + 'tr_{}.{}'.format(postfix, file_type)
 55 | te_out_path = output_root_path + 'te_{}.{}'.format(postfix, file_type)
 56 | 
 57 | ID_NAMES = ['description_id', 'paper_id']
 58 | PROCESS_NUM = 20
 59 | 
 60 | # load data
 61 | ts = time.time()
 62 | 
 63 | def feat_extract(tr_path, te_path, prefix):
 64 |     tr_sample = loader.load_df('../../feat/tr_s0_37.ftr')
 65 |     te_sample = loader.load_df('../../feat/te_s0_37.ftr')
 66 |    
 67 |     tr = loader.load_df(tr_path)
 68 |     te = loader.load_df(te_path)
 69 |     
 70 |     del_cols = ['label']
 71 |     del_cols = [col for col in tr.columns if col in del_cols]
 72 |     tr.drop(del_cols, axis=1, inplace=True)
 73 | 
 74 |     tr = tr_sample[ID_NAMES].merge(tr, on=ID_NAMES, how='left')
 75 |     te = te_sample[ID_NAMES].merge(te, on=ID_NAMES, how='left')
 76 | 
 77 |     tr.columns = ID_NAMES + [prefix]
 78 |     te.columns = ID_NAMES + [prefix]
 79 | 
 80 |     print (prefix)
 81 |     print (tr.shape, te.shape)
 82 |     print (tr.head())
 83 |     
 84 |     tr = tr[prefix]
 85 |     te = te[prefix]
 86 | 
 87 |     return tr, te
 88 | 
 89 | def output_fea(tr, te):
 90 |     print (tr.head())
 91 |     print (te.head())
 92 | 
 93 |     loader.save_df(tr, tr_fea_out_path)
 94 |     loader.save_df(te, te_fea_out_path)
 95 | 
 96 | # 生成特征
 97 | def gen_fea(base_tr_path=None, base_te_path=None):
 98 | 
 99 |     tr_sample = loader.load_df('../../feat/tr_s0_37.ftr')
100 |     te_sample = loader.load_df('../../feat/te_s0_37.ftr')
101 | 
102 |     prefixs = ['m1_cat_03', 'm1_infesent_simple', 'm1_nn_02', \
103 |                'm2_ESIM_001', 'm2_ESIMplus_001', 'lgb_m3_37-0']
104 |     
105 |     tr_paths = ['{}_tr.ftr'.format(prefix) for prefix in prefixs]
106 |     te_paths = ['final_{}_te.ftr'.format(prefix) for prefix in prefixs]
107 |     
108 |     tr_paths = ['../../stk_feat/{}'.format(p) for p in tr_paths]
109 |     te_paths = ['../../stk_feat/{}'.format(p) for p in te_paths]
110 | 
111 | 
112 |     trs, tes = [], []
113 |     for i, prefix in enumerate(prefixs):
114 |         tr, te = feat_extract(tr_paths[i], te_paths[i], prefix + '_prob')
115 |         trs.append(tr)
116 |         tes.append(te)
117 |     tr = pd.concat([tr_sample[ID_NAMES]] + trs, axis=1)
118 |     te = pd.concat([te_sample[ID_NAMES]] + tes, axis=1)
119 | 
120 |     float_cols = [c for c in tr.columns if tr[c].dtype == 'float']
121 |     tr[float_cols] = tr[float_cols].astype('float32')
122 |     te[float_cols] = te[float_cols].astype('float32')
123 | 
124 |     print (tr.shape, te.shape)
125 |     print (tr.head())
126 |     print (te.head())
127 |     print (tr.columns)
128 | 
129 |     output_fea(tr, te)
130 | 
131 | # merge 已有特征
132 | def merge_fea(tr_list, te_list):
133 |     tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES)
134 |     te = loader.merge_fea(te_list, primary_keys=ID_NAMES)
135 | 
136 |     print (tr.head())
137 |     print (te.head())
138 |     print (tr.columns.tolist())
139 | 
140 |     loader.save_df(tr, tr_out_path)
141 |     loader.save_df(te, te_out_path)
142 |     
143 | if __name__ == "__main__":
144 | 
145 |     print('start time: %s' % datetime.now())
146 |     root_path = '../../feat/'
147 |     base_tr_path = root_path + 'tr_s0_37.ftr'
148 |     base_te_path = root_path + 'te_s0_37.ftr'
149 | 
150 |     gen_fea()
151 | 
152 |     # merge fea
153 |     prefix = 's0'
154 |     fea_list = [FEA_NUM]
155 | 
156 |     tr_list = [base_tr_path] + \
157 |             [root_path + 'tr_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
158 |     te_list = [base_te_path] + \
159 |             [root_path + 'te_fea_{}_{}.ftr'.format(prefix, i) for i in fea_list]
160 | 
161 |     merge_fea(tr_list, te_list)
162 | 
163 |     print('all completed: %s' % datetime.now())
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/src/feature/gen_samples.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #coding=utf-8
 3 | 
 4 | import warnings
 5 | warnings.filterwarnings('always')
 6 | warnings.filterwarnings('ignore')
 7 | 
 8 | # 基础模块
 9 | import os
10 | import sys
11 | import time
12 | from datetime import datetime
13 | from tqdm import tqdm
14 | 
15 | # 数据处理
16 | import numpy as np
17 | import pandas as pd
18 | 
19 | # 自定义工具包
20 | sys.path.append('../../tools/')
21 | import loader
22 | import pandas_util
23 | 
24 | # 开源工具包
25 | from gensim.models import Word2Vec
26 | from gensim.models.word2vec import LineSentence
27 | from gensim import corpora, models, similarities
28 | from gensim.similarities import SparseMatrixSimilarity
29 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
30 | 
31 | # 设置随机种子
32 | SEED = 2020
33 | 
34 | def topk_lines(df, k):
35 |     df.loc[:, 'rank'] = df.groupby(['description_id']).cumcount().values
36 |     df = df[df['rank'] < k]
37 |     df.drop(['rank'], axis=1, inplace=True)
38 |     return df
39 | 
40 | def process(in_path, k):
41 |     ID_NAMES = ['description_id', 'paper_id']
42 | 
43 |     df = loader.load_df(in_path)
44 |     df = topk_lines(df, k)
45 |     df['sim_score'] = df['sim_score'].astype('float')
46 |     df.rename(columns={'sim_score': 'corp_sim_score'}, inplace=True)
47 |     return df
48 | 
49 | 
50 | if __name__ == "__main__":
51 | 
52 |     ts = time.time()
53 |     tr_path = '../../feat/tr_tfidf_30.ftr'
54 |     te_path = '../../feat/te_tfidf_30.ftr'
55 | 
56 |     cv = loader.load_df('../../input/cv_ids_0109.csv')[['description_id', 'cv']]
57 | 
58 |     tr = process(tr_path, k=50)
59 |     tr = tr.merge(cv, on=['description_id'], how='left')
60 | 
61 |     te = process(te_path, k=50)
62 |     te['cv'] = 0
63 | 
64 |     loader.save_df(tr, '../../feat/tr_samples_30-50.ftr')
65 |     loader.save_df(te, '../../feat/te_samples_30-50.ftr')
66 |     print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/src/feature/tfidf_recall_30.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | 
  4 | # bm25 recall
  5 | 
  6 | # 基础模块
  7 | import os
  8 | import gc
  9 | import sys
 10 | import time
 11 | import functools
 12 | from tqdm import tqdm
 13 | from six import iteritems
 14 | from datetime import datetime
 15 | 
 16 | # 数据处理
 17 | import re
 18 | import math
 19 | import pickle
 20 | import numpy as np
 21 | import pandas as pd
 22 | from multiprocessing import Pool
 23 | 
 24 | # 自定义工具包
 25 | sys.path.append('../../tools/')
 26 | import loader
 27 | import pandas_util
 28 | import custom_bm25 as bm25
 29 | 
 30 | # 开源工具包
 31 | from gensim.models import Word2Vec
 32 | from gensim.models.word2vec import LineSentence
 33 | from gensim import corpora, models, similarities
 34 | from gensim.similarities import SparseMatrixSimilarity
 35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
 36 | 
 37 | # 设置随机种子
 38 | SEED = 2020
 39 | PROCESS_NUM, PARTITION_NUM = 18, 18
 40 | 
 41 | input_root_path  = '../../input/'
 42 | output_root_path = '../../feat/'
 43 | 
 44 | postfix = '30'
 45 | file_type = 'ftr'
 46 | 
 47 | train_out_path = output_root_path + 'tr_tfidf_{}.{}'.format(postfix, file_type)
 48 | test_out_path = output_root_path + 'te_tfidf_{}.{}'.format(postfix, file_type)
 49 | 
 50 | def topk_sim_samples(desc, desc_ids, paper_ids, bm25_model, k=10):
 51 |     desc_id2papers = {}
 52 |     for desc_i in tqdm(range(len(desc))):
 53 |         query_vec, query_desc_id = desc[desc_i], desc_ids[desc_i]
 54 |         sims = bm25_model.get_scores(query_vec)
 55 |         sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
 56 |         sim_papers = [paper_ids[val[0]] for val in sort_sims[:k]]
 57 |         sim_scores = [str(val[1]) for val in sort_sims[:k]]
 58 |         desc_id2papers[query_desc_id] = ['|'.join(sim_papers), '|'.join(sim_scores)]
 59 |     sim_df = pd.DataFrame.from_dict(desc_id2papers, orient='index', columns=['paper_id', 'sim_score'])
 60 |     sim_df = sim_df.reset_index().rename(columns={'index':'description_id'})
 61 |     return sim_df
 62 | 
 63 | def partition(queries, num):
 64 |     queries_partitions, step = [], int(np.ceil(len(queries)/num))
 65 |     for i in range(0, len(queries), step):
 66 |         queries_partitions.append(queries[i:i+step])
 67 |     return queries_partitions
 68 | 
 69 | def single_process_search(params=None):
 70 |     (query_vecs, desc_ids, paper_ids, bm25_model, k, i) = params
 71 |     print (i, 'start', datetime.now())
 72 |     gc.collect()
 73 |     sim_df = topk_sim_samples(query_vecs, desc_ids, paper_ids, bm25_model, k)
 74 |     print (i, 'completed', datetime.now())
 75 |     return sim_df
 76 | 
 77 | def multi_process_search(query_vecs, desc_ids, paper_ids, bm25_model, k):
 78 |     pool = Pool(PROCESS_NUM)
 79 |     queries_parts = partition(query_vecs, PARTITION_NUM)
 80 |     desc_ids_parts = partition(desc_ids, PARTITION_NUM)
 81 |     print ('{} processes init and partition to {} parts' \
 82 |            .format(PROCESS_NUM, PARTITION_NUM))
 83 | 
 84 |     param_list = [(queries_parts[i], desc_ids_parts[i], \
 85 |         paper_ids, bm25_model, k, i) for i in range(PARTITION_NUM)]
 86 |     sim_dfs = pool.map(single_process_search, param_list)
 87 |     sim_df = pd.concat(sim_dfs, axis=0)
 88 |     return sim_df
 89 | 
 90 | def gen_samples(df, desc, desc_ids, corpus_list, paper_ids_list, k):
 91 |     df_samples_list = []
 92 |     for i, corpus in enumerate(corpus_list):
 93 |         bm25_model = bm25.BM25(corpus[0])        
 94 |         cur_df_sample = multi_process_search(desc, desc_ids, \
 95 |                 paper_ids_list[i], bm25_model, k)
 96 |         cur_df_sample_out = pandas_util.explode(cur_df_sample, ['paper_id', 'sim_score'])
 97 |         cur_df_sample_out['type'] = corpus[1] # recall_name
 98 |         df_samples_list.append(cur_df_sample_out)
 99 |     df_samples = pd.concat(df_samples_list, axis=0)
100 |     df_samples.drop_duplicates(subset=['description_id', 'paper_id'], inplace=True)
101 |     df_samples['target'] = 0
102 |     return df_samples
103 | 
104 | if __name__ == "__main__":
105 | 
106 |     ts = time.time()
107 |     tqdm.pandas()
108 |     print('start time: %s' % datetime.now())
109 |     # load data
110 |     df = loader.load_df(input_root_path + 'paper_input_final.ftr')
111 |     df = df[~pd.isnull(df['paper_id'])]
112 |                                
113 |     # gen tfidf vecs
114 |     dictionary = pickle.load(open('../../feat/corpus.dict', 'rb'))
115 |     print ('dic len', len(dictionary))
116 | 
117 |     df['corp'] = df['abst'] + ' ' + df['titl'] + ' ' + df['keywords'].fillna('').replace(';', ' ')
118 |     df_corp, corp_paper_ids = [dictionary.doc2bow(line.split(' ')) for line in df['corp'].tolist()], \
119 |             df['paper_id'].tolist()
120 |        
121 |     # gen topk sim samples
122 |     paper_ids_list = [corp_paper_ids]
123 |     corpus_list = [(df_corp, 'corp_bm25')]
124 |     out_cols = ['description_id', 'paper_id', 'sim_score', 'target', 'type']
125 | 
126 |     if sys.argv[1] in ['tr']:
127 |         # for tr ins
128 |         tr = loader.load_df(input_root_path + 'tr_input_final.ftr')
129 |         tr = tr[~pd.isnull(tr['description_id'])]
130 |         
131 | #         tr = tr.head(1000)        
132 |         tr_desc, tr_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in tr['quer_all'].tolist()], \
133 |                 tr['description_id'].tolist()
134 |         print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))   
135 |         
136 |         tr_samples = gen_samples(tr, tr_desc, tr_desc_ids, \
137 |                 corpus_list, paper_ids_list, k=50)
138 |         tr_samples = tr.rename(columns={'paper_id': 'target_paper_id'}) \
139 |                 .merge(tr_samples, on='description_id', how='left')
140 |         tr_samples.loc[tr_samples['target_paper_id'] == tr_samples['paper_id'], 'target'] = 1
141 |         loader.save_df(tr_samples[out_cols], train_out_path)
142 |         print ('recall succ {} from {}'.format(tr_samples['target'].sum(), tr.shape[0]))
143 |         print (tr.shape, tr_samples.shape)
144 | 
145 |     if sys.argv[1] in ['te']:
146 |         # for te ins
147 |         te = loader.load_df(input_root_path + 'te_input_final.ftr')
148 |         te = te[~pd.isnull(te['description_id'])]
149 |         
150 | #         te = te.head(1000)
151 |         te_desc, te_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in te['quer_all'].tolist()], \
152 |                 te['description_id'].tolist()
153 |         print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))
154 |                 
155 |         te_samples = gen_samples(te, te_desc, te_desc_ids, \
156 |                 corpus_list, paper_ids_list, k=50)
157 |         te_samples = te.merge(te_samples, on='description_id', how='left')
158 |         loader.save_df(te_samples[out_cols], test_out_path)
159 |         print (te.shape, te_samples.shape)
160 |         
161 |     print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/src/rank/m1/catboost03.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import datetime
 10 | from catboost import CatBoostClassifier
 11 | from time import time
 12 | from tqdm import tqdm_notebook as tqdm
 13 | 
 14 | 
 15 | # In[2]:
 16 | 
 17 | 
 18 | feat_dir = "../../../feat/"
 19 | input_dir = "../../../input/"
 20 | cv_id = pd.read_csv("../../../input/cv_ids_0109.csv")
 21 | 
 22 | 
 23 | # In[3]:
 24 | 
 25 | 
 26 | train = pd.read_feather(f'{feat_dir}/tr_s0_32-50.ftr')
 27 | train.drop(columns=['cv'],axis=1,inplace=True)
 28 | train = train.merge(cv_id,on=['description_id'],how='left')
 29 | train = train.dropna(subset=['cv']).reset_index(drop=True)
 30 | # test = pd.read_feather(f'{feat_dir}/te_s0_20-50.ftr')
 31 | test = pd.read_feather(f'{feat_dir}/te_s0_32-50.ftr')
 32 | 
 33 | 
 34 | # In[4]:
 35 | 
 36 | 
 37 | ID_NAMES = ['description_id', 'paper_id']
 38 | TARGET_NAME = 'target'
 39 | 
 40 | 
 41 | # In[5]:
 42 | 
 43 | 
 44 | def get_feas(data):
 45 |     cols = data.columns.tolist()
 46 |     del_cols = ID_NAMES + ['target', 'cv']
 47 |     #sub_cols = ['year', 'corp_cos', 'corp_eucl', 'corp_manh', 'quer_all']
 48 |     sub_cols = ['year', 'corp_sim_score']
 49 |     sub_cols = ['year', 'pos_of_corp', 'pos_of_abst', 'pos_of_titl']
 50 |     for col in data.columns:
 51 |         for sub_col in sub_cols:
 52 |             if sub_col in col:
 53 |                 del_cols.append(col)
 54 | 
 55 |     cols = [val for val in cols if val not in del_cols]
 56 |     print ('del_cols', del_cols)
 57 |     return cols
 58 | 
 59 | 
 60 | # In[6]:
 61 | 
 62 | 
 63 | feas = get_feas(train)
 64 | 
 65 | 
 66 | # In[7]:
 67 | 
 68 | 
 69 | def make_classifier():
 70 |     clf = CatBoostClassifier(
 71 |                                loss_function='Logloss',
 72 |                                eval_metric="AUC",
 73 | #                                task_type="CPU",
 74 |                                learning_rate=0.1, ###0.01
 75 |                                iterations=2500, ###2000
 76 |                                od_type="Iter",
 77 | #                                depth=8,
 78 |                                thread_count=10,
 79 |                                early_stopping_rounds=100, ###100
 80 |     #                            l2_leaf_reg=1,
 81 |     #                            border_count=96,
 82 |                                random_seed=42
 83 |                               )
 84 |         
 85 |     return clf
 86 | 
 87 | 
 88 | # In[8]:
 89 | 
 90 | 
 91 | # 开源工具包
 92 | import ml_metrics as metrics
 93 | def cal_map(pred_valid,cv,train_df,tr_data):
 94 |     df_pred = train_df[train_df['cv']==cv].copy()
 95 |     df_pred['pred'] = pred_valid
 96 |     df_pred = df_pred[['description_id','paper_id','pred']]
 97 |     sort_df_pred = df_pred.sort_values(['description_id', 'pred'], ascending=False)
 98 |     df_pred = df_pred[['description_id']].drop_duplicates()             .merge(sort_df_pred, on=['description_id'], how='left')
 99 |     df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
100 |     df_pred = df_pred[df_pred['rank'] < 3]
101 |     df_pred = df_pred.groupby(['description_id'])['paper_id']             .apply(lambda s : ','.join((s))).reset_index()
102 |     df_pred = df_pred.merge(tr_data, on=['description_id'], how='left')
103 |     df_pred.rename(columns={'paper_id': 'paper_ids'}, inplace=True)
104 |     df_pred['paper_ids'] = df_pred['paper_ids'].apply(lambda s: s.split(','))
105 |     df_pred['target_id'] = df_pred['target_id'].apply(lambda s: [s])
106 |     return metrics.mapk(df_pred['target_id'].tolist(), df_pred['paper_ids'].tolist(), 3)
107 | 
108 | 
109 | # In[9]:
110 | 
111 | 
112 | import os
113 | model_dir = "./m1_model/catboost03"
114 | if not os.path.exists(model_dir):
115 |     os.makedirs(model_dir)
116 | 
117 | 
118 | # In[10]:
119 | 
120 | 
121 | tr_data = pd.read_csv(f'{input_dir}/train_release.csv')
122 | tr_data = tr_data[['description_id', 'paper_id']].rename(columns={'paper_id': 'target_id'})
123 | 
124 | 
125 | # In[13]:
126 | 
127 | 
128 | for fea in feas:
129 |     if fea not in test.columns:
130 |         print(fea)
131 | 
132 | 
133 | # In[14]:
134 | 
135 | 
136 | CV_RESULT_OUT=True
137 | 
138 | 
139 | # In[15]:
140 | 
141 | 
142 | def train_one_fold(type_train_df,type_test_df,model_dir,cv,pi=False):
143 |     print(" fold " + str(cv))
144 |     train_data = type_train_df[(type_train_df['cv']!=cv)]
145 |     valid_data = type_train_df[(type_train_df['cv']==cv)]
146 |     
147 |     des_id = valid_data['description_id']
148 |     paper_id = valid_data['paper_id']
149 |     
150 |     idx_train = train_data.index
151 |     idx_val = valid_data.index
152 |     des_id = valid_data['description_id']
153 |     paper_id = valid_data['paper_id']
154 |     model_name = "fold_{}_cbt_best.model".format(str(cv))
155 |     model_name_wrt = os.path.join(model_dir,model_name)
156 |     clf = make_classifier()
157 |     imp=pd.DataFrame()
158 |     if not os.path.exists(model_name_wrt):
159 |         clf.fit(train_data[feas], train_data[['target']], eval_set=(valid_data[feas],valid_data[['target']]),
160 |                               use_best_model=True, verbose=100)
161 |         clf.save_model(model_name_wrt)
162 |         fea_ = clf.feature_importances_
163 |         fea_name = clf.feature_names_
164 |         imp = pd.DataFrame({'name':fea_name,'imp':fea_})
165 |     else:
166 |         clf.load_model(model_name_wrt)
167 |     cv_predict=clf.predict_proba(valid_data[feas])[:,1]
168 | #     print(cv_predict.shape)
169 |     cv_score_fold = cal_map(cv_predict,cv,type_train_df,tr_data)
170 |     if CV_RESULT_OUT:
171 |         cv_preds = cv_predict
172 |         rdf = pd.DataFrame()
173 |         rdf = rdf.reindex(columns=['description_id','paper_id','pred'])
174 |         rdf['description_id'] = des_id
175 |         rdf['paper_id'] = paper_id
176 |         rdf['pred'] = cv_preds
177 |     test_des_id = type_test_df['description_id']
178 |     test_paper_id = type_test_df['paper_id']
179 |     test_preds = clf.predict_proba(type_test_df[feas])[:,1]
180 |     test_df = pd.DataFrame()
181 |     test_df = test_df.reindex(columns=['description_id','paper_id','pred'])
182 |     test_df['description_id'] = test_des_id
183 |     test_df['paper_id'] = test_paper_id
184 |     test_df['pred'] = test_preds
185 |     return rdf,test_df,cv_score_fold,imp
186 | 
187 | 
188 | # In[16]:
189 | 
190 | 
191 | kfold = 5
192 | type_scores = []
193 | type_cv_results = []
194 | type_test_results = []
195 | model_name = '../../../output/m1/catboost03/'
196 | fold_scores = []
197 | fold_cv_results = []
198 | fold_test_results = []
199 | imps=[]
200 | # test_preds = np.zeros(len(test))
201 | for cv in range(1,kfold+1):#####这里是因为cv是1~5
202 |     cv_df,test_df,cv_score,imp = train_one_fold(train,test,model_dir,cv)
203 | #     fold_cv_results.append(cv_df)
204 | #     fold_test_results.append(test_df)
205 |     cv_df.to_csv(f"{model_name}_cv_{cv}.csv",index=False)
206 |     test_df.to_csv(f"{model_name}_result_{cv}.csv",index=False)
207 |     imp.to_csv(f"{model_name}_imp_{cv}.csv",index=False)
208 |     print("fold {} finished".format(cv))
209 |     print(cv_score)
210 |     fold_scores.append(cv_score)
211 |     imps.append(imp)
212 | 
213 | 
214 | # In[1]:
215 | 
216 | 
217 | np.mean(fold_scores)
218 | 
219 | #0.35309347230573923
220 | #0.3522860689007414
221 | #0.3585175465159315
222 | #0.35720084429290466
223 | #0.34729405401751007
224 | 
225 | 
226 | # In[ ]:
227 | 
228 | 
229 | result = []
230 | for i in range(1,6):
231 |     re_csv = f"{model_name}_result_{i}.csv"
232 |     test_df = pd.read_csv(re_csv)
233 |     result.append(test_df)
234 | 
235 | 
236 | # In[ ]:
237 | 
238 | 
239 | final_test = result[0].copy()
240 | 
241 | 
242 | # In[ ]:
243 | 
244 | 
245 | for i in range(1,5):
246 |     final_test['pred']+=result[i]['pred']
247 | 
248 | 
249 | # In[ ]:
250 | 
251 | 
252 | final_test['pred'] = final_test['pred']/5
253 | 
254 | 
255 | # In[ ]:
256 | 
257 | 
258 | final_test.to_csv("../../../output/m1/nn02/te_catboost03newtest.csv",index=False)
259 | 
260 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 | 
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 | 
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 | 
31 | # Debug files
32 | *.dSYM/
33 | 
34 | 
35 | build/*
36 | *.swp
37 | 
38 | # OS X stuff
39 | ._*
40 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/.travis.yml:
--------------------------------------------------------------------------------
1 | language: c
2 | dist: trusty
3 | sudo: required
4 | before_install:
5 |   - sudo apt-get install python2.7 python-numpy python-pip
6 | script: pip install numpy && ./demo.sh | tee results.txt && [[ `cat results.txt  | egrep "Total accuracy. 2[23]" | wc -l` = "1" ]] && echo test-passed
7 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #For older gcc, use -O3 or -O2 instead of -Ofast
 3 | # CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wno-unused-result
 4 | CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic
 5 | BUILDDIR := build
 6 | SRCDIR := src
 7 | 
 8 | all: dir glove shuffle cooccur vocab_count
 9 | 
10 | dir :
11 | 	mkdir -p $(BUILDDIR)
12 | glove : $(SRCDIR)/glove.c
13 | 	$(CC) $(SRCDIR)/glove.c -o $(BUILDDIR)/glove $(CFLAGS)
14 | shuffle : $(SRCDIR)/shuffle.c
15 | 	$(CC) $(SRCDIR)/shuffle.c -o $(BUILDDIR)/shuffle $(CFLAGS)
16 | cooccur : $(SRCDIR)/cooccur.c
17 | 	$(CC) $(SRCDIR)/cooccur.c -o $(BUILDDIR)/cooccur $(CFLAGS)
18 | vocab_count : $(SRCDIR)/vocab_count.c
19 | 	$(CC) $(SRCDIR)/vocab_count.c -o $(BUILDDIR)/vocab_count $(CFLAGS)
20 | 
21 | clean:
22 | 	rm -rf glove shuffle cooccur vocab_count build
23 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/README.md:
--------------------------------------------------------------------------------
 1 | ## GloVe: Global Vectors for Word Representation
 2 | 
 3 | 
 4 | | nearest neighbors of <br/> <em>frog</em> | Litoria             |  Leptodactylidae | Rana | Eleutherodactylus |
 5 | | --- | ------------------------------- | ------------------- | ---------------- | ------------------- |
 6 | | Pictures | <img src="http://nlp.stanford.edu/projects/glove/images/litoria.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/leptodactylidae.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/rana.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/eleutherodactylus.jpg"></img> |
 7 | 
 8 | | Comparisons | man -> woman             |  city -> zip | comparative -> superlative |
 9 | | --- | ------------------------|-------------------------|-------------------------|
10 | | GloVe Geometry | <img src="http://nlp.stanford.edu/projects/glove/images/man_woman_small.jpg"></img>  | <img src="http://nlp.stanford.edu/projects/glove/images/city_zip_small.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/comparative_superlative_small.jpg"></img> |
11 | 
12 | We provide an implementation of the GloVe model for learning word representations, and describe how to download web-dataset vectors or train your own. See the [project page](http://nlp.stanford.edu/projects/glove/) or the [paper](http://nlp.stanford.edu/pubs/glove.pdf) for more information on glove vectors.
13 | 
14 | ## Download pre-trained word vectors
15 | The links below contain word vectors obtained from the respective corpora. If you want word vectors trained on massive web datasets, you need only download one of these text files! Pre-trained word vectors are made available under the <a href="http://opendatacommons.org/licenses/pddl/">Public Domain Dedication and License</a>. 
16 | <div class="entry">
17 | <ul style="padding-left:0px; margin-top:0px; margin-bottom:0px">
18 |   <li> Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip">glove.42B.300d.zip</a> </li>
19 |   <li> Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip">glove.840B.300d.zip</a> </li>
20 |   <li> Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.6B.zip">glove.6B.zip</a> </li>
21 |   <li> Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip">glove.twitter.27B.zip</a>
22 | </ul>
23 | </div>
24 | 
25 | ## Train word vectors on a new corpus
26 | 
27 | <img src="https://travis-ci.org/stanfordnlp/GloVe.svg?branch=master"></img>
28 | 
29 | If the web datasets above don't match the semantics of your end use case, you can train word vectors on your own corpus.
30 | 
31 |     $ git clone http://github.com/stanfordnlp/glove
32 |     $ cd glove && make
33 |     $ ./demo.sh
34 | 
35 | The demo.sh script downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python to verify word vector quality. More details about training on your own corpus can be found by reading [demo.sh](https://github.com/stanfordnlp/GloVe/blob/master/demo.sh) or the [src/README.md](https://github.com/stanfordnlp/GloVe/tree/master/src)
36 | 
37 | ### License
38 | All work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file.
39 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
 5 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python
 6 | 
 7 | make
 8 | if [ ! -e text8 ]; then
 9 |   if hash wget 2>/dev/null; then
10 |     wget http://mattmahoney.net/dc/text8.zip
11 |   else
12 |     curl -O http://mattmahoney.net/dc/text8.zip
13 |   fi
14 |   unzip text8.zip
15 |   rm text8.zip
16 | fi
17 | 
18 | CORPUS=../corpus.txt
19 | VOCAB_FILE=vocab.txt
20 | COOCCURRENCE_FILE=cooccurrence.bin
21 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
22 | BUILDDIR=build
23 | SAVE_FILE=vectors
24 | VERBOSE=2
25 | MEMORY=4.0
26 | VOCAB_MIN_COUNT=2
27 | VECTOR_SIZE=300
28 | MAX_ITER=15
29 | WINDOW_SIZE=15
30 | BINARY=2
31 | NUM_THREADS=8
32 | X_MAX=10
33 | 
34 | echo
35 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
36 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
37 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE"
38 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
39 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
40 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
41 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE"
42 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
43 | if [ "$CORPUS" = 'text8' ]; then
44 |    if [ "$1" = 'matlab' ]; then
45 |        matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 
46 |    elif [ "$1" = 'octave' ]; then
47 |        octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
48 |    else
49 |        echo "$ python eval/python/evaluate.py"
50 |        python eval/python/evaluate.py
51 |    fi
52 | fi
53 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/matlab/WordLookup.m:
--------------------------------------------------------------------------------
 1 | function index = WordLookup(InputString)
 2 | global wordMap
 3 | if wordMap.isKey(InputString)
 4 |     index = wordMap(InputString);
 5 | elseif wordMap.isKey('<unk>')
 6 |     index = wordMap('<unk>');
 7 | else
 8 |     index = 0;
 9 | end
10 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/matlab/evaluate_vectors.m:
--------------------------------------------------------------------------------
 1 | function [BB] = evaluate_vectors(W)
 2 | 
 3 | global wordMap
 4 | 
 5 | filenames = {'capital-common-countries' 'capital-world' 'currency' 'city-in-state' 'family' 'gram1-adjective-to-adverb' ...
 6 |     'gram2-opposite' 'gram3-comparative' 'gram4-superlative' 'gram5-present-participle' 'gram6-nationality-adjective' ...
 7 |     'gram7-past-tense' 'gram8-plural' 'gram9-plural-verbs'};
 8 | path = './eval/question-data/';
 9 | 
10 | split_size = 100; %to avoid memory overflow, could be increased/decreased depending on system and vocab size
11 | 
12 | correct_sem = 0; %count correct semantic questions
13 | correct_syn = 0; %count correct syntactic questions
14 | correct_tot = 0; %count correct questions
15 | count_sem = 0; %count all semantic questions
16 | count_syn = 0; %count all syntactic questions
17 | count_tot = 0; %count all questions
18 | full_count = 0; %count all questions, including those with unknown words
19 | 
20 | if wordMap.isKey('<unk>')
21 |     unkkey = wordMap('<unk>');
22 | else
23 |     unkkey = 0;
24 | end
25 | 
26 | for j=1:length(filenames);
27 | 
28 | clear dist;
29 | 
30 | fid=fopen([path filenames{j} '.txt']);
31 | temp=textscan(fid,'%s%s%s%s');
32 | fclose(fid);
33 | ind1 = cellfun(@WordLookup,temp{1}); %indices of first word in analogy
34 | ind2 = cellfun(@WordLookup,temp{2}); %indices of second word in analogy
35 | ind3 = cellfun(@WordLookup,temp{3}); %indices of third word in analogy
36 | ind4 = cellfun(@WordLookup,temp{4}); %indices of answer word in analogy
37 | full_count = full_count + length(ind1);
38 | ind = (ind1 ~= unkkey) & (ind2 ~= unkkey) & (ind3 ~= unkkey) & (ind4 ~= unkkey); %only look at those questions which have no unknown words
39 | ind1 = ind1(ind);
40 | ind2 = ind2(ind);
41 | ind3 = ind3(ind);
42 | ind4 = ind4(ind);
43 | disp([filenames{j} ':']);
44 | mx = zeros(1,length(ind1));
45 | num_iter = ceil(length(ind1)/split_size);
46 | for jj=1:num_iter
47 | range = (jj-1)*split_size+1:min(jj*split_size,length(ind1));
48 | dist = full(W * (W(ind2(range),:)' - W(ind1(range),:)' +  W(ind3(range),:)')); %cosine similarity if input W has been normalized
49 | for i=1:length(range)
50 | dist(ind1(range(i)),i) = -Inf;
51 | dist(ind2(range(i)),i) = -Inf;
52 | dist(ind3(range(i)),i) = -Inf;
53 | end
54 | [~, mx(range)] = max(dist); %predicted word index
55 | end
56 | 
57 | val = (ind4 == mx'); %correct predictions
58 | count_tot = count_tot + length(ind1);
59 | correct_tot = correct_tot + sum(val);
60 | disp(['ACCURACY TOP1: ' num2str(mean(val)*100,'%-2.2f') '%  (' num2str(sum(val)) '/' num2str(length(val)) ')']);
61 | if j < 6
62 |     count_sem = count_sem + length(ind1);
63 |     correct_sem = correct_sem + sum(val);
64 | else
65 |     count_syn = count_syn + length(ind1);
66 |     correct_syn = correct_syn + sum(val);
67 | end
68 |     
69 | disp(['Total accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '%   Semantic accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '%    Syntactic accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%']);
70 | 
71 | end
72 | disp('________________________________________________________________________________');
73 | disp(['Questions seen/total: ' num2str(100*count_tot/full_count,'%-2.2f') '%  (' num2str(count_tot) '/' num2str(full_count) ')']);
74 | disp(['Semantic Accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '%   (' num2str(correct_sem) '/' num2str(count_sem) ')']);
75 | disp(['Syntactic Accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%   (' num2str(correct_syn) '/' num2str(count_syn) ')']);
76 | disp(['Total Accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '%   (' num2str(correct_tot) '/' num2str(count_tot) ')']);
77 | BB = [100*correct_sem/count_sem 100*correct_syn/count_syn 100*correct_tot/count_tot];
78 | 
79 | end
80 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/matlab/read_and_evaluate.m:
--------------------------------------------------------------------------------
 1 | addpath('./eval/matlab');
 2 | if(~exist('vocab_file')) 
 3 |     vocab_file = 'vocab.txt';
 4 | end
 5 | if(~exist('vectors_file')) 
 6 |     vectors_file = 'vectors.bin';
 7 | end
 8 | 
 9 | fid = fopen(vocab_file, 'r');
10 | words = textscan(fid, '%s %f');
11 | fclose(fid);
12 | words = words{1};
13 | vocab_size = length(words);
14 | global wordMap
15 | wordMap = containers.Map(words(1:vocab_size),1:vocab_size);
16 | 
17 | fid = fopen(vectors_file,'r');
18 | fseek(fid,0,'eof');
19 | vector_size = ftell(fid)/16/vocab_size - 1;
20 | frewind(fid);
21 | WW = fread(fid, [vector_size+1 2*vocab_size], 'double')'; 
22 | fclose(fid); 
23 | 
24 | W1 = WW(1:vocab_size, 1:vector_size); % word vectors
25 | W2 = WW(vocab_size+1:end, 1:vector_size); % context (tilde) word vectors
26 | 
27 | W = W1 + W2; %Evaluate on sum of word vectors
28 | W = bsxfun(@rdivide,W,sqrt(sum(W.*W,2))); %normalize vectors before evaluation
29 | evaluate_vectors(W);
30 | exit
31 | 
32 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/octave/WordLookup_octave.m:
--------------------------------------------------------------------------------
 1 | function index = WordLookup_octave(InputString)
 2 | global wordMap
 3 | 
 4 | if isfield(wordMap, InputString)
 5 |   index = wordMap.(InputString);
 6 | elseif isfield(wordMap, '<unk>')
 7 |   index = wordMap.('<unk>');
 8 | else
 9 |   index = 0;
10 | end
11 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/octave/evaluate_vectors_octave.m:
--------------------------------------------------------------------------------
 1 | function [BB] = evaluate_vectors_octave(W)
 2 | 
 3 | global wordMap
 4 | 
 5 | filenames = {'capital-common-countries' 'capital-world' 'currency' 'city-in-state' 'family' 'gram1-adjective-to-adverb' ...
 6 |     'gram2-opposite' 'gram3-comparative' 'gram4-superlative' 'gram5-present-participle' 'gram6-nationality-adjective' ...
 7 |     'gram7-past-tense' 'gram8-plural' 'gram9-plural-verbs'};
 8 | path = './eval/question-data/';
 9 | 
10 | split_size = 100; %to avoid memory overflow, could be increased/decreased depending on system and vocab size
11 | 
12 | correct_sem = 0; %count correct semantic questions
13 | correct_syn = 0; %count correct syntactic questions
14 | correct_tot = 0; %count correct questions
15 | count_sem = 0; %count all semantic questions
16 | count_syn = 0; %count all syntactic questions
17 | count_tot = 0; %count all questions
18 | full_count = 0; %count all questions, including those with unknown words
19 | 
20 | 
21 | if isfield(wordMap, '<unk>')
22 |   unkkey = wordMap.('<unk>');
23 | else
24 |   unkkey = 0;
25 | end
26 | 
27 | for j=1:length(filenames);
28 | 
29 | clear dist;
30 | 
31 | fid=fopen([path filenames{j} '.txt']);
32 | temp=textscan(fid,'%s%s%s%s');
33 | fclose(fid);
34 | ind1 = cellfun(@WordLookup_octave,temp{1}); %indices of first word in analogy
35 | ind2 = cellfun(@WordLookup_octave,temp{2}); %indices of second word in analogy
36 | ind3 = cellfun(@WordLookup_octave,temp{3}); %indices of third word in analogy
37 | ind4 = cellfun(@WordLookup_octave,temp{4}); %indices of answer word in analogy
38 | full_count = full_count + length(ind1);
39 | ind = (ind1 ~= unkkey) & (ind2 ~= unkkey) & (ind3 ~= unkkey) & (ind4 ~= unkkey); %only look at those questions which have no unknown words
40 | ind1 = ind1(ind);
41 | ind2 = ind2(ind);
42 | ind3 = ind3(ind);
43 | ind4 = ind4(ind);
44 | disp([filenames{j} ':']);
45 | mx = zeros(1,length(ind1));
46 | num_iter = ceil(length(ind1)/split_size);
47 | for jj=1:num_iter
48 | range = (jj-1)*split_size+1:min(jj*split_size,length(ind1));
49 | dist = full(W * (W(ind2(range),:)' - W(ind1(range),:)' +  W(ind3(range),:)')); %cosine similarity if input W has been normalized
50 | for i=1:length(range)
51 | dist(ind1(range(i)),i) = -Inf;
52 | dist(ind2(range(i)),i) = -Inf;
53 | dist(ind3(range(i)),i) = -Inf;
54 | end
55 | [~, mx(range)] = max(dist); %predicted word index
56 | end
57 | 
58 | val = (ind4 == mx'); %correct predictions
59 | count_tot = count_tot + length(ind1);
60 | correct_tot = correct_tot + sum(val);
61 | disp(['ACCURACY TOP1: ' num2str(mean(val)*100,'%-2.2f') '%  (' num2str(sum(val)) '/' num2str(length(val)) ')']);
62 | if j < 6
63 |     count_sem = count_sem + length(ind1);
64 |     correct_sem = correct_sem + sum(val);
65 | else
66 |     count_syn = count_syn + length(ind1);
67 |     correct_syn = correct_syn + sum(val);
68 | end
69 |     
70 | disp(['Total accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '%   Semantic accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '%    Syntactic accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%']);
71 | 
72 | end
73 | disp('________________________________________________________________________________');
74 | disp(['Questions seen/total: ' num2str(100*count_tot/full_count,'%-2.2f') '%  (' num2str(count_tot) '/' num2str(full_count) ')']);
75 | disp(['Semantic Accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '%   (' num2str(correct_sem) '/' num2str(count_sem) ')']);
76 | disp(['Syntactic Accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%   (' num2str(correct_syn) '/' num2str(count_syn) ')']);
77 | disp(['Total Accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '%   (' num2str(correct_tot) '/' num2str(count_tot) ')']);
78 | BB = [100*correct_sem/count_sem 100*correct_syn/count_syn 100*correct_tot/count_tot];
79 | 
80 | end
81 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/octave/read_and_evaluate_octave.m:
--------------------------------------------------------------------------------
 1 | addpath('./eval/octave');
 2 | if(~exist('vocab_file')) 
 3 |     vocab_file = 'vocab.txt';
 4 | end
 5 | if(~exist('vectors_file')) 
 6 |     vectors_file = 'vectors.bin';
 7 | end
 8 | 
 9 | fid = fopen(vocab_file, 'r');
10 | words = textscan(fid, '%s %f');
11 | fclose(fid);
12 | words = words{1};
13 | vocab_size = length(words);
14 | global wordMap
15 | 
16 | wordMap = struct();
17 | for i=1:numel(words)
18 |     wordMap.(words{i}) = i;
19 | end
20 | 
21 | fid = fopen(vectors_file,'r');
22 | fseek(fid,0,'eof');
23 | vector_size = ftell(fid)/16/vocab_size - 1;
24 | frewind(fid);
25 | WW = fread(fid, [vector_size+1 2*vocab_size], 'double')'; 
26 | fclose(fid); 
27 | 
28 | W1 = WW(1:vocab_size, 1:vector_size); % word vectors
29 | W2 = WW(vocab_size+1:end, 1:vector_size); % context (tilde) word vectors
30 | 
31 | W = W1 + W2; %Evaluate on sum of word vectors
32 | W = bsxfun(@rdivide,W,sqrt(sum(W.*W,2))); %normalize vectors before evaluation
33 | evaluate_vectors_octave(W);
34 | exit
35 | 
36 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/python/distance.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | def generate():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
 8 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     with open(args.vocab_file, 'r') as f:
12 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 |     with open(args.vectors_file, 'r') as f:
14 |         vectors = {}
15 |         for line in f:
16 |             vals = line.rstrip().split(' ')
17 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
18 | 
19 |     vocab_size = len(words)
20 |     vocab = {w: idx for idx, w in enumerate(words)}
21 |     ivocab = {idx: w for idx, w in enumerate(words)}
22 | 
23 |     vector_dim = len(vectors[ivocab[0]])
24 |     W = np.zeros((vocab_size, vector_dim))
25 |     for word, v in vectors.items():
26 |         if word == '<unk>':
27 |             continue
28 |         W[vocab[word], :] = v
29 | 
30 |     # normalize each word vector to unit variance
31 |     W_norm = np.zeros(W.shape)
32 |     d = (np.sum(W ** 2, 1) ** (0.5))
33 |     W_norm = (W.T / d).T
34 |     return (W_norm, vocab, ivocab)
35 | 
36 | 
37 | def distance(W, vocab, ivocab, input_term):
38 |     for idx, term in enumerate(input_term.split(' ')):
39 |         if term in vocab:
40 |             print('Word: %s  Position in vocabulary: %i' % (term, vocab[term]))
41 |             if idx == 0:
42 |                 vec_result = np.copy(W[vocab[term], :])
43 |             else:
44 |                 vec_result += W[vocab[term], :] 
45 |         else:
46 |             print('Word: %s  Out of dictionary!\n' % term)
47 |             return
48 |     
49 |     vec_norm = np.zeros(vec_result.shape)
50 |     d = (np.sum(vec_result ** 2,) ** (0.5))
51 |     vec_norm = (vec_result.T / d).T
52 | 
53 |     dist = np.dot(W, vec_norm.T)
54 | 
55 |     for term in input_term.split(' '):
56 |         index = vocab[term]
57 |         dist[index] = -np.Inf
58 | 
59 |     a = np.argsort(-dist)[:N]
60 | 
61 |     print("\n                               Word       Cosine distance\n")
62 |     print("---------------------------------------------------------\n")
63 |     for x in a:
64 |         print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     N = 100;          # number of closest words that will be shown
69 |     W, vocab, ivocab = generate()
70 |     while True:
71 |         input_term = raw_input("\nEnter word or sentence (EXIT to break): ")
72 |         if input_term == 'EXIT':
73 |             break
74 |         else:
75 |             distance(W, vocab, ivocab, input_term)
76 | 
77 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/python/evaluate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | 
  4 | def main():
  5 |     parser = argparse.ArgumentParser()
  6 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
  7 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
  8 |     args = parser.parse_args()
  9 | 
 10 |     with open(args.vocab_file, 'r') as f:
 11 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
 12 |     with open(args.vectors_file, 'r') as f:
 13 |         vectors = {}
 14 |         for line in f:
 15 |             vals = line.rstrip().split(' ')
 16 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
 17 | 
 18 |     vocab_size = len(words)
 19 |     vocab = {w: idx for idx, w in enumerate(words)}
 20 |     ivocab = {idx: w for idx, w in enumerate(words)}
 21 | 
 22 |     vector_dim = len(vectors[ivocab[0]])
 23 |     W = np.zeros((vocab_size, vector_dim))
 24 |     for word, v in vectors.items():
 25 |         if word == '<unk>':
 26 |             continue
 27 |         W[vocab[word], :] = v
 28 | 
 29 |     # normalize each word vector to unit length
 30 |     W_norm = np.zeros(W.shape)
 31 |     d = (np.sum(W ** 2, 1) ** (0.5))
 32 |     W_norm = (W.T / d).T
 33 |     evaluate_vectors(W_norm, vocab, ivocab)
 34 | 
 35 | def evaluate_vectors(W, vocab, ivocab):
 36 |     """Evaluate the trained word vectors on a variety of tasks"""
 37 | 
 38 |     filenames = [
 39 |         'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
 40 |         'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
 41 |         'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
 42 |         'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
 43 |         'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
 44 |         ]
 45 |     prefix = './eval/question-data/'
 46 | 
 47 |     # to avoid memory overflow, could be increased/decreased
 48 |     # depending on system and vocab size
 49 |     split_size = 100
 50 | 
 51 |     correct_sem = 0; # count correct semantic questions
 52 |     correct_syn = 0; # count correct syntactic questions
 53 |     correct_tot = 0 # count correct questions
 54 |     count_sem = 0; # count all semantic questions
 55 |     count_syn = 0; # count all syntactic questions
 56 |     count_tot = 0 # count all questions
 57 |     full_count = 0 # count all questions, including those with unknown words
 58 | 
 59 |     for i in range(len(filenames)):
 60 |         with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
 61 |             full_data = [line.rstrip().split(' ') for line in f]
 62 |             full_count += len(full_data)
 63 |             data = [x for x in full_data if all(word in vocab for word in x)]
 64 | 
 65 |         indices = np.array([[vocab[word] for word in row] for row in data])
 66 |         ind1, ind2, ind3, ind4 = indices.T
 67 | 
 68 |         predictions = np.zeros((len(indices),))
 69 |         num_iter = int(np.ceil(len(indices) / float(split_size)))
 70 |         for j in range(num_iter):
 71 |             subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
 72 | 
 73 |             pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
 74 |                 +  W[ind3[subset], :])
 75 |             #cosine similarity if input W has been normalized
 76 |             dist = np.dot(W, pred_vec.T)
 77 | 
 78 |             for k in range(len(subset)):
 79 |                 dist[ind1[subset[k]], k] = -np.Inf
 80 |                 dist[ind2[subset[k]], k] = -np.Inf
 81 |                 dist[ind3[subset[k]], k] = -np.Inf
 82 | 
 83 |             # predicted word index
 84 |             predictions[subset] = np.argmax(dist, 0).flatten()
 85 | 
 86 |         val = (ind4 == predictions) # correct predictions
 87 |         count_tot = count_tot + len(ind1)
 88 |         correct_tot = correct_tot + sum(val)
 89 |         if i < 5:
 90 |             count_sem = count_sem + len(ind1)
 91 |             correct_sem = correct_sem + sum(val)
 92 |         else:
 93 |             count_syn = count_syn + len(ind1)
 94 |             correct_syn = correct_syn + sum(val)
 95 | 
 96 |         print("%s:" % filenames[i])
 97 |         print('ACCURACY TOP1: %.2f%% (%d/%d)' %
 98 |             (np.mean(val) * 100, np.sum(val), len(val)))
 99 | 
100 |     print('Questions seen/total: %.2f%% (%d/%d)' %
101 |         (100 * count_tot / float(full_count), count_tot, full_count))
102 |     print('Semantic accuracy: %.2f%%  (%i/%i)' %
103 |         (100 * correct_sem / float(count_sem), correct_sem, count_sem))
104 |     print('Syntactic accuracy: %.2f%%  (%i/%i)' %
105 |         (100 * correct_syn / float(count_syn), correct_syn, count_syn))
106 |     print('Total accuracy: %.2f%%  (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot))
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/eval/python/word_analogy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | def generate():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
 8 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     with open(args.vocab_file, 'r') as f:
12 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 |     with open(args.vectors_file, 'r') as f:
14 |         vectors = {}
15 |         for line in f:
16 |             vals = line.rstrip().split(' ')
17 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
18 | 
19 |     vocab_size = len(words)
20 |     vocab = {w: idx for idx, w in enumerate(words)}
21 |     ivocab = {idx: w for idx, w in enumerate(words)}
22 | 
23 |     vector_dim = len(vectors[ivocab[0]])
24 |     W = np.zeros((vocab_size, vector_dim))
25 |     for word, v in vectors.items():
26 |         if word == '<unk>':
27 |             continue
28 |         W[vocab[word], :] = v
29 | 
30 |     # normalize each word vector to unit variance
31 |     W_norm = np.zeros(W.shape)
32 |     d = (np.sum(W ** 2, 1) ** (0.5))
33 |     W_norm = (W.T / d).T
34 |     return (W_norm, vocab, ivocab)
35 | 
36 | 
37 | def distance(W, vocab, ivocab, input_term):
38 |     vecs = {}
39 |     if len(input_term.split(' ')) < 3:
40 |         print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' ')))
41 |         return 
42 |     else:
43 |         for idx, term in enumerate(input_term.split(' ')):
44 |             if term in vocab:
45 |                 print('Word: %s  Position in vocabulary: %i' % (term, vocab[term]))
46 |                 vecs[idx] = W[vocab[term], :] 
47 |             else:
48 |                 print('Word: %s  Out of dictionary!\n' % term)
49 |                 return
50 | 
51 |         vec_result = vecs[1] - vecs[0] + vecs[2]
52 |         
53 |         vec_norm = np.zeros(vec_result.shape)
54 |         d = (np.sum(vec_result ** 2,) ** (0.5))
55 |         vec_norm = (vec_result.T / d).T
56 | 
57 |         dist = np.dot(W, vec_norm.T)
58 | 
59 |         for term in input_term.split(' '):
60 |             index = vocab[term]
61 |             dist[index] = -np.Inf
62 | 
63 |         a = np.argsort(-dist)[:N]
64 | 
65 |         print("\n                               Word       Cosine distance\n")
66 |         print("---------------------------------------------------------\n")
67 |         for x in a:
68 |             print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     N = 100;          # number of closest words that will be shown
73 |     W, vocab, ivocab = generate()
74 |     while True:
75 |         input_term = raw_input("\nEnter three words (EXIT to break): ")
76 |         if input_term == 'EXIT':
77 |             break
78 |         else:
79 |             distance(W, vocab, ivocab, input_term)
80 | 
81 | 


--------------------------------------------------------------------------------
/src/rank/m1/glove/src/README.md:
--------------------------------------------------------------------------------
 1 | ### Package Contents
 2 | 
 3 | To train your own GloVe vectors, first you'll need to prepare your corpus as a single text file with all words separated by one or more spaces or tabs. If your corpus has multiple documents, the documents (only) should be separated by new line characters. Cooccurrence contexts for words do not extend past newline characters. Once you create your corpus, you can train GloVe vectors using the following 4 tools. An example is included in `demo.sh`, which you can modify as necessary.
 4 | 
 5 | The four main tools in this package are:
 6 | 
 7 | #### 1) vocab_count
 8 | This tool requires an input corpus that should already consist of whitespace-separated tokens. Use something like the [Stanford Tokenizer](https://nlp.stanford.edu/software/tokenizer.html) first on raw text. From the corpus, it constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count.
 9 | 
10 | #### 2) cooccur
11 | Constructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by `vocab_count`, and may specify a variety of parameters, as described by running `./build/cooccur`.
12 | 
13 | #### 3) shuffle
14 | Shuffles the binary file of cooccurrence statistics produced by `cooccur`. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled together. The user may specify a number of parameters, as described by running `./build/shuffle`.
15 | 
16 | #### 4) glove
17 | Train the GloVe model on the specified cooccurrence data, which typically will be the output of the `shuffle` tool. The user should supply a vocabulary file, as given by `vocab_count`, and may specify a number of other parameters, which are described by running `./build/glove`.
18 | 


--------------------------------------------------------------------------------
/src/rank/m1/prepare_rank_train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | # In[2]:
 13 | 
 14 | 
 15 | paper = pd.read_feather("../../../input/paper_input_final.ftr")
 16 | 
 17 | 
 18 | # In[3]:
 19 | 
 20 | 
 21 | paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
 22 | paper['corp'] = paper['titl']+' '+paper['keywords'].fillna('').replace(';', ' ')+paper['abst']
 23 | 
 24 | 
 25 | # In[4]:
 26 | 
 27 | 
 28 | df_train = pd.read_feather("../../../input/tr_input_final.ftr")
 29 | 
 30 | 
 31 | # In[5]:
 32 | 
 33 | 
 34 | df_train.head()
 35 | 
 36 | 
 37 | # In[6]:
 38 | 
 39 | 
 40 | df_test = pd.read_feather("../../../input/te_input_final.ftr")
 41 | 
 42 | 
 43 | # In[7]:
 44 | 
 45 | 
 46 | df_test.head()
 47 | 
 48 | 
 49 | # In[8]:
 50 | 
 51 | 
 52 | #####reduce mem
 53 | import datetime
 54 | def pandas_reduce_mem_usage(df):
 55 |     start_mem=df.memory_usage().sum() / 1024**2
 56 |     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
 57 |     starttime = datetime.datetime.now()
 58 |     for col in df.columns:
 59 |         col_type=df[col].dtype   #每一列的类型
 60 |         if col_type !=object:    #不是object类型
 61 |             c_min=df[col].min()
 62 |             c_max=df[col].max()
 63 |             # print('{} column dtype is {} and begin convert to others'.format(col,col_type))
 64 |             if str(col_type)[:3]=='int':
 65 |                 #是有符号整数
 66 |                 if c_min<0:
 67 |                     if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
 68 |                         df[col] = df[col].astype(np.int8)
 69 |                     elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
 70 |                         df[col] = df[col].astype(np.int16)
 71 |                     elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
 72 |                         df[col] = df[col].astype(np.int32)
 73 |                     else:
 74 |                         df[col] = df[col].astype(np.int64)
 75 |                 else:
 76 |                     if c_min >= np.iinfo(np.uint8).min and c_max<=np.iinfo(np.uint8).max:
 77 |                         df[col]=df[col].astype(np.uint8)
 78 |                     elif c_min >= np.iinfo(np.uint16).min and c_max <= np.iinfo(np.uint16).max:
 79 |                         df[col] = df[col].astype(np.uint16)
 80 |                     elif c_min >= np.iinfo(np.uint32).min and c_max <= np.iinfo(np.uint32).max:
 81 |                         df[col] = df[col].astype(np.uint32)
 82 |                     else:
 83 |                         df[col] = df[col].astype(np.uint64)
 84 |             #浮点数
 85 |             else:
 86 |                 if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float32).max:
 87 |                     df[col] = df[col].astype(np.float32)
 88 |                 else:
 89 |                     df[col] = df[col].astype(np.float64)
 90 |             # print('\t\tcolumn dtype is {}'.format(df[col].dtype))
 91 | 
 92 |         #是object类型，比如str
 93 |         else:
 94 |             # print('\t\tcolumns dtype is object and will convert to category')
 95 |             df[col] = df[col].astype('category')
 96 |     end_mem = df.memory_usage().sum() / 1024 ** 2
 97 |     endtime = datetime.datetime.now()
 98 |     print('consume times: {:.4f}'.format((endtime - starttime).seconds))
 99 |     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
100 |     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
101 |     return df
102 | 
103 | 
104 | # In[9]:
105 | 
106 | 
107 | recall_train = pd.read_feather('../../../input/tr_s0_32-50.ftr')
108 | recall_test = pd.read_feather('../../../input/te_s0_32-50.ftr')
109 | 
110 | 
111 | # In[10]:
112 | 
113 | 
114 | recall_train = pandas_reduce_mem_usage(recall_train)
115 | 
116 | 
117 | # In[11]:
118 | 
119 | 
120 | recall_test = pandas_reduce_mem_usage(recall_test)
121 | 
122 | 
123 | # In[12]:
124 | 
125 | 
126 | recall_train.shape
127 | 
128 | 
129 | # In[13]:
130 | 
131 | 
132 | cv_id = pd.read_csv("../../../input/cv_ids_0109.csv")
133 | recall_train.drop(columns=['cv'],axis=1,inplace=True)
134 | recall_train = recall_train.merge(cv_id,on=['description_id'],how='left')
135 | 
136 | 
137 | # In[14]:
138 | 
139 | 
140 | recall_train = recall_train.dropna(subset=['cv']).reset_index(drop=True)
141 | 
142 | 
143 | # In[15]:
144 | 
145 | 
146 | recall_train.shape,recall_test.shape
147 | 
148 | 
149 | # In[16]:
150 | 
151 | 
152 | recall_train = recall_train.merge(paper[['paper_id','corp']],on=['paper_id'],how='left')
153 | recall_test = recall_test.merge(paper[['paper_id','corp']],on=['paper_id'],how='left')
154 | 
155 | 
156 | # In[17]:
157 | 
158 | 
159 | recall_train = recall_train.merge(df_train[['description_id','quer_key','quer_all']],on=['description_id'],how='left')
160 | recall_test = recall_test.merge(df_test[['description_id','quer_key','quer_all']],on=['description_id'],how='left')
161 | 
162 | 
163 | # In[18]:
164 | 
165 | 
166 | recall_train = recall_train.sort_values(['description_id', 'corp_sim_score'], ascending=False)
167 | recall_train['rank'] = recall_train.groupby('description_id').cumcount().values
168 | recall_test = recall_test.sort_values(['description_id', 'corp_sim_score'], ascending=False)
169 | recall_test['rank'] = recall_test.groupby('description_id').cumcount().values
170 | 
171 | 
172 | # In[19]:
173 | 
174 | 
175 | keep_columns = ['description_id','paper_id','corp','quer_key','quer_all','corp_sim_score','cv','rank','target']
176 | recall_train = recall_train[keep_columns].reset_index(drop=True)
177 | recall_test = recall_test[keep_columns].reset_index(drop=True)
178 | 
179 | 
180 | # In[20]:
181 | 
182 | 
183 | recall_train.head()
184 | 
185 | 
186 | # In[22]:
187 | 
188 | 
189 | recall_train.to_csv('recall_train.csv',index=False)
190 | 
191 | 
192 | # In[23]:
193 | 
194 | 
195 | recall_test.to_csv('recall_test.csv',index=False)
196 | 
197 | 
198 | # In[ ]:
199 | 
200 | 
201 | # recall_train.shape
202 | 
203 | 


--------------------------------------------------------------------------------
/src/rank/m1/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ####依赖paper_input_1.ftr、te_input_1.ftr
 4 | ####写入语料，并训练word2vector词向量
 5 | python3 w2v_training.py ###暂时用jupyter notebook占位
 6 | 
 7 | ###训练glove词向量
 8 | cd glove && make
 9 | bash demo.sh
10 | ####回到目录
11 | cd ..
12 | 
13 | ###词向量序列化
14 | 
15 | 
16 | 
17 | ###准备训练数据
18 | python3 prepare_rank_train.py ###暂时用jupyter notebook占位
19 | 
20 | ###inferSent-simple-5-fold训练
21 | python3 inferSent1-5-fold_train.py ###暂时用jupyter notebook 占位
22 | 
23 | ###inferSent-simple-5-fold预测
24 | python3 inferSent1-5-fold_predict.py ###暂时用jupyter notebook 占位
25 | 
26 | ###catboost模型训练&预测
27 | python3 catboost3.py ###暂时用jupyter notebook 占位
28 | 
29 | ###nn02模型训练
30 | python3 nn02_train.py ###暂时用jupyter notebook 占位
31 | python3 nn02_predict.py ###暂时用jupyter notebook 占位
32 | 


--------------------------------------------------------------------------------
/src/rank/m1/w2v_training.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | # external vec
  8 | import warnings
  9 | warnings.filterwarnings('always')
 10 | warnings.filterwarnings('ignore')
 11 | 
 12 | import os
 13 | import sys
 14 | import numpy as np
 15 | import pandas as pd
 16 | from tqdm import tqdm
 17 | 
 18 | import time
 19 | from datetime import datetime
 20 | from gensim.models import Word2Vec
 21 | from gensim.models.word2vec import LineSentence
 22 | from gensim import corpora, models, similarities
 23 | from gensim.similarities import SparseMatrixSimilarity
 24 | from gensim.similarities import MatrixSimilarity
 25 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
 26 | 
 27 | 
 28 | # In[3]:
 29 | 
 30 | 
 31 | paper = pd.read_feather("../../../input/paper_input_final.ftr")
 32 | 
 33 | 
 34 | # In[4]:
 35 | 
 36 | 
 37 | paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
 38 | paper['corp'] = paper['titl']+' '+paper['keywords'].fillna('').replace(';', ' ')+paper['abst']
 39 | 
 40 | 
 41 | # In[5]:
 42 | 
 43 | 
 44 | paper.head()
 45 | 
 46 | 
 47 | # In[6]:
 48 | 
 49 | 
 50 | paper['len'] = paper['corp'].apply(len)
 51 | 
 52 | 
 53 | # In[7]:
 54 | 
 55 | 
 56 | paper['len'].describe()
 57 | 
 58 | 
 59 | # In[8]:
 60 | 
 61 | 
 62 | df_train = pd.read_feather("../../../input/tr_input_final.ftr")
 63 | 
 64 | 
 65 | # In[9]:
 66 | 
 67 | 
 68 | df_train.head()
 69 | 
 70 | 
 71 | # In[10]:
 72 | 
 73 | 
 74 | df_train['len'] = df_train['quer_key'].apply(len)
 75 | df_train['len'].describe()
 76 | 
 77 | 
 78 | # In[16]:
 79 | 
 80 | 
 81 | df_test = pd.read_feather("../../../input/te_input_final.ftr")
 82 | 
 83 | 
 84 | # In[17]:
 85 | 
 86 | 
 87 | df_test.head()
 88 | 
 89 | 
 90 | # In[18]:
 91 | 
 92 | 
 93 | # df_train[df_train['quer_all'].str.contains("[##]")]
 94 | 
 95 | 
 96 | # In[19]:
 97 | 
 98 | 
 99 | from tqdm import tqdm
100 | ###训练语料准备
101 | with open("corpus.txt","w+") as f:
102 |     for i in tqdm(range(len(paper))):
103 |         abst = paper.iloc[i]['abst']
104 |         if abst!='no_content' and abst!="none":
105 |             f.write(abst+"\n")
106 |         title = paper.iloc[i]['titl']
107 |         if title!='no_content' and title!="none":
108 |             f.write(title+"\n")
109 |     for i in tqdm(range(len(df_train))):
110 |         quer_all = df_train.iloc[i]['quer_all']
111 |         f.write(quer_all+"\n")
112 |     for i in tqdm(range(len(df_test))):
113 |         quer_all = df_test.iloc[i]['quer_all']
114 |         f.write(quer_all+"\n")
115 | 
116 | 
117 | # In[23]:
118 | 
119 | 
120 | ####word2vector
121 | from gensim.models import word2vec
122 | sentences = word2vec.LineSentence('./corpus.txt') 
123 | model = word2vec.Word2Vec(sentences, sg=1,min_count=2,window=8,size=300,iter=6,sample=1e-4, hs=1, workers=12)  
124 | 
125 | 
126 | # In[24]:
127 | 
128 | 
129 | model.save("word2vec.model")
130 | 
131 | 
132 | # In[34]:
133 | 
134 | 
135 | model.wv.save_word2vec_format("word2vec.txt",binary=False)
136 | 
137 | 
138 | # In[26]:
139 | 
140 | 
141 | #glove的已有
142 | from gensim.test.utils import datapath, get_tmpfile
143 | from gensim.models import KeyedVectors
144 | 
145 | 
146 | # In[31]:
147 | 
148 | 
149 | # 输入文件
150 | glove_file = datapath('glove/vectors.txt')
151 | # 输出文件
152 | tmp_file = get_tmpfile("glove_vec.txt")
153 | 
154 | 


--------------------------------------------------------------------------------
/src/rank/m2/bert_5_fold_predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | import torch
  8 | from pytorch_transformers import AdamW, WarmupLinearSchedule
  9 | import matchzoo as mz
 10 | from matchzoo.preprocessors.units.truncated_length import TruncatedLength
 11 | from utils import MAP, build_matrix, topk_lines, predict, Logger
 12 | 
 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 14 | 
 15 | import argparse
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--model_id', type=str, default='bert_002')
 19 | args = parser.parse_args()
 20 | 
 21 | model_id = args.model_id
 22 | 
 23 | if model_id=="bert_002":
 24 |     test_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_final_test_processed_query_key.dp")
 25 |     bst_epochs = {1:1, 2:1, 3:2, 4:1, 5:1}
 26 | if model_id=="bert_003":
 27 |     test_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_test_processed_query_all.dp")
 28 |     bst_epochs = {1:2, 2:1, 3:1, 4:2, 5:1}
 29 | if model_id=="bert_004":
 30 |     test_processed = mz.data_pack.data_pack.load_data_pack(
 31 |         "bert_data/bert_final_test_processed_query_all_nopreprocessing.dp/")
 32 |     bst_epochs = {1:2, 2:2, 3:1, 4:1, 5:1}
 33 | 
 34 | padding_callback = mz.models.Bert.get_default_padding_callback()
 35 | testset = mz.dataloader.Dataset(
 36 |     data_pack=test_processed,
 37 |     batch_size=128,
 38 |     sort=False,
 39 |     shuffle=False
 40 | )
 41 | testloader = mz.dataloader.DataLoader(
 42 |     dataset=testset,
 43 |     stage='dev',
 44 |     callback=padding_callback
 45 | )
 46 | 
 47 | 
 48 | num_dup = 1
 49 | num_neg = 7
 50 | 
 51 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
 52 | padding_callback = mz.models.Bert.get_default_padding_callback()
 53 | task = mz.tasks.Ranking(losses=losses)
 54 | task.metrics = [
 55 |     mz.metrics.MeanAveragePrecision(),
 56 |     MAP()
 57 | ]
 58 | 
 59 | model = mz.models.Bert()
 60 | 
 61 | model.params['task'] = task
 62 | model.params['mode'] = 'bert-base-uncased'
 63 | model.params['dropout_rate'] = 0.2
 64 | 
 65 | model.build()
 66 | 
 67 | print('Trainable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
 68 | 
 69 | no_decay = ['bias', 'LayerNorm.weight']
 70 | optimizer_grouped_parameters = [
 71 |     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 5e-5},
 72 |     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
 73 | ]
 74 | 
 75 | 
 76 | optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, betas=(0.9, 0.98), eps=1e-8)
 77 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=6, t_total=-1)
 78 | 
 79 | trainer = mz.trainers.Trainer(
 80 |     model=model,
 81 |     optimizer=optimizer,
 82 |     scheduler=scheduler,
 83 |     trainloader=testloader,
 84 |     validloader=testloader,
 85 |     validate_interval=None,
 86 |     epochs=1
 87 | )
 88 | 
 89 | 
 90 | for fold in range(1,6):
 91 |     i = bst_epochs[fold]
 92 |     trainer.restore_model("save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
 93 | 
 94 |     score = predict(trainer, testloader)
 95 |     X, y = test_processed.unpack()
 96 |     result = pd.DataFrame(data={
 97 |         'description_id': X['id_left'],
 98 |         'paper_id': X['id_right'],
 99 |         'score': score[:,0]})
100 |     # result.to_csv("result/{}/{}_fold_{}_test.csv".format(model_id, model_id, fold), index=False)
101 |     result.to_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold), index=False)
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/src/rank/m2/bert_5_fold_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
  3 | 
  4 | import gc
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import torch
 10 | from pytorch_transformers import AdamW, WarmupLinearSchedule
 11 | import matchzoo as mz
 12 | from matchzoo.preprocessors.units.truncated_length import TruncatedLength
 13 | from utils import MAP, build_matrix, topk_lines, predict, Logger
 14 | 
 15 | from matchzoo.data_pack import DataPack
 16 | 
 17 | import argparse
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | parser.add_argument('--model_id', type=str, default='bert_002')
 21 | args = parser.parse_args()
 22 | 
 23 | model_id = args.model_id
 24 | 
 25 | num_dup = 1
 26 | num_neg = 7
 27 | 
 28 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
 29 | padding_callback = mz.models.Bert.get_default_padding_callback()
 30 | task = mz.tasks.Ranking(losses=losses)
 31 | task.metrics = [
 32 |     mz.metrics.MeanAveragePrecision(),
 33 |     MAP()
 34 | ]
 35 | 
 36 | with Logger(log_filename = '{}.log'.format(model_id)):
 37 |     for fold in range(1,6):
 38 |         if model_id=='bert_002':
 39 |             train_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_train_processed_{}.dp".format(fold))
 40 |             val_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_val_processed_{}.dp".format(fold))
 41 |         if model_id=='bert_003':
 42 |             train_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_train_processed_query_all_{}.dp".format(fold))
 43 |             val_processed = mz.data_pack.data_pack.load_data_pack("bert_data/bert_val_processed_query_all_{}.dp".format(fold))
 44 |         if model_id=='bert_004':
 45 |             train_processed = mz.data_pack.data_pack.load_data_pack(
 46 |                 "bert_data/bert_train_processed_query_all_nopreprocessing_{}.dp".format(fold))
 47 |             val_processed = mz.data_pack.data_pack.load_data_pack(
 48 |                 "bert_data/bert_val_processed_query_all_nopreprocessing_{}.dp".format(fold))
 49 | 
 50 |         model = mz.models.Bert()
 51 | 
 52 |         model.params['task'] = task
 53 |         model.params['mode'] = 'bert-base-uncased'
 54 |         model.params['dropout_rate'] = 0.2
 55 | 
 56 |         model.build()
 57 | 
 58 |         print('Trainable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
 59 | 
 60 | 
 61 |         trainset = mz.dataloader.Dataset(
 62 |             data_pack=train_processed,
 63 |             mode='pair',
 64 |             num_dup=num_dup,
 65 |             num_neg=num_neg,
 66 |             batch_size=1,
 67 |             resample=True,
 68 |             sort=False,
 69 |             shuffle=True
 70 |         )
 71 |         trainloader = mz.dataloader.DataLoader(
 72 |             dataset=trainset,
 73 |             stage='train',
 74 |             callback=padding_callback
 75 |         )
 76 | 
 77 |         valset = mz.dataloader.Dataset(
 78 |             data_pack=val_processed,
 79 |             batch_size=32,
 80 |             sort=False,
 81 |             shuffle=False
 82 |         )
 83 |         valloader = mz.dataloader.DataLoader(
 84 |             dataset=valset,
 85 |             stage='dev',
 86 |             callback=padding_callback
 87 |         )
 88 | 
 89 | 
 90 |         no_decay = ['bias', 'LayerNorm.weight']
 91 |         optimizer_grouped_parameters = [
 92 |             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 5e-5},
 93 |             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
 94 |         ]
 95 | 
 96 |         optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, betas=(0.9, 0.98), eps=1e-8)
 97 |         scheduler = WarmupLinearSchedule(optimizer, warmup_steps=6, t_total=-1)
 98 | 
 99 |         trainer = mz.trainers.Trainer(
100 |             model=model,
101 |             optimizer=optimizer,
102 |             scheduler=scheduler,
103 |             trainloader=trainloader,
104 |             validloader=valloader,
105 |             validate_interval=None,
106 |             epochs=1
107 |         )
108 | 
109 |         for i in range(0,8):
110 |             print("="*10+" epoch: "+str(i)+" "+"="*10)
111 |             trainer.run()
112 |             trainer.save_model()
113 |             os.rename("save/model.pt", "save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
114 | 
115 | 
116 |         


--------------------------------------------------------------------------------
/src/rank/m2/bert_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | import torch
  8 | import matchzoo as mz
  9 | from matchzoo.preprocessors.units.truncated_length import TruncatedLength
 10 | from utils import MAP, build_matrix, topk_lines, predict
 11 | 
 12 | import argparse
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('--preprocessing_type', type=str, default='fine')
 16 | parser.add_argument('--left_truncated_length', type=int, default=64)
 17 | parser.add_argument('--query_type', type=str, default='query_key')
 18 | args = parser.parse_args()
 19 | 
 20 | preprocessing_type = args.preprocessing_type
 21 | left_truncated_length = args.left_truncated_length
 22 | dp_type = args.query_type
 23 | 
 24 | num_neg = 7
 25 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
 26 | task = mz.tasks.Ranking(losses=losses)
 27 | task.metrics = [
 28 |     mz.metrics.MeanAveragePrecision(),
 29 |     MAP()
 30 | ]
 31 | 
 32 | preprocessor = mz.models.Bert.get_default_preprocessor(mode='bert-base-uncased')
 33 | 
 34 | 
 35 | if preprocessing_type == 'fine':
 36 |     candidate_dic = pd.read_feather('data/candidate_dic.ftr')
 37 |     train_description = pd.read_feather('data/train_description_{}.ftr'.format(dp_type))
 38 | else:
 39 |     candidate_dic = pd.read_csv('../../../input/candidate_paper_for_wsdm2020.csv')
 40 |     candidate_dic.loc[candidate_dic['keywords'].isna(),'keywords'] = ''
 41 |     candidate_dic.loc[candidate_dic['title'].isna(),'title'] = ''
 42 |     candidate_dic.loc[candidate_dic['abstract'].isna(),'abstract'] = ''
 43 |     candidate_dic['text_right'] = candidate_dic['abstract'].str.cat(
 44 |         candidate_dic['keywords'], sep=' ').str.cat(
 45 |         candidate_dic['title'], sep=' ')
 46 |     candidate_dic = candidate_dic.rename(columns={'paper_id': 'id_right'})[['id_right', 'text_right']]
 47 | 
 48 |     train_description = pd.read_csv('../../../input/train_release.csv')
 49 |     train_description = train_description.rename(
 50 |         columns={'description_id': 'id_left', 
 51 |                  'description_text': 'text_left'})[['id_left', 'text_left']]
 52 |     dp_type = 'query_all_nopreprocessing'
 53 | 
 54 | train_recall = pd.read_feather('data/train_recall.ftr')[['id_left', 'id_right', 'label', 'cv']]
 55 | train_recall = pd.merge(train_recall, train_description, how='left', on='id_left')
 56 | train_recall = pd.merge(train_recall, candidate_dic, how='left', on='id_right')
 57 | train_recall = train_recall.drop_duplicates().reset_index(drop=True)
 58 | train_recall = train_recall[['id_left', 'text_left', 'id_right', 'text_right', 'label', 'cv']]
 59 | del train_description
 60 | gc.collect()
 61 | 
 62 | 
 63 | 
 64 | for i in range(1,6):
 65 |     print("="*20, i, "="*20)
 66 |     train_df = train_recall[train_recall.cv!=i][
 67 |         ['id_left', 'text_left', 'id_right', 'text_right', 'label']].reset_index(drop=True)
 68 |     val_df = train_recall[train_recall.cv==i][
 69 |         ['id_left', 'text_left', 'id_right', 'text_right', 'label']].reset_index(drop=True)
 70 |     
 71 |     train_raw = mz.pack(train_df, task)
 72 |     train_processed = preprocessor.transform(train_raw)
 73 |     train_processed.apply_on_text(TruncatedLength(left_truncated_length, 'pre').transform, 
 74 |                                   mode='left', inplace=True, verbose=1)
 75 |     train_processed.apply_on_text(TruncatedLength(256, 'pre').transform, mode='right', inplace=True, verbose=1)
 76 |     train_processed.append_text_length(inplace=True, verbose=1)
 77 |     train_processed.save("bert_data/bert_train_processed_{}_{}.dp".format(dp_type, i))
 78 | 
 79 |     val_raw = mz.pack(val_df, task)
 80 |     val_processed = preprocessor.transform(val_raw)
 81 |     val_processed.apply_on_text(TruncatedLength(left_truncated_length, 'pre').transform,
 82 |                                 mode='left', inplace=True, verbose=1)
 83 |     val_processed.apply_on_text(TruncatedLength(256, 'pre').transform, mode='right', inplace=True, verbose=1)
 84 |     val_processed.append_text_length(inplace=True, verbose=1)
 85 |     val_processed.save("bert_data/bert_val_processed_{}_{}.dp".format(dp_type, i))
 86 | 
 87 | 
 88 | if preprocessing_type == 'fine':
 89 |     test_description = pd.read_feather('data/test_description_quer_all.ftr')
 90 | else:
 91 |     test_description = pd.read_csv('../../input/test.csv')
 92 |     test_description = test_description.rename(
 93 |         columns={'description_id': 'id_left', 
 94 |                  'description_text': 'text_left'})[['id_left', 'text_left']]
 95 | 
 96 | 
 97 | test_recall = pd.read_feather('data/test_recall.ftr')[['id_left', 'id_right', 'label']]
 98 | test_recall = pd.merge(test_recall, test_description, how='left', on='id_left')
 99 | test_recall = pd.merge(test_recall, candidate_dic, how='left', on='id_right')
100 | del test_description, candidate_dic
101 | gc.collect()
102 | 
103 | test_raw = mz.pack(test_recall, task)
104 | test_processed = preprocessor.transform(test_raw)
105 | test_processed.apply_on_text(TruncatedLength(left_truncated_length, 'pre').transform, 
106 |                              mode='left', inplace=True, verbose=1)
107 | test_processed.apply_on_text(TruncatedLength(256, 'pre').transform, mode='right', inplace=True, verbose=1)
108 | test_processed.append_text_length(inplace=True, verbose=1)
109 | test_processed.save("bert_data/bert_test_processed_{}.dp".format(dp_type))
110 |     
111 |     
112 |     
113 |     


--------------------------------------------------------------------------------
/src/rank/m2/change_formatting4stk.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
 9 | args = parser.parse_args()
10 | 
11 | model_id = args.model_id
12 | 
13 | stk_path = "../../../stk_feat"
14 | 
15 | df = pd.read_csv("oof_m2_{}_5cv.csv".format(model_id))
16 | df = df.rename(columns={"target": "pred"})
17 | df.to_feather("{}/m2_{}_tr.ftr".format(stk_path, model_name))
18 | 
19 | df = pd.read_csv("result_m2_{}_5cv.csv".format(model_id))
20 | df = df.rename(columns={"target": "pred"})
21 | df.to_feather("{}/final_m2_{}_te.ftr".format(stk_path, model_name))
22 | 
23 | 


--------------------------------------------------------------------------------
/src/rank/m2/final_blend.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import pandas as pd
  3 | from tqdm import tqdm
  4 | 
  5 | np.set_printoptions(precision=4)
  6 | 
  7 | def map3_func(df, topk = 50, verbose=0):
  8 |     ids = df[df.label==1].description_id.values
  9 |     df_recalled = df[df.description_id.isin(ids)].reset_index(drop=True)
 10 |     df_recalled = df_recalled.sort_values(
 11 |         by=['description_id', 'label'], ascending=False).reset_index(drop=True)
 12 |     result = df_recalled.score.values.reshape([-1,topk])
 13 |     ranks = topk-result.argsort(axis=1).argsort(axis=1)
 14 |     map3_sum = sum(((1/ranks[:,0])*(ranks[:,0]<4)))
 15 |     if verbose>0:
 16 |         print("recall rate: "+str((df_recalled.shape[0]/topk)/(df.shape[0]/topk)))
 17 |         print("map@3 in recall: "+str(map3_sum/(df_recalled.shape[0]/topk)))
 18 |     print("map@3 in all: "+str(map3_sum/(df.shape[0]/topk)))
 19 |     
 20 |     
 21 | m2_path = "../../model/"
 22 | 
 23 | res = pd.read_feather('{}/lgb_s0_m2_33-0/lgb_s0_m3_33.ftr'.format(m2_path))
 24 | res['score'] = res['target'].apply(lambda x:np.log(x/(1-x)))
 25 | res.loc[res['score']<-12, 'score'] = -12
 26 | res = res[['description_id', 'paper_id', 'score']]
 27 | res.head()
 28 | 
 29 | res1 = pd.read_feather('{}/lgb_s0_m2_33-1/lgb_s0_m3_33.ftr'.format(m2_path))
 30 | res1['score'] = res1['target'].apply(lambda x:np.log(x/(1-x)))
 31 | res1.loc[res1['score']<-12, 'score'] = -12
 32 | res1 = res1[['description_id', 'paper_id', 'score']]
 33 | res1.head()
 34 | 
 35 | 
 36 | res2 = pd.read_feather('{}/lgb_s0_m3_34-0/lgb_s0_m3_34.ftr'.format(m2_path))
 37 | res2['score'] = res2['target'].apply(lambda x:np.log(x/(1-x)))
 38 | res2.loc[res2['score']<-12, 'score'] = -12
 39 | res2 = res2[['description_id', 'paper_id', 'score']]
 40 | res2.head()
 41 | 
 42 | 
 43 | res3 = pd.read_feather('{}/lgb_s0_m3_34-1/lgb_s0_m3_34.ftr'.format(m2_path))
 44 | res3['score'] = res3['target'].apply(lambda x:np.log(x/(1-x)))
 45 | res3.loc[res3['score']<-12, 'score'] = -12
 46 | res3 = res3[['description_id', 'paper_id', 'score']]
 47 | res3.head()
 48 | 
 49 | 
 50 | res4 = pd.read_feather('{}/lgb_s0_m3_35-0/lgb_s0_m3_35.ftr'.format(m2_path))
 51 | res4['score'] = res4['target'].apply(lambda x:np.log(x/(1-x)))
 52 | res4.loc[res4['score']<-12, 'score'] = -12
 53 | res4 = res4[['description_id', 'paper_id', 'score']]
 54 | res4.head()
 55 | 
 56 | 
 57 | res5 = pd.read_feather('{}/lgb_s0_m3_35-1/lgb_s0_m3_35.ftr'.format(m2_path))
 58 | res5['score'] = res5['target'].apply(lambda x:np.log(x/(1-x)))
 59 | res5.loc[res5['score']<-12, 'score'] = -12
 60 | res5 = res5[['description_id', 'paper_id', 'score']]
 61 | res5.head()
 62 | 
 63 | 
 64 | res6 = pd.read_feather('{}/lgb_s0_m3_38-0lgb_s0_m3_38.ftr'.format(m2_path))
 65 | res6['score'] = res6['target'].apply(lambda x:np.log(x/(1-x)))
 66 | res6.loc[res6['score']<-12, 'score'] = -12
 67 | res6 = res6[['description_id', 'paper_id', 'score']]
 68 | res6.head()
 69 | 
 70 | 
 71 | res7 = pd.read_feather('{}/lgb_s0_m3_38-1/lgb_s0_m3_38.ftr'.format(m2_path))
 72 | res7['score'] = res7['target'].apply(lambda x:np.log(x/(1-x)))
 73 | res7.loc[res7['score']<-12, 'score'] = -12
 74 | res7 = res7[['description_id', 'paper_id', 'score']]
 75 | res7.head()
 76 | 
 77 | 
 78 | res8 = pd.read_feather('{}/lgb_s0_m3_40-0/lgb_s0_m3_40.ftr'.format(m2_path))
 79 | res8['score'] = res8['target'].apply(lambda x:np.log(x/(1-x)))
 80 | res8.loc[res8['score']<-12, 'score'] = -12
 81 | res8 = res8[['description_id', 'paper_id', 'score']]
 82 | res8.head()
 83 | 
 84 | 
 85 | res9 = pd.read_feather('{}/model/m1/m1_catboost13.ftr'.format(m2_path))
 86 | res9['score'] = res9['pred'].apply(lambda x:np.log(x/(1-x)))
 87 | res9.loc[res9['score']<-12, 'score'] = -12
 88 | res9 = res9[['description_id', 'paper_id', 'score']]
 89 | res9.head()
 90 | 
 91 | 
 92 | model_id = 'bert_002'
 93 | res_b1 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
 94 | res_b1['score'] = res_b1['target'].apply(lambda x:np.log(x/(1-x)))
 95 | res_b1.loc[res_b1['score']<-12, 'score'] = -12
 96 | res_b1 = res_b1[['description_id', 'paper_id', 'score']]
 97 | res_b1.head()
 98 | 
 99 | 
100 | model_id = 'bert_003'
101 | res_b2 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
102 | res_b2['score'] = res_b2['target'].apply(lambda x:np.log(x/(1-x)))
103 | res_b2.loc[res_b2['score']<-12, 'score'] = -12
104 | res_b2 = res_b2[['description_id', 'paper_id', 'score']]
105 | res_b2.head()
106 | 
107 | 
108 | model_id = 'bert_004'
109 | res_b3 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
110 | res_b3['score'] = res_b3['target'].apply(lambda x:np.log(x/(1-x)))
111 | res_b3.loc[res_b3['score']<-12, 'score'] = -12
112 | res_b3 = res_b3[['description_id', 'paper_id', 'score']]
113 | res_b3.head()
114 | 
115 | model_id = 'bert_year_test'
116 | res_b4 = pd.read_csv("final_result_m2_{}_5cv.csv".format(model_id))
117 | res_b4['score'] = res_b4['target'].apply(lambda x:np.log(x/(1-x)))
118 | res_b4.loc[res_b4['score']<-12, 'score'] = -12
119 | res_b4 = res_b4[['description_id', 'paper_id', 'score']]
120 | res_b4.head()
121 | 
122 | 
123 | res_all = res.rename(columns={'score': 'score_0'}).merge(
124 |     res1.rename(columns={'score': 'score_1'}), how='outer', on=['description_id', 'paper_id']).merge(
125 |     res2.rename(columns={'score': 'score_2'}), how='outer', on=['description_id', 'paper_id']).merge(
126 |     res3.rename(columns={'score': 'score_3'}), how='outer', on=['description_id', 'paper_id']).merge(
127 |     res4.rename(columns={'score': 'score_4'}), how='outer', on=['description_id', 'paper_id']).merge(
128 |     res5.rename(columns={'score': 'score_5'}), how='outer', on=['description_id', 'paper_id']).merge(
129 |     res6.rename(columns={'score': 'score_6'}), how='outer', on=['description_id', 'paper_id']).merge(
130 |     res7.rename(columns={'score': 'score_7'}), how='outer', on=['description_id', 'paper_id']).merge(
131 |     res8.rename(columns={'score': 'score_8'}), how='outer', on=['description_id', 'paper_id']).merge(
132 |     res9.rename(columns={'score': 'score_9'}), how='outer', on=['description_id', 'paper_id']).merge(
133 |     res_b1.rename(columns={'score': 'score_b1'}), how='outer', on=['description_id', 'paper_id']).merge(
134 |     res_b2.rename(columns={'score': 'score_b2'}), how='outer', on=['description_id', 'paper_id']).merge(
135 |     res_b3.rename(columns={'score': 'score_b3'}), how='outer', on=['description_id', 'paper_id']).merge(
136 |     res_b4.rename(columns={'score': 'score_b4'}), how='outer', on=['description_id', 'paper_id'])
137 | res_all = res_all.fillna(0.0)
138 | res_all.head()
139 | 
140 | 
141 | cols = ['score_0', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 
142 |         'score_6', 'score_7', 'score_8', 'score_9',
143 |         'score_b1', 'score_b2', 'score_b3']
144 | 
145 | corr_matrix = []
146 | for description_id, df_tmp in tqdm(res_all.groupby('description_id')):
147 |     corr_matrix.append(
148 |         df_tmp[cols].corr().values[:,:,np.newaxis])
149 | corr_matrix = np.concatenate(corr_matrix, axis=2)
150 | corr_matrix[np.isnan(corr_matrix)] = 0
151 | pd.DataFrame(data=corr_matrix.mean(axis=2), columns=cols, index=cols)
152 | 
153 | res_all['score'] = (
154 |     (
155 |         res_all['score_0'] + res_all['score_1'] + res_all['score_2'] + res_all['score_3'] + 
156 |         res_all['score_4'] + res_all['score_5'] + res_all['score_6'] + res_all['score_7']
157 |     )/8 + 
158 |     (
159 |         res_all['score_8'] + res_all['score_9']
160 |     )/2 +
161 |     (
162 |         res_all['score_b1'] + 1.5*res_all['score_b2']
163 |     )/2.5*5 + 
164 |     (
165 |         res_all['score_b2'] + 3*res_all['score_b3']
166 |     )/4
167 | )
168 | 
169 | 
170 | result = res_all.sort_values(by=['description_id', 'score'], na_position='first').groupby(
171 |     'description_id').tail(3)
172 | 
173 | 
174 | model_id = 'all_model'
175 | 
176 | description_id_list = []
177 | paper_id_list_1 = []
178 | paper_id_list_2 = []
179 | paper_id_list_3 = []
180 | for description_id, df_tmp in tqdm(result.groupby('description_id')):
181 |     description_id_list.append(description_id)
182 |     paper_id_list_1.append(df_tmp.iloc[2,1])
183 |     paper_id_list_2.append(df_tmp.iloc[1,1])
184 |     paper_id_list_3.append(df_tmp.iloc[0,1])
185 |     
186 | sub = pd.DataFrame(data={'description_id':description_id_list, 
187 |                          'paper_id_1': paper_id_list_1, 
188 |                          'paper_id_2': paper_id_list_2, 
189 |                          'paper_id_3': paper_id_list_3})
190 | sub.to_csv("blend_{}.csv".format(model_id), header=False, index=False)
191 | print("blend_{}.csv".format(model_id))
192 | 
193 | 


--------------------------------------------------------------------------------
/src/rank/m2/fold_result_integration.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
10 | args = parser.parse_args()
11 | model_id = args.model_id
12 | 
13 | 
14 | def map3_func(df, topk = 50, verbose=0):
15 |     ids = df[df.label==1].description_id.values
16 |     df_recalled = df[df.description_id.isin(ids)].reset_index(drop=True)
17 |     df_recalled = df_recalled.sort_values(
18 |         by=['description_id', 'label'], ascending=False).reset_index(drop=True)
19 |     result = df_recalled.score.values.reshape([-1,topk])
20 |     ranks = topk-result.argsort(axis=1).argsort(axis=1)
21 |     map3_sum = sum(((1/ranks[:,0])*(ranks[:,0]<4)))
22 |     if verbose>1:
23 |         print("recall rate: "+str((df_recalled.shape[0]/topk)/(df.shape[0]/topk)))
24 |         print("map@3 in recall: "+str(map3_sum/(df_recalled.shape[0]/topk)))
25 |     if verbose>0:
26 |         print("map@3 in all: "+str(map3_sum/(df.shape[0]/topk)))
27 |     return map3_sum/(df.shape[0]/topk)
28 | 
29 | 
30 | fold = 1
31 | val_df = pd.read_csv("result/{}/{}_fold_{}_cv.csv".format(model_id, model_id, fold))
32 | test_df = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
33 |     columns={'score':'score_1'})
34 | 
35 | for fold in tqdm(range(2,6)):
36 |     val_df_cv = pd.read_csv("result/{}/{}_fold_{}_cv.csv".format(model_id, model_id, fold))
37 |     val_df = pd.concat([val_df, val_df_cv], ignore_index=True, sort=True)
38 |     
39 |     test_df_cv = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
40 |         columns={'score':'score_{}'.format(fold)})
41 |     test_df = test_df.merge(test_df_cv)
42 |     
43 | val_df = val_df.merge(train_recall, how='left')
44 | val_df = val_df[val_df.description_id!='6.45E+04'].reset_index(drop=True)
45 | # assert val_df.description_id.nunique()==49945
46 | map3_func(val_df)
47 | val_df['target'] = val_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
48 | val_df.to_csv("oof_m2_{}_5cv.csv".format(model_id), index=False)
49 | 
50 | score_cols = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
51 | test_df['score'] = test_df[score_cols].mean(axis=1)
52 | print(test_df[score_cols+['score']].corr(method='spearman'))
53 | 
54 | test_df['target'] = test_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
55 | val_df['target'] = val_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
56 | 
57 | test_df = test_recall.merge(
58 |     test_df[['description_id', 'paper_id', 'score']], how='left', on=['description_id', 'paper_id'])
59 | test_df['target'] = test_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
60 | test_df['target'] = test_df['target'].fillna(0)
61 | test_df[['description_id', 'paper_id', 'target']].to_csv("result_m2_{}_5cv.csv".format(model_id), index=False)
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/src/rank/m2/gen_w2v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # set -e
 3 | 
 4 | BUILDDIR=build
 5 | CORPUS=corpus.txt
 6 | VOCAB_FILE=vocab.txt
 7 | SAVE_FILE=glove.w2v
 8 | 
 9 | VERBOSE=2
10 | MEMORY=4.0
11 | 
12 | VOCAB_MIN_COUNT=5
13 | 
14 | WINDOW_SIZE=5
15 | COOCCURRENCE_FILE=cooccurrence.bin
16 | WEIGHT=1
17 | 
18 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
19 | 
20 | VECTOR_SIZE=256
21 | MAX_ITER=25
22 | WINDOW_SIZE=2
23 | BINARY=0
24 | NUM_THREADS=8
25 | X_MAX=10
26 | HEADLINE=1
27 | 
28 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
29 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
30 | 
31 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE -distance-weighting $WEIGHT < $CORPUS > $COOCCURRENCE_FILE"
32 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE -distance-weighting $WEIGHT < $CORPUS > $COOCCURRENCE_FILE
33 | 
34 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
35 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
36 | 
37 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE -write-header $HEADLINE"
38 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE -write-header $HEADLINE
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/src/rank/m2/mk_submission.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from tqdm import tqdm
 4 | 
 5 | test_recall = pd.read_feather('../../feat/te_s0_32-50.ftr')[['description_id', 'paper_id', 'corp_sim_score']]
 6 | 
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
11 | args = parser.parse_args()
12 | model_id = args.model_id
13 | 
14 | if '_pointwise' in model_id:
15 |     fold = 1
16 |     test_df = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
17 |         columns={'target':'target_1'})
18 | 
19 |     for fold in tqdm(range(2,6)):
20 |         test_df_cv = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
21 |             columns={'target':'target_{}'.format(fold)})
22 |         test_df = test_df.merge(test_df_cv)
23 | 
24 |     score_cols = ['target_1', 'target_2', 'target_3', 'target_4', 'target_5']
25 |     test_df['target'] = test_df[score_cols].mean(axis=1)
26 |     print(test_df[score_cols+['target']].corr(method='spearman'))
27 | else:    
28 |     fold = 1
29 |     test_df = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
30 |         columns={'score':'score_1'})
31 | 
32 |     for fold in tqdm(range(2,6)):
33 |         test_df_cv = pd.read_csv("result/{}/final_{}_fold_{}_test.csv".format(model_id, model_id, fold)).rename(
34 |             columns={'score':'score_{}'.format(fold)})
35 |         test_df = test_df.merge(test_df_cv)
36 | 
37 |     score_cols = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
38 |     test_df['score'] = test_df[score_cols].mean(axis=1)
39 |     print(test_df[score_cols+['score']].corr(method='spearman'))
40 | 
41 | 
42 | if 'target' not in test_df.columns:
43 |     test_df['target'] = test_df['score'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
44 |     
45 | test_df = test_recall.merge(
46 |     test_df[['description_id', 'paper_id', 'target']], how='left', on=['description_id', 'paper_id'])
47 | test_df[['description_id', 'paper_id', 'target']].to_csv("final_result_m2_{}_5cv.csv".format(model_id), index=False)
48 | 
49 | result = test_df.sort_values(by=['description_id', 'target', 'corp_sim_score'], na_position='first').groupby(
50 |     'description_id').tail(3)
51 | 
52 | description_id_list = []
53 | paper_id_list_1 = []
54 | paper_id_list_2 = []
55 | paper_id_list_3 = []
56 | for description_id, df_tmp in tqdm(result.groupby('description_id')):
57 |     description_id_list.append(description_id)
58 |     paper_id_list_1.append(df_tmp.iloc[2,1])
59 |     paper_id_list_2.append(df_tmp.iloc[1,1])
60 |     paper_id_list_3.append(df_tmp.iloc[0,1])
61 |     
62 | sub = pd.DataFrame(data={'description_id':description_id_list, 
63 |                          'paper_id_1': paper_id_list_1, 
64 |                          'paper_id_2': paper_id_list_2, 
65 |                          'paper_id_3': paper_id_list_3})
66 | sub.to_csv("final_{}_sub_5cv.csv".format(model_id), header=False, index=False)
67 | print("final_{}_sub_5cv.csv".format(model_id))
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/src/rank/m2/model.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.nn import functional as F
  6 | 
  7 | import matchzoo as mz
  8 | from matchzoo.engine.param_table import ParamTable
  9 | from matchzoo.engine.param import Param
 10 | from matchzoo.engine.base_model import BaseModel
 11 | from matchzoo.modules import RNNDropout
 12 | from matchzoo.modules import BidirectionalAttention
 13 | from matchzoo.modules import StackedBRNN
 14 | 
 15 | 
 16 | class ESIMplus(mz.models.ESIM):
 17 |     def set_feature_dim(self, feature_dim):
 18 |         self.feature_dim = feature_dim
 19 |         
 20 |     def build(self):
 21 |         """Instantiating layers."""
 22 |         rnn_mapping = {'lstm': nn.LSTM, 'gru': nn.GRU}
 23 |         self.embedding = self._make_default_embedding_layer()
 24 |         self.rnn_dropout = RNNDropout(p=self._params['dropout'])
 25 |         lstm_size = self._params['hidden_size']
 26 |         if self._params['concat_lstm']:
 27 |             lstm_size /= self._params['lstm_layer']
 28 |         self.input_encoding = StackedBRNN(
 29 |             self._params['embedding_output_dim'],
 30 |             int(lstm_size / 2),
 31 |             self._params['lstm_layer'],
 32 |             dropout_rate=self._params['dropout'],
 33 |             dropout_output=self._params['drop_lstm'],
 34 |             rnn_type=rnn_mapping[self._params['rnn_type'].lower()],
 35 |             concat_layers=self._params['concat_lstm'])
 36 |         self.attention = BidirectionalAttention()
 37 |         self.projection = nn.Sequential(
 38 |             nn.Linear(
 39 |                 4 * self._params['hidden_size'],
 40 |                 self._params['hidden_size']),
 41 |             nn.ReLU())
 42 |         self.composition = StackedBRNN(
 43 |             self._params['hidden_size'],
 44 |             int(lstm_size / 2),
 45 |             self._params['lstm_layer'],
 46 |             dropout_rate=self._params['dropout'],
 47 |             dropout_output=self._params['drop_lstm'],
 48 |             rnn_type=rnn_mapping[self._params['rnn_type'].lower()],
 49 |             concat_layers=self._params['concat_lstm'])
 50 |         self.wide_net = nn.Sequential(
 51 |             nn.Linear(self.feature_dim, self._params['hidden_size']),
 52 |             nn.ReLU(),
 53 |             nn.Linear(self._params['hidden_size'], self._params['hidden_size']),
 54 |             nn.ReLU())
 55 |         self.classification = nn.Sequential(
 56 |             nn.Dropout(
 57 |                 p=self._params['dropout']),
 58 |             nn.Linear(
 59 |                 4 * self._params['hidden_size']+self._params['hidden_size'],
 60 |                 self._params['hidden_size']),
 61 |             nn.Tanh(),
 62 |             nn.Dropout(
 63 |                 p=self._params['dropout']))
 64 |         self.out = self._make_output_layer(self._params['hidden_size'])
 65 |             
 66 |         
 67 |     def forward(self, inputs):
 68 |         """Forward."""
 69 |         # Scalar dimensions referenced here:
 70 |         # B = batch size (number of sequences)
 71 |         # D = embedding size
 72 |         # L = `input_left` sequence length
 73 |         # R = `input_right` sequence length
 74 |         # F = `feature` dim
 75 |         # H = hidden size
 76 | 
 77 |         # [B, L], [B, R]
 78 |         
 79 |         query, doc = inputs['text_left'].long(), inputs['text_right'].long()
 80 | 
 81 |         # [B, L]
 82 |         # [B, R]
 83 |         query_mask = (query == self._params['mask_value'])
 84 |         doc_mask = (doc == self._params['mask_value'])
 85 | 
 86 |         # [B, L, D]
 87 |         # [B, R, D]
 88 |         query = self.embedding(query)
 89 |         doc = self.embedding(doc)
 90 | 
 91 |         # [B, L, D]
 92 |         # [B, R, D]
 93 |         query = self.rnn_dropout(query)
 94 |         doc = self.rnn_dropout(doc)
 95 | 
 96 |         # [B, L, H]
 97 |         # [B, R, H]
 98 |         query = self.input_encoding(query, query_mask)
 99 |         doc = self.input_encoding(doc, doc_mask)
100 | 
101 |         # [B, L, H], [B, L, H]
102 |         attended_query, attended_doc = self.attention(
103 |             query, query_mask, doc, doc_mask)
104 | 
105 |         # [B, L, 4 * H]
106 |         # [B, L, 4 * H]
107 |         enhanced_query = torch.cat([query,
108 |                                     attended_query,
109 |                                     query - attended_query,
110 |                                     query * attended_query],
111 |                                    dim=-1)
112 |         enhanced_doc = torch.cat([doc,
113 |                                   attended_doc,
114 |                                   doc - attended_doc,
115 |                                   doc * attended_doc],
116 |                                  dim=-1)
117 |         # [B, L, H]
118 |         # [B, L, H]
119 |         projected_query = self.projection(enhanced_query)
120 |         projected_doc = self.projection(enhanced_doc)
121 | 
122 |         # [B, L, H]
123 |         # [B, L, H]
124 |         query = self.composition(projected_query, query_mask)
125 |         doc = self.composition(projected_doc, doc_mask)
126 | 
127 |         # [B, L]
128 |         # [B, R]
129 |         reverse_query_mask = 1. - query_mask.float()
130 |         reverse_doc_mask = 1. - doc_mask.float()
131 | 
132 |         # [B, H]
133 |         # [B, H]
134 |         query_avg = torch.sum(query * reverse_query_mask.unsqueeze(2), dim=1)\
135 |             / (torch.sum(reverse_query_mask, dim=1, keepdim=True) + 1e-8)
136 |         doc_avg = torch.sum(doc * reverse_doc_mask.unsqueeze(2), dim=1)\
137 |             / (torch.sum(reverse_doc_mask, dim=1, keepdim=True) + 1e-8)
138 | 
139 |         # [B, L, H]
140 |         # [B, L, H]
141 |         query = query.masked_fill(query_mask.unsqueeze(2), -1e7)
142 |         doc = doc.masked_fill(doc_mask.unsqueeze(2), -1e7)
143 | 
144 |         # [B, H]
145 |         # [B, H]
146 |         query_max, _ = query.max(dim=1)
147 |         doc_max, _ = doc.max(dim=1)
148 |         
149 |         feature = inputs['feature'].float()
150 |         feat_emb = self.wide_net(feature)
151 | 
152 |         # [B, 4 * H + H]
153 |         v = torch.cat([query_avg, query_max, doc_avg, doc_max, feat_emb], dim=-1)
154 | 
155 |         # [B, H]
156 |         hidden = self.classification(v)
157 | 
158 |         # [B, num_classes]
159 |         out = self.out(hidden)
160 | 
161 |         return out
162 |     
163 |     
164 |     


--------------------------------------------------------------------------------
/src/rank/m2/nn_5_fold_predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | 
  4 | import gc
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import torch
 10 | import matchzoo as mz
 11 | from model import ESIMplus
 12 | 
 13 | from utils import MAP, build_matrix, topk_lines, predict, Logger
 14 | 
 15 | 
 16 | import argparse
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
 20 | args = parser.parse_args()
 21 | 
 22 | model_id = args.model_id
 23 | 
 24 | num_dup = 6
 25 | num_neg = 10
 26 | batch_size = 128
 27 | add_lgb_feat = False
 28 | debug = False
 29 | 
 30 | if model_id == 'ESIMplus_001':
 31 |     bst_epochs = {1:0, 2:2, 3:4, 4:2, 5:1}
 32 |     Model = ESIMplus
 33 |     lr = 0.001
 34 |     add_lgb_feat = True
 35 |     params = {'embedding_freeze': True,
 36 |               'mask_value': 0,
 37 |               'lstm_layer': 2, 
 38 |               'hidden_size': 200, 
 39 |               'dropout': 0.2}
 40 | 
 41 | 
 42 | if model_id == 'aNMM_001':
 43 |     bst_epochs = {1:4, 2:4, 3:3, 4:4, 5:9}
 44 |     Model = mz.models.aNMM
 45 |     lr = 0.001
 46 |     params = {'embedding_freeze': True,
 47 |               'mask_value': 0, 
 48 |               'dropout_rate': 0.1}
 49 |     
 50 | if model_id == 'ESIM_001':
 51 |     bst_epochs = {1:4, 2:4, 3:2, 4:2, 5:6}
 52 |     Model = mz.models.ESIM
 53 |     lr = 0.001
 54 |     params = {'embedding_freeze': True,
 55 |               'mask_value': 0,
 56 |               'lstm_layer': 2, 
 57 |               'hidden_size': 200, 
 58 |               'dropout': 0.2}
 59 |     
 60 | if model_id == 'MatchLSTM_001':
 61 |     bst_epochs = {1:4, 2:2, 3:2, 4:4, 5:3}
 62 |     Model = mz.models.MatchLSTM
 63 |     lr = 0.001
 64 |     params = {'embedding_freeze': True,
 65 |               'mask_value': 0}
 66 | 
 67 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
 68 | task = mz.tasks.Ranking(losses=losses)
 69 | task.metrics = [
 70 |     mz.metrics.MeanAveragePrecision(),
 71 |     MAP()
 72 | ]
 73 | 
 74 | if model_id == 'ESIM_001_pointwise':
 75 |     bst_epochs = {1:4, 2:3, 3:7, 4:12, 5:5}
 76 |     Model = mz.models.ESIM
 77 |     lr = 0.001
 78 |     params = {'embedding_freeze': True,
 79 |               'mask_value': 0,
 80 |               'lstm_layer': 2, 
 81 |               'hidden_size': 200, 
 82 |               'dropout': 0.2}
 83 |     
 84 |     task = mz.tasks.Classification(num_classes=2)
 85 |     task.metrics = ['acc']
 86 |     
 87 |     
 88 | padding_callback = Model.get_default_padding_callback()
 89 | embedding_matrix = np.load("data/embedding_matrix.npy")
 90 | # l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
 91 | # embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
 92 | 
 93 | test_processed = mz.data_pack.data_pack.load_data_pack("test_processed.dp")
 94 | testset = mz.dataloader.Dataset(
 95 |     data_pack=test_processed,
 96 |     batch_size=batch_size,
 97 |     sort=False,
 98 |     shuffle=False
 99 | )
100 | 
101 | testloader = mz.dataloader.DataLoader(
102 |     dataset=testset,
103 |     stage='dev',
104 |     callback=padding_callback
105 | )
106 | 
107 | 
108 | 
109 | model = Model()
110 | if add_lgb_feat: model.set_feature_dim(30)
111 | 
112 | model.params['task'] = task
113 | model.params['embedding'] = embedding_matrix
114 | 
115 | for param in params:
116 |     model.params[param] = params[param]
117 | 
118 | model.build()
119 | 
120 | optimizer = torch.optim.Adam(model.parameters(), lr=lr)
121 | trainer = mz.trainers.Trainer(
122 |             model=model,
123 |             optimizer=optimizer,
124 |             trainloader=testloader,
125 |             validloader=testloader,
126 |             validate_interval=None,
127 |             epochs=1
128 | )
129 | 
130 | 
131 | for fold in range(1,6):
132 |     i = bst_epochs[fold]
133 |     val_processed = mz.data_pack.data_pack.load_data_pack("5fold/val_processed_{}.dp".format(fold))
134 |     valset = mz.dataloader.Dataset(
135 |         data_pack=val_processed,
136 |         batch_size=batch_size,
137 |         sort=False,
138 |         shuffle=False
139 |     )
140 | 
141 |     valloader = mz.dataloader.DataLoader(
142 |         dataset=valset,
143 |         stage='dev',
144 |         callback=padding_callback
145 |     )
146 | 
147 |     trainer.restore_model("save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
148 | 
149 |     score = predict(trainer, valloader)
150 |     X, y = val_processed.unpack()
151 |     result = pd.DataFrame(data={
152 |         'description_id': X['id_left'],
153 |         'paper_id': X['id_right'],
154 |         'score': score[:,0]})
155 |     result.to_csv("result/{}/{}_fold_{}_cv.csv".format(model_id, model_id, fold), index=False)
156 | 
157 |     score = predict(trainer, testloader)
158 |     X, y = test_processed.unpack()
159 |     result = pd.DataFrame(data={
160 |         'description_id': X['id_left'],
161 |         'paper_id': X['id_right'],
162 |         'score': score[:,0]})
163 |     result.to_csv("result/{}/{}_fold_{}_test.csv".format(model_id, model_id, fold), index=False)
164 |     
165 |     
166 |     


--------------------------------------------------------------------------------
/src/rank/m2/nn_5_fold_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | 
  4 | import gc
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import torch
 10 | import matchzoo as mz
 11 | from model import ESIMplus
 12 | 
 13 | from utils import MAP, build_matrix, topk_lines, predict, Logger
 14 | 
 15 | import argparse
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--model_id', type=str, default='ESIMplus_001')
 19 | args = parser.parse_args()
 20 | 
 21 | num_dup = 6
 22 | num_neg = 10
 23 | batch_size = 128
 24 | add_lgb_feat = False
 25 | debug = False
 26 | 
 27 | if model_id == 'ESIMplus_001':
 28 |     Model = ESIMplus
 29 |     lr = 0.001
 30 |     add_lgb_feat = True
 31 |     params = {'embedding_freeze': True,
 32 |               'mask_value': 0,
 33 |               'lstm_layer': 2, 
 34 |               'hidden_size': 200, 
 35 |               'dropout': 0.2}
 36 | 
 37 | 
 38 | if model_id == 'aNMM_001':
 39 |     Model = mz.models.aNMM
 40 |     lr = 0.001
 41 |     params = {'embedding_freeze': True,
 42 |               'mask_value': 0, 
 43 |               'dropout_rate': 0.1}
 44 |     
 45 | if model_id == 'ESIM_001':
 46 |     Model = mz.models.ESIM
 47 |     lr = 0.001
 48 |     params = {'embedding_freeze': True,
 49 |               'mask_value': 0,
 50 |               'lstm_layer': 2, 
 51 |               'hidden_size': 200, 
 52 |               'dropout': 0.2}
 53 |     
 54 | if model_id == 'MatchLSTM':
 55 |     model_id = 'MatchLSTM_001'
 56 |     Model = mz.models.MatchLSTM
 57 |     lr = 0.001
 58 |     params = {'embedding_freeze': True,
 59 |               'mask_value': 0}
 60 | 
 61 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
 62 | padding_callback = Model.get_default_padding_callback()
 63 | task = mz.tasks.Ranking(losses=losses)
 64 | task.metrics = [
 65 |     mz.metrics.MeanAveragePrecision(),
 66 |     MAP()
 67 | ]
 68 | 
 69 | if model_id == 'ESIM_001_pointwise':
 70 |     Model = mz.models.ESIM
 71 |     lr = 0.001
 72 |     params = {'embedding_freeze': True,
 73 |               'mask_value': 0,
 74 |               'lstm_layer': 2, 
 75 |               'hidden_size': 200, 
 76 |               'dropout': 0.2}
 77 |     
 78 |     task = mz.tasks.Classification(num_classes=2)
 79 |     task.metrics = ['acc']
 80 | 
 81 | embedding_matrix = np.load("data/embedding_matrix.npy")
 82 | 
 83 | 
 84 | if not os.path.exists('result/{}'.format(model_id)):
 85 |     os.makedirs('result/{}'.format(model_id))
 86 | 
 87 | with Logger(log_filename = '{}.log'.format(model_id)):
 88 |     for fold in range(1,5):
 89 |         print("="*10+" fold: "+str(fold)+" data_processed prepare "+"="*10)
 90 |         train_processed = mz.data_pack.data_pack.load_data_pack("5fold/train_processed_{}.dp".format(fold))
 91 |         val_processed = mz.data_pack.data_pack.load_data_pack("5fold/val_processed_{}.dp".format(fold))
 92 |         
 93 |         if model_id == 'ESIM_001_pointwise':
 94 |             train_processed.relation.label = train_processed.relation.label.astype(np.long)
 95 |             val_processed.relation.label = val_processed.relation.label.astype(np.long)
 96 | 
 97 | 
 98 |         print("="*10+" fold: "+str(fold)+" dataset prepare "+"="*10)
 99 |         trainset = mz.dataloader.Dataset(
100 |             data_pack=train_processed,
101 |             mode='pair',
102 |             num_dup=num_dup,
103 |             num_neg=num_neg,
104 |             batch_size=batch_size,
105 |             resample=True,
106 |             sort=False,
107 |             shuffle=True
108 |         )
109 |         valset = mz.dataloader.Dataset(
110 |             data_pack=val_processed,
111 |             batch_size=batch_size,
112 |             sort=False,
113 |             shuffle=False
114 |         )
115 | 
116 |         print("="*10+" fold: "+str(fold)+" dataloader prepare "+"="*10)
117 |         trainloader = mz.dataloader.DataLoader(
118 |             dataset=trainset,
119 |             stage='train',
120 |             callback=padding_callback
121 |         )
122 |         valloader = mz.dataloader.DataLoader(
123 |             dataset=valset,
124 |             stage='dev',
125 |             callback=padding_callback
126 |         )
127 | 
128 |         print("="*10+" fold: "+str(fold)+" model build "+"="*10)
129 |         model = Model()
130 |         if add_lgb_feat: model.set_feature_dim(30)
131 | 
132 |         model.params['task'] = task
133 |         model.params['embedding'] = embedding_matrix
134 | 
135 |         for param in params:
136 |             model.params[param] = params[param]
137 | 
138 |         model.build()
139 |         if debug: print(model)
140 | 
141 |         print("="*10+" fold: "+str(fold)+" trainers build "+"="*10)
142 |         optimizer = torch.optim.Adam(model.parameters(), lr=lr)
143 | 
144 |         trainer = mz.trainers.Trainer(
145 |             model=model,
146 |             optimizer=optimizer,
147 |             trainloader=trainloader,
148 |             validloader=valloader,
149 |             validate_interval=None,
150 |             epochs=1
151 |         )
152 | 
153 |         print("="*10+" fold: "+str(fold)+" training "+"="*10)
154 |         trainer.restore_model("save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, 1))
155 |         for i in range(2,6):
156 |             trainer._model.embedding.requires_grad_(requires_grad=False)
157 |             print("="*10+" fold: "+str(fold)+" epoch: "+str(i)+" "+"="*10)
158 |             trainer.run()
159 |             trainer.save_model()
160 |             os.rename("save/model.pt", "save/{}_fold_{}_epoch_{}.pt".format(model_id, fold, i))
161 |             
162 |             
163 | 


--------------------------------------------------------------------------------
/src/rank/m2/nn_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | import torch
  8 | import matchzoo as mz
  9 | from model import ESIMplus
 10 | 
 11 | from gensim.models import KeyedVectors
 12 | from utils import MAP, build_matrix, topk_lines, predict
 13 | 
 14 | pd.set_option('display.max_columns', None)
 15 | pd.set_option('display.max_rows', 200)
 16 | pd.set_option('max_colwidth',400)
 17 | 
 18 | 
 19 | num_neg = 10
 20 | fit_preprocessor = True
 21 | losses = mz.losses.RankCrossEntropyLoss(num_neg=num_neg)
 22 | feature = [
 23 |     'quer_key_tfidf_corp_cos_dis',
 24 |     'quer_key_tfidf_corp_eucl_dis', 
 25 |     'quer_key_corp_bm25_score', 
 26 |     'corp_sim_score', 
 27 |     'quer_all_tfidf_corp_eucl_dis', 
 28 |     'quer_all_corp_bm25_score',
 29 |     'quer_key_tfidf_titl_manh_dis', 
 30 |     'quer_all_titl_bm25_score', 
 31 |     'quer_all_tfidf_corp_cos_dis',
 32 |     'jaccard_coef_of_unigram_between_corp_quer_key',
 33 |     'ratio_of_unique_corp_unigram', 
 34 |     'jaccard_coef_of_unigram_between_corp_quer_all', 
 35 |     'jaccard_coef_of_unigram_between_titl_quer_key',
 36 |     'quer_key_tfidf_titl_cos_dis', 
 37 |     'jaccard_coef_of_unigram_between_abst_quer_key', 
 38 |     'quer_key_abst_bm25_score', 
 39 |     'quer_all_tfidf_titl_cos_dis', 
 40 |     'quer_key_tfidf_titl_eucl_dis', 
 41 |     'count_of_quer_key_unigram', 
 42 |     'quer_all_tfidf_titl_eucl_dis', 
 43 |     'ratio_of_unique_quer_all_unigram', 
 44 |     'quer_key_tfidf_abst_cos_dis', 
 45 |     'count_of_unique_corp_unigram', 
 46 |     'ratio_of_unique_abst_unigram', 
 47 |     'normalized_pos_of_corp_unigram_in_quer_all_max', 
 48 |     'quer_all_abst_bm25_score', 
 49 |     'normalized_pos_of_titl_unigram_in_quer_all_std',
 50 |     'quer_all_tfidf_titl_manh_dis', 
 51 |     'jaccard_coef_of_unigram_between_abst_quer_all', 
 52 |     'dice_dist_of_unigram_between_corp_quer_key']
 53 | 
 54 | task = mz.tasks.Ranking(losses=losses)
 55 | task.metrics = [
 56 |     mz.metrics.MeanAveragePrecision(),
 57 |     MAP()
 58 | ]
 59 | print("task is", task)
 60 | print("`task` initialized with metrics", task.metrics)
 61 | 
 62 | if fit_preprocessor:
 63 |     
 64 |     preprocessor = mz.models.ESIM.get_default_preprocessor(
 65 |         truncated_mode='pre',
 66 |         truncated_length_left=64,
 67 |         truncated_length_right=256,
 68 |         filter_mode='df',
 69 |         filter_low_freq=2)
 70 |     
 71 |     preprocessor = preprocessor.fit(all_data_raw)
 72 |     preprocessor.save("preprocessor.prep")
 73 | else:
 74 |     preprocessor = mz.load_preprocessor("preprocessor.prep")
 75 | 
 76 | 
 77 | candidate_dic = pd.read_feather('data/candidate_dic.ftr')
 78 | 
 79 | train_recall = pd.read_feather('data/train_recall.ftr')
 80 | train_description = pd.read_feather('data/train_description.ftr')
 81 | train_recall = pd.merge(train_recall, train_description, how='left', on='id_left')
 82 | train_recall = pd.merge(train_recall, candidate_dic, how='left', on='id_right')
 83 | train_recall = train_recall.drop_duplicates().reset_index(drop=True)
 84 | del train_description
 85 | gc.collect()
 86 | 
 87 | 
 88 | test_recall = pd.read_feather('data/test_recall.ftr')
 89 | test_description = pd.read_feather('data/test_description.ftr')
 90 | test_recall = pd.merge(test_recall, test_description, how='left', on='id_left')
 91 | test_recall = pd.merge(test_recall, candidate_dic, how='left', on='id_right')
 92 | del test_description, candidate_dic
 93 | gc.collect()
 94 | 
 95 | all_data_df = train_recall.copy()
 96 | all_data_df.id_left = all_data_df.id_left+'_tr'
 97 | all_data_df = pd.concat([all_data_df, test_recall]).reset_index(drop=True)
 98 | norm_df = all_data_df[feature].quantile(q=0.99)
 99 | 
100 | del all_data_df, train_recall, test_recall
101 | gc.collect()
102 | 
103 | train_recall[feature] = train_recall[feature]/norm_df
104 | train_recall['feature'] = list(train_recall[feature].values)
105 | train_recall = train_recall[['id_left', 'text_left', 'id_right', 'text_right', 'label', 'feature']]
106 | cv_ids = pd.read_csv("../../input/cv_ids_0109.csv")
107 | train_recall = train_recall.merge(
108 |     cv_ids.rename(columns={'description_id': 'id_left'}),
109 |     how='left', 
110 |     on='id_left').fillna(5.0)
111 | 
112 | 
113 | for i in range(1,6):
114 |     print("="*20, i, "="*20)
115 |     train_df = train_recall[train_recall.cv!=i][
116 |         ['id_left', 'text_left', 'id_right', 'text_right', 'label', 'feature']].reset_index(drop=True)
117 |     val_df = train_recall[train_recall.cv==i][
118 |         ['id_left', 'text_left', 'id_right', 'text_right', 'label', 'feature']].reset_index(drop=True)
119 | 
120 |     train_raw = mz.pack(train_df, task)
121 |     val_raw = mz.pack(val_df, task)
122 |     
123 |     train_processed = preprocessor.transform(train_raw)
124 |     val_processed = preprocessor.transform(val_raw)
125 |     
126 |     train_processed.save("5fold/train_processed_{}.dp".format(i))
127 |     val_processed.save("5fold/val_processed_{}.dp".format(i))
128 |     
129 |     
130 | test_recall[feature] = test_recall[feature]/norm_df
131 | test_recall['feature'] = list(test_recall[feature].values)
132 | test_recall = test_recall[['id_left', 'text_left', 'id_right', 'text_right', 'feature']]
133 | 
134 | test_raw = mz.pack(test_recall, task)
135 | test_processed = preprocessor.transform(test_raw)
136 | # test_processed.save("test_processed.dp")
137 | test_processed.save("final_test_processed.dp")
138 | 
139 | 
140 | from gensim.models import KeyedVectors
141 | w2v_path = "data/glove.w2v"
142 | w2v_model = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
143 | term_index = preprocessor.context['vocab_unit'].state['term_index']
144 | embedding_matrix = build_matrix(term_index, w2v_model)
145 | del w2v_model, term_index
146 | gc.collect()
147 | np.save("data/embedding_matrix.npy", embedding_matrix) 
148 |     
149 | 
150 | 


--------------------------------------------------------------------------------
/src/rank/m2/preprocessing.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | import pandas as pd
 4 | import feather
 5 | 
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--query_type', type=str, default='query_key')
10 | args = parser.parse_args()
11 | 
12 | query_type = args.query_type
13 | 
14 | def topk_lines(df, k):
15 |     print(df.shape)
16 |     df.loc[:, 'rank'] = df.groupby(['description_id', 'type']).cumcount().values
17 |     df = df[df['rank'] < k]
18 |     df.drop(['rank'], axis=1, inplace=True)
19 |     print(df.shape)
20 |     return df
21 | 
22 | 
23 | ## preprocess
24 | candidate_dic = feather.read_dataframe('../../../input/paper_input_final.ftr')
25 | 
26 | candidate_dic.loc[candidate_dic['keywords'].isna(),'keywords'] = ''
27 | candidate_dic.loc[candidate_dic['titl'].isna(),'titl'] = ''
28 | candidate_dic.loc[candidate_dic['abst'].isna(),'abst'] = ''
29 | 
30 | candidate_dic['text_right'] = candidate_dic['abst'].str.cat(
31 |     candidate_dic['keywords'], sep=' ').str.cat(
32 |     candidate_dic['titl'], sep=' ')
33 | 
34 | candidate_dic = candidate_dic.rename(columns={'paper_id': 'id_right'})[['id_right', 'text_right']]
35 | candidate_dic.to_feather('data/candidate_dic.ftr')
36 | 
37 | train_description = feather.read_dataframe('../../../input/tr_input_final.ftr')
38 | 
39 | train_description = train_description.rename(
40 |     columns={'description_id': 'id_left', query_type: 'text_left'})
41 | train_description[['id_left', 'text_left']].to_feather('data/train_description_{}.ftr'.format(query_type))
42 | 
43 | 
44 | test_description = feather.read_dataframe('../../../input/te_input_final.ftr')
45 | 
46 | test_description = test_description.rename(
47 |     columns={'description_id': 'id_left', query_type: 'text_left'})
48 |  
49 | test_description[['id_left', 'text_left']].to_feather('data/test_description_{}.ftr'.format(query_type))
50 | 
51 | train_recall = feather.read_dataframe('../../../feat/tr_s0_32-50.ftr')
52 | 
53 | ## recall
54 | train_recall = train_recall.rename(
55 |     columns={'description_id': 'id_left', 'paper_id': 'id_right', 'target': 'label'})
56 | 
57 | train_recall = train_recall[train_recall.id_left.isin(train_description.id_left.values)].reset_index(drop=True)
58 | train_recall = train_recall.drop_duplicates()
59 | train_recall = train_recall.fillna(0)
60 | train_recall.to_feather('data/train_recall.ftr')
61 | 
62 | test_recall = feather.read_dataframe('../../../feat/te_s0_32-50.ftr')
63 | test_recall = test_recall.reset_index(drop=True)
64 | 
65 | test_recall = test_recall.rename(
66 |     columns={'description_id': 'id_left', 
67 |              'paper_id': 'id_right', 
68 |              'target': 'label'})
69 | 
70 | # test_recall[['id_left', 'id_right', 'label']].to_feather('data/test_recall.ftr')
71 | test_recall[['id_left', 'id_right', 'label']].to_feather('data/final_test_recall.ftr')
72 | 
73 | 
74 | ## corpus
75 | if query_type== 'query_key':
76 |     candidate_dic = feather.read_dataframe('data/candidate_dic.ftr')
77 |     train_description = feather.read_dataframe('data/train_description.ftr')
78 |     test_description = feather.read_dataframe('data/test_description.ftr')
79 | 
80 |     with open('data/corpus.txt','a') as fid:
81 |         for sent in tqdm(candidate_dic['text_right']):
82 |             if type(sent)==str:
83 |                 fid.write(sent+'\n')
84 |         for sent in tqdm(train_description['text_left']):
85 |             if type(sent)==str:
86 |                 fid.write(sent+'\n')
87 |         for sent in tqdm(test_description['text_left']):
88 |             if type(sent)==str:
89 |                 fid.write(sent+'\n')
90 | 
91 | 


--------------------------------------------------------------------------------
/src/rank/m2/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | python3 preprocessing.py --query_type quer_key
 4 | python3 preprocessing.py --query_type quer_all
 5 | 
 6 | git clone http://github.com/stanfordnlp/glove
 7 | cp gen_w2v.sh glove/
 8 | cp data/corpus.txt glove/
 9 | cd glove && make
10 | . gen_w2v.sh
11 | cd ..
12 | cp glove/glove.w2v data/
13 | 
14 | python3 nn_preprocessing.py
15 | python3 bert_preprocessing.py --preprocessing_type fine --left_truncated_length 64 --query_type query_key
16 | python3 bert_preprocessing.py --preprocessing_type fine --left_truncated_length 200 --query_type query_all
17 | python3 bert_preprocessing.py --preprocessing_type coarse --left_truncated_length 200 --query_type query_all
18 | 
19 | python3 nn_5_fold_train.py --model_id ESIM_001
20 | python3 nn_5_fold_train.py --model_id ESIMplus_001
21 | python3 nn_5_fold_train.py --model_id aNMM_001
22 | python3 nn_5_fold_train.py --model_id MatchLSTM_001
23 | python3 nn_5_fold_train.py --model_id ESIM_001_pointwise
24 | 
25 | python3 bert_5_fold_train.py --model_id bert_002
26 | python3 bert_5_fold_train.py --model_id bert_003
27 | python3 bert_5_fold_train.py --model_id bert_004
28 | 
29 | python3 nn_5_fold_predict.py --model_id ESIM_001
30 | python3 nn_5_fold_predict.py --model_id ESIMplus_001
31 | python3 nn_5_fold_predict.py --model_id aNMM_001
32 | python3 nn_5_fold_predict.py --model_id MatchLSTM_001
33 | python3 nn_5_fold_predict.py --model_id ESIM_001_pointwise
34 | 
35 | python3 bert_5_fold_predict.py --model_id bert_002
36 | python3 bert_5_fold_predict.py --model_id bert_003
37 | python3 bert_5_fold_predict.py --model_id bert_004
38 | 
39 | python3 fold_result_integration.py --model_id ESIM_001
40 | python3 fold_result_integration.py --model_id ESIMplus_001
41 | python3 fold_result_integration.py --model_id aNMM_001
42 | python3 fold_result_integration.py --model_id MatchLSTM_001
43 | python3 fold_result_integration.py --model_id ESIM_001_pointwise
44 | python3 fold_result_integration.py --model_id bert_002
45 | python3 fold_result_integration.py --model_id bert_003
46 | python3 fold_result_integration.py --model_id bert_004
47 | 
48 | python3 mk_submission.py --model_id ESIM_001
49 | python3 mk_submission.py --model_id ESIMplus_001
50 | python3 mk_submission.py --model_id aNMM_001
51 | python3 mk_submission.py --model_id MatchLSTM_001
52 | python3 mk_submission.py --model_id ESIM_001_pointwise
53 | python3 mk_submission.py --model_id bert_002
54 | python3 mk_submission.py --model_id bert_003
55 | python3 mk_submission.py --model_id bert_004
56 | 
57 | python3 change_formatting4stk.py --model_id ESIM_001
58 | python3 change_formatting4stk.py --model_id ESIMplus_001
59 | python3 change_formatting4stk.py --model_id aNMM_001
60 | python3 change_formatting4stk.py --model_id MatchLSTM_001
61 | python3 change_formatting4stk.py --model_id ESIM_001_pointwise
62 | python3 change_formatting4stk.py --model_id bert_002
63 | python3 change_formatting4stk.py --model_id bert_003
64 | python3 change_formatting4stk.py --model_id bert_004
65 | 
66 | ###### finally #####
67 | python3 final_blend.py
68 | 
69 | 


--------------------------------------------------------------------------------
/src/rank/m2/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | import torch
 6 | from matchzoo.engine.base_metric import sort_and_couple, RankingMetric
 7 | 
 8 | 
 9 | def build_matrix(term_index, gv_model, dim=256):
10 |     
11 |     input_dim = len(term_index)
12 |     matrix = np.empty((input_dim, dim))
13 | 
14 |     valid_keys = gv_model.vocab.keys()
15 |     for term, index in term_index.items():
16 |         if term in valid_keys:
17 |             matrix[index] = gv_model.word_vec(term)
18 |         else:
19 |             if '<unk>' in gv_model.vocab.keys():
20 |                 matrix[index] = gv_model.word_vec("<unk>")
21 |             else:
22 |                 matrix[index] = np.random.randn(dim).astype(dtype=np.float32)
23 |     return matrix
24 | 
25 | def topk_lines(df, k):
26 |     print(df.shape)
27 |     df.loc[:, 'rank'] = df.groupby(['description_id', 'type']).cumcount().values
28 |     df = df[df['rank'] < k]
29 |     df.drop(['rank'], axis=1, inplace=True)
30 |     print(df.shape)
31 |     return df
32 | 
33 | 
34 | class MAP(RankingMetric):
35 |     
36 |     def __init__(self, k = 3):
37 |         self._k = k
38 |         
39 |     def __repr__(self) -> str:
40 |         return 'mean_average_precision@{}'.format(self._k)
41 |     
42 |     def __call__(self, y_true, y_pred):
43 |         coupled_pair = sort_and_couple(y_true, y_pred)
44 |         for idx, (label, pred) in enumerate(coupled_pair):
45 |             if idx+1>self._k:
46 |                 return 0
47 |             if label > 0:
48 |                 return 1. / (idx + 1)
49 |         return 0.
50 |     
51 | 
52 | def predict(trainer, testloader):
53 |     with torch.no_grad():
54 |         trainer._model.eval()
55 |         predictions = []
56 |         for batch in tqdm(testloader):
57 |             inputs = batch[0]
58 |             outputs = trainer._model(inputs).detach().cpu()
59 |             predictions.append(outputs)
60 |         trainer._model.train()
61 | 
62 |     return torch.cat(predictions, dim=0).numpy()
63 | 
64 | 
65 | class Logger:
66 |     def __init__(self, log_filename="log.txt"):
67 |         self.terminal = sys.stdout
68 |         self.log = open(log_filename, "a")
69 |         self.log.write("="*10+" Start Time:"+time.ctime()+" "+"="*10+"\n")
70 |     
71 |     def __enter__(self):
72 |         sys.stdout = self
73 |  
74 |     def __exit__(self, e_t, e_v, t_b):
75 |         sys.stdout = self.close()
76 |         
77 |     def stop_log(self):
78 |         sys.stdout = self.close()
79 |         
80 |     def write(self, message):
81 |         self.terminal.write(message)
82 |         if message=="\n":
83 |             self.log.write(message)
84 |         else:
85 |             self.log.write("["+time.ctime()+"]: "+message)
86 |  
87 |     def flush(self):
88 |         self.terminal.flush()
89 |         self.log.flush()
90 |         
91 |     def close(self):
92 |         self.log.write("="*10+" End Time"+time.ctime()+" "+"="*10+"\n")
93 |         self.log.close()
94 |         return self.terminal
95 | 
96 | 


--------------------------------------------------------------------------------
/src/rank/m3/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # 基础模块
 5 | import os
 6 | import sys
 7 | import gc
 8 | import json
 9 | import time
10 | import functools
11 | from datetime import datetime
12 | 
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | 
17 | # 自定义工具包
18 | sys.path.append('../../../tools/')
19 | import loader
20 | 
21 | # 设置随机种子
22 | SEED = 2020
23 | np.random.seed (SEED)
24 | 
25 | def val_convert(df_path, pred_path, out_path):
26 |     tr_data = loader.load_df(df_path)
27 |     df_pred = loader.load_df(pred_path)
28 | 
29 |     sort_df_pred = df_pred.sort_values(['description_id', 'target'], ascending=False)
30 |     df_pred = df_pred[['description_id']].drop_duplicates() \
31 |             .merge(sort_df_pred, on=['description_id'], how='left')
32 |     df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
33 |     df_pred = df_pred[df_pred['rank'] < 3]
34 |     df_pred = df_pred.groupby(['description_id'])['paper_id'] \
35 |             .apply(lambda s : ','.join((s))).reset_index()
36 | 
37 |     tr_data = tr_data[['description_id', 'paper_id']].rename(columns={'paper_id': 'target_id'})
38 |     df_pred = df_pred.merge(tr_data, on=['description_id'], how='left')
39 |     loader.save_df(df_pred, out_path)
40 | 
41 | def output(df, out_path):
42 |     fo = open(out_path, 'w')
43 |     for i in range(df.shape[0]):
44 |         desc_id = df.iloc[i]['description_id']
45 |         paper_ids = df.iloc[i]['paper_id']
46 |         print (desc_id + ',' + paper_ids, file=fo)
47 |     fo.close()
48 | 
49 | def sub_convert(df_path, pred_path, out_path1, out_path2):
50 |     te_data = loader.load_df(df_path)
51 |     df_pred = loader.load_df(pred_path)
52 | 
53 |     sort_df_pred = df_pred.sort_values(['description_id', 'target'], ascending=False)
54 |     df_pred = df_pred[['description_id']].drop_duplicates() \
55 |             .merge(sort_df_pred, on=['description_id'], how='left')
56 |     df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
57 |     df_pred = df_pred[df_pred['rank'] < 3]
58 |     df_pred = df_pred.groupby(['description_id'])['paper_id'] \
59 |             .apply(lambda s : ','.join((s))).reset_index()
60 | 
61 |     df_pred = te_data[['description_id']].merge(df_pred, on=['description_id'], how='left')
62 |     loader.save_df(df_pred, out_path1)
63 |     #output(df_pred, out_path2)
64 | 
65 | if __name__ == "__main__":
66 | 
67 |     print('start time: %s' % datetime.now())
68 |     root_path = '../../../feat/'
69 |     base_tr_path = '../../../input/train_release.csv'
70 |     base_te_path = '../../../input/test.csv'
71 | 
72 |     sub_file_path = sys.argv[1]
73 |     sub_name = sys.argv[2]
74 | 
75 |     val_path = '{}/{}_cv.ftr'.format(sub_file_path, sub_name)
76 |     val_out_path = '{}/r_{}_cv.csv'.format(sub_file_path, sub_name)
77 |     val_convert(base_tr_path, val_path, val_out_path)
78 | 
79 |     sub_path = '{}/{}.ftr'.format(sub_file_path, sub_name)
80 |     sub_out_pathA = '{}/r_{}.csv'.format(sub_file_path, sub_name)
81 |     sub_out_pathB = '{}/s_{}.csv'.format(sub_file_path, sub_name)
82 |     sub_out_pathA2 = '{}/r2_{}.csv'.format(sub_file_path, sub_name)
83 |     sub_out_pathB2 = '{}/s2_{}.csv'.format(sub_file_path, sub_name)
84 |     sub_convert(base_te_path, sub_path, sub_out_pathA, sub_out_pathB)
85 | 
86 |     print('all completed: %s' % datetime.now())
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/src/rank/m3/eval.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # 基础模块
 5 | import os
 6 | import sys
 7 | import gc
 8 | import json
 9 | import time
10 | import functools
11 | from datetime import datetime
12 | 
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | 
17 | # 自定义工具包
18 | sys.path.append('../../../tools/')
19 | import loader
20 | 
21 | # 开源工具包
22 | import ml_metrics as metrics
23 | 
24 | # 设置随机种子
25 | SEED = 2020
26 | np.random.seed (SEED)
27 | 
28 | def calc_map(df, k):
29 |     df.rename(columns={'paper_id': 'paper_ids'}, inplace=True)
30 |     df['paper_ids'] = df['paper_ids'].apply(lambda s: s.split(','))
31 |     df['target_id'] = df['target_id'].apply(lambda s: [s])
32 |     return metrics.mapk(df['target_id'].tolist(), df['paper_ids'].tolist(), k)
33 | 
34 | if __name__ == "__main__":
35 | 
36 |     print('start time: %s' % datetime.now())
37 |     in_path = sys.argv[1]
38 |     df = loader.load_df(in_path)
39 |     mapk = calc_map(df, k=3)
40 |     print ('{} {}'.format(df.shape, round(mapk, 5)))
41 |     print('all completed: %s' % datetime.now())
42 | 
43 | 


--------------------------------------------------------------------------------
/src/rank/m3/flow.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # 基础模块
 5 | import os
 6 | import sys
 7 | import time
 8 | 
 9 | ts = time.time()
10 | 
11 | num = sys.argv[1]
12 | 
13 | sub_file_path = '../../../output/m3/lgb_m3_{}'.format(num)
14 | sub_name = 'lgb_m3_{}'.format(num)
15 | 
16 | # lgb train
17 | print ('lgb_train-%s.py %s' % (num, num))
18 | os.system('python3 -u lgb_train_%s.py %s' % (num, num))
19 | 
20 | # merge cv & sub
21 | print('\nkfold_merge')
22 | os.system('python3 -u kfold_merge.py %s %s' % (sub_file_path, sub_name))
23 | 
24 | # convert cv & sub to list format
25 | print ('\nconvert')
26 | os.system('python3 -u convert.py %s %s' % (sub_file_path, sub_name))
27 | 
28 | # calculate mrr & auc
29 | print ('\neval')
30 | os.system('python3 -u eval.py %s' % ('{}/r_{}_cv.csv'.format(sub_file_path, sub_name)))
31 | 
32 | print ('all completed, cost {}s'.format(time.time() - ts))
33 | 


--------------------------------------------------------------------------------
/src/rank/m3/kfold_merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # 基础模块
 5 | import os
 6 | import sys
 7 | import gc
 8 | import json
 9 | import time
10 | import functools
11 | from datetime import datetime
12 | 
13 | # 数据处理
14 | import numpy as np
15 | import pandas as pd
16 | from math import sqrt
17 | from collections import Counter
18 | 
19 | # 自定义工具包
20 | sys.path.append('../../../tools/')
21 | import loader
22 | 
23 | # 设置随机种子
24 | SEED = 2020
25 | np.random.seed (SEED)
26 | 
27 | TARGET_NAME = 'target'
28 | FOLD_NUM = 5
29 | 
30 | def merge_val(file_path, sub_name, fold_num):
31 |     file_list = os.listdir(file_path)
32 | 
33 |     paths = ['{}_cv_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
34 |     print (paths)
35 | 
36 |     dfs = []
37 |     for path in paths:
38 |         assert path in file_list, '{} not exist'.format(path)
39 |         path = '{}/{}'.format(file_path, path)
40 |         dfs.append(loader.load_df(path))
41 | 
42 |     df = pd.concat(dfs)
43 |     print (df.head())
44 |     print (df.describe())
45 |     out_path = '{}/{}_cv.ftr'.format(file_path, sub_name)
46 |     loader.save_df(df, out_path)
47 | 
48 | def merge_sub(file_path, sub_name, fold_num):
49 |     file_list = os.listdir(file_path)
50 | 
51 |     paths = ['{}_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
52 |     print (paths)
53 | 
54 |     df = pd.DataFrame()
55 |     for i, path in enumerate(paths):
56 |         assert path in file_list, '{} not exist'.format(path)
57 |         path = '{}/{}'.format(file_path, path)
58 |         if i == 0:
59 |             df = loader.load_df(path)
60 |         else:
61 |             df[TARGET_NAME] += loader.load_df(path)[TARGET_NAME]
62 | 
63 |     df[TARGET_NAME] /= fold_num
64 |     print (df.head())
65 |     print (df.describe())
66 |     out_path = '{}/{}.ftr'.format(file_path, sub_name)
67 |     loader.save_df(df, out_path)
68 | 
69 | 
70 | if __name__ == '__main__':
71 | 
72 |     sub_file_path = sys.argv[1]
73 |     sub_name = sys.argv[2]
74 | 
75 |     merge_val(sub_file_path, sub_name, FOLD_NUM)
76 |     merge_sub(sub_file_path, sub_name, FOLD_NUM)
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_32-50-0.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 基础模块
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | import time
 10 | import functools
 11 | from datetime import datetime
 12 | 
 13 | # 数据处理
 14 | import numpy as np
 15 | import pandas as pd
 16 | from math import sqrt
 17 | from collections import Counter
 18 | 
 19 | # 自定义工具包
 20 | sys.path.append('../../../tools/')
 21 | import loader
 22 | from lgb_learner import lgbLearner
 23 | 
 24 | # 设置随机种子
 25 | SEED = 2020
 26 | np.random.seed (SEED)
 27 | 
 28 | FEA_NUM = sys.argv[1]
 29 | FEA_NUM = '32-50'
 30 | 
 31 | fold_num = 5
 32 | out_name  = 'lgb_m3_{}-0'.format(FEA_NUM)
 33 | root_path = '../../../output/m3/' + out_name + '/'
 34 | 
 35 | ID_NAMES = ['description_id', 'paper_id']
 36 | TARGET_NAME = 'target'
 37 | 
 38 | TASK_TYPE = 'te'
 39 | #TASK_TYPE = 'tr'
 40 | #TASK_TYPE = 'pe'
 41 | 
 42 | if not os.path.exists(root_path):
 43 |     os.mkdir(root_path)
 44 |     print ('create dir succ {}'.format(root_path))
 45 | 
 46 | def sum_score(x, y):
 47 |     return max(x, 0) + max(y, 0)
 48 | 
 49 | def get_feas(data):
 50 | 
 51 |     cols = data.columns.tolist()
 52 |     del_cols = ID_NAMES + ['target', 'cv'] 
 53 |     sub_cols = ['year']
 54 |     for col in data.columns:
 55 |         for sub_col in sub_cols:
 56 |             #if sub_col in col and col != 'year':
 57 |             if sub_col in col:
 58 |                 del_cols.append(col)
 59 | 
 60 |     cols = [val for val in cols if val not in del_cols]
 61 |     print ('del_cols', del_cols)
 62 |     return cols
 63 | 
 64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
 65 |     params = {
 66 |         "objective":        "binary",
 67 |         "boosting_type":    "gbdt",
 68 |         #"metric":           ['binary_logloss'],
 69 |         "metric":           ['auc'],
 70 |         "boost_from_average": False,
 71 |         "learning_rate":    0.03,
 72 |         "num_leaves":       32,
 73 |         "max_depth":        -1,
 74 |         "feature_fraction": 0.7,
 75 |         "bagging_fraction": 0.7,
 76 |         "bagging_freq":     2,
 77 |         "lambda_l1":        0,
 78 |         "lambda_l2":        0,
 79 |         "seed":             seed,
 80 |         'min_child_weight':  0.005,
 81 |         'min_data_in_leaf':  50,
 82 |         'max_bin':           255,
 83 |         "num_threads":       16,
 84 |         "verbose":          -1,
 85 |         "early_stopping_round": 50
 86 |     }
 87 |     params['learning_rate'] = 0.03
 88 |     num_trees = 2000
 89 |     print ('training params:', num_trees, params)
 90 | 
 91 |     lgb_learner = lgbLearner(train_data, test_data, \
 92 |             fea_col_names, ID_NAMES, TARGET_NAME, \
 93 |             params, num_trees, fold_num, out_name, \
 94 |             metric_names=['auc', 'logloss'], \
 95 |             model_postfix='')
 96 |     predicted_folds = [1,2,3,4,5]
 97 | 
 98 |     if TASK_TYPE == 'te':
 99 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 |                 predicted_folds=predicted_folds, need_predict_test=True)
101 |     elif TASK_TYPE == 'tr':
102 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 |                 predicted_folds=predicted_folds, need_predict_test=False)
104 |     elif TASK_TYPE == 'pe':
105 |         lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 |                 predicted_folds=predicted_folds, need_predict_test=False)
107 | 
108 | if __name__ == '__main__':
109 | 
110 |     ##################  params ####################
111 |     print("Load the training, test and store data using pandas")
112 |     ts = time.time()
113 |     root_path = '../../../feat/'
114 |     postfix = 's0_{}'.format(FEA_NUM)
115 |     file_type = 'ftr'
116 | 
117 |     train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 |     test_path  = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 |     if TASK_TYPE in ['te', 'pe']:
120 |         test_path  = root_path + 'te_{}.{}'.format(postfix, file_type)
121 | 
122 |     print ('tr path', train_path)
123 |     print ('te path', test_path)
124 |     train_data = loader.load_df(train_path)
125 |     test_data = loader.load_df(test_path)
126 | 
127 |     paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 |     tr = loader.load_df('../../../input/tr_input_final.ftr')
129 |     tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 |     desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 |     #train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |     
133 |     print (train_data.columns)
134 |     print (train_data.shape, test_data.shape)
135 | 
136 |     fea_col_names = get_feas(train_data)
137 |     print (len(fea_col_names), fea_col_names)
138 | 
139 |     required_cols = ID_NAMES + ['cv', 'target']
140 |     drop_cols = [col for col in train_data.columns \
141 |             if col not in fea_col_names and col not in required_cols]
142 | 
143 |     train_data = train_data.drop(drop_cols, axis=1)
144 |     test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 | 
146 |     lgb_train(train_data, test_data, fea_col_names)
147 |     print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_37-0.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 基础模块
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | import time
 10 | import functools
 11 | from datetime import datetime
 12 | 
 13 | # 数据处理
 14 | import numpy as np
 15 | import pandas as pd
 16 | from math import sqrt
 17 | from collections import Counter
 18 | 
 19 | # 自定义工具包
 20 | sys.path.append('../../../tools/')
 21 | import loader
 22 | from lgb_learner import lgbLearner
 23 | 
 24 | # 设置随机种子
 25 | SEED = 2020
 26 | np.random.seed (SEED)
 27 | 
 28 | FEA_NUM = sys.argv[1]
 29 | FEA_NUM = '37'
 30 | 
 31 | fold_num = 5
 32 | out_name  = 'lgb_m3_{}-0'.format(FEA_NUM)
 33 | root_path = '../../../output/m3/' + out_name + '/'
 34 | 
 35 | ID_NAMES = ['description_id', 'paper_id']
 36 | TARGET_NAME = 'target'
 37 | 
 38 | TASK_TYPE = 'te'
 39 | #TASK_TYPE = 'tr'
 40 | #TASK_TYPE = 'pe'
 41 | 
 42 | if not os.path.exists(root_path):
 43 |     os.mkdir(root_path)
 44 |     print ('create dir succ {}'.format(root_path))
 45 | 
 46 | def sum_score(x, y):
 47 |     return max(x, 0) + max(y, 0)
 48 | 
 49 | def get_feas(data):
 50 | 
 51 |     cols = data.columns.tolist()
 52 |     del_cols = ID_NAMES + ['target', 'cv'] 
 53 |     sub_cols = ['year']
 54 |     for col in data.columns:
 55 |         for sub_col in sub_cols:
 56 |             #if sub_col in col and col != 'year':
 57 |             if sub_col in col:
 58 |                 del_cols.append(col)
 59 | 
 60 |     cols = [val for val in cols if val not in del_cols]
 61 |     print ('del_cols', del_cols)
 62 |     return cols
 63 | 
 64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
 65 |     params = {
 66 |         "objective":        "binary",
 67 |         "boosting_type":    "gbdt",
 68 |         #"metric":           ['binary_logloss'],
 69 |         "metric":           ['auc'],
 70 |         "boost_from_average": False,
 71 |         "learning_rate":    0.03,
 72 |         "num_leaves":       32,
 73 |         "max_depth":        -1,
 74 |         "feature_fraction": 0.7,
 75 |         "bagging_fraction": 0.7,
 76 |         "bagging_freq":     2,
 77 |         "lambda_l1":        0,
 78 |         "lambda_l2":        0,
 79 |         "seed":             seed,
 80 |         'min_child_weight':  0.005,
 81 |         'min_data_in_leaf':  50,
 82 |         'max_bin':           255,
 83 |         "num_threads":       16,
 84 |         "verbose":          -1,
 85 |         "early_stopping_round": 50
 86 |     }
 87 |     params['learning_rate'] = 0.03
 88 |     num_trees = 2000
 89 |     print ('training params:', num_trees, params)
 90 | 
 91 |     lgb_learner = lgbLearner(train_data, test_data, \
 92 |             fea_col_names, ID_NAMES, TARGET_NAME, \
 93 |             params, num_trees, fold_num, out_name, \
 94 |             metric_names=['auc', 'logloss'], \
 95 |             model_postfix='')
 96 |     predicted_folds = [1,2,3,4,5]
 97 | 
 98 |     if TASK_TYPE == 'te':
 99 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 |                 predicted_folds=predicted_folds, need_predict_test=True)
101 |     elif TASK_TYPE == 'tr':
102 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 |                 predicted_folds=predicted_folds, need_predict_test=False)
104 |     elif TASK_TYPE == 'pe':
105 |         lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 |                 predicted_folds=predicted_folds, need_predict_test=False)
107 | 
108 | if __name__ == '__main__':
109 | 
110 |     ##################  params ####################
111 |     print("Load the training, test and store data using pandas")
112 |     ts = time.time()
113 |     root_path = '../../../feat/'
114 |     postfix = 's0_{}'.format(FEA_NUM)
115 |     file_type = 'ftr'
116 | 
117 |     train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 |     test_path  = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 |     if TASK_TYPE in ['te', 'pe']:
120 |         test_path  = root_path + 'te_{}.{}'.format(postfix, file_type)
121 | 
122 |     print ('tr path', train_path)
123 |     print ('te path', test_path)
124 |     train_data = loader.load_df(train_path)
125 |     test_data = loader.load_df(test_path)
126 | 
127 |     paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 |     tr = loader.load_df('../../../input/tr_input_final.ftr')
129 |     tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 |     desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 |     #train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |     
133 |     print (train_data.columns)
134 |     print (train_data.shape, test_data.shape)
135 | 
136 |     fea_col_names = get_feas(train_data)
137 |     print (len(fea_col_names), fea_col_names)
138 | 
139 |     required_cols = ID_NAMES + ['cv', 'target']
140 |     drop_cols = [col for col in train_data.columns \
141 |             if col not in fea_col_names and col not in required_cols]
142 | 
143 |     train_data = train_data.drop(drop_cols, axis=1)
144 |     test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 | 
146 |     lgb_train(train_data, test_data, fea_col_names)
147 |     print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_38-0.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 基础模块
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | import time
 10 | import functools
 11 | from datetime import datetime
 12 | 
 13 | # 数据处理
 14 | import numpy as np
 15 | import pandas as pd
 16 | from math import sqrt
 17 | from collections import Counter
 18 | 
 19 | # 自定义工具包
 20 | sys.path.append('../../../tools/')
 21 | import loader
 22 | from lgb_learner import lgbLearner
 23 | 
 24 | # 设置随机种子
 25 | SEED = 2020
 26 | np.random.seed (SEED)
 27 | 
 28 | FEA_NUM = sys.argv[1]
 29 | FEA_NUM = '38'
 30 | 
 31 | fold_num = 5
 32 | out_name  = 'lgb_m3_{}-0'.format(FEA_NUM)
 33 | root_path = '../../../output/m3/' + out_name + '/'
 34 | 
 35 | ID_NAMES = ['description_id', 'paper_id']
 36 | TARGET_NAME = 'target'
 37 | 
 38 | TASK_TYPE = 'te'
 39 | #TASK_TYPE = 'tr'
 40 | #TASK_TYPE = 'pe'
 41 | 
 42 | if not os.path.exists(root_path):
 43 |     os.mkdir(root_path)
 44 |     print ('create dir succ {}'.format(root_path))
 45 | 
 46 | def sum_score(x, y):
 47 |     return max(x, 0) + max(y, 0)
 48 | 
 49 | def get_feas(data):
 50 | 
 51 |     cols = data.columns.tolist()
 52 |     del_cols = ID_NAMES + ['target', 'cv'] 
 53 |     sub_cols = ['year']
 54 |     for col in data.columns:
 55 |         for sub_col in sub_cols:
 56 |             #if sub_col in col and col != 'year':
 57 |             if sub_col in col:
 58 |                 del_cols.append(col)
 59 | 
 60 |     cols = [val for val in cols if val not in del_cols]
 61 |     print ('del_cols', del_cols)
 62 |     return cols
 63 | 
 64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
 65 |     params = {
 66 |         "objective":        "binary",
 67 |         "boosting_type":    "gbdt",
 68 |         #"metric":           ['binary_logloss'],
 69 |         "metric":           ['auc'],
 70 |         "boost_from_average": False,
 71 |         "learning_rate":    0.03,
 72 |         "num_leaves":       32,
 73 |         "max_depth":        -1,
 74 |         "feature_fraction": 0.7,
 75 |         "bagging_fraction": 0.7,
 76 |         "bagging_freq":     2,
 77 |         "lambda_l1":        0,
 78 |         "lambda_l2":        0,
 79 |         "seed":             seed,
 80 |         'min_child_weight':  0.005,
 81 |         'min_data_in_leaf':  50,
 82 |         'max_bin':           255,
 83 |         "num_threads":       16,
 84 |         "verbose":          -1,
 85 |         "early_stopping_round": 50
 86 |     }
 87 |     params['learning_rate'] = 0.03
 88 |     num_trees = 2000
 89 |     print ('training params:', num_trees, params)
 90 | 
 91 |     lgb_learner = lgbLearner(train_data, test_data, \
 92 |             fea_col_names, ID_NAMES, TARGET_NAME, \
 93 |             params, num_trees, fold_num, out_name, \
 94 |             metric_names=['auc', 'logloss'], \
 95 |             model_postfix='')
 96 |     predicted_folds = [1,2,3,4,5]
 97 | 
 98 |     if TASK_TYPE == 'te':
 99 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 |                 predicted_folds=predicted_folds, need_predict_test=True)
101 |     elif TASK_TYPE == 'tr':
102 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 |                 predicted_folds=predicted_folds, need_predict_test=False)
104 |     elif TASK_TYPE == 'pe':
105 |         lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 |                 predicted_folds=predicted_folds, need_predict_test=False)
107 | 
108 | if __name__ == '__main__':
109 | 
110 |     ##################  params ####################
111 |     print("Load the training, test and store data using pandas")
112 |     ts = time.time()
113 |     root_path = '../../../feat/'
114 |     postfix = 's0_{}'.format(FEA_NUM)
115 |     file_type = 'ftr'
116 | 
117 |     train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 |     test_path  = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 |     if TASK_TYPE in ['te', 'pe']:
120 |         test_path  = root_path + 'te_{}.{}'.format(postfix, file_type)
121 | 
122 |     print ('tr path', train_path)
123 |     print ('te path', test_path)
124 |     train_data = loader.load_df(train_path)
125 |     test_data = loader.load_df(test_path)
126 | 
127 |     paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 |     tr = loader.load_df('../../../input/tr_input_final.ftr')
129 |     tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 |     desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 |     #train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |     
133 |     print (train_data.columns)
134 |     print (train_data.shape, test_data.shape)
135 | 
136 |     fea_col_names = get_feas(train_data)
137 |     print (len(fea_col_names), fea_col_names)
138 | 
139 |     required_cols = ID_NAMES + ['cv', 'target']
140 |     drop_cols = [col for col in train_data.columns \
141 |             if col not in fea_col_names and col not in required_cols]
142 | 
143 |     train_data = train_data.drop(drop_cols, axis=1)
144 |     test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 | 
146 |     lgb_train(train_data, test_data, fea_col_names)
147 |     print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_38-1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 基础模块
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | import time
 10 | import functools
 11 | from datetime import datetime
 12 | 
 13 | # 数据处理
 14 | import numpy as np
 15 | import pandas as pd
 16 | from math import sqrt
 17 | from collections import Counter
 18 | 
 19 | # 自定义工具包
 20 | sys.path.append('../../../tools/')
 21 | import loader
 22 | from lgb_learner import lgbLearner
 23 | 
 24 | # 设置随机种子
 25 | SEED = 2020
 26 | np.random.seed (SEED)
 27 | 
 28 | FEA_NUM = sys.argv[1]
 29 | FEA_NUM = '38'
 30 | 
 31 | fold_num = 5
 32 | out_name  = 'lgb_m3_{}-0'.format(FEA_NUM)
 33 | root_path = '../../../output/m3/' + out_name + '/'
 34 | 
 35 | ID_NAMES = ['description_id', 'paper_id']
 36 | TARGET_NAME = 'target'
 37 | 
 38 | TASK_TYPE = 'te'
 39 | #TASK_TYPE = 'tr'
 40 | #TASK_TYPE = 'pe'
 41 | 
 42 | if not os.path.exists(root_path):
 43 |     os.mkdir(root_path)
 44 |     print ('create dir succ {}'.format(root_path))
 45 | 
 46 | def sum_score(x, y):
 47 |     return max(x, 0) + max(y, 0)
 48 | 
 49 | def get_feas(data):
 50 | 
 51 |     cols = data.columns.tolist()
 52 |     del_cols = ID_NAMES + ['target', 'cv'] 
 53 |     sub_cols = ['year']
 54 |     for col in data.columns:
 55 |         for sub_col in sub_cols:
 56 |             if sub_col in col and col != 'year':
 57 |             #if sub_col in col:
 58 |                 del_cols.append(col)
 59 | 
 60 |     cols = [val for val in cols if val not in del_cols]
 61 |     print ('del_cols', del_cols)
 62 |     return cols
 63 | 
 64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
 65 |     params = {
 66 |         "objective":        "binary",
 67 |         "boosting_type":    "gbdt",
 68 |         #"metric":           ['binary_logloss'],
 69 |         "metric":           ['auc'],
 70 |         "boost_from_average": False,
 71 |         "learning_rate":    0.03,
 72 |         "num_leaves":       32,
 73 |         "max_depth":        -1,
 74 |         "feature_fraction": 0.7,
 75 |         "bagging_fraction": 0.7,
 76 |         "bagging_freq":     2,
 77 |         "lambda_l1":        0,
 78 |         "lambda_l2":        0,
 79 |         "seed":             seed,
 80 |         'min_child_weight':  0.005,
 81 |         'min_data_in_leaf':  50,
 82 |         'max_bin':           255,
 83 |         "num_threads":       16,
 84 |         "verbose":          -1,
 85 |         "early_stopping_round": 50
 86 |     }
 87 |     params['learning_rate'] = 0.03
 88 |     num_trees = 2000
 89 |     print ('training params:', num_trees, params)
 90 | 
 91 |     lgb_learner = lgbLearner(train_data, test_data, \
 92 |             fea_col_names, ID_NAMES, TARGET_NAME, \
 93 |             params, num_trees, fold_num, out_name, \
 94 |             metric_names=['auc', 'logloss'], \
 95 |             model_postfix='')
 96 |     predicted_folds = [1,2,3,4,5]
 97 | 
 98 |     if TASK_TYPE == 'te':
 99 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 |                 predicted_folds=predicted_folds, need_predict_test=True)
101 |     elif TASK_TYPE == 'tr':
102 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 |                 predicted_folds=predicted_folds, need_predict_test=False)
104 |     elif TASK_TYPE == 'pe':
105 |         lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 |                 predicted_folds=predicted_folds, need_predict_test=False)
107 | 
108 | if __name__ == '__main__':
109 | 
110 |     ##################  params ####################
111 |     print("Load the training, test and store data using pandas")
112 |     ts = time.time()
113 |     root_path = '../../../feat/'
114 |     postfix = 's0_{}'.format(FEA_NUM)
115 |     file_type = 'ftr'
116 | 
117 |     train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 |     test_path  = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 |     if TASK_TYPE in ['te', 'pe']:
120 |         test_path  = root_path + 'te_{}.{}'.format(postfix, file_type)
121 | 
122 |     print ('tr path', train_path)
123 |     print ('te path', test_path)
124 |     train_data = loader.load_df(train_path)
125 |     test_data = loader.load_df(test_path)
126 | 
127 |     paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 |     tr = loader.load_df('../../../input/tr_input_final.ftr')
129 |     tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 |     desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 |     train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |     
133 |     print (train_data.columns)
134 |     print (train_data.shape, test_data.shape)
135 | 
136 |     fea_col_names = get_feas(train_data)
137 |     print (len(fea_col_names), fea_col_names)
138 | 
139 |     required_cols = ID_NAMES + ['cv', 'target']
140 |     drop_cols = [col for col in train_data.columns \
141 |             if col not in fea_col_names and col not in required_cols]
142 | 
143 |     train_data = train_data.drop(drop_cols, axis=1)
144 |     test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 | 
146 |     lgb_train(train_data, test_data, fea_col_names)
147 |     print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/src/rank/m3/lgb_train_40-0.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 基础模块
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | import time
 10 | import functools
 11 | from datetime import datetime
 12 | 
 13 | # 数据处理
 14 | import numpy as np
 15 | import pandas as pd
 16 | from math import sqrt
 17 | from collections import Counter
 18 | 
 19 | # 自定义工具包
 20 | sys.path.append('../../../tools/')
 21 | import loader
 22 | from lgb_learner import lgbLearner
 23 | 
 24 | # 设置随机种子
 25 | SEED = 2020
 26 | np.random.seed (SEED)
 27 | 
 28 | FEA_NUM = sys.argv[1]
 29 | FEA_NUM = '40'
 30 | 
 31 | fold_num = 5
 32 | out_name  = 'lgb_m3_{}-0'.format(FEA_NUM)
 33 | root_path = '../../../output/m3/' + out_name + '/'
 34 | 
 35 | ID_NAMES = ['description_id', 'paper_id']
 36 | TARGET_NAME = 'target'
 37 | 
 38 | TASK_TYPE = 'te'
 39 | #TASK_TYPE = 'tr'
 40 | #TASK_TYPE = 'pe'
 41 | 
 42 | if not os.path.exists(root_path):
 43 |     os.mkdir(root_path)
 44 |     print ('create dir succ {}'.format(root_path))
 45 | 
 46 | def sum_score(x, y):
 47 |     return max(x, 0) + max(y, 0)
 48 | 
 49 | def get_feas(data):
 50 | 
 51 |     cols = data.columns.tolist()
 52 |     del_cols = ID_NAMES + ['target', 'cv'] 
 53 |     sub_cols = ['year']
 54 |     for col in data.columns:
 55 |         for sub_col in sub_cols:
 56 |             #if sub_col in col and col != 'year':
 57 |             if sub_col in col:
 58 |                 del_cols.append(col)
 59 | 
 60 |     cols = [val for val in cols if val not in del_cols]
 61 |     print ('del_cols', del_cols)
 62 |     return cols
 63 | 
 64 | def lgb_train(train_data, test_data, fea_col_names, seed=SEED, cv_index=0):
 65 |     params = {
 66 |         "objective":        "binary",
 67 |         "boosting_type":    "gbdt",
 68 |         #"metric":           ['binary_logloss'],
 69 |         "metric":           ['auc'],
 70 |         "boost_from_average": False,
 71 |         "learning_rate":    0.03,
 72 |         "num_leaves":       32,
 73 |         "max_depth":        -1,
 74 |         "feature_fraction": 0.7,
 75 |         "bagging_fraction": 0.7,
 76 |         "bagging_freq":     2,
 77 |         "lambda_l1":        0,
 78 |         "lambda_l2":        0,
 79 |         "seed":             seed,
 80 |         'min_child_weight':  0.005,
 81 |         'min_data_in_leaf':  50,
 82 |         'max_bin':           255,
 83 |         "num_threads":       16,
 84 |         "verbose":          -1,
 85 |         "early_stopping_round": 50
 86 |     }
 87 |     params['learning_rate'] = 0.03
 88 |     num_trees = 2000
 89 |     print ('training params:', num_trees, params)
 90 | 
 91 |     lgb_learner = lgbLearner(train_data, test_data, \
 92 |             fea_col_names, ID_NAMES, TARGET_NAME, \
 93 |             params, num_trees, fold_num, out_name, \
 94 |             metric_names=['auc', 'logloss'], \
 95 |             model_postfix='')
 96 |     predicted_folds = [1,2,3,4,5]
 97 | 
 98 |     if TASK_TYPE == 'te':
 99 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
100 |                 predicted_folds=predicted_folds, need_predict_test=True)
101 |     elif TASK_TYPE == 'tr':
102 |         lgb_learner.multi_fold_train(lgb_learner.train_data, \
103 |                 predicted_folds=predicted_folds, need_predict_test=False)
104 |     elif TASK_TYPE == 'pe':
105 |         lgb_learner.multi_fold_predict(lgb_learner.train_data, \
106 |                 predicted_folds=predicted_folds, need_predict_test=False)
107 | 
108 | if __name__ == '__main__':
109 | 
110 |     ##################  params ####################
111 |     print("Load the training, test and store data using pandas")
112 |     ts = time.time()
113 |     root_path = '../../../feat/'
114 |     postfix = 's0_{}'.format(FEA_NUM)
115 |     file_type = 'ftr'
116 | 
117 |     train_path = root_path + 'tr_{}.{}'.format(postfix, file_type)
118 |     test_path  = root_path + 'te_{}.{}'.format('s0_4', file_type)
119 |     if TASK_TYPE in ['te', 'pe']:
120 |         test_path  = root_path + 'te_{}.{}'.format(postfix, file_type)
121 | 
122 |     print ('tr path', train_path)
123 |     print ('te path', test_path)
124 |     train_data = loader.load_df(train_path)
125 |     test_data = loader.load_df(test_path)
126 | 
127 |     paper = loader.load_df('../../../input/candidate_paper_for_wsdm2020.ftr')
128 |     tr = loader.load_df('../../../input/tr_input_final.ftr')
129 |     tr = tr.merge(paper[['paper_id', 'journal', 'year']], on=['paper_id'], how='left')
130 |     desc_list = tr[tr['journal'] != 'no-content'][~pd.isnull(tr['year'])]['description_id'].tolist()
131 |     train_data = train_data[train_data['description_id'].isin(desc_list)]
132 |     
133 |     print (train_data.columns)
134 |     print (train_data.shape, test_data.shape)
135 | 
136 |     fea_col_names = get_feas(train_data)
137 |     print (len(fea_col_names), fea_col_names)
138 | 
139 |     required_cols = ID_NAMES + ['cv', 'target']
140 |     drop_cols = [col for col in train_data.columns \
141 |             if col not in fea_col_names and col not in required_cols]
142 | 
143 |     train_data = train_data.drop(drop_cols, axis=1)
144 |     test_data = test_data.drop([col for col in drop_cols if col in test_data.columns], axis=1)
145 | 
146 |     lgb_train(train_data, test_data, fea_col_names)
147 |     print('all completed: %s, cost %s' % (datetime.now(), time.time() - ts))
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/src/recall/tfidf_recall_30.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | 
  4 | # bm25 recall
  5 | 
  6 | # 基础模块
  7 | import os
  8 | import gc
  9 | import sys
 10 | import time
 11 | import functools
 12 | from tqdm import tqdm
 13 | from six import iteritems
 14 | from datetime import datetime
 15 | 
 16 | # 数据处理
 17 | import re
 18 | import math
 19 | import pickle
 20 | import numpy as np
 21 | import pandas as pd
 22 | from multiprocessing import Pool
 23 | 
 24 | # 自定义工具包
 25 | sys.path.append('../../tools/')
 26 | import loader
 27 | import pandas_util
 28 | import custom_bm25 as bm25
 29 | 
 30 | # 开源工具包
 31 | from gensim.models import Word2Vec
 32 | from gensim.models.word2vec import LineSentence
 33 | from gensim import corpora, models, similarities
 34 | from gensim.similarities import SparseMatrixSimilarity
 35 | from sklearn.metrics.pairwise import cosine_similarity as cos_sim
 36 | 
 37 | # 设置随机种子
 38 | SEED = 2020
 39 | PROCESS_NUM, PARTITION_NUM = 18, 18
 40 | 
 41 | input_root_path  = '../../input/'
 42 | output_root_path = '../../feat/'
 43 | 
 44 | postfix = '30'
 45 | file_type = 'ftr'
 46 | 
 47 | train_out_path = output_root_path + 'tr_tfidf_{}.{}'.format(postfix, file_type)
 48 | test_out_path = output_root_path + 'te_tfidf_{}.{}'.format(postfix, file_type)
 49 | 
 50 | def topk_sim_samples(desc, desc_ids, paper_ids, bm25_model, k=10):
 51 |     desc_id2papers = {}
 52 |     for desc_i in tqdm(range(len(desc))):
 53 |         query_vec, query_desc_id = desc[desc_i], desc_ids[desc_i]
 54 |         sims = bm25_model.get_scores(query_vec)
 55 |         sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
 56 |         sim_papers = [paper_ids[val[0]] for val in sort_sims[:k]]
 57 |         sim_scores = [str(val[1]) for val in sort_sims[:k]]
 58 |         desc_id2papers[query_desc_id] = ['|'.join(sim_papers), '|'.join(sim_scores)]
 59 |     sim_df = pd.DataFrame.from_dict(desc_id2papers, orient='index', columns=['paper_id', 'sim_score'])
 60 |     sim_df = sim_df.reset_index().rename(columns={'index':'description_id'})
 61 |     return sim_df
 62 | 
 63 | def partition(queries, num):
 64 |     queries_partitions, step = [], int(np.ceil(len(queries)/num))
 65 |     for i in range(0, len(queries), step):
 66 |         queries_partitions.append(queries[i:i+step])
 67 |     return queries_partitions
 68 | 
 69 | def single_process_search(params=None):
 70 |     (query_vecs, desc_ids, paper_ids, bm25_model, k, i) = params
 71 |     print (i, 'start', datetime.now())
 72 |     gc.collect()
 73 |     sim_df = topk_sim_samples(query_vecs, desc_ids, paper_ids, bm25_model, k)
 74 |     print (i, 'completed', datetime.now())
 75 |     return sim_df
 76 | 
 77 | def multi_process_search(query_vecs, desc_ids, paper_ids, bm25_model, k):
 78 |     pool = Pool(PROCESS_NUM)
 79 |     queries_parts = partition(query_vecs, PARTITION_NUM)
 80 |     desc_ids_parts = partition(desc_ids, PARTITION_NUM)
 81 |     print ('{} processes init and partition to {} parts' \
 82 |            .format(PROCESS_NUM, PARTITION_NUM))
 83 | 
 84 |     param_list = [(queries_parts[i], desc_ids_parts[i], \
 85 |         paper_ids, bm25_model, k, i) for i in range(PARTITION_NUM)]
 86 |     sim_dfs = pool.map(single_process_search, param_list)
 87 |     sim_df = pd.concat(sim_dfs, axis=0)
 88 |     return sim_df
 89 | 
 90 | def gen_samples(df, desc, desc_ids, corpus_list, paper_ids_list, k):
 91 |     df_samples_list = []
 92 |     for i, corpus in enumerate(corpus_list):
 93 |         bm25_model = bm25.BM25(corpus[0])        
 94 |         cur_df_sample = multi_process_search(desc, desc_ids, \
 95 |                 paper_ids_list[i], bm25_model, k)
 96 |         cur_df_sample_out = pandas_util.explode(cur_df_sample, ['paper_id', 'sim_score'])
 97 |         cur_df_sample_out['type'] = corpus[1] # recall_name
 98 |         df_samples_list.append(cur_df_sample_out)
 99 |     df_samples = pd.concat(df_samples_list, axis=0)
100 |     df_samples.drop_duplicates(subset=['description_id', 'paper_id'], inplace=True)
101 |     df_samples['target'] = 0
102 |     return df_samples
103 | 
104 | if __name__ == "__main__":
105 | 
106 |     ts = time.time()
107 |     tqdm.pandas()
108 |     print('start time: %s' % datetime.now())
109 |     # load data
110 |     df = loader.load_df(input_root_path + 'paper_input_final.ftr')
111 |     df = df[~pd.isnull(df['paper_id'])]
112 |                                
113 |     # gen tfidf vecs
114 |     dictionary = pickle.load(open('../../feat/corpus.dict', 'rb'))
115 |     print ('dic len', len(dictionary))
116 | 
117 |     df['corp'] = df['abst'] + ' ' + df['titl'] + ' ' + df['keywords'].fillna('').replace(';', ' ')
118 |     df_corp, corp_paper_ids = [dictionary.doc2bow(line.split(' ')) for line in df['corp'].tolist()], \
119 |             df['paper_id'].tolist()
120 |        
121 |     # gen topk sim samples
122 |     paper_ids_list = [corp_paper_ids]
123 |     corpus_list = [(df_corp, 'corp_bm25')]
124 |     out_cols = ['description_id', 'paper_id', 'sim_score', 'target', 'type']
125 | 
126 |     if sys.argv[1] in ['tr']:
127 |         # for tr ins
128 |         tr = loader.load_df(input_root_path + 'tr_input_final.ftr')
129 |         tr = tr[~pd.isnull(tr['description_id'])]
130 |         
131 | #         tr = tr.head(1000)        
132 |         tr_desc, tr_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in tr['quer_all'].tolist()], \
133 |                 tr['description_id'].tolist()
134 |         print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))   
135 |         
136 |         tr_samples = gen_samples(tr, tr_desc, tr_desc_ids, \
137 |                 corpus_list, paper_ids_list, k=50)
138 |         tr_samples = tr.rename(columns={'paper_id': 'target_paper_id'}) \
139 |                 .merge(tr_samples, on='description_id', how='left')
140 |         tr_samples.loc[tr_samples['target_paper_id'] == tr_samples['paper_id'], 'target'] = 1
141 |         loader.save_df(tr_samples[out_cols], train_out_path)
142 |         print ('recall succ {} from {}'.format(tr_samples['target'].sum(), tr.shape[0]))
143 |         print (tr.shape, tr_samples.shape)
144 | 
145 |     if sys.argv[1] in ['te']:
146 |         # for te ins
147 |         te = loader.load_df(input_root_path + 'te_input_final.ftr')
148 |         te = te[~pd.isnull(te['description_id'])]
149 |         
150 | #         te = te.head(1000)
151 |         te_desc, te_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in te['quer_all'].tolist()], \
152 |                 te['description_id'].tolist()
153 |         print ('gen tf completed, cost {}s'.format(np.round(time.time() - ts, 2)))
154 |                 
155 |         te_samples = gen_samples(te, te_desc, te_desc_ids, \
156 |                 corpus_list, paper_ids_list, k=50)
157 |         te_samples = te.merge(te_samples, on='description_id', how='left')
158 |         loader.save_df(te_samples[out_cols], test_out_path)
159 |         print (te.shape, te_samples.shape)
160 |         
161 |     print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/src/utils/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/src/utils/.gitkeep


--------------------------------------------------------------------------------
/stk_feat/README.md:
--------------------------------------------------------------------------------
1 | ## Dir of generated stacking features
2 | 


--------------------------------------------------------------------------------
/tools/__pycache__/basic_learner.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/basic_learner.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/custom_bm25.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/custom_bm25.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/custom_metrics.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/custom_metrics.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/feat_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/feat_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/lgb_learner.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/lgb_learner.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/loader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/loader.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/nlp_preprocess.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/nlp_preprocess.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/pandas_util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/__pycache__/pandas_util.cpython-37.pyc


--------------------------------------------------------------------------------
/tools/basic_learner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 基础模块
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | import time
 10 | import functools
 11 | from datetime import datetime
 12 | 
 13 | # 数据处理
 14 | import numpy as np
 15 | import pandas as pd
 16 | from math import sqrt
 17 | from collections import Counter
 18 | 
 19 | # 自定义工具包
 20 | import loader
 21 | import custom_metrics
 22 | 
 23 | # 设置随机种子
 24 | SEED = 2018
 25 | np.random.seed (SEED)
 26 | 
 27 | class BaseLearner(object):
 28 | 
 29 |     def __init__(self, train_data, test_data, 
 30 |             fea_names, id_names, target_name, \
 31 |             params, fold_num, out_name, metric_names=['auc'], \
 32 |             model_postfix=''):
 33 |         # 深度拷贝原始数据，防止外部主函数修改导致的数据异常
 34 |         self.train_data = train_data.copy(deep=True)
 35 |         self.test_data = test_data.copy(deep=True)
 36 | 
 37 |         # 基础数据信息
 38 |         self.fea_names = fea_names
 39 |         self.id_names = id_names
 40 |         self.target_name = target_name
 41 | 
 42 |         self.params = params
 43 |         self.fold_num = fold_num
 44 |         self.out_name = out_name
 45 |         self.root_path = '../../../output/m3/' + out_name + '/'
 46 |         self.metric_names = metric_names
 47 |         self.model_postfix = model_postfix
 48 | 
 49 |     # 获取模型存储路径
 50 |     def get_model_path(self, predicted_fold_index):
 51 |         model_path = self.root_path + 'model_' + str(predicted_fold_index)
 52 |         if self.model_postfix != '':
 53 |             model_path += '_' + self.model_postfix
 54 |         return model_path
 55 | 
 56 |     # 获取预测结果输出路径
 57 |     def get_preds_outpath(self, predicted_fold_index):
 58 |         out_path = self.root_path + self.out_name
 59 |         if self.model_postfix != '': 
 60 |             out_path += '_' + self.model_postfix
 61 |         if predicted_fold_index != 0:
 62 |             out_path += '_cv_' + str(predicted_fold_index)
 63 |         return out_path
 64 | 
 65 |     # 训练、验证集划分接口，需要被重载
 66 |     def extract_train_data(self, data, predicted_fold_index):
 67 |         pass 
 68 | 
 69 |     # 单 fold 训练接口，需要被重载
 70 |     def train(self, data, predicted_fold_index, model_dump_path=None):
 71 |         pass
 72 | 
 73 |     # 单 fold 预测接口，需要被重载
 74 |     def predict(self, data, predicted_fold_index, model_load_path=None):
 75 |         pass
 76 | 
 77 |     # 多 fold 训练
 78 |     def multi_fold_train(self, data, predicted_folds=[1,2,3,4,5], \
 79 |             need_predict_test=False):
 80 |         print ("multi_fold train start {}".format(datetime.now()))
 81 |         ts = time.time()
 82 |         for fold_index in predicted_folds:
 83 |             print ('training fold {}'.format(fold_index))
 84 |             self.train(data, fold_index)
 85 |             print ('fold {} completed, cost {}s'.format( \
 86 |                     fold_index, time.time() - ts))
 87 |         self.multi_fold_predict(data, predicted_folds, need_predict_test)
 88 | 
 89 |     # 多 fold 预测
 90 |     def multi_fold_predict(self, data, predicted_folds, \
 91 |             need_predict_test=False):
 92 |         print ("multi_fold predict start {}".format(datetime.now()))
 93 | 
 94 |         multi_fold_eval_lis = []
 95 |         
 96 |         for fold_index in predicted_folds:
 97 |             dtrain, dvalid, Xvalid = self.extract_train_data( \
 98 |                     self.train_data, fold_index)
 99 | 
100 |             ypreds = self.predict(Xvalid, fold_index)
101 |             labels = Xvalid[self.target_name]
102 | 
103 |             eval_lis = custom_metrics.calc_metrics(labels, ypreds, \
104 |                     self.metric_names)
105 | 
106 |             multi_fold_eval_lis.append(eval_lis)
107 |             print ('{} eval: {}'.format(fold_index, eval_lis))
108 |             loader.out_preds(self.target_name, \
109 |                     Xvalid[self.id_names], ypreds, \
110 |                     '{}.csv'.format(self.get_preds_outpath(fold_index)), \
111 |                     labels.tolist())
112 | 
113 |             if need_predict_test:
114 |                 print ('predict test data')
115 |                 ypreds = self.predict(self.test_data, 0,
116 |                         model_load_path=self.get_model_path(fold_index))
117 |                 # output preds
118 |                 loader.out_preds(self.target_name, \
119 |                         self.test_data[self.id_names], ypreds, \
120 |                         '{}_{}.csv'.format(self.get_preds_outpath(0), fold_index))
121 | 
122 |         multi_fold_eval_avgs = []
123 |         for i in range(len(self.metric_names)):
124 |             eval_avg = np.array([val[i] for val in multi_fold_eval_lis]).mean()
125 |             eval_avg = round(eval_avg, 5)
126 |             multi_fold_eval_avgs.append(eval_avg)
127 |         print ('multi fold eval mean: ', multi_fold_eval_avgs)
128 | 
129 |         return multi_fold_eval_avgs
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/tools/basic_learner.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/basic_learner.pyc


--------------------------------------------------------------------------------
/tools/custom_bm25.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from six import iteritems
  3 | from six.moves import range
  4 | 
  5 | PARAM_K1 = 1.5
  6 | PARAM_B = 0.75
  7 | EPSILON = 0.25
  8 | 
  9 | class BM25(object):
 10 |     def __init__(self, corpus):
 11 |         """
 12 |         Parameters
 13 |         ----------
 14 |         corpus : list of list of str
 15 |             Given corpus.
 16 |         """
 17 |         self.corpus_size = 0
 18 |         self.avgdl = 0
 19 |         self.doc_freqs = []
 20 |         self.idf = {}
 21 |         self.doc_len = []
 22 |         self._initialize(corpus)
 23 | 
 24 |     def _initialize(self, corpus):
 25 |         """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
 26 |         nd = {}  # word -> number of documents with word
 27 |         num_doc = 0
 28 |                 
 29 |         for document in corpus:
 30 |             self.corpus_size += 1
 31 |             cur_doc_len = 0
 32 |             frequencies = {}
 33 | 
 34 |             for word_tuple in document:
 35 |                 word, word_cnt = word_tuple[0], word_tuple[1]
 36 |                 if word not in frequencies:
 37 |                     frequencies[word] = 0
 38 |                 frequencies[word] += word_cnt
 39 |                 cur_doc_len += word_cnt
 40 |             self.doc_freqs.append(frequencies)
 41 |             self.doc_len.append(cur_doc_len)
 42 |             num_doc += cur_doc_len
 43 | 
 44 |             for word, freq in iteritems(frequencies):
 45 |                 if word not in nd:
 46 |                     nd[word] = 0
 47 |                 nd[word] += 1
 48 | 
 49 |         self.avgdl = float(num_doc) / self.corpus_size
 50 |         # collect idf sum to calculate an average idf for epsilon value
 51 |         idf_sum = 0
 52 |         # collect words with negative idf to set them a special epsilon value.
 53 |         # idf can be negative if word is contained in more than half of documents
 54 |         negative_idfs = []
 55 |         for word, freq in iteritems(nd):
 56 |             idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
 57 |             self.idf[word] = idf
 58 |             idf_sum += idf
 59 |             if idf < 0:
 60 |                 negative_idfs.append(word)
 61 |         self.average_idf = float(idf_sum) / len(self.idf)
 62 | 
 63 |         eps = EPSILON * self.average_idf
 64 |         for word in negative_idfs:
 65 |             self.idf[word] = eps
 66 | 
 67 |     def get_score(self, document, index):
 68 |         """Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
 69 |         Parameters
 70 |         ----------
 71 |         document : list of str
 72 |             Document to be scored.
 73 |         index : int
 74 |             Index of document in corpus selected to score with `document`.
 75 |         Returns
 76 |         -------
 77 |         float
 78 |             BM25 score.
 79 |         """
 80 |         score = 0
 81 |         doc_freqs = self.doc_freqs[index]
 82 |         for word_tuple in document:
 83 |             word = word_tuple[0]
 84 |             if word not in doc_freqs:
 85 |                 continue
 86 |             score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
 87 |                       / (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
 88 |         return score
 89 | 
 90 |     def get_scores(self, document):
 91 |         """Computes and returns BM25 scores of given `document` in relation to
 92 |         every item in corpus.
 93 |         Parameters
 94 |         ----------
 95 |         document : list of str
 96 |             Document to be scored.
 97 |         Returns
 98 |         -------
 99 |         list of float
100 |             BM25 scores.
101 |         """
102 |         scores = [self.get_score(document, index) for index in range(self.corpus_size)]
103 |         return scores
104 | 
105 |     def get_scores_bow(self, document):
106 |         """Computes and returns BM25 scores of given `document` in relation to
107 |         every item in corpus.
108 |         Parameters
109 |         ----------
110 |         document : list of str
111 |             Document to be scored.
112 |         Returns
113 |         -------
114 |         list of float
115 |             BM25 scores.
116 |         """
117 |         scores = []
118 |         for index in range(self.corpus_size):
119 |             score = self.get_score(document, index)
120 |             if score > 0:
121 |                 scores.append((index, score))
122 |         return scores
123 | 


--------------------------------------------------------------------------------
/tools/custom_bm25.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/custom_bm25.pyc


--------------------------------------------------------------------------------
/tools/custom_metrics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import gc
 7 | import json
 8 | import time
 9 | import functools
10 | from datetime import datetime
11 | 
12 | # 数据处理
13 | import numpy as np
14 | import pandas as pd
15 | from math import sqrt
16 | 
17 | # 评价指标
18 | from sklearn.metrics import log_loss
19 | from sklearn.metrics import roc_auc_score
20 | from sklearn.metrics import accuracy_score
21 | from sklearn.metrics import mean_absolute_error
22 | from sklearn.metrics import mean_squared_error
23 | 
24 | def _calc_auc(labels, ypreds):
25 |     return roc_auc_score(labels, ypreds)
26 | 
27 | def _calc_logloss(labels, ypreds):
28 |     return log_loss(labels, ypreds)
29 | 
30 | def _calc_mae(labels, ypreds):
31 |     return mean_absolute_error(labels, ypreds)
32 | 
33 | def _calc_rmse(labels, ypreds):
34 |     return sqrt(mean_squared_error(labels, ypreds))
35 | 
36 | # kappa
37 | 
38 | # multi-logloss
39 | 
40 | def _calc_metric(labels, ypreds, metric_name='auc'):
41 |     if metric_name == 'auc':
42 |         return _calc_auc(labels, ypreds)
43 |     elif metric_name == 'logloss':
44 |         return _calc_logloss(labels, ypreds)
45 |     elif metric_name == 'mae':
46 |         return _calc_mae(labels, ypreds)
47 |     elif metric_name == 'rmse':
48 |         return _calc_rmse(labels, ypreds)
49 |         
50 | def calc_metrics(labels, ypreds, metric_names=['auc']):
51 |     eval_lis = []
52 |     for metric_name in metric_names:
53 |         eval_val = _calc_metric(labels, ypreds, metric_name=metric_name)
54 |         eval_val = round(eval_val, 5)
55 |         eval_lis.append(eval_val)
56 |     return eval_lis
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/tools/custom_metrics.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/custom_metrics.pyc


--------------------------------------------------------------------------------
/tools/feat_utils.py:
--------------------------------------------------------------------------------
 1 | def try_divide(x, y, val=0.0):
 2 |     """ 
 3 |     	Try to divide two numbers
 4 |     """
 5 |     if y != 0.0:
 6 |     	val = float(x) / y
 7 |     return val
 8 | 
 9 | 
10 | def get_sample_indices_by_relevance(dfTrain, additional_key=None):
11 | 	""" 
12 | 		return a dict with
13 | 		key: (additional_key, median_relevance)
14 | 		val: list of sample indices
15 | 	"""
16 | 	dfTrain["sample_index"] = range(dfTrain.shape[0])
17 | 	group_key = ["median_relevance"]
18 | 	if additional_key != None:
19 | 		group_key.insert(0, additional_key)
20 | 	agg = dfTrain.groupby(group_key, as_index=False).apply(lambda x: list(x["sample_index"]))
21 | 	d = dict(agg)
22 | 	dfTrain = dfTrain.drop("sample_index", axis=1)
23 | 	return d
24 | 
25 | 
26 | def dump_feat_name(feat_names, feat_name_file):
27 | 	"""
28 | 		save feat_names to feat_name_file
29 | 	"""
30 | 	with open(feat_name_file, "wb") as f:
31 | 	    for i,feat_name in enumerate(feat_names):
32 | 	        if feat_name.startswith("count") or feat_name.startswith("pos_of"):
33 | 	            f.write("('%s', SimpleTransform(config.count_feat_transform)),\n" % feat_name)
34 | 	        else:
35 | 	            f.write("('%s', SimpleTransform()),\n" % feat_name)
36 | 


--------------------------------------------------------------------------------
/tools/lgb_learner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | import gc
  7 | import json
  8 | import time
  9 | import functools
 10 | from datetime import datetime
 11 | 
 12 | # 数据处理
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | # 模型相关
 17 | import lightgbm as lgb
 18 | from basic_learner import BaseLearner
 19 | 
 20 | # 设置随机种子
 21 | SEED = 2018
 22 | np.random.seed (SEED)
 23 | 
 24 | # 设置模型通用参数
 25 | EVAL_ROUND = 100
 26 | PRINT_TRAIN_METRICS = False
 27 | 
 28 | 
 29 | class lgbLearner(BaseLearner):
 30 | 
 31 |     def __init__(self, train_data, test_data, \
 32 |             fea_names, id_names, target_name, \
 33 |             params, num_trees, fold_num, out_name, \
 34 |             cv_name='cv', metric_names=['auc'], model_postfix=''):
 35 |         super(lgbLearner, self).__init__(train_data, test_data, fea_names, \
 36 |                 id_names, target_name, params, fold_num, \
 37 |                 out_name, metric_names, model_postfix)
 38 |         self.num_trees = num_trees
 39 |         self.cv_name = cv_name
 40 | 
 41 |         self.eval_round = EVAL_ROUND
 42 |         self.print_train_metrics = PRINT_TRAIN_METRICS
 43 | 
 44 |     def extract_train_data(self, data, predicted_fold_index):
 45 | 
 46 |         Xtrain = data[data[self.cv_name] != predicted_fold_index]
 47 |         Xvalid = data[data[self.cv_name] == predicted_fold_index]
 48 | 
 49 |         dtrain = lgb.Dataset(Xtrain[self.fea_names].values, \
 50 |                         Xtrain[self.target_name])
 51 |         dvalid = lgb.Dataset(Xvalid[self.fea_names].values, \
 52 |                         Xvalid[self.target_name])
 53 | 
 54 |         print ('train, valid', Xtrain.shape, Xvalid.shape)
 55 |         return dtrain, dvalid, Xvalid
 56 | 
 57 |     def train(self, data, predicted_fold_index, model_dump_path=None):
 58 |         if model_dump_path == None:
 59 |             model_dump_path = self.get_model_path(predicted_fold_index)
 60 | 
 61 |         dtrain, dvalid, Xvalid = self.extract_train_data(self.train_data,
 62 |                 predicted_fold_index)
 63 | 
 64 |         if self.print_train_metrics:
 65 |             valid_sets = [dtrain, dvalid] \
 66 |                     if predicted_fold_index != 0 else [dtrain]
 67 |             valid_names = ['train', 'valid'] \
 68 |                     if predicted_fold_index != 0 else ['train']
 69 |         else:
 70 |             valid_sets = [dvalid] if predicted_fold_index != 0 else [dtrain]
 71 |             valid_names = ['valid'] if predicted_fold_index != 0 else ['train']
 72 | 
 73 |         params = self.params
 74 | 
 75 |         bst = lgb.train(params, dtrain, self.num_trees,
 76 |                 valid_sets=valid_sets,
 77 |                 valid_names=valid_names,
 78 |                 verbose_eval=self.eval_round)
 79 |         bst.save_model(model_dump_path)
 80 | 
 81 |     def predict(self, data, predicted_fold_index, \
 82 |                 model_load_path=None):
 83 |         if model_load_path is None:
 84 |             model_load_path = self.get_model_path(predicted_fold_index)
 85 | 
 86 |         bst = lgb.Booster(model_file=model_load_path)
 87 |         ypreds = bst.predict(data[self.fea_names], num_iteration=self.num_trees)
 88 | 
 89 |         if predicted_fold_index != 0:
 90 |             # output fea importance
 91 |             df = pd.DataFrame(self.fea_names, columns=['feature'])
 92 |             df['importance'] = list(bst.feature_importance('gain'))
 93 |             df['precent'] = np.round(df.importance * 100 / sum(df.importance), 2)
 94 |             df['precent'] = df.precent.apply(lambda x : str(x) + '%')
 95 | 
 96 |             df = df.sort_values(by='importance', ascending=False)
 97 |             imp_path = 'imp'
 98 |             if self.model_postfix != '':
 99 |                 imp_path = 'imp-{}'.format(self.model_postfix)
100 |             df.to_csv(self.root_path + imp_path, sep='\t')
101 |         return ypreds
102 | 
103 | 


--------------------------------------------------------------------------------
/tools/lgb_learner.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/lgb_learner.pyc


--------------------------------------------------------------------------------
/tools/loader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python$
 2 | # -*- coding: utf-8 -*-$
 3 | 
 4 | # 数据处理
 5 | import numpy as np
 6 | import pandas as pd
 7 | import feather
 8 | 
 9 | # 基础文件读写
10 | def load_df(filename, nrows=None):
11 |     if filename.endswith('csv'):
12 |         return pd.read_csv(filename, nrows = nrows)
13 |     elif filename.endswith('ftr'):
14 |         return feather.read_dataframe(filename)
15 | 
16 | def save_df(df, filename, index=False):
17 |     if filename.endswith('csv'):
18 |         df.to_csv(filename, index=index)
19 |     elif filename.endswith('ftr'):
20 |         df = df.reset_index(drop=True)
21 |         df.columns = [str(col) for col in df.columns]
22 |         df.to_feather(filename)
23 | 
24 | # merge 特征文件
25 | def merge_fea(df_list, primary_keys=[]):
26 |     assert len(primary_keys) >= 0, 'empty primary keys'
27 |     print (df_list)
28 | 
29 |     df_base = load_df(df_list[0])
30 |     for i in range(1, len(df_list)):
31 |         print (df_list[i])
32 |         cur_df = load_df(df_list[i])
33 |         df_base = pd.concat([df_base, \
34 |                 cur_df.drop(primary_keys, axis=1)], axis=1)
35 |     print ('merge completed, df shape', df_base.shape)
36 |     return df_base
37 | 
38 | # 模型预测结果输出
39 | def out_preds(target_name, df_ids, ypreds, out_path, labels=[]):
40 |     preds_df = pd.DataFrame(df_ids)
41 |     preds_df[target_name] = ypreds
42 |     if len(labels) == preds_df.shape[0]:
43 |         preds_df['label'] = np.array(labels)
44 |     elif len(labels) > 0:
45 |         print ('labels length not match')
46 |     preds_df.to_csv(out_path, float_format = '%.4f', index=False)
47 | 
48 | #def out_preds(id_name, target_name, ids, ypreds, out_path, labels=[]):
49 | #    preds_df = pd.DataFrame({id_name: np.array(ids)})
50 | #    preds_df[target_name] = ypreds
51 | #    if len(labels) == preds_df.shape[0]:
52 | #        preds_df['label'] = np.array(labels)
53 | #    elif len(labels) > 0:
54 | #        print ('labels length not match')
55 | #    preds_df.to_csv(out_path, float_format = '%.4f', index=False)
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/tools/loader.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/loader.pyc


--------------------------------------------------------------------------------
/tools/nlp_preprocess.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import  numpy as np
  4 | import nltk
  5 | # nltk.download('punkt')
  6 | from nltk.corpus import stopwords
  7 | from nltk import word_tokenize, pos_tag
  8 | from nltk.stem import WordNetLemmatizer
  9 | 
 10 | def tokenize(sentence):
 11 |     '''
 12 |         去除多余空白、分词、词性标注
 13 |     '''
 14 |     sentence = re.sub(r'\s+', ' ', sentence)
 15 |     token_words = word_tokenize(sentence)  # 输入的是列表
 16 |     token_words = pos_tag(token_words)
 17 |     return token_words
 18 | 
 19 | def stem(token_words):
 20 |     '''
 21 |         词形归一化
 22 |     '''
 23 |     wordnet_lematizer = WordNetLemmatizer()  # 单词转换原型
 24 |     words_lematizer = []
 25 |     for word, tag in token_words:
 26 |         if tag.startswith('NN'):
 27 |             word_lematizer = wordnet_lematizer.lemmatize(word, pos='n')  # n代表名词
 28 |         elif tag.startswith('VB'):
 29 |             word_lematizer = wordnet_lematizer.lemmatize(word, pos='v')  # v代表动词
 30 |         elif tag.startswith('JJ'):
 31 |             word_lematizer = wordnet_lematizer.lemmatize(word, pos='a')  # a代表形容词
 32 |         elif tag.startswith('R'):
 33 |             word_lematizer = wordnet_lematizer.lemmatize(word, pos='r')  # r代表代词
 34 |         else:
 35 |             word_lematizer = wordnet_lematizer.lemmatize(word)
 36 |         words_lematizer.append(word_lematizer)
 37 |     return words_lematizer
 38 | 
 39 | 
 40 | sr = stopwords.words('english')
 41 | 
 42 | 
 43 | def delete_stopwords(token_words):
 44 |     '''
 45 |         去停用词
 46 |     '''
 47 |     cleaned_words = [word for word in token_words if word not in sr]
 48 |     return cleaned_words
 49 | 
 50 | 
 51 | def is_number(s):
 52 |     '''
 53 |         判断字符串是否为数字
 54 |     '''
 55 |     try:
 56 |         float(s)
 57 |         return True
 58 |     except ValueError:
 59 |         pass
 60 | 
 61 |     try:
 62 |         import unicodedata
 63 |         unicodedata.numeric(s)
 64 |         return True
 65 |     except (TypeError, ValueError):
 66 |         pass
 67 | 
 68 |     return False
 69 | 
 70 | 
 71 | characters = [' ', ',', '.', 'DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '-', '...',
 72 |               '^', '{', '}']
 73 | 
 74 | 
 75 | def delete_characters(token_words):
 76 |     '''
 77 |         去除特殊字符、数字
 78 |     '''
 79 |     words_list = [word for word in token_words if word not in characters and not is_number(word)]
 80 |     return words_list
 81 | 
 82 | 
 83 | def to_lower(token_words):
 84 |     '''
 85 |         统一为小写
 86 |     '''
 87 |     words_lists = [x.lower() for x in token_words]
 88 |     return words_lists
 89 | 
 90 | def replace_process(line):
 91 |     m = replace = {
 92 |         'α': 'alpha',
 93 |         'β': 'beta',
 94 |         'γ': 'gamma',
 95 |         'δ': 'delta',
 96 |         'ε': 'epsilon',
 97 |         'ζ': 'zeta',
 98 |         'η': 'eta',
 99 |         'θ': 'theta',
100 |         'ι': 'iota',
101 |         'κ': 'kappa',
102 |         'λ': 'lambda',
103 |         'μ': 'mu',
104 |         'ν': 'nu',
105 |         'ξ': 'xi',
106 |         'ο': 'omicron',
107 |         'π': 'pi',
108 |         'ρ': 'rho',
109 |         'ς': 'sigma',
110 |         'σ': 'sigma',
111 |         'τ': 'tau',
112 |         'υ': 'upsilon',
113 |         'φ': 'phi',
114 |         'χ': 'chi',
115 |         'ψ': 'psi',
116 |         'ω': 'omega',
117 |         'ϑ': 'theta',
118 |         'ϒ': 'gamma',
119 |         'ϕ': 'phi',
120 |         'ϱ': 'rho',
121 |         'ϵ': 'epsilon',
122 |         '𝛼': 'alpha',
123 |         '𝛽': 'beta',
124 |         '𝜀': 'epsilon',
125 |         '𝜃': 'theta',
126 |         '𝜏': 'tau',
127 |         '𝜖': 'epsilon',
128 |         '𝜷': 'beta',
129 |     }
130 |     empty_str = ['etc.','et al.','fig.','figure.','e.g.','(', ')','[', ']',';',':','!',',','.','?','"','\'', \
131 |                  '%','>','<','+','&']
132 |     m.update({s: ' ' for s in empty_str})
133 |     
134 |     for k, v in m.items():
135 |         line = line.replace(k, v)
136 |     line = ' '.join([s.strip() for s in line.split(' ') if s != ''])
137 |     return line
138 | 
139 | def preprocess(line):
140 |     '''
141 |         文本预处理
142 |     '''
143 |     line = line.lower()
144 |     line = replace_process(line)
145 |     token_words = tokenize(line)
146 |     token_words = stem(token_words)
147 |     token_words = delete_stopwords(token_words)
148 |     token_words = delete_characters(token_words)
149 |     token_words = to_lower(token_words)
150 |     return ' '.join(token_words)
151 | 
152 | if __name__ == '__main__':
153 |     text = 'This experiment was conducted to determine whether feeding meal and hulls derived from genetically modified soybeans to dairy cows affected production measures and sensory qualities of milk. The soybeans were genetically modified (Event DAS-444Ø6-6) to be resistant to multiple herbicides. Twenty-six Holstein cows (13/treatment) were fed a diet that contained meal and hulls derived from transgenic soybeans or a diet that contained meal and hulls from a nontransgenic near-isoline variety. Soybean products comprised approximately 21% of the diet dry matter, and diets were formulated to be nearly identical in crude protein, neutral detergent fiber, energy, and minerals and vitamins. The experimental design was a replicated 2×2 Latin square with a 28-d feeding period. Dry matter intake (21.3 vs. 21.4kg/d), milk yield (29.3 vs. 29.4kg/d), milk fat (3.70 vs. 3.68%), and milk protein (3.10 vs. 3.12%) did not differ between cows fed control or transgenic soybean products, respectively. Milk fatty acid profile was virtually identical between treatments. Somatic cell count was significantly lower for cows fed transgenic soybean products, but the difference was biologically trivial. Milk was collected from all cows in period 1 on d 0 (before treatment), 14, and 28 for sensory evaluation. On samples from all days (including d 0) judges could discriminate between treatments for perceived appearance of the milk. The presence of this difference at d 0 indicated that it was likely not a treatment effect but rather an initial bias in the cow population. No treatment differences were found for preference or acceptance of the milk. Overall, feeding soybean meal and hulls derived from this genetically modified soybean had essentially no effects on production or milk acceptance when fed to dairy cows. '
154 |     text = 'Pyrvinium is a drug approved by the FDA and identified as a Wnt inhibitor by inhibiting Axin degradation and stabilizing 尾-catenin, which can increase Ki67+ cardiomyocytes in the peri-infarct area and alleviate cardiac remodeling in a mouse model of MI . UM206 is a peptide with a high homology to Wnt-3a/5a, and acts as an antagonist for Frizzled proteins to inhibit Wnt signaling pathway transduction. UM206 could reduce infarct size, increase the numbers of capillaries, decrease myofibroblasts in infarct area of post-MI heart, and ultimately suppress the development of heart failure . ICG-001, which specifically inhibits the interaction between 尾-catenin and CBP in the Wnt canonical signaling pathway, can promote the differentiation of epicardial progenitors, thereby contributing to myocardial regeneration and improving cardiac function in a rat model of MI . Small molecules invaliding Porcupine have been further studied, such as WNT-974, GNF-6231 and CGX-1321. WNT-974 decreases fibrosis in post-MI heart, with a mechanism of preventing collagen production in cardiomyocytes by blocking secretion of Wnt-3, a pro-fibrotic agonist, from cardiac fibroblasts and its signaling to cardiomyocytes . The phosphorylation of DVL protein is decreased in both the canonical and non-canonical Wnt signaling pathways by WNT-974 administration . GNF-6231 prevents adverse cardiac remodeling in a mouse model of MI by inhibiting the proliferation of interstitial cells, increasing the proliferation of Sca1+ cardiac progenitors and reducing the apoptosis of cardiomyocytes [[**##**]]. Similarly, we demonstrate that CGX-1321, which has also been applied in a phase I clinical trial to treat solid tumors ({"type":"clinical-trial","attrs":{"text":"NCT02675946","term_id":"NCT02675946"}}NCT02675946), inhibits both canonical and non-canonical Wnt signaling pathways in post-MI heart. CGX-1321 promotes cardiac function by reducing fibrosis and stimulating cardiomyocyte proliferation-mediated cardiac regeneration in a Hippo/YAP-independent manner . These reports implicate that Wnt pathway inhibitors are a class of potential drugs for treating MI through complex mechanisms, including reducing cardiomyocyte death, increasing angiogenesis, suppressing fibrosis and stimulating cardiac regeneration.'
155 |     token_words = tokenize(text)
156 |     print(token_words)
157 |     token_words = stem(token_words)  # 单词原型
158 |     token_words = delete_stopwords(token_words)  # 去停
159 |     token_words = delete_characters(token_words)
160 |     token_words = to_lower(token_words)
161 |     print(token_words)
162 | 


--------------------------------------------------------------------------------
/tools/pandas_util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #coding=utf-8
 3 | 
 4 | # 基础模块
 5 | import math
 6 | import os
 7 | import sys
 8 | import time
 9 | from datetime import datetime
10 | from tqdm import tqdm
11 | 
12 | # 数据处理
13 | import numpy as np
14 | import pandas as pd
15 | 
16 | def string_to_array(s):
17 |     """Convert pipe separated string to array."""
18 | 
19 |     if isinstance(s, str):
20 |         out = s.split("|")
21 |     elif math.isnan(s):
22 |         out = []
23 |     else:
24 |         raise ValueError("Value must be either string of nan")
25 |     return out
26 | 
27 | 
28 | def explode(df_in, col_expls):
29 |     """Explode column col_expl of array type into multiple rows."""
30 | 
31 |     df = df_in.copy()
32 |     for col_expl in col_expls:
33 |         df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
34 | 
35 |     base_cols = list(set(df.columns) - set(col_expls))
36 |     df_out = pd.DataFrame(
37 |         {col: np.repeat(df[col].values,
38 |                         df[col_expls[0]].str.len())
39 |          for col in base_cols}
40 |     )
41 | 
42 |     for col_expl in col_expls:
43 |         df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
44 |         df_out.loc[:, col_expl] = df_out[col_expl]
45 |     return df_out
46 | 


--------------------------------------------------------------------------------
/tools/pandas_util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wsdm-Teamfunny/wsdm2020-solution/dae072da26ccf629d1a96185acedeaf4199b6ec4/tools/pandas_util.pyc


--------------------------------------------------------------------------------