├── description.pdf ├── cfg.py ├── blend.py ├── README.md ├── preprocessing_dmitry.py ├── preprocessing_stanislav.py ├── fit_models_dmitry.py ├── preprocessing_mikhail.py ├── utility.py ├── fit_model2_mikhail.py └── fit_model1_mikhail.py /description.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geffy/kaggle-crowdflower/HEAD/description.pdf -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | path_train = './raw/train.csv' 2 | path_test = './raw/test.csv' 3 | path_sampleSubmission = './raw/sampleSubmission.csv' 4 | 5 | path_w2v_pretrained_model = '../tools-w2v/GoogleNews-vectors-negative300.bin' 6 | 7 | path_processed = './processed/' 8 | path_features = './processed/' 9 | path_tmp = './tmp/' 10 | 11 | path_submit = './submit/' 12 | -------------------------------------------------------------------------------- /blend.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import cfg 4 | import pandas as pd 5 | import numpy as np 6 | from utility import apply_border 7 | 8 | 9 | p1 = np.loadtxt(cfg.path_processed + 'mikhail_model1.txt') 10 | p2 = np.loadtxt(cfg.path_processed + 'mikhail_model2.txt') 11 | p3 = np.loadtxt(cfg.path_processed + 'dmitry_model1.txt') 12 | p4 = np.loadtxt(cfg.path_processed + 'dmitry_model2.txt') 13 | 14 | ens = apply_border(p1+p2+p3+p4,[0.33, 0.6, 0.77]) 15 | 16 | idx = pd.read_csv(cfg.path_test).id.values.astype(int) 17 | # 18 | submission = pd.DataFrame({"id": idx, "prediction": ens.astype(np.int32)}) 19 | submission.to_csv(cfg.path_submit + "final_solution.csv", index=False) 20 | 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Kaggle ['Search Results Relevance'](https://www.kaggle.com/c/crowdflower-search-relevance) 2nd place solution 2 | ======= 3 | ### Mikhail Trofimov, Stanislav Semenov, Dmitry Altukhov 4 | 5 | Gets score 0.71881 on private leaderboard 6 | 7 | How to reproduce submission 8 | ======= 9 | Don't forget to check paths in `./cfg.py`! 10 | By default, you should place raw data in `./raw/`, create `./processed/` for temporal files and `./submit/` for submission file. 11 | 12 | After this, run 13 | ``` 14 | python preprocessing_mikhail.py 15 | python preprocessing_dmitry.py 16 | python preprocessing_stanislav.py 17 | python fit_models_dmitry.py 18 | python fit_model1_mikhail.py 19 | python fit_model2_mikhail.py 20 | python blend.py 21 | ``` 22 | 23 | Dependencies 24 | ======= 25 | * python 2.7.6 26 | * numpy 1.10 27 | * pandas 0.16.0 28 | * scikit-learn 0.16.1 29 | * scipy 0.15.1 30 | * nltk 3.0.3 31 | * BeautifulSoup 4.3.2 32 | * tsne 0.1.1 (https://github.com/danielfrg/tsne) 33 | * gensim 0.11.1-1 34 | * backports.lzma 0.0.3 35 | 36 | Hardware 37 | ======= 38 | This code was developed on a machine with 32 cores and 60 gb ram (amazon cc2.8xlarge instance), however it is possible to build a solution even with the average laptop. 39 | -------------------------------------------------------------------------------- /preprocessing_dmitry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | import cfg 5 | from utility import * 6 | 7 | train = pd.read_csv(cfg.path_train).fillna("") 8 | test = pd.read_csv(cfg.path_test ).fillna("") 9 | 10 | X1, weights, titles, queries = assemble_counts(train,m='train') 11 | X1_test, titles_test, queries_test = assemble_counts(test,m='test') 12 | np.savetxt(cfg.path_features + 'train_counts.txt',X1) 13 | np.savetxt(cfg.path_features +'test_counts.txt',X1_test) 14 | pd.DataFrame(weights,columns=['weights']).to_csv(cfg.path_features + 'weights.csv',index=False) 15 | pd.DataFrame(titles,columns=['titles_clean']).to_csv(cfg.path_features + 'titles_clean.csv',index=False) 16 | pd.DataFrame(queries,columns=['queries_clean']).to_csv(cfg.path_features + 'queries_clean.csv',index=False) 17 | pd.DataFrame(titles_test,columns=['titles_test_clean']).to_csv(cfg.path_features + 'titles_test_clean.csv',index=False) 18 | pd.DataFrame(queries_test,columns=['queries_test_clean']).to_csv(cfg.path_features + 'queries_test_clean.csv',index=False) 19 | 20 | #Extended queries top 10 words 21 | train_ext, test_ext = construct_extended_query(queries,queries_test,titles,titles_test,top_words=10) 22 | X5, queries_ext = assemble_counts2(train_ext.fillna("")) 23 | X5_test, queries_ext_test = assemble_counts2(test_ext.fillna("")) 24 | np.savetxt(cfg.path_features + 'train_ext_counts_top10.txt',X5) 25 | np.savetxt(cfg.path_features + 'test_ext_counts_top10.txt',X5_test) 26 | tmp = pd.DataFrame(train_ext,columns=['id','query','product_title','product_description','median_relevance','relevance_variance']) 27 | tmp.to_csv(cfg.path_features + 'train_ext_top10.csv',index=False) 28 | tmp = pd.DataFrame(test_ext,columns=['id','query','product_title','product_description']) 29 | tmp.to_csv(cfg.path_features + 'test_ext_top10.csv',index=False) 30 | 31 | #Extended queries top 15 words 32 | train_ext, test_ext = construct_extended_query(queries,queries_test,titles,titles_test,top_words=15) 33 | X5, queries_ext = assemble_counts2(train_ext.fillna("")) 34 | X5_test, queries_ext_test = assemble_counts2(test_ext.fillna("")) 35 | np.savetxt(cfg.path_features + 'train_ext_counts_top15.txt',X5) 36 | np.savetxt(cfg.path_features + 'test_ext_counts_top15.txt',X5_test) 37 | tmp = pd.DataFrame(train_ext,columns=['id','query','product_title','product_description','median_relevance','relevance_variance']) 38 | tmp.to_csv(cfg.path_features + 'train_ext_top15.csv',index=False) 39 | tmp = pd.DataFrame(test_ext,columns=['id','query','product_title','product_description']) 40 | tmp.to_csv(cfg.path_features + 'test_ext_top15.csv',index=False) -------------------------------------------------------------------------------- /preprocessing_stanislav.py: -------------------------------------------------------------------------------- 1 | import cfg 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import random 6 | import time 7 | import pickle 8 | 9 | from sklearn.feature_extraction.text import TfidfVectorizer 10 | 11 | newdtrain = pd.read_pickle(cfg.path_processed + 'train_df') 12 | newdtest = pd.read_pickle( cfg.path_processed + 'test_df') 13 | 14 | 15 | def stasnormal(curstr): 16 | curstrstas = [' '] 17 | 18 | for j in range(len(curstr)): 19 | if curstr[j] in " abcdefghijklmnopqrstuvwxyz": 20 | curstrstas.append(curstr[j]) 21 | 22 | if j == len(curstr) - 1: 23 | curstrstas.append(' ') 24 | 25 | if len(curstrstas) > 1 and curstrstas[-1] == ' ' and curstrstas[-2] == ' ': 26 | curstrstas = curstrstas[:-1] 27 | 28 | return ''.join(curstrstas) 29 | 30 | querystemstas = [] 31 | titlestemstas = [] 32 | descstemstas = [] 33 | 34 | for i in range(len(newdtrain)): 35 | querystemstas.append(stasnormal(newdtrain['query_stem'][i])) 36 | titlestemstas.append(stasnormal(newdtrain['title_stem'][i])) 37 | descstemstas.append(stasnormal(newdtrain['desc_stem'][i])) 38 | 39 | if i % 1000 == 0: 40 | print i 41 | 42 | newdtrain['query_stemstas'] = querystemstas 43 | newdtrain['title_stemstas'] = titlestemstas 44 | newdtrain['desc_stemstas'] = descstemstas 45 | 46 | querystemstas = [] 47 | titlestemstas = [] 48 | descstemstas = [] 49 | 50 | for i in range(len(newdtest)): 51 | querystemstas.append(stasnormal(newdtest['query_stem'][i])) 52 | titlestemstas.append(stasnormal(newdtest['title_stem'][i])) 53 | descstemstas.append(stasnormal(newdtest['desc_stem'][i])) 54 | 55 | if i % 1000 == 0: 56 | print i 57 | 58 | newdtest['query_stemstas'] = querystemstas 59 | newdtest['title_stemstas'] = titlestemstas 60 | newdtest['desc_stemstas'] = descstemstas 61 | 62 | vect = TfidfVectorizer( 63 | strip_accents='unicode', analyzer='char', 64 | ngram_range=(1, 1), use_idf = 0).fit(newdtrain['title_stemstas']) 65 | 66 | Xstats = np.zeros((len(newdtrain), 1)) 67 | 68 | for i in range(len(newdtrain)): 69 | query = newdtrain['query_stemstas'][i] 70 | 71 | Xstats[i] = len(query) 72 | 73 | Xquery = vect.transform(newdtrain['query_stemstas']).todense() 74 | Xtitle = vect.transform(newdtrain['title_stemstas']).todense() 75 | Xdesc = vect.transform(newdtrain['desc_stemstas']).todense() 76 | 77 | Xtrain = np.array((Xtitle + 100.) / (Xquery + 0.1) / Xstats) 78 | 79 | Xstats = np.zeros((len(newdtest), 1)) 80 | 81 | for i in range(len(newdtest)): 82 | query = newdtest['query_stemstas'][i] 83 | 84 | Xstats[i] = len(query) 85 | 86 | Xquery = vect.transform(newdtest['query_stemstas']).todense() 87 | Xtitle = vect.transform(newdtest['title_stemstas']).todense() 88 | Xdesc = vect.transform(newdtest['desc_stemstas']).todense() 89 | 90 | Xtest = np.array((Xtitle + 100.) / (Xquery + 0.1) / Xstats) 91 | 92 | np.savetxt(cfg.path_features + 'ssfeas4train.txt', Xtrain) 93 | np.savetxt(cfg.path_features + 'ssfeas4test.txt', Xtest) -------------------------------------------------------------------------------- /fit_models_dmitry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from utility import * 6 | from sklearn.feature_extraction import text 7 | from sklearn.decomposition import TruncatedSVD 8 | from sklearn.preprocessing import MinMaxScaler 9 | from sklearn.svm import SVR 10 | import cfg 11 | 12 | 13 | 14 | def build_model(titles,X1,X3,X4,titles_test,X1_test,X3_test,X4_test,y,weights=None,params=[400,10,0.0],top_words=10): 15 | ''' 16 | X1: query lenght,title lenght,description presetn flag,number of words from query that also occured in title, 17 | compression distance between query and title ,1 - edit distance between query and title, 18 | 1 - average(maximum edit distance between word from query and every word from title), 19 | last word from query present in title flag,ratio of words from query that also occured in title 20 | X3: Stanislav's features 21 | X4: Mikhail's features 22 | params list: [Number of SVD components, C in SVR, gamma in SVR] 23 | ''' 24 | #get features from extended queries 25 | if top_words==10: 26 | X5 = np.loadtxt(cfg.path_features + 'train_ext_counts_top10.txt') 27 | X5_test = np.loadtxt(cfg.path_features + 'test_ext_counts_top10.txt') 28 | queries_ext = np.array(pd.read_csv(cfg.path_features + 'train_ext_top10.csv')['query']) 29 | queries_ext_test = np.array(pd.read_csv(cfg.path_features + 'test_ext_top10.csv')['query']) 30 | elif top_words==15: 31 | X5 = np.loadtxt(cfg.path_features + 'train_ext_counts_top15.txt') 32 | X5_test = np.loadtxt(cfg.path_features + 'test_ext_counts_top15.txt') 33 | queries_ext = np.array(pd.read_csv(cfg.path_features + 'train_ext_top15.csv')['query']) 34 | queries_ext_test = np.array(pd.read_csv(cfg.path_features + 'test_ext_top15.csv')['query']) 35 | else: 36 | print('Generate features for extended queries. top10 or top 15.') 37 | print(1/0) 38 | 39 | df_train = pd.DataFrame(np.c_[queries_ext,titles],columns=['query','product_title']) 40 | df_test = pd.DataFrame(np.c_[queries_ext_test,titles_test],columns=['query','product_title']) 41 | train_qt = list(df_train.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1)) 42 | test_qt = list(df_test.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1)) 43 | 44 | 45 | tfv = text.TfidfVectorizer(min_df=10, max_features=None, 46 | strip_accents='unicode', analyzer='char',token_pattern=r'\w{1,}', 47 | ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1, 48 | stop_words = 'english') 49 | 50 | tfv.fit(train_qt) 51 | X2 = tfv.transform(train_qt) 52 | X2_test = tfv.transform(test_qt) 53 | svd = TruncatedSVD(n_components=params[0]) 54 | mms = MinMaxScaler() 55 | 56 | X = np.c_[svd.fit_transform(X2),X1,X4,X3,X5] 57 | X_test = np.c_[svd.transform(X2_test),X1_test,X4_test,X3_test,X5_test] 58 | 59 | X=mms.fit_transform(X) 60 | X_test = mms.transform(X_test) 61 | 62 | clf = SVR(C=params[1],gamma=params[2],cache_size=2048,kernel='rbf') 63 | clf.fit(X,y,sample_weight=weights) 64 | p = clf.predict(X_test) 65 | return p 66 | 67 | train = pd.read_csv(cfg.path_train).fillna("") 68 | test = pd.read_csv(cfg.path_test ).fillna("") 69 | idx = test.id.values.astype(int) 70 | y = train.median_relevance.values 71 | 72 | X1, weights, titles = (np.loadtxt(cfg.path_features + 'train_counts.txt'), 73 | np.array(pd.read_csv(cfg.path_features + 'weights.csv'))[:,0], 74 | np.array(pd.read_csv(cfg.path_features + 'titles_clean.csv'))[:,0]) 75 | X1_test, titles_test = (np.loadtxt(cfg.path_features + 'test_counts.txt'), 76 | np.array(pd.read_csv(cfg.path_features + 'titles_test_clean.csv'))[:,0]) 77 | 78 | 79 | X4 = np.loadtxt(cfg.path_features + 'X_additional_tr.txt') 80 | X4_test = np.loadtxt(cfg.path_features + 'X_additional_te.txt') 81 | 82 | X3 = np.loadtxt(cfg.path_features + 'ssfeas4train.txt') 83 | X3_test = np.loadtxt(cfg.path_features + 'ssfeas4test.txt') 84 | 85 | np.random.seed(seed=22) 86 | p1 = build_model(titles,X1,X3,X4,titles_test,X1_test,X3_test,X4_test,y,weights=weights,params=[300,8,0.15],top_words=10) 87 | 88 | p2 = build_model(titles,X1,X3,X4,titles_test,X1_test,X3_test,X4_test,y,weights=weights,params=[400,4,0.20],top_words=15) 89 | 90 | np.savetxt(cfg.path_features + 'dmitry_model1.txt',p1) 91 | np.savetxt(cfg.path_features + 'dmitry_model2.txt',p2) 92 | -------------------------------------------------------------------------------- /preprocessing_mikhail.py: -------------------------------------------------------------------------------- 1 | import cfg 2 | import pandas as pd 3 | import numpy as np 4 | import scipy.sparse as sp 5 | import re 6 | import cPickle as pickle 7 | 8 | from bs4 import BeautifulSoup 9 | from nltk.stem.porter import * 10 | from nltk.tokenize import TreebankWordTokenizer 11 | from nltk.stem import wordnet 12 | 13 | from sklearn.feature_extraction.text import TfidfVectorizer 14 | from sklearn.decomposition import TruncatedSVD 15 | from sklearn.preprocessing import StandardScaler 16 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 17 | from sklearn.metrics import pairwise_distances 18 | 19 | from tsne import bh_sne 20 | from gensim.models import Word2Vec 21 | 22 | import logging 23 | logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.INFO) 24 | logging.info("Feature extractor (Mikhail's part)") 25 | logging.info('** see cfg.py for path settings **') 26 | 27 | #load data 28 | logging.info('Reading data') 29 | train_df = pd.read_csv(cfg.path_train, encoding='utf-8').fillna('') 30 | test_df = pd.read_csv(cfg.path_test, encoding='utf-8').fillna('') 31 | 32 | 33 | ######################### 34 | ### Lemmatizing part ### 35 | ######################### 36 | 37 | logging.info('Lemmatizing') 38 | toker = TreebankWordTokenizer() 39 | lemmer = wordnet.WordNetLemmatizer() 40 | 41 | def text_preprocessor(x): 42 | ''' 43 | Get one string and clean\lemm it 44 | ''' 45 | tmp = unicode(x) 46 | tmp = tmp.lower().replace('blu-ray', 'bluray').replace('wi-fi', 'wifi') 47 | x_cleaned = tmp.replace('/', ' ').replace('-', ' ').replace('"', '') 48 | tokens = toker.tokenize(x_cleaned) 49 | return " ".join([lemmer.lemmatize(z) for z in tokens]) 50 | 51 | # lemm description 52 | train_df['desc_stem'] = train_df['product_description'].apply(text_preprocessor) 53 | test_df[ 'desc_stem'] = test_df['product_description'].apply(text_preprocessor) 54 | # lemm title 55 | train_df['title_stem'] = train_df['product_title'].apply(text_preprocessor) 56 | test_df[ 'title_stem'] = test_df['product_title'].apply(text_preprocessor) 57 | # lemm query 58 | train_df['query_stem'] = train_df['query'].apply(text_preprocessor) 59 | test_df[ 'query_stem'] = test_df['query'].apply(text_preprocessor) 60 | 61 | 62 | #################### 63 | ### Similarities ### 64 | #################### 65 | 66 | logging.info('Calc similarities') 67 | 68 | def calc_cosine_dist(text_a ,text_b, vect): 69 | return pairwise_distances(vect.transform([text_a]), vect.transform([text_b]), metric='cosine')[0][0] 70 | 71 | def calc_set_intersection(text_a, text_b): 72 | a = set(text_a.split()) 73 | b = set(text_b.split()) 74 | return len(a.intersection(b)) *1.0 / len(a) 75 | 76 | # vectorizers for similarities 77 | logging.info('\t fit vectorizers') 78 | tfv_orig = TfidfVectorizer(ngram_range=(1,2), min_df=2) 79 | tfv_stem = TfidfVectorizer(ngram_range=(1,2), min_df=2) 80 | tfv_desc = TfidfVectorizer(ngram_range=(1,2), min_df=2) 81 | tfv_orig.fit( 82 | list(train_df['query'].values) + 83 | list(test_df['query'].values) + 84 | list(train_df['product_title'].values) + 85 | list(test_df['product_title'].values) 86 | ) 87 | tfv_stem.fit( 88 | list(train_df['query_stem'].values) + 89 | list(test_df['query_stem'].values) + 90 | list(train_df['title_stem'].values) + 91 | list(test_df['title_stem'].values) 92 | ) 93 | tfv_desc.fit( 94 | list(train_df['query_stem'].values) + 95 | list(test_df['query_stem'].values) + 96 | list(train_df['desc_stem'].values) + 97 | list(test_df['desc_stem'].values) 98 | ) 99 | 100 | # for train 101 | logging.info('\t process train') 102 | cosine_orig = [] 103 | cosine_stem = [] 104 | cosine_desc = [] 105 | set_stem = [] 106 | for i, row in train_df.iterrows(): 107 | cosine_orig.append(calc_cosine_dist(row['query'], row['product_title'], tfv_orig)) 108 | cosine_stem.append(calc_cosine_dist(row['query_stem'], row['title_stem'], tfv_stem)) 109 | cosine_desc.append(calc_cosine_dist(row['query_stem'], row['desc_stem'], tfv_desc)) 110 | set_stem.append(calc_set_intersection(row['query_stem'], row['title_stem'])) 111 | train_df['cosine_qt_orig'] = cosine_orig 112 | train_df['cosine_qt_stem'] = cosine_stem 113 | train_df['cosine_qd_stem'] = cosine_desc 114 | train_df['set_qt_stem'] = set_stem 115 | 116 | # for test 117 | logging.info('\t process test') 118 | cosine_orig = [] 119 | cosine_stem = [] 120 | cosine_desc = [] 121 | set_stem = [] 122 | for i, row in test_df.iterrows(): 123 | cosine_orig.append(calc_cosine_dist(row['query'], row['product_title'], tfv_orig)) 124 | cosine_stem.append(calc_cosine_dist(row['query_stem'], row['title_stem'], tfv_stem)) 125 | cosine_desc.append(calc_cosine_dist(row['query_stem'], row['desc_stem'], tfv_desc)) 126 | set_stem.append(calc_set_intersection(row['query_stem'], row['title_stem'])) 127 | test_df['cosine_qt_orig'] = cosine_orig 128 | test_df['cosine_qt_stem'] = cosine_stem 129 | test_df['cosine_qd_stem'] = cosine_desc 130 | test_df['set_qt_stem'] = set_stem 131 | 132 | 133 | 134 | ################ 135 | ### w2v part ### 136 | ################ 137 | 138 | logging.info('w2v part') 139 | 140 | def calc_w2v_sim(row): 141 | ''' 142 | Calc w2v similarities and diff of centers of query\title 143 | ''' 144 | a2 = [x for x in row['query_stem'].lower().split() if x in embedder.vocab] 145 | b2 = [x for x in row['title_stem'].lower().split() if x in embedder.vocab] 146 | if len(a2)>0 and len(b2)>0: 147 | w2v_sim = embedder.n_similarity(a2, b2) 148 | else: 149 | return((-1, -1, np.zeros(300))) 150 | 151 | vectorA = np.zeros(300) 152 | for w in a2: 153 | vectorA += embedder[w] 154 | vectorA /= len(a2) 155 | 156 | vectorB = np.zeros(300) 157 | for w in b2: 158 | vectorB += embedder[w] 159 | vectorB /= len(b2) 160 | 161 | vector_diff = (vectorA - vectorB) 162 | 163 | w2v_vdiff_dist = np.sqrt(np.sum(vector_diff**2)) 164 | return (w2v_sim, w2v_vdiff_dist, vector_diff) 165 | 166 | logging.info('\t load pretrained model from {}'.format(cfg.path_w2v_pretrained_model)) 167 | embedder = Word2Vec.load_word2vec_format(cfg.path_w2v_pretrained_model, binary=True) 168 | 169 | # for train 170 | logging.info('\t process train') 171 | X_w2v = [] 172 | sim_list = [] 173 | dist_list = [] 174 | for i,row in train_df.iterrows(): 175 | sim, dist, vdiff = calc_w2v_sim(row) 176 | X_w2v.append(vdiff) 177 | sim_list.append(sim) 178 | dist_list.append(dist) 179 | X_w2v_tr = np.array(X_w2v) 180 | train_df['w2v_sim'] = np.array(sim_list) 181 | train_df['w2v_dist'] = np.array(dist_list) 182 | 183 | # for test 184 | logging.info('\t process test') 185 | X_w2v = [] 186 | sim_list = [] 187 | dist_list = [] 188 | for i,row in test_df.iterrows(): 189 | sim, dist, vdiff = calc_w2v_sim(row) 190 | X_w2v.append(vdiff) 191 | sim_list.append(sim) 192 | dist_list.append(dist) 193 | X_w2v_te = np.array(X_w2v) 194 | test_df['w2v_sim'] = np.array(sim_list) 195 | test_df['w2v_dist'] = np.array(dist_list) 196 | 197 | logging.info('\t dump w2v-features') 198 | pickle.dump((X_w2v_tr, X_w2v_te), open(cfg.path_processed + 'X_w2v.pickled', 'wb'), protocol=2) 199 | 200 | 201 | 202 | ##################### 203 | ### tSNE features ### 204 | ##################### 205 | 206 | logging.info('tSNE part') 207 | logging.info('\t [1\3] process title') 208 | vect = TfidfVectorizer(ngram_range=(1,2), min_df=3) 209 | X_tf = vect.fit_transform(list(train_df['title_stem'].values) + list(test_df['title_stem'].values)) 210 | svd = TruncatedSVD(n_components=200) 211 | X_svd = svd.fit_transform(X_tf) 212 | X_scaled = StandardScaler().fit_transform(X_svd) 213 | X_tsne = bh_sne(X_scaled) 214 | train_df['tsne_title_1'] = X_tsne[:len(train_df), 0] 215 | train_df['tsne_title_2'] = X_tsne[:len(train_df), 1] 216 | test_df[ 'tsne_title_1'] = X_tsne[len(train_df):, 0] 217 | test_df[ 'tsne_title_2'] = X_tsne[len(train_df):, 1] 218 | 219 | logging.info('\t [2\3] process title-query') 220 | vect = TfidfVectorizer(ngram_range=(1,2), min_df=3) 221 | X_title = vect.fit_transform(list(train_df['title_stem'].values) + list(test_df['title_stem'].values)) 222 | X_query = vect.fit_transform(list(train_df['query_stem'].values) + list(test_df['query_stem'].values)) 223 | X_tf = sp.hstack([X_title, X_query]).tocsr() 224 | svd = TruncatedSVD(n_components=200) 225 | X_svd = svd.fit_transform(X_tf) 226 | X_scaled = StandardScaler().fit_transform(X_svd) 227 | X_tsne = bh_sne(X_scaled) 228 | train_df['tsne_qt_1'] = X_tsne[:len(train_df), 0] 229 | train_df['tsne_qt_2'] = X_tsne[:len(train_df), 1] 230 | test_df[ 'tsne_qt_1'] = X_tsne[len(train_df):, 0] 231 | test_df[ 'tsne_qt_2'] = X_tsne[len(train_df):, 1] 232 | 233 | logging.info('\t [3\3] process description') 234 | vect = TfidfVectorizer(ngram_range=(1,2), min_df=3) 235 | X_desc = vect.fit_transform(list(train_df['desc_stem'].values) + list(test_df['desc_stem'].values)) 236 | X_tf = X_desc 237 | svd = TruncatedSVD(n_components=200) 238 | X_svd = svd.fit_transform(X_tf) 239 | X_scaled = StandardScaler().fit_transform(X_svd) 240 | X_tsne = bh_sne(X_scaled) 241 | train_df['tsne_desc_1'] = X_tsne[:len(train_df), 0] 242 | train_df['tsne_desc_2'] = X_tsne[:len(train_df), 1] 243 | test_df[ 'tsne_desc_1'] = X_tsne[len(train_df):, 0] 244 | test_df[ 'tsne_desc_2'] = X_tsne[len(train_df):, 1] 245 | 246 | logging.info('\t dump results') 247 | train_df.to_pickle(cfg.path_processed + 'train_df') 248 | test_df.to_pickle( cfg.path_processed + 'test_df') 249 | 250 | 251 | 252 | #################### 253 | ### X_additional ### 254 | #################### 255 | logging.info("Dump additional features") 256 | feat_list = [ 257 | u'w2v_sim', 258 | u'w2v_dist', 259 | u'tsne_title_1', 260 | u'tsne_title_2', 261 | u'tsne_qt_1', 262 | u'tsne_qt_2', 263 | u'cosine_qt_orig', 264 | u'cosine_qt_stem', 265 | u'cosine_qd_stem', 266 | u'set_qt_stem' 267 | ] 268 | X_additional_tr = train_df[feat_list].as_matrix() 269 | X_additional_te = test_df[feat_list].as_matrix() 270 | 271 | np.savetxt(cfg.path_processed + 'X_additional_tr.txt', X_additional_tr) 272 | np.savetxt(cfg.path_processed + 'X_additional_te.txt', X_additional_te) 273 | 274 | logging.info('Done!') -------------------------------------------------------------------------------- /utility.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | import numpy as np 4 | import pandas as pd 5 | import backports.lzma as lzma 6 | from bs4 import BeautifulSoup 7 | from nltk.stem.porter import PorterStemmer 8 | from sklearn.feature_extraction import text 9 | from difflib import SequenceMatcher as seq_matcher 10 | from itertools import combinations_with_replacement 11 | from sklearn.preprocessing import MinMaxScaler 12 | import re 13 | from collections import Counter 14 | 15 | def construct_extended_query(queries,queries_test,titles,titles_test,top_words=10): 16 | y = pd.read_csv('raw/train.csv').median_relevance.values 17 | 18 | stop_words = text.ENGLISH_STOP_WORDS 19 | pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*') 20 | 21 | train = pd.read_csv('raw/train.csv') 22 | test = pd.read_csv('raw/test.csv') 23 | 24 | data = [] 25 | query_ext_train = np.zeros(len(train)).astype(np.object) 26 | query_ext_test = np.zeros(len(test)).astype(np.object) 27 | for q in np.unique(queries): 28 | q_mask = queries == q 29 | q_test = queries_test == q 30 | 31 | titles_q = titles[q_mask] 32 | y_q = y[q_mask] 33 | 34 | good_mask = y_q > 3 35 | titles_good = titles_q[good_mask] 36 | ext_q = str(q) 37 | for item in titles_good: 38 | ext_q += ' '+str(item) 39 | ext_q = pattern.sub('', ext_q) 40 | c = [word for word, it in Counter(ext_q.split()).most_common(top_words)] 41 | c = ' '.join(c) 42 | data.append([q,ext_q,c]) 43 | query_ext_train[q_mask] = c 44 | query_ext_test[q_test] = c 45 | 46 | train['query'] = query_ext_train 47 | test['query'] = query_ext_test 48 | train['product_title'] = titles 49 | test['product_title'] = titles_test 50 | return train, test 51 | 52 | # The following 3 functions have been taken from Ben Hamner's github repository 53 | # https://github.com/benhamner/Metrics 54 | def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None): 55 | """ 56 | Returns the confusion matrix between rater's ratings 57 | """ 58 | assert(len(rater_a) == len(rater_b)) 59 | if min_rating is None: 60 | min_rating = min(rater_a + rater_b) 61 | if max_rating is None: 62 | max_rating = max(rater_a + rater_b) 63 | num_ratings = int(max_rating - min_rating + 1) 64 | conf_mat = [[0 for i in range(num_ratings)] 65 | for j in range(num_ratings)] 66 | for a, b in zip(rater_a, rater_b): 67 | conf_mat[a - min_rating][b - min_rating] += 1 68 | return conf_mat 69 | 70 | 71 | def histogram(ratings, min_rating=None, max_rating=None): 72 | """ 73 | Returns the counts of each type of rating that a rater made 74 | """ 75 | if min_rating is None: 76 | min_rating = min(ratings) 77 | if max_rating is None: 78 | max_rating = max(ratings) 79 | num_ratings = int(max_rating - min_rating + 1) 80 | hist_ratings = [0 for x in range(num_ratings)] 81 | for r in ratings: 82 | hist_ratings[r - min_rating] += 1 83 | return hist_ratings 84 | 85 | 86 | def quadratic_weighted_kappa(y, y_pred): 87 | """ 88 | Calculates the quadratic weighted kappa 89 | axquadratic_weighted_kappa calculates the quadratic weighted kappa 90 | value, which is a measure of inter-rater agreement between two raters 91 | that provide discrete numeric ratings. Potential values range from -1 92 | (representing complete disagreement) to 1 (representing complete 93 | agreement). A kappa value of 0 is expected if all agreement is due to 94 | chance. 95 | quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b 96 | each correspond to a list of integer ratings. These lists must have the 97 | same length. 98 | The ratings should be integers, and it is assumed that they contain 99 | the complete range of possible ratings. 100 | quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating 101 | is the minimum possible rating, and max_rating is the maximum possible 102 | rating 103 | """ 104 | rater_a = y 105 | rater_b = y_pred 106 | min_rating=None 107 | max_rating=None 108 | rater_a = np.array(rater_a, dtype=int) 109 | rater_b = np.array(rater_b, dtype=int) 110 | assert(len(rater_a) == len(rater_b)) 111 | if min_rating is None: 112 | min_rating = min(min(rater_a), min(rater_b)) 113 | if max_rating is None: 114 | max_rating = max(max(rater_a), max(rater_b)) 115 | conf_mat = confusion_matrix(rater_a, rater_b, 116 | min_rating, max_rating) 117 | num_ratings = len(conf_mat) 118 | num_scored_items = float(len(rater_a)) 119 | 120 | hist_rater_a = histogram(rater_a, min_rating, max_rating) 121 | hist_rater_b = histogram(rater_b, min_rating, max_rating) 122 | 123 | numerator = 0.0 124 | denominator = 0.0 125 | 126 | for i in range(num_ratings): 127 | for j in range(num_ratings): 128 | expected_count = (hist_rater_a[i] * hist_rater_b[j] 129 | / num_scored_items) 130 | d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0) 131 | numerator += d * conf_mat[i][j] / num_scored_items 132 | denominator += d * expected_count / num_scored_items 133 | 134 | return (1.0 - numerator / denominator) 135 | 136 | def compression_distance(x,y,l_x=None,l_y=None): 137 | if x==y: 138 | return 0 139 | x_b = x.encode('utf-8') 140 | y_b = y.encode('utf-8') 141 | if l_x is None: 142 | l_x = len(lzma.compress(x_b)) 143 | l_y = len(lzma.compress(y_b)) 144 | l_xy = len(lzma.compress(x_b+y_b)) 145 | l_yx = len(lzma.compress(y_b+x_b)) 146 | dist = (min(l_xy,l_yx)-min(l_x,l_y))/max(l_x,l_y) 147 | return dist 148 | 149 | def get_scores(std_true, y_true): 150 | best_diff = np.inf 151 | combs = list(combinations_with_replacement([1,2,3,4],3)) + list(combinations_with_replacement([1,2,3,4],4)) + list(combinations_with_replacement([1,2,3,4],5)) 152 | for item in combs: 153 | if np.median(item) == y_true: 154 | diff = np.abs(np.std(item) - std_true) 155 | if diff < best_diff: 156 | best_diff = diff 157 | best_match = list(item) 158 | if best_diff < 1e-8: 159 | break 160 | return best_match 161 | 162 | def extend_set(X,y,weights): 163 | X_tr = [] 164 | y_tr = [] 165 | y_true = [] 166 | for i in range(len(y)): 167 | std = 1/weights[i] - 1 168 | best_match = get_scores(std,y[i]) 169 | y_true_vals = [] 170 | for item in best_match: 171 | X_tr.append(X[i]) 172 | y_tr.append(item) 173 | y_true_vals.append(False) 174 | for j in range(len(best_match)): 175 | if best_match[j]==y[i]: 176 | y_true_vals[j] = True 177 | break 178 | y_true += y_true_vals 179 | y_true = np.array(y_true) 180 | X_tr = np.array(X_tr) 181 | y_tr = np.array(y_tr) 182 | return X_tr, y_tr 183 | 184 | 185 | def correct_string(s): 186 | s = s.replace("hardisk", "hard drive") 187 | s = s.replace("extenal", "external") 188 | s = s.replace("soda stream", "sodastream") 189 | s = s.replace("fragance", "fragrance") 190 | s = s.replace("16 gb", "16gb") 191 | s = s.replace("32 gb", "32gb") 192 | s = s.replace("500 gb", "500gb") 193 | s = s.replace("2 tb", "2tb") 194 | s = s.replace("shoppe", "shop") 195 | s = s.replace("refrigirator", "refrigerator") 196 | s = s.replace("assassinss", "assassins") 197 | s = s.replace("harleydavidson", "harley davidson") 198 | s = s.replace("harley-davidson", "harley davidson") 199 | return s 200 | 201 | stemmer = PorterStemmer() 202 | 203 | ## Stemming functionality 204 | class stemmerUtility(object): 205 | #Stemming functionality 206 | @staticmethod 207 | def stemPorter(review_text): 208 | porter = PorterStemmer() 209 | preprocessed_docs = [] 210 | for doc in review_text: 211 | final_doc = [] 212 | for word in doc: 213 | final_doc.append(porter.stem(word)) 214 | #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument! 215 | preprocessed_docs.append(final_doc) 216 | return preprocessed_docs 217 | 218 | def assemble_counts(train,m='train'): 219 | X = [] 220 | titles = [] 221 | queries = [] 222 | weights = [] 223 | train['isdesc'] = 1 # Description present flag 224 | train.loc[train['product_description'].isnull(),'isdesc'] = 0 225 | 226 | for i in range(len(train.id)): 227 | query = correct_string(train['query'][i].lower()) 228 | title = correct_string(train.product_title[i].lower()) 229 | 230 | query = (" ").join([z for z in BeautifulSoup(query).get_text(" ").split(" ")]) 231 | title = (" ").join([z for z in BeautifulSoup(title).get_text(" ").split(" ")]) 232 | 233 | query=text.re.sub("[^a-zA-Z0-9]"," ", query) 234 | title=text.re.sub("[^a-zA-Z0-9]"," ", title) 235 | 236 | query= (" ").join([stemmer.stem(z) for z in query.split(" ")]) 237 | title= (" ").join([stemmer.stem(z) for z in title.split(" ")]) 238 | 239 | query=" ".join(query.split()) 240 | title=" ".join(title.split()) 241 | 242 | dist_qt = compression_distance(query,title) 243 | dist_qt2 = 1 - seq_matcher(None,query,title).ratio() 244 | 245 | query_len = len(query.split()) 246 | title_len = len(title.split()) 247 | isdesc = train.isdesc[i] 248 | 249 | tmp_title = title 250 | word_counter_qt = 0 251 | lev_dist_arr = [] 252 | for q in query.split(): 253 | lev_dist_q = [] 254 | for t in title.split(): 255 | lev_dist = seq_matcher(None,q,t).ratio() 256 | if lev_dist > 0.9: 257 | word_counter_qt += 1 258 | #tmp_title += ' '+q # add such words to title to increase their weights in tfidf 259 | lev_dist_q.append(lev_dist) 260 | lev_dist_arr.append(lev_dist_q) 261 | last_word_in = 0 262 | for t in title.split(): 263 | lev_dist = seq_matcher(None,query.split()[-1],t).ratio() 264 | if lev_dist > 0.9: 265 | last_word_in = 1 266 | lev_max = 0 267 | for item in lev_dist_arr: 268 | lev_max_q = max(item) 269 | lev_max += lev_max_q 270 | lev_max = 1- lev_max/len(lev_dist_arr) 271 | word_counter_qt_norm = word_counter_qt/query_len 272 | 273 | 274 | 275 | X.append([query_len,title_len,isdesc,word_counter_qt,dist_qt,dist_qt2,lev_max,last_word_in,word_counter_qt_norm]) 276 | titles.append(tmp_title) 277 | queries.append(query) 278 | if m =='train': 279 | weights.append(1/(float(train["relevance_variance"][i]) + 1.0)) 280 | X = np.array(X).astype(np.float) 281 | if m =='train': 282 | return X, np.array(weights).astype(np.float), np.array(titles), np.array(queries) 283 | else: 284 | return X, np.array(titles), np.array(queries) 285 | 286 | def assemble_counts2(train): 287 | X = [] 288 | queries = [] 289 | 290 | for i in range(len(train.id)): 291 | query = train['query'][i] 292 | title = train.product_title[i] 293 | 294 | dist_qt = compression_distance(query,title) 295 | dist_qt2 = 1 - seq_matcher(None,query,title).ratio() 296 | 297 | query_len = len(query.split()) 298 | 299 | lev_dist_arr = [] 300 | word_rank_list = [] 301 | word_q_ind = 0 302 | word_counter_qt = 0 303 | for q in query.split(): 304 | word_q_ind += 1 305 | lev_dist_q = [] 306 | for t in title.split(): 307 | lev_dist = seq_matcher(None,q,t).ratio() 308 | if lev_dist > 0.9: 309 | word_counter_qt += 1 310 | word_rank_list.append(word_q_ind) 311 | #tmp_title += ' '+q # add such words to title to increase their weights in tfidf 312 | lev_dist_q.append(lev_dist) 313 | lev_dist_arr.append(lev_dist_q) 314 | if word_counter_qt == 0: 315 | maxrank = 0 316 | else: 317 | maxrank = 26 - min(word_rank_list) 318 | 319 | 320 | lev_max = 0 321 | for item in lev_dist_arr: 322 | lev_max_q = max(item) 323 | lev_max += lev_max_q 324 | lev_max = 1- lev_max/len(lev_dist_arr) 325 | word_counter_qt_norm = word_counter_qt/query_len 326 | 327 | 328 | 329 | X.append([word_counter_qt,dist_qt,dist_qt2,lev_max,word_counter_qt_norm,maxrank]) 330 | queries.append(query) 331 | 332 | X = np.array(X).astype(np.float) 333 | 334 | return X, np.array(queries) 335 | 336 | 337 | def vary_border(pred_true,y,num_iter=101): 338 | mms = MinMaxScaler() 339 | pred=pred_true.copy() 340 | pred=mms.fit_transform(pred) 341 | best_score = 0 342 | for k1 in range(num_iter): 343 | c1 = k1/(num_iter-1) 344 | for k2 in range(num_iter): 345 | c2 = k2/(num_iter-1) 346 | for k3 in range(num_iter): 347 | c3 = k3/(num_iter-1) 348 | if c1 < c2 and c1 < c3 and c2 < c3 and c1 > 0.25 and c1 < 0.5 and c3 < 0.9: 349 | tmp_pred = pred.copy() 350 | mask1 = tmp_pred < c1 351 | mask2 = (tmp_pred >=c1) * (tmp_pred < c2) 352 | mask3 = (tmp_pred >=c2) * (tmp_pred < c3) 353 | mask4 = tmp_pred >=c3 354 | tmp_pred[mask1] = 1 355 | tmp_pred[mask2] = 2 356 | tmp_pred[mask3] = 3 357 | tmp_pred[mask4] = 4 358 | score = quadratic_weighted_kappa(y,tmp_pred) 359 | if score > best_score: 360 | best_score = score 361 | best_coef = [c1,c2,c3] 362 | best_pred = tmp_pred.copy() 363 | #print(best_score,best_coef) 364 | return best_pred, best_coef 365 | 366 | def apply_border(pred,coefs): 367 | c1, c2, c3 = coefs[0], coefs[1], coefs[2] 368 | mms2 = MinMaxScaler() 369 | tmp_pred=mms2.fit_transform(pred) 370 | mask1 = tmp_pred < c1 371 | mask2 = (tmp_pred >=c1) * (tmp_pred < c2) 372 | mask3 = (tmp_pred >=c2) * (tmp_pred < c3) 373 | mask4 = tmp_pred >=c3 374 | tmp_pred[mask1] = 1 375 | tmp_pred[mask2] = 2 376 | tmp_pred[mask3] = 3 377 | tmp_pred[mask4] = 4 378 | return tmp_pred.astype(np.int32) 379 | 380 | -------------------------------------------------------------------------------- /fit_model2_mikhail.py: -------------------------------------------------------------------------------- 1 | import cfg 2 | import kappa as pykappa 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import scipy.sparse as sp 7 | import cPickle as pickle 8 | 9 | import re 10 | from nltk.stem import PorterStemmer 11 | from bs4 import BeautifulSoup 12 | from difflib import SequenceMatcher as seq_matcher 13 | 14 | from sklearn.preprocessing import LabelEncoder 15 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 16 | from sklearn.preprocessing import StandardScaler 17 | from sklearn.decomposition import TruncatedSVD 18 | 19 | from sklearn.cross_validation import KFold, StratifiedKFold, StratifiedShuffleSplit 20 | from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier 21 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 22 | from sklearn.svm import LinearSVC, LinearSVR 23 | 24 | import logging 25 | logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.INFO) 26 | logging.info("Mikhail's model_2 learner script") 27 | 28 | # load data from dump 29 | logging.info("load data") 30 | train_df = pd.read_pickle(cfg.path_processed + 'train_df') 31 | test_df = pd.read_pickle(cfg.path_processed + 'test_df') 32 | sampleSubmission = pd.read_csv(cfg.path_sampleSubmission) 33 | # fetch Dmitry's extended querys 34 | ext_train_df = pd.read_csv(cfg.path_features + 'train_ext_top10.csv') 35 | ext_test_df = pd.read_csv(cfg.path_features + 'test_ext_top10.csv') 36 | train_df['title_ext'] = ext_train_df['product_title'] 37 | test_df['title_ext'] = ext_test_df['product_title'] 38 | 39 | logging.info("Collect BoWs") 40 | # collect BoWs -- for query 41 | vect = TfidfVectorizer(ngram_range=(1,2), min_df=2, encoding='utf-8') 42 | vect.fit(list(train_df['query_stem'].values) + list(test_df['query_stem'].values)) 43 | X_query_tr = vect.transform(train_df['query_stem'].values) 44 | X_query_te = vect.transform(test_df['query_stem'].values) 45 | # fot title 46 | vect.fit(list(train_df['title_stem'].values) + list(test_df['title_stem'].values)) 47 | X_tmp_tr = vect.transform(train_df['title_stem'].values).tocsc() 48 | X_tmp_te = vect.transform(test_df['title_stem'].values).tocsc() 49 | freq_tr = np.array(X_tmp_tr.sum(axis=0))[0] 50 | freq_te = np.array(X_tmp_te.sum(axis=0))[0] 51 | col_mask = np.where((freq_tr * freq_te)!=0)[0] 52 | X_title_tr = X_tmp_tr[:, col_mask].tocsr() 53 | X_title_te = X_tmp_te[:, col_mask].tocsr() 54 | # for description 55 | vect.fit(list(train_df['desc_stem'].values) + list(test_df['desc_stem'].values)) 56 | X_tmp_tr = vect.transform(train_df['desc_stem'].values).tocsc() 57 | X_tmp_te = vect.transform(test_df['desc_stem'].values).tocsc() 58 | freq_tr = np.array(X_tmp_tr.sum(axis=0))[0] 59 | freq_te = np.array(X_tmp_te.sum(axis=0))[0] 60 | col_mask = np.where((freq_tr * freq_te)!=0)[0] 61 | X_desc_tr = X_tmp_tr[:, col_mask].tocsr() 62 | X_desc_te = X_tmp_te[:, col_mask].tocsr() 63 | # assemble in one 64 | X_all_tr = sp.hstack([X_query_tr, X_title_tr, X_desc_tr]).tocsr() 65 | X_all_te = sp.hstack([X_query_te, X_title_te, X_desc_te]).tocsr() 66 | 67 | # coding query by id 68 | le = LabelEncoder() 69 | le.fit(train_df['query_stem'].values) 70 | qid_tr = le.transform(train_df['query_stem'].values) 71 | qid_te = le.transform(test_df['query_stem'].values) 72 | y_all_tr = train_df['median_relevance'].values 73 | 74 | 75 | stemmer = PorterStemmer() 76 | ## Stemming functionality 77 | class stemmerUtility(object): 78 | #Stemming functionality 79 | @staticmethod 80 | def stemPorter(review_text): 81 | porter = PorterStemmer() 82 | preprocessed_docs = [] 83 | for doc in review_text: 84 | final_doc = [] 85 | for word in doc: 86 | final_doc.append(porter.stem(word)) 87 | preprocessed_docs.append(final_doc) 88 | return preprocessed_docs 89 | 90 | def correct_string(s): 91 | s = s.replace("hardisk", "hard drive") 92 | s = s.replace("extenal", "external") 93 | s = s.replace("soda stream", "sodastream") 94 | s = s.replace("fragance", "fragrance") 95 | s = s.replace("16 gb", "16gb") 96 | s = s.replace("32 gb", "32gb") 97 | s = s.replace("500 gb", "500gb") 98 | s = s.replace("2 tb", "2tb") 99 | s = s.replace("shoppe", "shop") 100 | s = s.replace("refrigirator", "refrigerator") 101 | s = s.replace("assassinss", "assassins") 102 | s = s.replace("harleydavidson", "harley davidson") 103 | s = s.replace("harley-davidson", "harley davidson") 104 | return s 105 | 106 | 107 | def assemble_counts(train): 108 | X = [] 109 | titles = [] 110 | for i in range(len(train.id)): 111 | query = correct_string(train['query'][i].lower()) 112 | title = correct_string(train.product_title[i].lower()) 113 | 114 | query = (" ").join([z for z in BeautifulSoup(query).get_text(" ").split(" ")]) 115 | title = (" ").join([z for z in BeautifulSoup(title).get_text(" ").split(" ")]) 116 | 117 | query=re.sub("[^a-zA-Z0-9]"," ", query) 118 | title=re.sub("[^a-zA-Z0-9]"," ", title) 119 | 120 | query= (" ").join([stemmer.stem(z) for z in query.split(" ")]) 121 | title= (" ").join([stemmer.stem(z) for z in title.split(" ")]) 122 | 123 | query=" ".join(query.split()) 124 | title=" ".join(title.split()) 125 | 126 | #dist_qt = compression_distance(query,title) 127 | dist_qt2 = 1 - seq_matcher(None,query,title).ratio() 128 | 129 | query_len = len(query.split()) 130 | title_len = len(title.split()) 131 | 132 | tmp_title = title 133 | word_counter_qt = 0 134 | lev_dist_arr = [] 135 | for q in query.split(): 136 | lev_dist_q = [] 137 | for t in title.split(): 138 | lev_dist = seq_matcher(None,q,t).ratio() 139 | if lev_dist > 0.9: 140 | word_counter_qt += 1 141 | tmp_title += ' '+q # add such words to title to increase their weights in tfidf 142 | lev_dist_q.append(lev_dist) 143 | lev_dist_arr.append(lev_dist_q) 144 | last_word_in = 0 145 | for t in title.split(): 146 | lev_dist = seq_matcher(None,query.split()[-1],t).ratio() 147 | if lev_dist > 0.9: 148 | last_word_in = 1 149 | lev_max = 0 150 | for item in lev_dist_arr: 151 | lev_max_q = max(item) 152 | lev_max += lev_max_q 153 | lev_max = 1- lev_max/len(lev_dist_arr) 154 | word_counter_qt_norm = word_counter_qt/query_len 155 | X.append([query_len,title_len,word_counter_qt,lev_max,last_word_in,word_counter_qt_norm, dist_qt2]) 156 | titles.append(tmp_title) 157 | 158 | X = np.array(X).astype(np.float) 159 | return X, np.array(titles) 160 | 161 | logging.info("Assemble counts") 162 | X_counts_tr, titles_tr = assemble_counts(train_df) 163 | X_counts_te, titles_te = assemble_counts(test_df) 164 | 165 | logging.info("Assemble additional features") 166 | feat_list = [ 167 | u'w2v_sim', 168 | u'w2v_dist', 169 | u'tsne_title_1', 170 | u'tsne_title_2', 171 | u'tsne_qt_1', 172 | u'tsne_qt_2', 173 | u'cosine_qt_orig', 174 | u'cosine_qt_stem', 175 | u'cosine_qd_stem', 176 | u'set_qt_stem' 177 | ] 178 | X_additional_tr = train_df[feat_list].as_matrix() 179 | X_additional_te = test_df[feat_list].as_matrix() 180 | 181 | logging.info("Load w2v-based features") 182 | X_w2v_tr, X_w2v_te = pickle.load(open(cfg.path_processed + 'X_w2v.pickled')) 183 | 184 | logging.info("Load counts") 185 | X_counts2_tr = np.loadtxt(cfg.path_features + 'train_ext_counts_top10.txt') 186 | X_counts2_te = np.loadtxt(cfg.path_features + 'test_ext_counts_top10.txt') 187 | 188 | 189 | # Learinig routines (similar to model1 -- see description here) 190 | feat_list = [u'w2v_sim', 191 | u'cosine_qt_stem', 192 | u'cosine_qd_stem', 193 | u'set_qt_stem', 194 | u'tsne_title_1', 195 | u'tsne_title_2', 196 | u'tsne_qt_1', 197 | u'tsne_qt_2', 198 | ] 199 | 200 | def make_mf_sliced_regression(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'): 201 | print '\n [make_mf_slice]' 202 | print clf 203 | mf_tr = np.zeros(len(subset_tr)) 204 | mf_te = np.zeros(len(subset_te)) 205 | #query-slice 206 | for cur_query in subset_tr.query_stem.value_counts().index: 207 | mask_tr = subset_tr.query_stem == cur_query 208 | mask_te = subset_te.query_stem == cur_query 209 | 210 | # build Bow 211 | vect = CountVectorizer(min_df=1, ngram_range=(1,2)) 212 | 213 | txts = (list((subset_tr[mask_tr]['title_ext']).values) + 214 | list((subset_te[mask_te]['title_ext']).values)) 215 | vect.fit(txts) 216 | 217 | X_loc_base = vect.transform(list((subset_tr[mask_tr]['title_ext']).values)).todense() 218 | X_loc_hold = vect.transform(list((subset_te[mask_te]['title_ext']).values)).todense() 219 | y_loc_train = subset_tr[mask_tr][target_col].values 220 | # intersect terms 221 | feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(np.sum(X_loc_hold, axis=0))[0] 222 | feat_mask = np.where(feat_counts>0)[0] 223 | # build final feats matrix 224 | X_loc_base = np.hstack((X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list])) 225 | X_loc_hold = np.hstack((X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list])) 226 | 227 | # metafeatures iterators 228 | tmp_tr = np.zeros(sum(mask_tr)) 229 | tmp_te = np.zeros(sum(mask_te)) 230 | 231 | #print y_loc_train.shape, X_loc_base.shape 232 | 233 | for i in range(n_round): 234 | kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42+i*1000) 235 | for ind_tr, ind_te in kf: 236 | X_tr = X_loc_base[ind_tr] 237 | X_te = X_loc_base[ind_te] 238 | y_tr = y_loc_train[ind_tr] 239 | y_te = y_loc_train[ind_te] 240 | 241 | clf.fit(X_tr, y_tr) 242 | tmp_tr[ind_te] += clf.predict(X_te) 243 | tmp_te += clf.predict(X_loc_hold)*0.5 244 | mf_tr[mask_tr.values] = tmp_tr / n_round 245 | mf_te[mask_te.values] = tmp_te / n_round 246 | 247 | y_valid = subset_tr[target_col].values 248 | kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr)) 249 | acc = np.mean(y_valid == np.round(mf_tr)) 250 | print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 251 | return (mf_tr, mf_te) 252 | 253 | 254 | def make_mf_sliced_classification(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'): 255 | print '\n [make_mf_slice]' 256 | print clf 257 | mf_tr = np.zeros(len(subset_tr)) 258 | mf_te = np.zeros(len(subset_te)) 259 | 260 | #query-slice 261 | for cur_query in subset_tr.query_stem.value_counts().index: 262 | mask_tr = subset_tr.query_stem == cur_query 263 | mask_te = subset_te.query_stem == cur_query 264 | 265 | # build Bow 266 | vect = CountVectorizer(min_df=1, ngram_range=(1,2)) 267 | 268 | txts = (list((subset_tr[mask_tr]['title_ext']).values) + 269 | list((subset_te[mask_te]['title_ext']).values)) 270 | vect.fit(txts) 271 | 272 | X_loc_base = vect.transform(list((subset_tr[mask_tr]['title_ext']).values)).todense() 273 | X_loc_hold = vect.transform(list((subset_te[mask_te]['title_ext']).values)).todense() 274 | y_loc_train = subset_tr[mask_tr][target_col].values 275 | # intersect terms 276 | feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(np.sum(X_loc_hold, axis=0))[0] 277 | feat_mask = np.where(feat_counts>0)[0] 278 | # build final feats matrix 279 | X_loc_base = np.hstack((X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list])) 280 | X_loc_hold = np.hstack((X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list])) 281 | 282 | # metafeatures iterators 283 | tmp_tr = np.zeros(sum(mask_tr)) 284 | tmp_te = np.zeros(sum(mask_te)) 285 | 286 | #print y_loc_train.shape, X_loc_base.shape 287 | 288 | for i in range(n_round): 289 | kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42+i*1000) 290 | for ind_tr, ind_te in kf: 291 | X_tr = X_loc_base[ind_tr] 292 | X_te = X_loc_base[ind_te] 293 | y_tr = y_loc_train[ind_tr] 294 | y_te = y_loc_train[ind_te] 295 | 296 | clf.fit(X_tr, y_tr) 297 | tmp_tr[ind_te] += clf.predict(X_te) 298 | tmp_te += clf.predict(X_loc_hold)*0.5 299 | mf_tr[mask_tr.values] = tmp_tr / n_round 300 | mf_te[mask_te.values] = tmp_te / n_round 301 | 302 | y_valid = subset_tr[target_col].values 303 | kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr)) 304 | acc = np.mean(y_valid == np.round(mf_tr)) 305 | print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 306 | return (mf_tr, mf_te) 307 | 308 | 309 | def make_mf_regression(X ,y, clf, qid, X_test, n_round=3): 310 | print clf 311 | mf_tr = np.zeros(X.shape[0]) 312 | mf_te = np.zeros(X_test.shape[0]) 313 | for i in range(n_round): 314 | skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) 315 | for ind_tr, ind_te in skf: 316 | X_tr = X[ind_tr] 317 | X_te = X[ind_te] 318 | 319 | y_tr = y[ind_tr] 320 | y_te = y[ind_te] 321 | 322 | clf.fit(X_tr, y_tr) 323 | mf_tr[ind_te] += clf.predict(X_te) 324 | mf_te += clf.predict(X_test)*0.5 325 | 326 | y_pred = np.round(clf.predict(X_te)) 327 | kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) 328 | acc = np.mean(y_te == y_pred) 329 | print 'pred[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 330 | return (mf_tr / n_round, mf_te / n_round) 331 | 332 | 333 | def make_mf_classification4(X ,y, clf, qid, X_test, n_round=3): 334 | print clf 335 | mf_tr = np.zeros((X.shape[0], 5)) 336 | mf_te = np.zeros((X_test.shape[0], 5)) 337 | for i in range(n_round): 338 | skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) 339 | for ind_tr, ind_te in skf: 340 | X_tr = X[ind_tr] 341 | X_te = X[ind_te] 342 | 343 | y_tr = y[ind_tr] 344 | y_te = y[ind_te] 345 | 346 | clf.fit(X_tr, y_tr) 347 | mf_tr[ind_te, 4] += clf.predict(X_te) 348 | mf_te[:, 4] += clf.predict(X_test)*0.5 349 | try: 350 | mf_tr[ind_te, :4] += clf.predict_proba(X_te) 351 | mf_te[:, :4] += clf.predict_proba(X_test)*0.5 352 | except: 353 | mf_tr[ind_te, :4] += clf.decision_function(X_te) 354 | mf_te[:,:4] += clf.decision_function(X_test)*0.5 355 | y_pred = np.round(clf.predict(X_te)) 356 | kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) 357 | acc = np.mean(y_te == y_pred) 358 | print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 359 | print 360 | return (mf_tr / n_round, mf_te / n_round) 361 | 362 | 363 | def make_mf_classification2(X ,y, clf, qid, X_test, n_round=3): 364 | print clf 365 | mf_tr = np.zeros((X.shape[0], 2)) 366 | mf_te = np.zeros((X_test.shape[0], 2)) 367 | for i in range(n_round): 368 | skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) 369 | for ind_tr, ind_te in skf: 370 | X_tr = X[ind_tr] 371 | X_te = X[ind_te] 372 | 373 | y_tr = y[ind_tr] 374 | y_te = y[ind_te] 375 | 376 | clf.fit(X_tr, y_tr) 377 | try: 378 | mf_tr[ind_te] += clf.predict_proba(X_te) 379 | mf_te += clf.predict_proba(X_test)*0.5 380 | except: 381 | mf_tr[ind_te, 0] += clf.decision_function(X_te) 382 | mf_te[:, 0] += clf.decision_function(X_test)*0.5 383 | 384 | y_pred = np.round(clf.predict(X_te)) 385 | kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) 386 | acc = np.mean(y_te == y_pred) 387 | print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 388 | print 389 | return (mf_tr / n_round, mf_te / n_round) 390 | 391 | 392 | def learn_class_separators(clf, X_1, X_2, n_round): 393 | class_sep_4 = make_mf_classification2(X_1, (y_base<4), clf, q_base, X_2, n_round) 394 | class_sep_3 = make_mf_classification2(X_1, (y_base<3), clf, q_base, X_2, n_round) 395 | class_sep_2 = make_mf_classification2(X_1, (y_base<2), clf, q_base, X_2, n_round) 396 | class_sep_23 = make_mf_classification2(X_1, (y_base<4)*(y_base>1), clf, q_base, X_2, n_round) 397 | ret_tr = np.hstack((class_sep_4[0], class_sep_3[0], class_sep_2[0], class_sep_23[0])) 398 | ret_te = np.hstack((class_sep_4[1], class_sep_3[1], class_sep_2[1], class_sep_23[1])) 399 | return (ret_tr[:, 1::2], ret_te[:, 1::2]) 400 | 401 | 402 | logging.info("Assing names to featues") 403 | X_base_tf = X_all_tr 404 | X_hold_tf = X_all_te 405 | X_base_add = X_additional_tr 406 | X_hold_add = X_additional_te 407 | X_base_w2v = X_w2v_tr 408 | X_hold_w2v = X_w2v_te 409 | X_base_counts = X_counts_tr 410 | X_hold_counts = X_counts_te 411 | X_base_counts2 = X_counts2_tr 412 | X_hold_counts2 = X_counts2_te 413 | q_base = qid_tr 414 | q_hold = qid_te 415 | y_base = y_all_tr 416 | 417 | logging.info("Learn metafeatures") 418 | # make features 419 | rf_add_sep = learn_class_separators( 420 | RandomForestClassifier(n_estimators=500, n_jobs=-1,criterion='entropy', random_state=42), 421 | np.hstack((X_base_add, X_base_counts2)), 422 | np.hstack((X_hold_add, X_hold_counts2)), 423 | n_round=5) 424 | 425 | mfs_rf_reg = make_mf_sliced_regression( 426 | train_df, 427 | test_df, 428 | RandomForestRegressor(n_estimators=500, max_features=0.3, random_state=42), 429 | n_round=3) 430 | 431 | mfs_rf_clf = make_mf_sliced_regression( 432 | train_df, 433 | test_df, 434 | RandomForestClassifier(n_estimators=500, max_features=0.3, random_state=42), 435 | n_round=3) 436 | 437 | mf_lsvc_clf = make_mf_classification4( 438 | X_base_tf, 439 | y_base, 440 | LinearSVC(), 441 | q_base, 442 | X_hold_tf, 443 | n_round=10) 444 | 445 | mf_lsvr_reg = make_mf_regression( 446 | X_base_tf, 447 | y_base, 448 | LinearSVR(), 449 | q_base, 450 | X_hold_tf, 451 | n_round=10) 452 | 453 | 454 | logging.info("Assemble 2nd level features") 455 | X_train = np.hstack( 456 | (X_base_add, 457 | mf_lsvr_reg[0][:, np.newaxis], 458 | mf_lsvc_clf[0], 459 | rf_add_sep[0], 460 | mfs_rf_clf[0][:, np.newaxis], 461 | mfs_rf_reg[0][:, np.newaxis], 462 | X_base_counts, 463 | X_base_counts2)) 464 | 465 | X_test = np.hstack( 466 | (X_hold_add, 467 | mf_lsvr_reg[1][:, np.newaxis], 468 | mf_lsvc_clf[1], 469 | rf_add_sep[1], 470 | mfs_rf_clf[1][:, np.newaxis], 471 | mfs_rf_reg[1][:, np.newaxis], 472 | X_hold_counts, 473 | X_hold_counts2)) 474 | 475 | logging.info("Fit 2nd level model") 476 | rfR = RandomForestRegressor(n_estimators=25000, n_jobs=-1, min_samples_split=3, random_state=42) 477 | rfR.fit(X_train, y_base) 478 | y_pred_rfR = rfR.predict(X_test) 479 | 480 | logging.info("Dumping prediction") 481 | np.savetxt(cfg.path_processed + 'mikhail_model2.txt', y_pred_rfR) 482 | 483 | logging.info("Done!") -------------------------------------------------------------------------------- /fit_model1_mikhail.py: -------------------------------------------------------------------------------- 1 | import cfg 2 | import kappa as pykappa 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import scipy.sparse as sp 7 | import cPickle as pickle 8 | 9 | import re 10 | from nltk.stem import PorterStemmer 11 | from bs4 import BeautifulSoup 12 | from difflib import SequenceMatcher as seq_matcher 13 | 14 | from sklearn.preprocessing import LabelEncoder 15 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 16 | from sklearn.preprocessing import StandardScaler 17 | from sklearn.decomposition import TruncatedSVD 18 | 19 | from sklearn.cross_validation import KFold, StratifiedKFold, StratifiedShuffleSplit 20 | from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier 21 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 22 | from sklearn.svm import LinearSVC, LinearSVR 23 | 24 | 25 | import logging 26 | logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.INFO) 27 | logging.info("Mikhail's model_1 learner script") 28 | 29 | 30 | logging.info("Preparing features") 31 | # load data 32 | train_df = pd.read_pickle(cfg.path_processed + 'train_df') 33 | test_df = pd.read_pickle(cfg.path_processed + 'test_df') 34 | sampleSubmission = pd.read_csv(cfg.path_sampleSubmission) 35 | 36 | logging.info("Collecting BoWs") 37 | # collect BoW + TfIdf for query 38 | vect = TfidfVectorizer(ngram_range=(1,2), min_df=2, encoding='utf-8') 39 | vect.fit(list(train_df['query_stem'].values) + list(test_df['query_stem'].values)) 40 | X_query_tr = vect.transform(train_df['query_stem'].values) 41 | X_query_te = vect.transform(test_df['query_stem'].values) 42 | 43 | # same for title (note -- train and test are intersected by words) 44 | vect.fit(list(train_df['title_stem'].values) + list(test_df['title_stem'].values)) 45 | X_tmp_tr = vect.transform(train_df['title_stem'].values).tocsc() 46 | X_tmp_te = vect.transform(test_df['title_stem'].values).tocsc() 47 | freq_tr = np.array(X_tmp_tr.sum(axis=0))[0] 48 | freq_te = np.array(X_tmp_te.sum(axis=0))[0] 49 | col_mask = np.where((freq_tr * freq_te)!=0)[0] 50 | X_title_tr = X_tmp_tr[:, col_mask].tocsr() 51 | X_title_te = X_tmp_te[:, col_mask].tocsr() 52 | 53 | # same for descriprion 54 | vect.fit(list(train_df['desc_stem'].values) + list(test_df['desc_stem'].values)) 55 | X_tmp_tr = vect.transform(train_df['desc_stem'].values).tocsc() 56 | X_tmp_te = vect.transform(test_df['desc_stem'].values).tocsc() 57 | freq_tr = np.array(X_tmp_tr.sum(axis=0))[0] 58 | freq_te = np.array(X_tmp_te.sum(axis=0))[0] 59 | col_mask = np.where((freq_tr * freq_te)!=0)[0] 60 | X_desc_tr = X_tmp_tr[:, col_mask].tocsr() 61 | X_desc_te = X_tmp_te[:, col_mask].tocsr() 62 | 63 | # concatenate all BoWs into one(two) matrix 64 | X_all_tr = sp.hstack([X_query_tr, X_title_tr, X_desc_tr]).tocsr() 65 | X_all_te = sp.hstack([X_query_te, X_title_te, X_desc_te]).tocsr() 66 | 67 | # coding query by id (qid) 68 | le = LabelEncoder() 69 | le.fit(train_df['query_stem'].values) 70 | qid_tr = le.transform(train_df['query_stem'].values) 71 | qid_te = le.transform(test_df['query_stem'].values) 72 | y_all_tr = train_df['median_relevance'].values 73 | 74 | 75 | ## Part of Dmotry's features 76 | logging.info("Calc counts") 77 | stemmer = PorterStemmer() 78 | 79 | class stemmerUtility(object): 80 | #Stemming functionality 81 | @staticmethod 82 | def stemPorter(review_text): 83 | porter = PorterStemmer() 84 | preprocessed_docs = [] 85 | for doc in review_text: 86 | final_doc = [] 87 | for word in doc: 88 | final_doc.append(porter.stem(word)) 89 | preprocessed_docs.append(final_doc) 90 | return preprocessed_docs 91 | 92 | def correct_string(s): 93 | s = s.replace("hardisk", "hard drive") 94 | s = s.replace("extenal", "external") 95 | s = s.replace("soda stream", "sodastream") 96 | s = s.replace("fragance", "fragrance") 97 | s = s.replace("16 gb", "16gb") 98 | s = s.replace("32 gb", "32gb") 99 | s = s.replace("500 gb", "500gb") 100 | s = s.replace("2 tb", "2tb") 101 | s = s.replace("shoppe", "shop") 102 | s = s.replace("refrigirator", "refrigerator") 103 | s = s.replace("assassinss", "assassins") 104 | s = s.replace("harleydavidson", "harley davidson") 105 | s = s.replace("harley-davidson", "harley davidson") 106 | return s 107 | 108 | 109 | def assemble_counts(train): 110 | X = [] 111 | titles = [] 112 | for i in range(len(train.id)): 113 | query = correct_string(train['query'][i].lower()) 114 | title = correct_string(train.product_title[i].lower()) 115 | 116 | query = (" ").join([z for z in BeautifulSoup(query).get_text(" ").split(" ")]) 117 | title = (" ").join([z for z in BeautifulSoup(title).get_text(" ").split(" ")]) 118 | 119 | query=re.sub("[^a-zA-Z0-9]"," ", query) 120 | title=re.sub("[^a-zA-Z0-9]"," ", title) 121 | 122 | query= (" ").join([stemmer.stem(z) for z in query.split(" ")]) 123 | title= (" ").join([stemmer.stem(z) for z in title.split(" ")]) 124 | 125 | query=" ".join(query.split()) 126 | title=" ".join(title.split()) 127 | 128 | #dist_qt = compression_distance(query,title) 129 | dist_qt2 = 1 - seq_matcher(None,query,title).ratio() 130 | 131 | query_len = len(query.split()) 132 | title_len = len(title.split()) 133 | 134 | tmp_title = title 135 | word_counter_qt = 0 136 | lev_dist_arr = [] 137 | for q in query.split(): 138 | lev_dist_q = [] 139 | for t in title.split(): 140 | lev_dist = seq_matcher(None,q,t).ratio() 141 | if lev_dist > 0.9: 142 | word_counter_qt += 1 143 | tmp_title += ' '+q # add such words to title to increase their weights in tfidf 144 | lev_dist_q.append(lev_dist) 145 | lev_dist_arr.append(lev_dist_q) 146 | last_word_in = 0 147 | for t in title.split(): 148 | lev_dist = seq_matcher(None,query.split()[-1],t).ratio() 149 | if lev_dist > 0.9: 150 | last_word_in = 1 151 | lev_max = 0 152 | for item in lev_dist_arr: 153 | lev_max_q = max(item) 154 | lev_max += lev_max_q 155 | lev_max = 1- lev_max/len(lev_dist_arr) 156 | word_counter_qt_norm = word_counter_qt/query_len 157 | X.append([query_len,title_len,word_counter_qt,lev_max,last_word_in,word_counter_qt_norm, dist_qt2]) 158 | titles.append(tmp_title) 159 | 160 | X = np.array(X).astype(np.float) 161 | return X, np.array(titles) 162 | 163 | # collect counts 164 | X_counts_tr, titles_tr = assemble_counts(train_df) 165 | X_counts_te, titles_te = assemble_counts(test_df) 166 | 167 | 168 | # collect additional features 169 | logging.info("Collect additional features") 170 | feat_list = [ 171 | u'w2v_sim', 172 | u'w2v_dist', 173 | u'tsne_title_1', 174 | u'tsne_title_2', 175 | u'tsne_qt_1', 176 | u'tsne_qt_2', 177 | u'cosine_qt_orig', 178 | u'cosine_qt_stem', 179 | u'cosine_qd_stem', 180 | u'set_qt_stem' 181 | ] 182 | X_additional_tr = train_df[feat_list].as_matrix() 183 | X_additional_te = test_df[feat_list].as_matrix() 184 | 185 | logging.info("Load w2v-based features") 186 | X_w2v_tr, X_w2v_te = pickle.load(open(cfg.path_processed + 'X_w2v.pickled')) 187 | 188 | logging.info("Load extended counts") 189 | X_counts2_tr = np.loadtxt(cfg.path_features + 'train_ext_counts_top10.txt') 190 | X_counts2_te = np.loadtxt(cfg.path_features + 'test_ext_counts_top10.txt') 191 | 192 | 193 | ##################### 194 | ### Learning part ### 195 | ##################### 196 | logging.info("Learning part") 197 | 198 | feat_list = [u'w2v_sim', 199 | u'cosine_qt_stem', 200 | u'cosine_qd_stem', 201 | u'set_qt_stem', 202 | u'tsne_title_1', 203 | u'tsne_title_2', 204 | u'tsne_qt_1', 205 | u'tsne_qt_2' 206 | ] 207 | 208 | def make_mf_sliced_regression(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'): 209 | ''' 210 | Perform per-query slicing, BoW on text, fit @clf and get prediction for test. Assumed that @clf -- regressor 211 | ''' 212 | print '\n [make_mf_slice]' 213 | print clf 214 | mf_tr = np.zeros(len(subset_tr)) 215 | mf_te = np.zeros(len(subset_te)) 216 | 217 | #query-slice 218 | for cur_query in subset_tr.query_stem.value_counts().index: 219 | mask_tr = subset_tr.query_stem == cur_query 220 | mask_te = subset_te.query_stem == cur_query 221 | 222 | # build Bow 223 | vect = CountVectorizer(min_df=1, ngram_range=(1,2)) 224 | 225 | txts = (list((subset_tr[mask_tr]['title_stem']).values) + 226 | list((subset_te[mask_te]['title_stem']).values)) 227 | vect.fit(txts) 228 | 229 | X_loc_base = vect.transform(list((subset_tr[mask_tr]['title_stem']).values)).todense() 230 | X_loc_hold = vect.transform(list((subset_te[mask_te]['title_stem']).values)).todense() 231 | y_loc_train = subset_tr[mask_tr][target_col].values 232 | # intersect terms 233 | feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(np.sum(X_loc_hold, axis=0))[0] 234 | feat_mask = np.where(feat_counts>0)[0] 235 | # build final feats matrix 236 | X_loc_base = np.hstack((X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list])) 237 | X_loc_hold = np.hstack((X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list])) 238 | 239 | # metafeatures iterators 240 | tmp_tr = np.zeros(sum(mask_tr)) 241 | tmp_te = np.zeros(sum(mask_te)) 242 | 243 | #print y_loc_train.shape, X_loc_base.shape 244 | 245 | for i in range(n_round): 246 | kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42+i*1000) 247 | for ind_tr, ind_te in kf: 248 | X_tr = X_loc_base[ind_tr] 249 | X_te = X_loc_base[ind_te] 250 | y_tr = y_loc_train[ind_tr] 251 | y_te = y_loc_train[ind_te] 252 | 253 | clf.fit(X_tr, y_tr) 254 | tmp_tr[ind_te] += clf.predict(X_te) 255 | tmp_te += clf.predict(X_loc_hold)*0.5 256 | mf_tr[mask_tr.values] = tmp_tr / n_round 257 | mf_te[mask_te.values] = tmp_te / n_round 258 | 259 | y_valid = subset_tr[target_col].values 260 | kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr)) 261 | acc = np.mean(y_valid == np.round(mf_tr)) 262 | print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 263 | return (mf_tr, mf_te) 264 | 265 | 266 | def make_mf_sliced_classification(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'): 267 | ''' 268 | Perform per-query slicing, BoW on text, fit @clf and get prediction for test. Assumed that @clf -- classifier 269 | ''' 270 | print '\n [make_mf_slice]' 271 | print clf 272 | mf_tr = np.zeros(len(subset_tr)) 273 | mf_te = np.zeros(len(subset_te)) 274 | 275 | #query-slice 276 | for cur_query in subset_tr.query_stem.value_counts().index: 277 | mask_tr = subset_tr.query_stem == cur_query 278 | mask_te = subset_te.query_stem == cur_query 279 | 280 | # build Bow 281 | vect = CountVectorizer(min_df=1, ngram_range=(1,2)) 282 | 283 | txts = (list((subset_tr[mask_tr]['title_stem']).values) + 284 | list((subset_te[mask_te]['title_stem']).values)) 285 | vect.fit(txts) 286 | 287 | X_loc_base = vect.transform(list((subset_tr[mask_tr]['title_stem']).values)).todense() 288 | X_loc_hold = vect.transform(list((subset_te[mask_te]['title_stem']).values)).todense() 289 | y_loc_train = subset_tr[mask_tr][target_col].values 290 | # intersect terms 291 | feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(np.sum(X_loc_hold, axis=0))[0] 292 | feat_mask = np.where(feat_counts>0)[0] 293 | # build final feats matrix 294 | X_loc_base = np.hstack((X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list])) 295 | X_loc_hold = np.hstack((X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list])) 296 | 297 | # metafeatures iterators 298 | tmp_tr = np.zeros(sum(mask_tr)) 299 | tmp_te = np.zeros(sum(mask_te)) 300 | 301 | #print y_loc_train.shape, X_loc_base.shape 302 | 303 | for i in range(n_round): 304 | kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42+i*1000) 305 | for ind_tr, ind_te in kf: 306 | X_tr = X_loc_base[ind_tr] 307 | X_te = X_loc_base[ind_te] 308 | y_tr = y_loc_train[ind_tr] 309 | y_te = y_loc_train[ind_te] 310 | 311 | clf.fit(X_tr, y_tr) 312 | tmp_tr[ind_te] += clf.predict(X_te) 313 | tmp_te += clf.predict(X_loc_hold)*0.5 314 | mf_tr[mask_tr.values] = tmp_tr / n_round 315 | mf_te[mask_te.values] = tmp_te / n_round 316 | 317 | y_valid = subset_tr[target_col].values 318 | kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr)) 319 | acc = np.mean(y_valid == np.round(mf_tr)) 320 | print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 321 | return (mf_tr, mf_te) 322 | 323 | 324 | def make_mf_regression(X ,y, clf, qid, X_test, n_round=3): 325 | ''' 326 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor 327 | ''' 328 | print clf 329 | mf_tr = np.zeros(X.shape[0]) 330 | mf_te = np.zeros(X_test.shape[0]) 331 | for i in range(n_round): 332 | skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) 333 | for ind_tr, ind_te in skf: 334 | X_tr = X[ind_tr] 335 | X_te = X[ind_te] 336 | 337 | y_tr = y[ind_tr] 338 | y_te = y[ind_te] 339 | 340 | clf.fit(X_tr, y_tr) 341 | mf_tr[ind_te] += clf.predict(X_te) 342 | mf_te += clf.predict(X_test)*0.5 343 | 344 | y_pred = np.round(clf.predict(X_te)) 345 | kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) 346 | acc = np.mean(y_te == y_pred) 347 | print 'pred[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 348 | return (mf_tr / n_round, mf_te / n_round) 349 | 350 | 351 | def make_mf_classification4(X ,y, clf, qid, X_test, n_round=3): 352 | ''' 353 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier 354 | ''' 355 | print clf 356 | mf_tr = np.zeros((X.shape[0], 5)) 357 | mf_te = np.zeros((X_test.shape[0], 5)) 358 | for i in range(n_round): 359 | skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) 360 | for ind_tr, ind_te in skf: 361 | X_tr = X[ind_tr] 362 | X_te = X[ind_te] 363 | 364 | y_tr = y[ind_tr] 365 | y_te = y[ind_te] 366 | 367 | clf.fit(X_tr, y_tr) 368 | mf_tr[ind_te, 4] += clf.predict(X_te) 369 | mf_te[:, 4] += clf.predict(X_test)*0.5 370 | try: 371 | mf_tr[ind_te, :4] += clf.predict_proba(X_te) 372 | mf_te[:, :4] += clf.predict_proba(X_test)*0.5 373 | except: 374 | mf_tr[ind_te, :4] += clf.decision_function(X_te) 375 | mf_te[:,:4] += clf.decision_function(X_test)*0.5 376 | y_pred = np.round(clf.predict(X_te)) 377 | kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) 378 | acc = np.mean(y_te == y_pred) 379 | print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 380 | print 381 | return (mf_tr / n_round, mf_te / n_round) 382 | 383 | 384 | def make_mf_classification2(X ,y, clf, qid, X_test, n_round=3): 385 | ''' 386 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier and only 2 class presented 387 | ''' 388 | print clf 389 | mf_tr = np.zeros((X.shape[0], 2)) 390 | mf_te = np.zeros((X_test.shape[0], 2)) 391 | for i in range(n_round): 392 | skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) 393 | for ind_tr, ind_te in skf: 394 | X_tr = X[ind_tr] 395 | X_te = X[ind_te] 396 | 397 | y_tr = y[ind_tr] 398 | y_te = y[ind_te] 399 | 400 | clf.fit(X_tr, y_tr) 401 | try: 402 | mf_tr[ind_te] += clf.predict_proba(X_te) 403 | mf_te += clf.predict_proba(X_test)*0.5 404 | except: 405 | mf_tr[ind_te, 0] += clf.decision_function(X_te) 406 | mf_te[:, 0] += clf.decision_function(X_test)*0.5 407 | 408 | y_pred = np.round(clf.predict(X_te)) 409 | kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) 410 | acc = np.mean(y_te == y_pred) 411 | print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc) 412 | print 413 | return (mf_tr / n_round, mf_te / n_round) 414 | 415 | 416 | 417 | def learn_class_separators(clf, X_1, X_2, n_round): 418 | ''' 419 | Fit "bumpers" metafetures. 420 | ''' 421 | class_sep_4 = make_mf_classification2(X_1, (y_base<4), clf, q_base, X_2, n_round) 422 | class_sep_3 = make_mf_classification2(X_1, (y_base<3), clf, q_base, X_2, n_round) 423 | class_sep_2 = make_mf_classification2(X_1, (y_base<2), clf, q_base, X_2, n_round) 424 | class_sep_23 = make_mf_classification2(X_1, (y_base<4)*(y_base>1), clf, q_base, X_2, n_round) 425 | ret_tr = np.hstack((class_sep_4[0], class_sep_3[0], class_sep_2[0], class_sep_23[0])) 426 | ret_te = np.hstack((class_sep_4[1], class_sep_3[1], class_sep_2[1], class_sep_23[1])) 427 | return (ret_tr[:, 1::2], ret_te[:, 1::2]) 428 | 429 | 430 | logging.info('Connect data') 431 | # assing names 432 | X_base_tf = X_all_tr 433 | X_hold_tf = X_all_te 434 | X_base_add = X_additional_tr 435 | X_hold_add = X_additional_te 436 | X_base_w2v = X_w2v_tr 437 | X_hold_w2v = X_w2v_te 438 | q_base = qid_tr 439 | q_hold = qid_te 440 | X_base_counts = X_counts_tr 441 | X_hold_counts = X_counts_te 442 | X_base_counts2 = X_counts2_tr 443 | X_hold_counts2 = X_counts2_te 444 | y_base = y_all_tr 445 | 446 | 447 | # make metafeatures 448 | logging.info('Learn metafeatures') 449 | mfs_rf_reg = make_mf_sliced_regression( 450 | train_df, 451 | test_df, 452 | RandomForestRegressor(n_estimators=500, max_features=0.3, random_state=42), 453 | n_round=3) 454 | 455 | mfs_rf_clf = make_mf_sliced_regression( 456 | train_df, 457 | test_df, 458 | RandomForestClassifier(n_estimators=500, max_features=0.3, random_state=42), 459 | n_round=3) 460 | 461 | mf_lsvc_clf = make_mf_classification4( 462 | X_base_tf, 463 | y_base, 464 | LinearSVC(), 465 | q_base, 466 | X_hold_tf, 467 | n_round=10) 468 | 469 | mf_lsvr_reg = make_mf_regression( 470 | X_base_tf, 471 | y_base, 472 | LinearSVR(), 473 | q_base, 474 | X_hold_tf, 475 | n_round=10) 476 | 477 | rf_add_sep = learn_class_separators( 478 | RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=42), 479 | X_base_add, 480 | X_hold_add, 481 | n_round=3) 482 | 483 | # bind features 484 | logging.info('Assembling features for 2nd level') 485 | X_train = np.hstack( 486 | (X_base_add, 487 | mf_lsvr_reg[0][:, np.newaxis], 488 | mf_lsvc_clf[0], 489 | rf_add_sep[0], 490 | mfs_rf_clf[0][:, np.newaxis], 491 | mfs_rf_reg[0][:, np.newaxis], 492 | X_base_counts, 493 | X_base_counts2)) 494 | 495 | X_test = np.hstack( 496 | (X_hold_add, 497 | mf_lsvr_reg[1][:, np.newaxis], 498 | mf_lsvc_clf[1], 499 | rf_add_sep[1], 500 | mfs_rf_clf[1][:, np.newaxis], 501 | mfs_rf_reg[1][:, np.newaxis], 502 | X_hold_counts, 503 | X_hold_counts2)) 504 | 505 | # fit main model 506 | logging.info('Learning 2nd level model') 507 | rfR = RandomForestRegressor(n_estimators=15000, n_jobs=-1, min_samples_split=3, random_state=42) 508 | rfR.fit(X_train, y_base) 509 | y_pred_rfR = rfR.predict(X_test) 510 | 511 | # dump prediction 512 | logging.info('Dumping prediction') 513 | np.savetxt(cfg.path_processed + 'mikhail_model1.txt', y_pred_rfR) 514 | 515 | logging.info('Done!') --------------------------------------------------------------------------------