├── Code ├── Chenglong │ ├── __init__.py │ ├── conf │ │ ├── README.md │ │ ├── feature_conf_linear_201604172250.py │ │ ├── feature_conf_linear_201605010104.py │ │ ├── feature_conf_nonlinear_201604170111.py │ │ ├── feature_conf_nonlinear_201604210409.py │ │ └── feature_conf_nonlinear_201605010058.py │ ├── config.py │ ├── convert_csv_tsne_to_pkl_tsne.py │ ├── convert_pkl_lsa_to_csv_lsa.py │ ├── data_preparer.py │ ├── data_processor.py │ ├── embedding_trainer.py │ ├── extreme_ensemble_selection.py │ ├── feature_base.py │ ├── feature_basic.py │ ├── feature_combiner.py │ ├── feature_distance.py │ ├── feature_doc2vec.py │ ├── feature_first_last_ngram.py │ ├── feature_group_distance.py │ ├── feature_group_distance_stat.py │ ├── feature_group_relevance.py │ ├── feature_intersect_count.py │ ├── feature_intersect_position.py │ ├── feature_match.py │ ├── feature_query_quality.py │ ├── feature_stat_cooc_tfidf.py │ ├── feature_transformer.py │ ├── feature_tsne.R │ ├── feature_vector_space.py │ ├── feature_word2vec.py │ ├── feature_wordnet_similarity.py │ ├── gen_best_ensemble_model.py │ ├── gen_best_single_model.py │ ├── get_feature_conf_linear.py │ ├── get_feature_conf_linear_stacking.py │ ├── get_feature_conf_nonlinear.py │ ├── get_stacking_feature_conf.py │ ├── google_spelling_checker_dict.py │ ├── model_param_space.py │ ├── plot_CV_LB.py │ ├── plot_feature_corr.py │ ├── run_data.py │ ├── run_stacking_ridge.py │ ├── run_test_ridge.py │ ├── run_test_xgb.py │ ├── spelling_checker.py │ ├── splitter.py │ ├── task.py │ ├── turing_test_converter.py │ └── utils │ │ ├── __init__.py │ │ ├── dist_utils.py │ │ ├── keras_utils.py │ │ ├── logging_utils.py │ │ ├── ngram_utils.py │ │ ├── nlp_utils.py │ │ ├── np_utils.py │ │ ├── os_utils.py │ │ ├── pkl_utils.py │ │ ├── rgf_utils.py │ │ ├── skl_utils.py │ │ ├── time_utils.py │ │ └── xgb_utils.py └── Igor&Kostia │ ├── config_IgorKostia.py │ ├── dld_features.py │ ├── ensemble_script_imitation_version.py │ ├── ensemble_script_random_version.py │ ├── feature_extraction1.py │ ├── feature_extraction1_wo_google.py │ ├── feature_sets │ ├── first_part_1000.csv │ ├── first_part_1001.csv │ ├── first_part_2000.csv │ ├── first_part_3000.csv │ ├── first_part_3010.csv │ ├── first_part_3020.csv │ ├── readme.txt │ ├── second_part_1000.csv │ ├── second_part_2000.csv │ └── second_part_3000.csv │ ├── generate_ensemble_output_from_models.py │ ├── generate_feature_importances.py │ ├── generate_model_wo_google.py │ ├── generate_models.py │ ├── google_dict.py │ ├── grams_and_terms_features.py │ ├── homedepot_functions.py │ ├── model_selecting.py │ ├── models_ensemble │ └── log_2016-04-21.txt │ ├── processing_text │ ├── automatically_generated_word_corrections.csv │ ├── brand_statistics.csv │ └── material_statistics.csv │ ├── text_processing.py │ ├── text_processing_wo_google.py │ ├── tfidf_by_st_features.py │ ├── word2vec.py │ └── word2vec_without_google_dict.py ├── Data ├── dict │ ├── color_data.py │ └── word_replacer.csv └── split │ ├── splits_level1.pkl │ ├── splits_level2.pkl │ └── splits_level3.pkl ├── Doc ├── Kaggle_HomeDepot_Turing_Test.pdf ├── Kaggle_HomeDepot_Turing_Test.tex ├── reference.bib └── reference2.bib ├── Fig ├── CV_LB_Chenglong.pdf ├── FlowChart.jpg ├── FlowChart.pptx ├── actual_product_uid.pdf ├── actual_search_term.pdf ├── feature_corr_Chenglong.pdf ├── feature_importances_Igor.pdf ├── naive_product_uid.pdf ├── naive_search_term.pdf ├── plot_ensembles_means.pdf ├── plot_ensembles_performance.pdf ├── plot_feature_importances_benchmark.pdf ├── plot_feature_importances_simplified_model.pdf ├── plot_full_query_in_title.pdf ├── plot_high_vs_low_relevance.pdf ├── plot_query_with.pdf ├── plot_replaced_with_Google.pdf ├── proposed_product_uid.pdf └── proposed_search_term.pdf ├── LICENSE ├── Log ├── README.md ├── [Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_hyperopt_2016-05-07-18-42.log ├── feature │ ├── data_processor_2016-05-08-00-36.log │ ├── generate_feature_basic_2016-05-08-01-43.log │ ├── generate_feature_char_dist_sim_2016-05-08-12-02.log │ ├── generate_feature_doc2vec_2016-05-08-12-56.log │ ├── generate_feature_edit_distance_2016-05-08-13-03.log │ ├── generate_feature_first_last_ngram_count_2016-05-08-13-03.log │ ├── generate_feature_group_relevance_2016-05-08-01-47.log │ ├── generate_feature_intersect_count_2016-05-08-13-03.log │ ├── generate_feature_intersect_position_2016-05-08-01-43.log │ ├── generate_feature_lsa_ngram_cosinesim_2016-05-08-13-38.log │ ├── generate_feature_ngram_jaccard_2016-05-08-01-43.log │ ├── generate_feature_query_quality_2016-05-08-13-05.log │ ├── generate_feature_stat_cooc_tfidf_bm25_2016-05-08-13-04.log │ ├── generate_feature_stat_cooc_tfidf_tf_2016-05-08-13-03.log │ ├── generate_feature_stat_cooc_tfidf_tfidf_2016-05-08-13-03.log │ ├── generate_feature_tfidf_ngram_cosinesim_2016-05-08-12-12.log │ ├── generate_feature_word2vec_google_2016-05-08-12-56.log │ ├── generate_feature_word2vec_homedepot_2016-05-08-12-56.log │ ├── generate_feature_word2vec_wikipedia_2016-05-08-12-56.log │ └── generate_feature_wordnet_similarity_2016-05-08-01-43.log ├── feature_combiner_level2_meta_linear_201605030922_2016-05-03-09-23.log └── level1_models │ ├── [Feat@basic20160313]_[Learner@reg_skl_adaboost]_hyperopt_2016-03-13-12-28.log │ ├── [Feat@basic20160313]_[Learner@reg_skl_gbm]_hyperopt_2016-03-13-12-27.log │ ├── [Feat@basic20160313]_[Learner@reg_skl_lasso]_hyperopt_2016-03-13-11-19.log │ ├── [Feat@basic20160313]_[Learner@reg_skl_lsvr]_hyperopt_2016-03-13-11-31.log │ ├── [Feat@basic20160313]_[Learner@reg_skl_ridge]_hyperopt_2016-03-13-11-18.log │ ├── [Feat@basic20160313]_[Learner@reg_xgb_tree]_hyperopt_2016-03-14-09-48.log │ ├── [Feat@basic_linear_201604172250]_[Learner@reg_keras_dnn]_hyperopt_2016-04-20-20-10.log │ ├── [Feat@basic_linear_201604172250]_[Learner@reg_skl_lasso]_hyperopt_2016-04-18-19-53.log │ ├── [Feat@basic_linear_201604172250]_[Learner@reg_skl_lsvr]_hyperopt_2016-04-19-06-28.log │ ├── [Feat@basic_linear_201604172250]_[Learner@reg_skl_ridge]_hyperopt_2016-04-17-23-09.log │ ├── [Feat@basic_linear_201605010104]_[Learner@reg_keras_dnn]_hyperopt_2016-05-01-01-43.log │ ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_hyperopt_2016-05-01-23-31.log │ ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_lasso]_hyperopt_2016-05-01-22-31.log │ ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_lsvr]_hyperopt_2016-05-01-02-16.log │ ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_ridge]_hyperopt_2016-05-01-01-05.log │ ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_svr]_hyperopt_2016-05-01-22-45.log │ ├── [Feat@basic_linear_201605010104]_[Learner@reg_xgb_linear]_hyperopt_2016-05-02-00-18.log │ ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_rgf]_hyperopt_2016-04-17-18-41.log │ ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_adaboost]_hyperopt_2016-04-23-10-48.log │ ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_etr]_hyperopt_2016-04-23-10-48.log │ ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_gbm]_hyperopt_2016-04-17-22-18.log │ ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_rf]_hyperopt_2016-04-23-10-48.log │ ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_xgb_tree]_hyperopt_2016-04-17-01-12.log │ ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_rgf]_hyperopt_2016-04-21-04-54.log │ ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_skl_adaboost]_hyperopt_2016-04-23-10-37.log │ ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_skl_gbm]_hyperopt_2016-04-21-04-34.log │ ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_xgb_tree]_hyperopt_2016-04-21-04-11.log │ ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_rgf]_hyperopt_2016-05-01-02-30.log │ ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_adaboost]_hyperopt_2016-05-01-02-27.log │ ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_etr]_hyperopt_2016-05-01-01-45.log │ ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_hyperopt_2016-05-01-02-16.log │ ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_rf]_hyperopt_2016-05-01-02-22.log │ └── [Feat@basic_nonlinear_201605010058]_[Learner@reg_xgb_tree]_hyperopt_2016-05-01-00-59.log ├── Output └── Subm │ ├── README.md │ ├── reproduced_blend_0.438_0.436CV.csv │ ├── submission_kostia + igor final_ensemble (1 to 3 weights).csv │ ├── test.pred.[Feat@basic_nonlinear_201604210409]_[Learner@reg_xgb_tree]_[Id@84].[Mean0.438318]_[Std0.000786].csv │ └── test.pred.[Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_[Id@1].[Mean0.436087]_[Std0.001027].csv └── README.md /Code/Chenglong/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Code/Chenglong/__init__.py -------------------------------------------------------------------------------- /Code/Chenglong/conf/README.md: -------------------------------------------------------------------------------- 1 | This folder contains feature confs used to generate feature matrix (input) for Chenglong's models. 2 | 3 | They are used in a similar way as following (excuted in the `./Code/Chenglong` directory): 4 | `python feature_combiner.py -l 1 -c feature_conf_xxx -n basic_xxx -t 0.05` 5 | 6 | Please see `feature_combiner.py`for the usage. -------------------------------------------------------------------------------- /Code/Chenglong/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: config for Homedepot project 5 | 6 | """ 7 | 8 | import os 9 | import platform 10 | 11 | import numpy as np 12 | from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS 13 | 14 | from utils import os_utils 15 | 16 | 17 | # ---------------------- Overall ----------------------- 18 | TASK = "all" 19 | # # for testing data processing and feature generation 20 | # TASK = "sample" 21 | SAMPLE_SIZE = 1000 22 | 23 | # ------------------------ PATH ------------------------ 24 | ROOT_DIR = "../.." 25 | 26 | DATA_DIR = "%s/Data"%ROOT_DIR 27 | CLEAN_DATA_DIR = "%s/Clean"%DATA_DIR 28 | 29 | FEAT_DIR = "%s/Feat"%ROOT_DIR 30 | FEAT_FILE_SUFFIX = ".pkl" 31 | FEAT_CONF_DIR = "./conf" 32 | 33 | OUTPUT_DIR = "%s/Output"%ROOT_DIR 34 | SUBM_DIR = "%s/Subm"%OUTPUT_DIR 35 | 36 | LOG_DIR = "%s/Log"%ROOT_DIR 37 | FIG_DIR = "%s/Fig"%ROOT_DIR 38 | TMP_DIR = "%s/Tmp"%ROOT_DIR 39 | THIRDPARTY_DIR = "%s/Thirdparty"%ROOT_DIR 40 | 41 | # word2vec/doc2vec/glove 42 | WORD2VEC_MODEL_DIR = "%s/word2vec"%DATA_DIR 43 | GLOVE_WORD2VEC_MODEL_DIR = "%s/glove/gensim"%DATA_DIR 44 | DOC2VEC_MODEL_DIR = "%s/doc2vec"%DATA_DIR 45 | 46 | # index split 47 | SPLIT_DIR = "%s/split"%DATA_DIR 48 | 49 | # dictionary 50 | WORD_REPLACER_DATA = "%s/dict/word_replacer.csv"%DATA_DIR 51 | 52 | # colors 53 | COLOR_DATA = "%s/dict/color_data.py"%DATA_DIR 54 | 55 | # ------------------------ DATA ------------------------ 56 | # provided data 57 | TRAIN_DATA = "%s/train.csv"%DATA_DIR 58 | TEST_DATA = "%s/test.csv"%DATA_DIR 59 | ATTR_DATA = "%s/attributes.csv"%DATA_DIR 60 | DESC_DATA = "%s/product_descriptions.csv"%DATA_DIR 61 | SAMPLE_DATA = "%s/sample_submission.csv"%DATA_DIR 62 | 63 | ALL_DATA_RAW = "%s/all.raw.csv.pkl"%CLEAN_DATA_DIR 64 | ALL_DATA_LEMMATIZED = "%s/all.lemmatized.csv.pkl"%CLEAN_DATA_DIR 65 | ALL_DATA_LEMMATIZED_STEMMED = "%s/all.lemmatized.stemmed.csv.pkl"%CLEAN_DATA_DIR 66 | INFO_DATA = "%s/info.csv.pkl"%CLEAN_DATA_DIR 67 | 68 | # size 69 | TRAIN_SIZE = 74067 70 | if TASK == "sample": 71 | TRAIN_SIZE = SAMPLE_SIZE 72 | TEST_SIZE = 166693 73 | VALID_SIZE_MAX = 60000 # 0.7 * TRAIN_SIZE 74 | 75 | TRAIN_MEAN = 2.381634 76 | TRAIN_VAR = 0.285135 77 | 78 | TEST_MEAN = TRAIN_MEAN 79 | TEST_VAR = TRAIN_VAR 80 | 81 | MEAN_STD_DICT = { 82 | 1.00: 0.000, # Common: [1, 1, 1] 83 | 1.25: 0.433, # Rare: [1,1,1,2] 84 | 1.33: 0.471, # Common: [1, 1, 2] 85 | 1.50: 0.866, # Rare: [1, 1, 1, 3] 86 | 1.67: 0.471, # Common: [1, 2, 2] 87 | 1.75: 0.829, # Rare: [1, 1, 2, 3] 88 | 2.00: 0.000, # Common: [2, 2, 2], [1, 2, 3] 89 | 2.25: 0.829, # Rare: [1,2,3,3] 90 | 2.33: 0.471, # Common: [2, 2, 3] 91 | 2.50: 0.500, # Rare: [2,2,3,3] 92 | 2.67: 0.471, # Common: [2, 3, 3] 93 | 2.75: 0.433, # Rare: [2,3,3,3] 94 | 3.00: 0.000, # Common: [3, 3, 3] 95 | } 96 | 97 | # ------------------------ PARAM ------------------------ 98 | # attribute name and value SEPARATOR 99 | ATTR_SEPARATOR = " | " 100 | 101 | # cv 102 | N_RUNS = 5 103 | N_FOLDS = 1 104 | 105 | # intersect count/match 106 | STR_MATCH_THRESHOLD = 0.85 107 | 108 | # correct query with google spelling check dict 109 | # turn this on/off to have two versions of features/models 110 | # which is useful for ensembling 111 | GOOGLE_CORRECTING_QUERY = True 112 | 113 | # auto correcting query (quite time consuming; not used in final submission) 114 | AUTO_CORRECTING_QUERY = False 115 | 116 | # query expansion (not used in final submission) 117 | QUERY_EXPANSION = False 118 | 119 | # bm25 120 | BM25_K1 = 1.6 121 | BM25_B = 0.75 122 | 123 | # svd 124 | SVD_DIM = 100 125 | SVD_N_ITER = 5 126 | 127 | # xgboost 128 | # mean of relevance in training set 129 | BASE_SCORE = TRAIN_MEAN 130 | 131 | # word2vec/doc2vec 132 | EMBEDDING_ALPHA = 0.025 133 | EMBEDDING_LEARNING_RATE_DECAY = 0.5 134 | EMBEDDING_N_EPOCH = 5 135 | EMBEDDING_MIN_COUNT = 3 136 | EMBEDDING_DIM = 100 137 | EMBEDDING_WINDOW = 5 138 | EMBEDDING_WORKERS = 6 139 | 140 | # count transformer 141 | COUNT_TRANSFORM = np.log1p 142 | 143 | # missing value 144 | MISSING_VALUE_STRING = "MISSINGVALUE" 145 | MISSING_VALUE_NUMERIC = -1. 146 | 147 | # stop words 148 | STOP_WORDS = set(ENGLISH_STOP_WORDS) 149 | 150 | # ------------------------ OTHER ------------------------ 151 | RANDOM_SEED = 2016 152 | PLATFORM = platform.system() 153 | NUM_CORES = 4 if PLATFORM == "Windows" else 14 154 | 155 | DATA_PROCESSOR_N_JOBS = 4 if PLATFORM == "Windows" else 6 156 | AUTO_SPELLING_CHECKER_N_JOBS = 4 if PLATFORM == "Windows" else 8 157 | # multi processing is not faster 158 | AUTO_SPELLING_CHECKER_N_JOBS = 1 159 | 160 | ## rgf 161 | RGF_CALL_EXE = "%s/rgf1.2/test/call_exe.pl"%THIRDPARTY_DIR 162 | RGF_EXTENSION = ".exe" if PLATFORM == "Windows" else "" 163 | RGF_EXE = "%s/rgf1.2/bin/rgf%s"%(THIRDPARTY_DIR, RGF_EXTENSION) 164 | 165 | 166 | # ---------------------- CREATE PATH -------------------- 167 | DIRS = [] 168 | DIRS += [CLEAN_DATA_DIR] 169 | DIRS += [SPLIT_DIR] 170 | DIRS += [FEAT_DIR, FEAT_CONF_DIR] 171 | DIRS += ["%s/All"%FEAT_DIR] 172 | DIRS += ["%s/Run%d"%(FEAT_DIR,i+1) for i in range(N_RUNS)] 173 | DIRS += ["%s/Combine"%FEAT_DIR] 174 | DIRS += [OUTPUT_DIR, SUBM_DIR] 175 | DIRS += ["%s/All"%OUTPUT_DIR] 176 | DIRS += ["%s/Run%d"%(OUTPUT_DIR,i+1) for i in range(N_RUNS)] 177 | DIRS += [LOG_DIR, FIG_DIR, TMP_DIR] 178 | DIRS += [WORD2VEC_MODEL_DIR, DOC2VEC_MODEL_DIR, GLOVE_WORD2VEC_MODEL_DIR] 179 | 180 | os_utils._create_dirs(DIRS) 181 | -------------------------------------------------------------------------------- /Code/Chenglong/convert_csv_tsne_to_pkl_tsne.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: convert .csv format TSNE features to .pkl format 5 | 6 | """ 7 | 8 | import os 9 | 10 | import pandas as pd 11 | 12 | import config 13 | from utils import pkl_utils 14 | 15 | 16 | def main(): 17 | fnames = [ 18 | "TSNE_LSA100_Word_Unigram_Pair_search_term_x_product_title_100D", 19 | "TSNE_LSA100_Word_Bigram_Pair_search_term_x_product_title_100D", 20 | "TSNE_LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D", 21 | "TSNE_LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D", 22 | ] 23 | 24 | fnames = [os.path.join(config.FEAT_DIR, fname+".csv") for fname in fnames] 25 | 26 | for fname in fnames: 27 | df = pd.read_csv(fname, index=False) 28 | f = df.values 29 | pkl_utils._save(fname[:-4]+".pkl", f) 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /Code/Chenglong/convert_pkl_lsa_to_csv_lsa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: convert .pkl format LSA features to .csv format for using Rtsne package in R 5 | 6 | """ 7 | 8 | import os 9 | 10 | import pandas as pd 11 | 12 | import config 13 | from utils import pkl_utils 14 | 15 | 16 | def main(): 17 | fnames = [ 18 | "LSA100_Word_Unigram_Pair_search_term_x_product_title_100D", 19 | "LSA100_Word_Bigram_Pair_search_term_x_product_title_100D", 20 | "LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D", 21 | "LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D", 22 | ] 23 | 24 | fnames = [os.path.join(config.FEAT_DIR, fname+".pkl") for fname in fnames] 25 | 26 | for fname in fnames: 27 | f = pkl_utils._load(fname) 28 | columns = ["LSA%d"%(i+1) for i in range(f.shape[1])] 29 | pd.DataFrame(f, columns=columns).to_csv(fname[:-4]+".csv", index=False) 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /Code/Chenglong/data_preparer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: generate raw dataframe data 5 | 6 | """ 7 | 8 | import gc 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | import config 14 | from utils import pkl_utils 15 | 16 | 17 | def main(): 18 | # load provided data 19 | dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1") 20 | dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1") 21 | dfAttr = pd.read_csv(config.ATTR_DATA) 22 | dfDesc = pd.read_csv(config.DESC_DATA) 23 | 24 | # 25 | print("Train Mean: %.6f"%np.mean(dfTrain["relevance"])) 26 | print("Train Var: %.6f"%np.var(dfTrain["relevance"])) 27 | 28 | # 29 | dfTest["relevance"] = np.zeros((config.TEST_SIZE)) 30 | dfAttr.dropna(how="all", inplace=True) 31 | dfAttr["value"] = dfAttr["value"].astype(str) 32 | 33 | # concat train and test 34 | dfAll = pd.concat((dfTrain, dfTest), ignore_index=True) 35 | del dfTrain 36 | del dfTest 37 | gc.collect() 38 | 39 | # merge product description 40 | dfAll = pd.merge(dfAll, dfDesc, on="product_uid", how="left") 41 | dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) 42 | del dfDesc 43 | gc.collect() 44 | 45 | # merge product brand 46 | dfBrand = dfAttr[dfAttr.name=="MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "product_brand"}) 47 | dfAll = pd.merge(dfAll, dfBrand, on="product_uid", how="left") 48 | dfBrand["product_brand"] = dfBrand["product_brand"].values.astype(str) 49 | dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) 50 | del dfBrand 51 | gc.collect() 52 | 53 | # merge product color 54 | color_columns = ["product_color", "Color Family", "Color/Finish", "Color/Finish Family"] 55 | dfColor = dfAttr[dfAttr.name.isin(color_columns)][["product_uid", "value"]].rename(columns={"value": "product_color"}) 56 | dfColor.dropna(how="all", inplace=True) 57 | _agg_color = lambda df: " ".join(list(set(df["product_color"]))) 58 | dfColor = dfColor.groupby("product_uid").apply(_agg_color) 59 | dfColor = dfColor.reset_index(name="product_color") 60 | dfColor["product_color"] = dfColor["product_color"].values.astype(str) 61 | dfAll = pd.merge(dfAll, dfColor, on="product_uid", how="left") 62 | dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) 63 | del dfColor 64 | gc.collect() 65 | 66 | # merge product attribute 67 | _agg_attr = lambda df: config.ATTR_SEPARATOR.join(df["name"] + config.ATTR_SEPARATOR + df["value"]) 68 | dfAttr = dfAttr.groupby("product_uid").apply(_agg_attr) 69 | dfAttr = dfAttr.reset_index(name="product_attribute_concat") 70 | dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left") 71 | dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) 72 | del dfAttr 73 | gc.collect() 74 | 75 | # save data 76 | if config.TASK == "sample": 77 | dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy() 78 | pkl_utils._save(config.ALL_DATA_RAW, dfAll) 79 | 80 | # info 81 | dfInfo = dfAll[["id","relevance"]].copy() 82 | pkl_utils._save(config.INFO_DATA, dfInfo) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /Code/Chenglong/embedding_trainer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: word2vec & doc2vec trainer 5 | 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | import pandas as pd 12 | from gensim.models import Word2Vec, Doc2Vec 13 | from gensim.models.doc2vec import LabeledSentence 14 | 15 | import config 16 | from utils import nlp_utils 17 | from utils import logging_utils, pkl_utils, time_utils 18 | 19 | 20 | # tune the token pattern to get a better correlation with y_train 21 | # token_pattern = r"(?u)\b\w\w+\b" 22 | # token_pattern = r"\w{1,}" 23 | # token_pattern = r"\w+" 24 | # token_pattern = r"[\w']+" 25 | token_pattern = " " # just split the text into tokens 26 | 27 | 28 | #---------------------- Word2Vec ---------------------- 29 | class DataFrameSentences(object): 30 | def __init__(self, df, columns): 31 | self.df = df 32 | self.columns = columns 33 | 34 | def __iter__(self): 35 | for column in self.columns: 36 | for sentence in self.df[column]: 37 | tokens = nlp_utils._tokenize(sentence, token_pattern) 38 | yield tokens 39 | 40 | 41 | class DataFrameWord2Vec: 42 | def __init__(self, df, columns, model_param): 43 | self.df = df 44 | self.columns = columns 45 | self.model_param = model_param 46 | self.model = Word2Vec(sg=self.model_param["sg"], 47 | hs=self.model_param["hs"], 48 | alpha=self.model_param["alpha"], 49 | min_alpha=self.model_param["alpha"], 50 | min_count=self.model_param["min_count"], 51 | size=self.model_param["size"], 52 | sample=self.model_param["sample"], 53 | window=self.model_param["window"], 54 | workers=self.model_param["workers"]) 55 | 56 | def train(self): 57 | # build vocabulary 58 | self.sentences = DataFrameSentences(self.df, self.columns) 59 | self.model.build_vocab(self.sentences) 60 | # train for n_epoch 61 | for i in range(self.model_param["n_epoch"]): 62 | self.sentences = DataFrameSentences(self.df, self.columns) 63 | self.model.train(self.sentences) 64 | self.model.alpha *= self.model_param["learning_rate_decay"] 65 | self.model.min_alpha = self.model.alpha 66 | return self 67 | 68 | def save(self, model_dir, model_name): 69 | fname = os.path.join(model_dir, model_name) 70 | self.model.save(fname) 71 | 72 | 73 | def train_word2vec_model(df, columns): 74 | model_param = { 75 | "alpha": config.EMBEDDING_ALPHA, 76 | "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY, 77 | "n_epoch": config.EMBEDDING_N_EPOCH, 78 | "sg": 1, 79 | "hs": 1, 80 | "min_count": config.EMBEDDING_MIN_COUNT, 81 | "size": config.EMBEDDING_DIM, 82 | "sample": 0.001, 83 | "window": config.EMBEDDING_WINDOW, 84 | "workers": config.EMBEDDING_WORKERS, 85 | } 86 | model_dir = config.WORD2VEC_MODEL_DIR 87 | model_name = "Homedepot-word2vec-D%d-min_count%d.model"%( 88 | model_param["size"], model_param["min_count"]) 89 | 90 | word2vec = DataFrameWord2Vec(df, columns, model_param) 91 | word2vec.train() 92 | word2vec.save(model_dir, model_name) 93 | 94 | 95 | #---------------------- Doc2Vec ---------------------- 96 | class DataFrameLabelSentences(object): 97 | def __init__(self, df, columns): 98 | self.df = df 99 | self.columns = columns 100 | self.cnt = -1 101 | self.sent_label = {} 102 | 103 | def __iter__(self): 104 | for column in self.columns: 105 | for sentence in self.df[column]: 106 | if not sentence in self.sent_label: 107 | self.cnt += 1 108 | self.sent_label[sentence] = "SENT_%d"%self.cnt 109 | tokens = nlp_utils._tokenize(sentence, token_pattern) 110 | yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]]) 111 | 112 | 113 | class DataFrameDoc2Vec(DataFrameWord2Vec): 114 | def __init__(self, df, columns, model_param): 115 | super().__init__(df, columns, model_param) 116 | self.model = Doc2Vec(dm=self.model_param["dm"], 117 | hs=self.model_param["hs"], 118 | alpha=self.model_param["alpha"], 119 | min_alpha=self.model_param["alpha"], 120 | min_count=self.model_param["min_count"], 121 | size=self.model_param["size"], 122 | sample=self.model_param["sample"], 123 | window=self.model_param["window"], 124 | workers=self.model_param["workers"]) 125 | def train(self): 126 | # build vocabulary 127 | self.sentences = DataFrameLabelSentences(self.df, self.columns) 128 | self.model.build_vocab(self.sentences) 129 | # train for n_epoch 130 | for i in range(self.model_param["n_epoch"]): 131 | self.sentences = DataFrameLabelSentences(self.df, self.columns) 132 | self.model.train(self.sentences) 133 | self.model.alpha *= self.model_param["learning_rate_decay"] 134 | self.model.min_alpha = self.model.alpha 135 | return self 136 | 137 | def save(self, model_dir, model_name): 138 | fname = os.path.join(model_dir, model_name) 139 | self.model.save(fname) 140 | pkl_utils._save("%s.sent_label"%fname, self.sentences.sent_label) 141 | 142 | 143 | def train_doc2vec_model(df, columns): 144 | model_param = { 145 | "alpha": config.EMBEDDING_ALPHA, 146 | "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY, 147 | "n_epoch": config.EMBEDDING_N_EPOCH, 148 | "sg": 1, # not use 149 | "dm": 1, 150 | "hs": 1, 151 | "min_count": config.EMBEDDING_MIN_COUNT, 152 | "size": config.EMBEDDING_DIM, 153 | "sample": 0.001, 154 | "window": config.EMBEDDING_WINDOW, 155 | "workers": config.EMBEDDING_WORKERS, 156 | } 157 | model_dir = config.DOC2VEC_MODEL_DIR 158 | model_name = "Homedepot-doc2vec-D%d-min_count%d.model"%( 159 | model_param["size"], model_param["min_count"]) 160 | 161 | doc2vec = DataFrameDoc2Vec(df, columns, model_param) 162 | doc2vec.train() 163 | doc2vec.save(model_dir, model_name) 164 | 165 | 166 | #---------------------- Main ---------------------- 167 | if __name__ == "__main__": 168 | df = pkl_utils._load(config.ALL_DATA_LEMMATIZED) 169 | columns = ["search_term", "search_term_alt", "product_title", "product_description", 170 | "product_attribute", "product_brand", "product_color"] 171 | columns = [col for col in columns if col in df.columns] 172 | 173 | if len(sys.argv) >= 2: 174 | for w in sys.argv[1].split(","): 175 | if w == "word2vec": 176 | train_word2vec_model(df, columns) 177 | elif w == "doc2vec": 178 | train_doc2vec_model(df, columns) 179 | else: 180 | print("Skip: %s"%w) 181 | continue 182 | else: 183 | train_doc2vec_model(df, columns) 184 | train_word2vec_model(df, columns) 185 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_doc2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: doc2vec based features 5 | 6 | """ 7 | 8 | import gensim 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.metrics.pairwise import cosine_similarity 12 | 13 | import config 14 | from utils import dist_utils, ngram_utils, nlp_utils 15 | from utils import logging_utils, time_utils, pkl_utils 16 | from feature_base import BaseEstimator, StandaloneFeatureWrapper, PairwiseFeatureWrapper 17 | 18 | 19 | class Doc2Vec_BaseEstimator(BaseEstimator): 20 | def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""): 21 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 22 | self.model = doc2vec_model 23 | self.sent_label = sent_label 24 | self.model_prefix = model_prefix 25 | self.vector_size = doc2vec_model.vector_size 26 | 27 | def _get_vector(self, sent): 28 | try: 29 | vect = self.model.docvecs[self.sent_label[sent]] 30 | except: 31 | vect = np.zeros(self.vector_size, dtype=float) 32 | return vect 33 | 34 | def _get_cosine_sim(self, sent1, sent2): 35 | vect1 = self._get_vector(sent1) 36 | vect2 = self._get_vector(sent2) 37 | return dist_utils._cosine_sim(vect1, vect2) 38 | 39 | def _get_vdiff(self, sent1, sent2): 40 | vect1 = self._get_vector(sent1) 41 | vect2 = self._get_vector(sent2) 42 | return dist_utils._vdiff(vect1, vect2) 43 | 44 | def _get_rmse(self, sent1, sent2): 45 | vect1 = self._get_vector(sent1) 46 | vect2 = self._get_vector(sent2) 47 | return dist_utils._rmse(vect1, vect2) 48 | 49 | 50 | class Doc2Vec_Vector(Doc2Vec_BaseEstimator): 51 | def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""): 52 | super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode) 53 | 54 | def __name__(self): 55 | return "Doc2Vec_%s_D%d_Vector"%(self.model_prefix, self.vector_size) 56 | 57 | def transform_one(self, obs, target, id): 58 | return self._get_vector(obs) 59 | 60 | 61 | class Doc2Vec_Vdiff(Doc2Vec_BaseEstimator): 62 | def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""): 63 | super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode) 64 | 65 | def __name__(self): 66 | return "Doc2Vec_%s_D%d_Vdiff"%(self.model_prefix, self.vector_size) 67 | 68 | def transform_one(self, obs, target, id): 69 | return self._get_vdiff(obs, target) 70 | 71 | 72 | class Doc2Vec_CosineSim(Doc2Vec_BaseEstimator): 73 | def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""): 74 | super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode) 75 | 76 | def __name__(self): 77 | return "Doc2Vec_%s_D%d_CosineSim"%(self.model_prefix, self.vector_size) 78 | 79 | def transform_one(self, obs, target, id): 80 | return self._get_cosine_sim(obs, target) 81 | 82 | 83 | class Doc2Vec_RMSE(Doc2Vec_BaseEstimator): 84 | def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""): 85 | super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode) 86 | 87 | def __name__(self): 88 | return "Doc2Vec_%s_D%d_RMSE"%(self.model_prefix, self.vector_size) 89 | 90 | def transform_one(self, obs, target, id): 91 | return self._get_rmse(obs, target) 92 | 93 | 94 | # -------------------------------- Main ---------------------------------- 95 | def main(): 96 | logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp() 97 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 98 | #### NOTE: use data BEFORE STEMMING 99 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) 100 | 101 | doc2vec_model_dirs = [] 102 | model_prefixes = [] 103 | ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description 104 | doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) 105 | model_prefixes.append( "Homedepot" ) 106 | for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): 107 | ## load model 108 | try: 109 | if ".bin" in doc2vec_model_dir: 110 | doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True) 111 | if ".txt" in doc2vec_model_dir: 112 | doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False) 113 | else: 114 | doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) 115 | doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label") 116 | except: 117 | continue 118 | 119 | # ## standalone (not used in model building) 120 | # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] 121 | # generator = Doc2Vec_Vector 122 | # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] 123 | # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) 124 | # sf.go() 125 | 126 | ## pairwise 127 | generators = [ 128 | Doc2Vec_CosineSim, 129 | Doc2Vec_RMSE, 130 | # Doc2Vec_Vdiff, 131 | ] 132 | obs_fields_list = [] 133 | target_fields_list = [] 134 | obs_fields_list.append( ["search_term", "search_term_alt"][:1] ) 135 | target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) 136 | for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): 137 | for generator in generators: 138 | param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] 139 | pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) 140 | pf.go() 141 | 142 | 143 | if __name__ == "__main__": 144 | main() 145 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_group_distance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: group relevance based distance features 5 | @note: such features are not used in final submission 6 | 7 | """ 8 | 9 | import re 10 | import string 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | import config 16 | from config import TRAIN_SIZE 17 | from utils import dist_utils, ngram_utils, nlp_utils 18 | from utils import logging_utils, pkl_utils, time_utils 19 | from feature_base import BaseEstimator, StandaloneFeatureWrapper, PairwiseFeatureWrapper 20 | 21 | 22 | # tune the token pattern to get a better correlation with y_train 23 | # token_pattern = r"(?u)\b\w\w+\b" 24 | # token_pattern = r"\w{1,}" 25 | # token_pattern = r"\w+" 26 | # token_pattern = r"[\w']+" 27 | token_pattern = " " # just split the text into tokens 28 | 29 | 30 | # -------------------- Group by (obs, relevance) based distance features ----------------------------------- # 31 | # Something related to Query Expansion 32 | class GroupRelevance_Ngram_Jaccard(BaseEstimator): 33 | """Single aggregation features""" 34 | def __init__(self, obs_corpus, target_corpus, id_list, dfTrain, target_field, relevance, ngram, aggregation_mode=""): 35 | super().__init__(obs_corpus, target_corpus, aggregation_mode, id_list) 36 | self.dfTrain = dfTrain[dfTrain["relevance"] != 0].copy() 37 | self.target_field = target_field 38 | self.relevance = relevance 39 | self.relevance_str = self._relevance_to_str() 40 | self.ngram = ngram 41 | self.ngram_str = ngram_utils._ngram_str_map[self.ngram] 42 | 43 | def __name__(self): 44 | if isinstance(self.aggregation_mode, str): 45 | feat_name = "Group_%sRelevance_%s_Jaccard_%s"%( 46 | self.relevance_str, self.ngram_str, string.capwords(self.aggregation_mode)) 47 | elif isinstance(self.aggregation_mode, list): 48 | feat_name = ["Group_%sRelevance_%s_Jaccard_%s"%( 49 | self.relevance_str, self.ngram_str, string.capwords(m)) for m in self.aggregation_mode] 50 | return feat_name 51 | 52 | def _relevance_to_str(self): 53 | if isinstance(self.relevance, float): 54 | return re.sub("\.", "d", str(self.relevance)) 55 | else: 56 | return str(self.relevance) 57 | 58 | def transform_one(self, obs, target, id): 59 | df = self.dfTrain[self.dfTrain["search_term"] == obs].copy() 60 | val_list = [config.MISSING_VALUE_NUMERIC] 61 | if df is not None: 62 | df = df[df["id"] != id].copy() 63 | df = df[df["relevance"] == self.relevance].copy() 64 | if df is not None and df.shape[0] > 0: 65 | target_tokens = nlp_utils._tokenize(target, token_pattern) 66 | target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) 67 | val_list = [] 68 | for x in df[self.target_field]: 69 | x_tokens = nlp_utils._tokenize(x, token_pattern) 70 | x_ngrams = ngram_utils._ngrams(x_tokens, self.ngram) 71 | val_list.append(dist_utils._jaccard_coef(x_ngrams, target_ngrams)) 72 | return val_list 73 | 74 | 75 | # -------------------------------- Main ---------------------------------- 76 | def main(): 77 | logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp() 78 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 79 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) 80 | dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() 81 | 82 | ## run python3 splitter.py first 83 | split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) 84 | n_iter = len(split) 85 | 86 | relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3] 87 | relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] 88 | ngrams = [1] 89 | obs_fields = ["search_term"] 90 | target_fields = ["product_title", "product_description"] 91 | aggregation_mode = ["mean", "std", "max", "min", "median"] 92 | 93 | ## for cv 94 | for i in range(n_iter): 95 | trainInd, validInd = split[i][0], split[i][1] 96 | dfTrain2 = dfTrain.iloc[trainInd].copy() 97 | sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) 98 | 99 | for target_field in target_fields: 100 | for relevance in relevances: 101 | for ngram in ngrams: 102 | param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode] 103 | pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) 104 | pf.go() 105 | 106 | ## for all 107 | sub_feature_dir = "%s/All" % (config.FEAT_DIR) 108 | for target_field in target_fields: 109 | for relevance in relevances: 110 | for ngram in ngrams: 111 | param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode] 112 | pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) 113 | pf.go() 114 | 115 | 116 | if __name__ == "__main__": 117 | main() 118 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_group_relevance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: group based relevance features 5 | @note: such features are not used in final submission (except GroupRelevance_Size) 6 | 7 | """ 8 | 9 | import string 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | import config 15 | from config import TRAIN_SIZE 16 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils 17 | from utils import logging_utils, time_utils, pkl_utils 18 | from feature_base import BaseEstimator, StandaloneFeatureWrapper 19 | 20 | 21 | class GroupRelevance(BaseEstimator): 22 | """Single aggregation features""" 23 | def __init__(self, obs_corpus, target_corpus, id_list, dfTrain, aggregation_mode=""): 24 | super().__init__(obs_corpus, target_corpus, aggregation_mode, id_list) 25 | self.dfTrain = dfTrain[dfTrain["relevance"] != 0].copy() 26 | 27 | def __name__(self): 28 | if isinstance(self.aggregation_mode, str): 29 | feat_name = "GroupRelevance_%s"%string.capwords(self.aggregation_mode) 30 | elif isinstance(self.aggregation_mode, list): 31 | feat_name = ["GroupRelevance_%s"%string.capwords(m) for m in self.aggregation_mode] 32 | return feat_name 33 | 34 | def transform_one(self, obs, target, id): 35 | df = self.dfTrain[self.dfTrain["search_term"] == obs].copy() 36 | val_list = [config.MISSING_VALUE_NUMERIC] 37 | if df is not None: 38 | df = df[df["id"] != id].copy() 39 | if df is not None and df.shape[0] > 0: 40 | val_list = df["relevance"].values.tolist() 41 | return val_list 42 | 43 | 44 | # -------------------------------- Main ---------------------------------- 45 | def main(): 46 | logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp() 47 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 48 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) 49 | dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() 50 | 51 | ## run python3 splitter.py first 52 | split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) 53 | n_iter = len(split) 54 | 55 | ## for cv 56 | for i in range(n_iter): 57 | trainInd, validInd = split[i][0], split[i][1] 58 | dfTrain2 = dfTrain.iloc[trainInd].copy() 59 | sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) 60 | 61 | obs_fields = ["search_term", "product_title"][1:] 62 | aggregation_mode = ["mean", "std", "max", "min", "median", "size"] 63 | param_list = [dfAll["id"], dfTrain2, aggregation_mode] 64 | sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) 65 | sf.go() 66 | 67 | ## for all 68 | sub_feature_dir = "%s/All" % (config.FEAT_DIR) 69 | obs_fields = ["search_term", "product_title"][1:] 70 | aggregation_mode = ["mean", "std", "max", "min", "median", "size"] 71 | param_list = [dfAll["id"], dfTrain, aggregation_mode] 72 | sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) 73 | sf.go() 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_intersect_count.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: intersect count features 5 | 6 | """ 7 | 8 | import re 9 | import string 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | import config 15 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils 16 | from utils import logging_utils, time_utils, pkl_utils 17 | from feature_base import BaseEstimator, PairwiseFeatureWrapper 18 | 19 | 20 | # tune the token pattern to get a better correlation with y_train 21 | # token_pattern = r"(?u)\b\w\w+\b" 22 | # token_pattern = r"\w{1,}" 23 | # token_pattern = r"\w+" 24 | # token_pattern = r"[\w']+" 25 | token_pattern = " " # just split the text into tokens 26 | 27 | 28 | # ---------------------------------------------------------------------------- 29 | # How many ngrams of obs are in target? 30 | # Obs: [AB, AB, AB, AC, DE, CD] 31 | # Target: [AB, AC, AB, AD, ED] 32 | # -> 33 | # IntersectCount: 4 (i.e., AB, AB, AB, AC) 34 | # IntersectRatio: 4/6 35 | class IntersectCount_Ngram(BaseEstimator): 36 | def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", 37 | str_match_threshold=config.STR_MATCH_THRESHOLD): 38 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 39 | self.ngram = ngram 40 | self.ngram_str = ngram_utils._ngram_str_map[self.ngram] 41 | self.str_match_threshold = str_match_threshold 42 | 43 | def __name__(self): 44 | return "IntersectCount_%s"%self.ngram_str 45 | 46 | def transform_one(self, obs, target, id): 47 | obs_tokens = nlp_utils._tokenize(obs, token_pattern) 48 | target_tokens = nlp_utils._tokenize(target, token_pattern) 49 | obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) 50 | target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) 51 | s = 0. 52 | for w1 in obs_ngrams: 53 | for w2 in target_ngrams: 54 | if dist_utils._is_str_match(w1, w2, self.str_match_threshold): 55 | s += 1. 56 | break 57 | return s 58 | 59 | 60 | class IntersectRatio_Ngram(BaseEstimator): 61 | def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", 62 | str_match_threshold=config.STR_MATCH_THRESHOLD): 63 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 64 | self.ngram = ngram 65 | self.ngram_str = ngram_utils._ngram_str_map[self.ngram] 66 | self.str_match_threshold = str_match_threshold 67 | 68 | def __name__(self): 69 | return "IntersectRatio_%s"%self.ngram_str 70 | 71 | def transform_one(self, obs, target, id): 72 | obs_tokens = nlp_utils._tokenize(obs, token_pattern) 73 | target_tokens = nlp_utils._tokenize(target, token_pattern) 74 | obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) 75 | target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) 76 | s = 0. 77 | for w1 in obs_ngrams: 78 | for w2 in target_ngrams: 79 | if dist_utils._is_str_match(w1, w2, self.str_match_threshold): 80 | s += 1. 81 | break 82 | return np_utils._try_divide(s, len(obs_ngrams)) 83 | 84 | 85 | # ---------------------------------------------------------------------------- 86 | # How many cooccurrence ngrams between obs and target? 87 | # Obs: [AB, AB, AB, AC, DE, CD] 88 | # Target: [AB, AC, AB, AD, ED] 89 | # -> 90 | # CooccurrenceCount: 7 (i.e., AB x 2 + AB x 2 + AB x 2 + AC x 1) 91 | # CooccurrenceRatio: 7/(6 x 5) 92 | class CooccurrenceCount_Ngram(BaseEstimator): 93 | def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", str_match_threshold=config.STR_MATCH_THRESHOLD): 94 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 95 | self.ngram = ngram 96 | self.ngram_str = ngram_utils._ngram_str_map[self.ngram] 97 | self.str_match_threshold = str_match_threshold 98 | 99 | def __name__(self): 100 | return "CooccurrenceCount_%s"%self.ngram_str 101 | 102 | def transform_one(self, obs, target, id): 103 | obs_tokens = nlp_utils._tokenize(obs, token_pattern) 104 | target_tokens = nlp_utils._tokenize(target, token_pattern) 105 | obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) 106 | target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) 107 | s = 0. 108 | for w1 in obs_ngrams: 109 | for w2 in target_ngrams: 110 | if dist_utils._is_str_match(w1, w2, self.str_match_threshold): 111 | s += 1. 112 | return s 113 | 114 | 115 | class CooccurrenceRatio_Ngram(BaseEstimator): 116 | def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", str_match_threshold=config.STR_MATCH_THRESHOLD): 117 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 118 | self.ngram = ngram 119 | self.ngram_str = ngram_utils._ngram_str_map[self.ngram] 120 | self.str_match_threshold = str_match_threshold 121 | 122 | def __name__(self): 123 | return "CooccurrenceRatio_%s"%self.ngram_str 124 | 125 | def transform_one(self, obs, target, id): 126 | obs_tokens = nlp_utils._tokenize(obs, token_pattern) 127 | target_tokens = nlp_utils._tokenize(target, token_pattern) 128 | obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) 129 | target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) 130 | s = 0. 131 | for w1 in obs_ngrams: 132 | for w2 in target_ngrams: 133 | if dist_utils._is_str_match(w1, w2, self.str_match_threshold): 134 | s += 1. 135 | return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams)) 136 | 137 | 138 | # ---------------------------- Main -------------------------------------- 139 | def main(): 140 | logname = "generate_feature_intersect_count_%s.log"%time_utils._timestamp() 141 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 142 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) 143 | 144 | generators = [ 145 | IntersectCount_Ngram, 146 | IntersectRatio_Ngram, 147 | CooccurrenceCount_Ngram, 148 | CooccurrenceRatio_Ngram, 149 | ] 150 | obs_fields_list = [] 151 | target_fields_list = [] 152 | ## query in document 153 | obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) 154 | target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) 155 | ## document in query 156 | obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) 157 | target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) 158 | ngrams = [1,2,3,12,123][:3] 159 | for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): 160 | for generator in generators: 161 | for ngram in ngrams: 162 | param_list = [ngram] 163 | pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) 164 | pf.go() 165 | 166 | 167 | if __name__ == "__main__": 168 | main() 169 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_intersect_position.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: intersect position features 5 | 6 | """ 7 | 8 | import re 9 | import string 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | import config 15 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils 16 | from utils import logging_utils, time_utils, pkl_utils 17 | from feature_base import BaseEstimator, PairwiseFeatureWrapper 18 | 19 | 20 | # tune the token pattern to get a better correlation with y_train 21 | # token_pattern = r"(?u)\b\w\w+\b" 22 | # token_pattern = r"\w{1,}" 23 | # token_pattern = r"\w+" 24 | # token_pattern = r"[\w']+" 25 | token_pattern = " " # just split the text into tokens 26 | 27 | 28 | def _inter_pos_list(obs, target): 29 | """ 30 | Get the list of positions of obs in target 31 | """ 32 | pos_list = [0] 33 | if len(obs) != 0: 34 | pos_list = [i for i,o in enumerate(obs, start=1) if o in target] 35 | if len(pos_list) == 0: 36 | pos_list = [0] 37 | return pos_list 38 | 39 | 40 | def _inter_norm_pos_list(obs, target): 41 | pos_list = _inter_pos_list(obs, target) 42 | N = len(obs) 43 | return [np_utils._try_divide(i, N) for i in pos_list] 44 | 45 | 46 | class IntersectPosition_Ngram(BaseEstimator): 47 | """Single aggregation features""" 48 | def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode=""): 49 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 50 | self.ngram = ngram 51 | self.ngram_str = ngram_utils._ngram_str_map[self.ngram] 52 | 53 | def __name__(self): 54 | if isinstance(self.aggregation_mode, str): 55 | feat_name = "IntersectPosition_%s_%s"%( 56 | self.ngram_str, string.capwords(self.aggregation_mode)) 57 | elif isinstance(self.aggregation_mode, list): 58 | feat_name = ["IntersectPosition_%s_%s"%( 59 | self.ngram_str, string.capwords(m)) for m in self.aggregation_mode] 60 | return feat_name 61 | 62 | def transform_one(self, obs, target, id): 63 | obs_tokens = nlp_utils._tokenize(obs, token_pattern) 64 | target_tokens = nlp_utils._tokenize(target, token_pattern) 65 | obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) 66 | target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) 67 | pos_list = _inter_pos_list(obs_ngrams, target_ngrams) 68 | return pos_list 69 | 70 | 71 | class IntersectNormPosition_Ngram(BaseEstimator): 72 | """Single aggregation features""" 73 | def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode=""): 74 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 75 | self.ngram = ngram 76 | self.ngram_str = ngram_utils._ngram_str_map[self.ngram] 77 | 78 | def __name__(self): 79 | if isinstance(self.aggregation_mode, str): 80 | feat_name = "IntersectNormPosition_%s_%s"%( 81 | self.ngram_str, string.capwords(self.aggregation_mode)) 82 | elif isinstance(self.aggregation_mode, list): 83 | feat_name = ["IntersectNormPosition_%s_%s"%( 84 | self.ngram_str, string.capwords(m)) for m in self.aggregation_mode] 85 | return feat_name 86 | 87 | def transform_one(self, obs, target, id): 88 | obs_tokens = nlp_utils._tokenize(obs, token_pattern) 89 | target_tokens = nlp_utils._tokenize(target, token_pattern) 90 | obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) 91 | target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) 92 | pos_list = _inter_norm_pos_list(obs_ngrams, target_ngrams) 93 | return pos_list 94 | 95 | 96 | # ---------------------------- Main -------------------------------------- 97 | def main(): 98 | logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() 99 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 100 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) 101 | 102 | generators = [ 103 | IntersectPosition_Ngram, 104 | IntersectNormPosition_Ngram, 105 | ] 106 | obs_fields_list = [] 107 | target_fields_list = [] 108 | ## query in document 109 | obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) 110 | target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) 111 | ## document in query 112 | obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) 113 | target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) 114 | ngrams = [1,2,3,12,123][:3] 115 | aggregation_mode = ["mean", "std", "max", "min", "median"] 116 | for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): 117 | for generator in generators: 118 | for ngram in ngrams: 119 | param_list = [ngram, aggregation_mode] 120 | pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) 121 | pf.go() 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_match.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: match based features 5 | 6 | """ 7 | 8 | import re 9 | import string 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | import config 15 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils 16 | from utils import logging_utils, time_utils, pkl_utils 17 | from feature_base import BaseEstimator, PairwiseFeatureWrapper 18 | 19 | 20 | class MatchQueryCount(BaseEstimator): 21 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 22 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 23 | 24 | def __name__(self): 25 | return "MatchQueryCount" 26 | 27 | def _str_whole_word(self, str1, str2, i_): 28 | cnt = 0 29 | if len(str1) > 0 and len(str2) > 0: 30 | try: 31 | while i_ < len(str2): 32 | i_ = str2.find(str1, i_) 33 | if i_ == -1: 34 | return cnt 35 | else: 36 | cnt += 1 37 | i_ += len(str1) 38 | except: 39 | pass 40 | return cnt 41 | 42 | def transform_one(self, obs, target, id): 43 | return self._str_whole_word(obs, target, 0) 44 | 45 | 46 | class MatchQueryRatio(MatchQueryCount): 47 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 48 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 49 | 50 | def __name__(self): 51 | return "MatchQueryRatio" 52 | 53 | def transform_one(self, obs, target, id): 54 | return np_utils._try_divide(super().transform_one(obs, target, id), len(target.split(" "))) 55 | 56 | 57 | #------------- Longest match features ------------------------------- 58 | class LongestMatchSize(BaseEstimator): 59 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 60 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 61 | 62 | def __name__(self): 63 | return "LongestMatchSize" 64 | 65 | def transform_one(self, obs, target, id): 66 | return dist_utils._longest_match_size(obs, target) 67 | 68 | 69 | class LongestMatchRatio(BaseEstimator): 70 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 71 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 72 | 73 | def __name__(self): 74 | return "LongestMatchRatio" 75 | 76 | def transform_one(self, obs, target, id): 77 | return dist_utils._longest_match_ratio(obs, target) 78 | 79 | 80 | # --------------------------- Attribute based features ------------------------- 81 | class MatchAttrCount(BaseEstimator): 82 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 83 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 84 | 85 | def __name__(self): 86 | return "MatchAttrCount" 87 | 88 | def _str_whole_word(self, str1, str2, i_): 89 | cnt = 0 90 | if len(str1) > 0 and len(str2) > 0: 91 | try: 92 | while i_ < len(str2): 93 | i_ = str2.find(str1, i_) 94 | if i_ == -1: 95 | return cnt 96 | else: 97 | cnt += 1 98 | i_ += len(str1) 99 | except: 100 | pass 101 | return cnt 102 | 103 | def transform_one(self, obs, target, id): 104 | cnt = 0 105 | for o in obs.split(" "): 106 | for t in target: 107 | if not t[0].startswith("bullet"): 108 | if self._str_whole_word(obs, t[0], 0): 109 | cnt += 1 110 | return cnt 111 | 112 | 113 | class MatchAttrRatio(MatchQueryCount): 114 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 115 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 116 | 117 | def __name__(self): 118 | return "MatchAttrRatio" 119 | 120 | def transform_one(self, obs, target, id): 121 | lo = len(obs.split(" ")) 122 | lt = len([t[0] for t in target if not t[0].startswith("bullet")]) 123 | return np_utils._try_divide(super().transform_one(obs, target, id), lo*lt) 124 | 125 | 126 | class IsIndoorOutdoorMatch(BaseEstimator): 127 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 128 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 129 | 130 | def __name__(self): 131 | return "IsIndoorOutdoorMatch" 132 | 133 | def transform_one(self, obs, target, id): 134 | os = [] 135 | if obs.find("indoor") != -1: 136 | os.append("indoor") 137 | if obs.find("outdoor") != -1: 138 | os.append("outdoor") 139 | 140 | cnt = 0 141 | for t in target: 142 | if t[0].find("indoor outdoor") != -1: 143 | cnt = 1 144 | ts = t[1].split(" ") 145 | for i in ts: 146 | if i in os: 147 | return 1 148 | if cnt == 0: 149 | return 0 150 | else: 151 | return -1 152 | 153 | 154 | # ---------------------------- Main -------------------------------------- 155 | def main(): 156 | logname = "generate_feature_match_%s.log"%time_utils._timestamp() 157 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 158 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) 159 | 160 | generators = [ 161 | MatchQueryCount, 162 | MatchQueryRatio, 163 | LongestMatchSize, 164 | LongestMatchRatio, 165 | ] 166 | obs_fields_list = [] 167 | target_fields_list = [] 168 | obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) 169 | target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) 170 | for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): 171 | for generator in generators: 172 | param_list = [] 173 | pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) 174 | pf.go() 175 | 176 | # product_attribute_list 177 | generators = [ 178 | MatchAttrCount, 179 | MatchAttrRatio, 180 | IsIndoorOutdoorMatch, 181 | ] 182 | obs_fields_list = [] 183 | target_fields_list = [] 184 | obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) 185 | target_fields_list.append( ["product_attribute_list"] ) 186 | for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): 187 | for generator in generators: 188 | param_list = [] 189 | pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) 190 | pf.go() 191 | 192 | 193 | if __name__ == "__main__": 194 | main() 195 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_query_quality.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: query quality based features 5 | 6 | """ 7 | 8 | import re 9 | import os 10 | import string 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | import config 16 | from config import TRAIN_SIZE 17 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils 18 | from utils import logging_utils, time_utils, pkl_utils 19 | from feature_base import BaseEstimator, StandaloneFeatureWrapper 20 | import google_spelling_checker_dict 21 | 22 | 23 | class QueryQuality(BaseEstimator): 24 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 25 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 26 | 27 | def __name__(self): 28 | return "QueryQuality" 29 | 30 | def transform_one(self, obs, target, id): 31 | return dist_utils._edit_dist(obs, target) 32 | 33 | 34 | class IsInGoogleDict(BaseEstimator): 35 | def __init__(self, obs_corpus, target_corpus, aggregation_mode=""): 36 | super().__init__(obs_corpus, target_corpus, aggregation_mode) 37 | 38 | def __name__(self): 39 | return "IsInGoogleDict" 40 | 41 | def transform_one(self, obs, target, id): 42 | if obs in google_spelling_checker_dict.spelling_checker_dict: 43 | return 1. 44 | else: 45 | return 0. 46 | 47 | 48 | # ---------------------------- Main -------------------------------------- 49 | def main(): 50 | logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() 51 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 52 | 53 | obs_corpus = [] 54 | query_suffix = [] 55 | # raw 56 | dfAll = pkl_utils._load(config.ALL_DATA_RAW) 57 | obs_corpus.append(dfAll["search_term"].values) 58 | query_suffix.append("raw") 59 | # after processing 60 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) 61 | obs_corpus.append(dfAll["search_term"].values) 62 | query_suffix.append("lemmatized") 63 | # after extracting product_name in search_term 64 | obs_corpus.append(dfAll["search_term_product_name"].values) 65 | query_suffix.append("product_name") 66 | if "search_term_auto_corrected" in dfAll.columns: 67 | # after auto correction 68 | obs_corpus.append(dfAll["search_term_auto_corrected"].values) 69 | query_suffix.append("corrected") 70 | # after stemming 71 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) 72 | obs_corpus.append(dfAll["search_term"].values) 73 | query_suffix.append("stemmed") 74 | 75 | y_train = dfAll["relevance"].values[:TRAIN_SIZE] 76 | for i in range(len(query_suffix)-1): 77 | for j in range(i+1, len(query_suffix)): 78 | ext = QueryQuality(obs_corpus[i], obs_corpus[j]) 79 | x = ext.transform() 80 | dim = np_utils._dim(x) 81 | fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim) 82 | pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) 83 | corr = np_utils._corr(x[:TRAIN_SIZE], y_train) 84 | logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) 85 | 86 | # raw 87 | dfAll = pkl_utils._load(config.ALL_DATA_RAW) 88 | obs_fields = ["search_term"] 89 | param_list = [] 90 | sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) 91 | sf.go() 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: feature transformer 5 | 6 | """ 7 | 8 | from collections import Counter 9 | 10 | from sklearn.base import BaseEstimator 11 | 12 | 13 | #### adopted from @Ben Hamner's Python Benchmark code 14 | ## https://www.kaggle.com/benhamner/crowdflower-search-relevance/python-benchmark 15 | def identity(x): 16 | return x 17 | 18 | 19 | class SimpleTransform(BaseEstimator): 20 | def __init__(self, transformer=identity): 21 | self.transformer = transformer 22 | 23 | def fit(self, X, y=None): 24 | return self 25 | 26 | def fit_transform(self, X, y=None): 27 | return self.transform(X) 28 | 29 | def transform(self, X, y=None): 30 | return self.transformer(X) 31 | 32 | 33 | class ColumnSelector(BaseEstimator): 34 | def __init__(self, columns=-1): 35 | # assert (type(columns) == int) or (type(columns) == list) 36 | self.columns = columns 37 | 38 | def fit(self, X, y=None): 39 | return self 40 | 41 | def fit_transform(self, X, y=None): 42 | return self.transform(X) 43 | 44 | def transform(self, X, y=None): 45 | if len(X.shape) == 1: 46 | return X 47 | elif self.columns == -1: 48 | return X 49 | else: 50 | return X[:,self.columns] 51 | 52 | 53 | # feature mapper for mapping rare categorical values to a special case 54 | # example 55 | # mapper = FeatureMapper(10, 0) 56 | # dfTrain = mapper.fit_transform(dfTrain, "Medical_History_2") 57 | # dfTest = mapper.transform(dfTest, "Medical_History_2") 58 | class FeatureMapper: 59 | def __init__(self, threshold, rare_code): 60 | self.threshold = threshold 61 | self.rare_code = rare_code 62 | self.counter = Counter() 63 | self.mapper = {} 64 | 65 | def fit(self, X): 66 | self.counter = Counter(X) 67 | if self.rare_code is None: 68 | most_freq = sorted(self.counter.items(), 69 | key=lambda x: x[1], 70 | reverse=True)[0][0] 71 | self.rare_code = most_freq 72 | self.mapper = {} 73 | for k,v in self.counter.items(): 74 | if v < self.threshold: 75 | self.mapper[k] = self.rare_code 76 | return self 77 | 78 | def transform(self, X): 79 | Y = map(lambda x:self.mapper.get(x, x), X) 80 | return Y 81 | 82 | def fit_transform(self, X): 83 | self.fit(X) 84 | return self.transform(X) 85 | 86 | 87 | class CountFeaturizer: 88 | def __init__(self): 89 | self.mapper = Counter() 90 | 91 | def fit(self, X): 92 | self.mapper = Counter(X) 93 | s = sum(self.mapper.values()) 94 | for k,v in self.mapper.items(): 95 | self.mapper[k] = float(v) / s 96 | return self 97 | 98 | def transform(self, X): 99 | Y = map(lambda x:self.mapper.get(x, 0), X) 100 | return Y 101 | 102 | def fit_transform(self, X): 103 | self.fit(X) 104 | return self.transform(X) 105 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_tsne.R: -------------------------------------------------------------------------------- 1 | # 2 | # @author: Chenglong Chen 3 | # @brief: tsne based features 4 | # 5 | 6 | require(data.table) 7 | require(Rtsne) 8 | 9 | # random seed for reproducibility 10 | set.seed(2016) 11 | 12 | # path 13 | setwd(".") 14 | feat_dir <- "../../Feat/" 15 | 16 | # feature names 17 | fnames <- c( 18 | "LSA100_Word_Unigram_Pair_search_term_x_product_title_100D", 19 | "LSA100_Word_Bigram_Pair_search_term_x_product_title_100D", 20 | "LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D", 21 | "LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D" 22 | ) 23 | 24 | # setting 25 | perplexity <- 30 26 | theta <- 0.5 27 | dims <- 2 28 | 29 | # run 30 | for(fname in fnames) { 31 | # load lsa features 32 | file_lsa <- paste(feat_dir, fname, ".csv", sep="") 33 | X <- fread(file_lsa, data.table=F) 34 | X <- as.matrix(X) 35 | gc() 36 | 37 | # run tsne 38 | tsne <- Rtsne(X , check_duplicates=FALSE, pca=FALSE, 39 | perplexity=perplexity, theta=theta, dims=dims) 40 | 41 | # save tsne features 42 | col.names <- paste("TSNE_", 1:ncol(tsne$Y), sep="") 43 | file_tsne <- paste(feat_dir, "/TSNE_", fname, ".csv", sep="") 44 | write.table(tsne$Y, file=file_tsne, sep=',', quote=FALSE, 45 | row.names=FALSE, col.names=col.names) 46 | } 47 | -------------------------------------------------------------------------------- /Code/Chenglong/feature_wordnet_similarity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: wordnet similarity based features (veeerrry time consuming) 5 | @note: in our final submission, we are only able to generate WordNet_Path_Similarity between 6 | search_term and product_title in reasonable time. 7 | """ 8 | 9 | """ 10 | http://stackoverflow.com/questions/16877517/compare-similarity-of-terms-expressions-using-nltk 11 | http://stackoverflow.com/questions/22031968/how-to-find-distance-between-two-synset-using-python-nltk-in-wordnet-hierarchy 12 | 13 | #---------------------------------------------------------------------------------------- 14 | Path similarity, wup_similarity and lch_similarity, all of these should work 15 | since they are based on the distance between two synsets in the Wordnet hierarchy. 16 | 17 | dog = wn.synset('dog.n.01') 18 | cat = wn.synset('cat.n.01') 19 | 20 | dog.path_similarity(cat) 21 | 22 | dog.lch_similarity(cat) 23 | 24 | dog.wup_similarity(cat) 25 | 26 | #---------------------------------------------------------------------------------------- 27 | synset1.path_similarity(synset2): 28 | 29 | Return a score denoting how similar two word senses are, based on the shortest 30 | path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The 31 | score is in the range 0 to 1, except in those cases where a path cannot be 32 | found (will only be true for verbs as there are many distinct verb taxonomies), 33 | in which case -1 is returned. A score of 1 represents identity i.e. comparing 34 | a sense with itself will return 1. 35 | 36 | #---------------------------------------------------------------------------------------- 37 | synset1.lch_similarity(synset2), Leacock-Chodorow Similarity: 38 | 39 | Return a score denoting how similar two word senses are, based on the shortest 40 | path that connects the senses (as above) and the maximum depth of the taxonomy 41 | in which the senses occur. The relationship is given as -log(p/2d) where p is 42 | the shortest path length and d the taxonomy depth. 43 | 44 | #---------------------------------------------------------------------------------------- 45 | synset1.wup_similarity(synset2), Wu-Palmer Similarity: 46 | 47 | Return a score denoting how similar two word senses are, based on the depth of the 48 | two senses in the taxonomy and that of their Least Common Subsumer (most specific 49 | ancestor node). Note that at this time the scores given do not always agree with 50 | those given by Pedersen's Perl implementation of Wordnet Similarity. 51 | """ 52 | 53 | import string 54 | 55 | import numpy as np 56 | import pandas as pd 57 | from nltk.corpus import wordnet as wn 58 | 59 | import config 60 | from utils import dist_utils, ngram_utils, nlp_utils, pkl_utils 61 | from utils import logging_utils, time_utils 62 | from feature_base import BaseEstimator, PairwiseFeatureWrapper 63 | 64 | 65 | # tune the token pattern to get a better correlation with y_train 66 | # token_pattern = r"(?u)\b\w\w+\b" 67 | # token_pattern = r"\w{1,}" 68 | # token_pattern = r"\w+" 69 | # token_pattern = r"[\w']+" 70 | token_pattern = " " # just split the text into tokens 71 | 72 | 73 | class WordNet_Similarity(BaseEstimator): 74 | """Double aggregation features""" 75 | def __init__(self, obs_corpus, target_corpus, metric="path", aggregation_mode_prev="", aggregation_mode=""): 76 | super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev) 77 | self.metric = metric 78 | if self.metric == "path": 79 | self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2) 80 | elif self.metric == "lch": 81 | self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2) 82 | elif self.metric == "wup": 83 | self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2) 84 | else: 85 | raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric)) 86 | 87 | def __name__(self): 88 | feat_name = [] 89 | for m1 in self.aggregation_mode_prev: 90 | for m in self.aggregation_mode: 91 | n = "WordNet_%s_Similarity_%s_%s"%( 92 | string.capwords(self.metric), string.capwords(m1), string.capwords(m)) 93 | feat_name.append(n) 94 | return feat_name 95 | 96 | def _maximum_similarity_for_two_synset_list(self, syn_list1, syn_list2): 97 | s = 0. 98 | if syn_list1 and syn_list2: 99 | for syn1 in syn_list1: 100 | for syn2 in syn_list2: 101 | try: 102 | _s = self.metric_func(syn1, syn2) 103 | except: 104 | _s = config.MISSING_VALUE_NUMERIC 105 | if _s and _s > s: 106 | s = _s 107 | return s 108 | 109 | def transform_one(self, obs, target, id): 110 | obs_tokens = nlp_utils._tokenize(obs, token_pattern) 111 | target_tokens = nlp_utils._tokenize(target, token_pattern) 112 | obs_synset_list = [wn.synsets(obs_token) for obs_token in obs_tokens] 113 | target_synset_list = [wn.synsets(target_token) for target_token in target_tokens] 114 | val_list = [] 115 | for obs_synset in obs_synset_list: 116 | _val_list = [] 117 | for target_synset in target_synset_list: 118 | _s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset) 119 | _val_list.append(_s) 120 | if len(_val_list) == 0: 121 | _val_list = [config.MISSING_VALUE_NUMERIC] 122 | val_list.append( _val_list ) 123 | if len(val_list) == 0: 124 | val_list = [[config.MISSING_VALUE_NUMERIC]] 125 | return val_list 126 | 127 | 128 | class WordNet_Path_Similarity(WordNet_Similarity): 129 | def __init__(self, obs_corpus, target_corpus, aggregation_mode_prev="", aggregation_mode=""): 130 | super().__init__(obs_corpus, target_corpus, "path", aggregation_mode_prev, aggregation_mode) 131 | 132 | 133 | class WordNet_Lch_Similarity(WordNet_Similarity): 134 | def __init__(self, obs_corpus, target_corpus, aggregation_mode_prev="", aggregation_mode=""): 135 | super().__init__(obs_corpus, target_corpus, "lch", aggregation_mode_prev, aggregation_mode) 136 | 137 | 138 | class WordNet_Wup_Similarity(WordNet_Similarity): 139 | def __init__(self, obs_corpus, target_corpus, aggregation_mode_prev="", aggregation_mode=""): 140 | super().__init__(obs_corpus, target_corpus, "wup", aggregation_mode_prev, aggregation_mode) 141 | 142 | 143 | # ---------------------------- Main -------------------------------------- 144 | def main(): 145 | logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp() 146 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 147 | #### NOTE: use data BEFORE STEMMING 148 | dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) 149 | 150 | # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission 151 | generators = [ 152 | WordNet_Path_Similarity, 153 | WordNet_Lch_Similarity, 154 | WordNet_Wup_Similarity, 155 | ][:1] 156 | obs_fields_list = [] 157 | target_fields_list = [] 158 | # only search_term and product_title are used in final submission 159 | obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) 160 | target_fields_list.append( ["product_title", "product_description", "product_attribute"][:1] ) 161 | # double aggregation 162 | aggregation_mode_prev = ["mean", "max", "min", "median"] 163 | aggregation_mode = ["mean", "std", "max", "min", "median"] 164 | for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): 165 | for generator in generators: 166 | param_list = [aggregation_mode_prev, aggregation_mode] 167 | pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) 168 | pf.go() 169 | 170 | 171 | if __name__ == "__main__": 172 | main() 173 | -------------------------------------------------------------------------------- /Code/Chenglong/gen_best_ensemble_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: script for generating the best ensemble model from Chenglong's side 5 | @note: 1. make sure you have run `python run_data.py` first 6 | 2. make sure you have built `some diverse` 1st level models first (see `./Log/level1_models` for example) 7 | 8 | """ 9 | 10 | import os 11 | 12 | 13 | cmd = "python run_stacking_ridge.py -l 2 -d 0 -t 10 -c 1 -L reg_ensemble -o" 14 | os.system(cmd) 15 | -------------------------------------------------------------------------------- /Code/Chenglong/gen_best_single_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: script for generating the best single model from Chenglong's side 5 | @note: 1. make sure you have run `python run_data.py` first 6 | 2. RMSE should be something around 0.438 ~ 0.439 7 | 8 | """ 9 | 10 | import os 11 | 12 | 13 | suffix = '201604210409' 14 | threshold = 0.05 15 | 16 | cmd = "python feature_combiner.py -l 1 -c feature_conf_nonlinear_%s -n basic_nonlinear_%s -t %.6f"%(suffix, suffix, threshold) 17 | os.system(cmd) 18 | 19 | cmd = "python task.py -m single -f basic_nonlinear_%s -l reg_xgb_tree_best_single_model -e 1"%suffix 20 | os.system(cmd) 21 | -------------------------------------------------------------------------------- /Code/Chenglong/get_feature_conf_linear.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: generate feature conf for the following models (most of which are linear models) 5 | - reg_skl_ridge 6 | - reg_skl_bayesian_ridge 7 | - reg_skl_lasso 8 | - reg_skl_lsvr 9 | - reg_xgb_linear 10 | - reg_keras_dnn (nonlinear models) 11 | @note: 12 | - such features DO NOT INCLUDE "DocId_(search_term|product_title|product_color|product_brand)" 13 | - one can tune the MANDATORY_FEATS and COMMENT_OUT_FEATS to generate different feature subset 14 | 15 | """ 16 | 17 | import re 18 | import os 19 | from optparse import OptionParser 20 | 21 | import config 22 | from utils import time_utils 23 | 24 | 25 | INCLUDE_FEATS = [ 26 | ".+" 27 | ] 28 | 29 | 30 | COUNT_FEATS = [ 31 | "Freq", 32 | "Len", 33 | "Count", 34 | "Size", 35 | "Position", 36 | ] 37 | # COUNT_FEATS = [] 38 | 39 | 40 | NOT_COUNT_FEATS = ["Norm", "Ratio"] 41 | 42 | 43 | MANDATORY_FEATS = [ 44 | 45 | # including product_uid according to 46 | # https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/20288/trends-in-relevances-by-row-ids/115886#post115886 47 | "DocIdEcho_product_uid", 48 | "ProductUidDummy1_product_uid", 49 | "ProductUidDummy2_product_uid", 50 | 51 | "IsInGoogleDict", 52 | "GroupRelevance_Size", 53 | "TSNE", 54 | ] 55 | 56 | 57 | COMMENT_OUT_FEATS = [ 58 | 59 | #-------------- General -------------- 60 | "search_term_alt", 61 | 62 | "Bigram", 63 | "Trigram", 64 | "UBgram", 65 | "UBTgram", 66 | 67 | "Median", 68 | "Std", 69 | 70 | ".+(Bigram|Trigram)_.+_product_(brand|color)", 71 | 72 | 73 | #-------------- Basic -------------- 74 | "DocLogFreq", 75 | "Digit", 76 | "Unique", 77 | "^DocIdOneHot", 78 | "^DocId", 79 | 80 | "DocLen_product_(brand|color)", 81 | "DocLen_product_attribute_1D", 82 | "DocFreq_product_description_1D", 83 | "DocFreq_product_attribute_1D", 84 | "Digit(Count|Ratio)_product_(brand|color)", 85 | "Doc(Entropy|Len)_product_(brand|color)", 86 | "Unique(Count|Ratio)_.+_product_(brand|color)", 87 | 88 | 89 | #-------------- Distance -------------- 90 | "DiceDistance", 91 | # "EditDistance", 92 | "Compression", 93 | 94 | 95 | #-------------- First and Last Ngram -------------- 96 | "FirstIntersectNormPosition", 97 | "FirstIntersectPosition", 98 | "LastIntersectNormPosition", 99 | "LastIntersectPosition", 100 | 101 | 102 | #-------------- Group -------------- 103 | "GroupRelevance_(Mean|Std|Max|Min|Median)", 104 | "Group_\d+", 105 | "GroupDistanceStat", 106 | 107 | 108 | #-------------- Intersect Count & Position -------------- 109 | "IntersectPosition_.+_(Std|Max|Min|Median)", 110 | "IntersectNormPosition_.+_(Std|Max|Min|Median)", 111 | 112 | 113 | #-------------- Match -------------- 114 | "LongestMatchSize", 115 | 116 | 117 | #-------------- StatCooc -------------- 118 | # since product_name is of length 2, it makes no difference for various aggregation as there is only one item 119 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_search_term_product_name_x_product_title_product_name_1D", 120 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_product_title_product_name_x_search_term_product_name_1D", 121 | 122 | "NormTF", 123 | "NormTFIDF", 124 | 125 | 126 | #-------------- Vector Space -------------- 127 | # as TFIDF_Word_Trigram has the largest corr 128 | "LSA\d+_Word_Unigram", 129 | "LSA\d+_Word_Bigram", 130 | "TFIDF_Word_Unigram", 131 | "TFIDF_Word_Bigram", 132 | 133 | # as TFIDF_Char_Fourgram has the largest corr 134 | "LSA\d+_Char_Bigram", 135 | "LSA\d+_Char_Trigram", 136 | "LSA\d+_Char_Fivegram", 137 | "TFIDF_Char_Bigram", 138 | "TFIDF_Char_Trigram", 139 | "TFIDF_Char_Fivegram", 140 | 141 | "CharDistribution_Ratio", 142 | 143 | 144 | #-------------- Word2Vec & Doc2Vec -------------- 145 | "_Vector_", 146 | "_Vdiff_", 147 | "Word2Vec_Wikipedia_D50", 148 | "Word2Vec_Wikipedia_D100", 149 | "Word2Vec_Wikipedia_D200", 150 | # "Word2Vec_GoogleNews", 151 | "Word2Vec_GoogleNews_D300_Vector", 152 | # as all the words are used to train the model 153 | "Word2Vec_Homedepot_D100_Importance", 154 | "Word2Vec_Homedepot_D100_N_Similarity_Imp", 155 | 156 | 157 | #-------------- Turing Test -------------- 158 | # d = { 159 | # "df_basic_features.csv": "Basic", 160 | # "df_brand_material_dummies.csv": "BrandMaterialDummy", 161 | # "df_dist_new.csv": "Dist", 162 | # "dld_features.csv": "DLD", 163 | # "df_st_tfidf.csv": "StTFIDF", 164 | # "df_tfidf_intersept_new.csv": "TFIDF", 165 | # "df_thekey_dummies.csv": "TheKeyDummy", 166 | # "df_word2vec_new.csv": "Word2Vec", 167 | # } 168 | # "TuringTest_Basic", 169 | # "TuringTest_BrandMaterialDummy", 170 | # "TuringTest_Dist", 171 | # "TuringTest_DLD", 172 | # "TuringTest_StTFIDF", 173 | # "TuringTest_TFIDF", 174 | # "TuringTest_TheKeyDummy", 175 | # "TuringTest_Word2Vec", 176 | 177 | 178 | ] 179 | 180 | 181 | def _check_include(fname): 182 | for v in INCLUDE_FEATS: 183 | pat = re.compile(v) 184 | if len(re.findall(pat, fname)) > 0: 185 | return True 186 | return False 187 | 188 | 189 | def _check_count_feat(fname): 190 | for v in NOT_COUNT_FEATS: 191 | pat = re.compile(v) 192 | if len(re.findall(pat, fname)) > 0: 193 | return False 194 | for v in COUNT_FEATS: 195 | pat = re.compile(v) 196 | if len(re.findall(pat, fname)) > 0: 197 | return True 198 | return False 199 | 200 | 201 | def _check_lsa_matrix(fname): 202 | pat = re.compile("^LSA") 203 | if len(re.findall(pat, fname)) > 0: 204 | return True 205 | return False 206 | 207 | 208 | def _check_mandatory(fname): 209 | for v in MANDATORY_FEATS: 210 | pat = re.compile(v) 211 | if len(re.findall(pat, fname)) > 0: 212 | return True 213 | return False 214 | 215 | 216 | def _check_comment_out(fname): 217 | for v in COMMENT_OUT_FEATS: 218 | pat = re.compile(v) 219 | if len(re.findall(pat, fname)) > 0: 220 | return True 221 | return False 222 | 223 | 224 | header_pattern = """ 225 | # -*- coding: utf-8 -*- 226 | \"\"\" 227 | @author: Chenglong Chen 228 | @brief: one feature conf 229 | 230 | Generated by 231 | python %s -d %d -o %s 232 | 233 | Format: 234 | FEATURE_NAME : (MANDATORY, TRANSFORM) 235 | 236 | \"\"\" 237 | 238 | import config 239 | from feature_transformer import SimpleTransform, ColumnSelector 240 | 241 | LSA_COLUMNS = range(%d) 242 | 243 | feature_dict = { 244 | 245 | """ 246 | 247 | 248 | def _create_feature_conf(lsa_columns, outfile): 249 | res = header_pattern%(__file__, int(lsa_columns), outfile, int(lsa_columns)) 250 | 251 | folders = [config.FEAT_DIR, config.FEAT_DIR+"/All"] 252 | for folder in folders: 253 | try: 254 | for file in sorted(os.listdir(folder)): 255 | if config.FEAT_FILE_SUFFIX in file: 256 | fname = file.split(".")[0] 257 | if _check_include(fname): 258 | line = "" 259 | mandatory = _check_mandatory(fname) 260 | if not mandatory and _check_comment_out(fname): 261 | continue 262 | line += "# " 263 | line += "'%s' : "%fname 264 | if mandatory: 265 | line += "(True, " 266 | else: 267 | line += "(False, " 268 | if _check_lsa_matrix(fname): 269 | if int(lsa_columns) > 0: 270 | line += "ColumnSelector(LSA_COLUMNS)),\n" 271 | else: 272 | continue 273 | elif _check_count_feat(fname): 274 | line += "SimpleTransform(config.COUNT_TRANSFORM)),\n" 275 | else: 276 | line += "SimpleTransform()),\n" 277 | res += line 278 | except: 279 | pass 280 | res += "}\n" 281 | 282 | with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f: 283 | f.write(res) 284 | 285 | 286 | def parse_args(parser): 287 | parser.add_option("-d", "--dim", default=1, type=int, dest="lsa_columns", 288 | help="lsa_columns") 289 | parser.add_option("-o", "--outfile", default="feature_conf_%s.py"%time_utils._timestamp(), 290 | type="string", dest="outfile", help="outfile") 291 | 292 | (options, args) = parser.parse_args() 293 | return options, args 294 | 295 | 296 | def main(options): 297 | _create_feature_conf(lsa_columns=options.lsa_columns, outfile=options.outfile) 298 | 299 | 300 | if __name__ == "__main__": 301 | parser = OptionParser() 302 | options, args = parse_args(parser) 303 | main(options) 304 | -------------------------------------------------------------------------------- /Code/Chenglong/get_feature_conf_nonlinear.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: generate feature conf for the following models 5 | - reg_skl_gbm 6 | - reg_skl_adaboost 7 | - reg_skl_etr 8 | - reg_skl_rf 9 | - reg_xgb_tree 10 | - reg_rgf 11 | @note: 12 | - such features INCLUDE "DocId_(search_term|product_title|product_color|product_brand)" 13 | - one can tune the MANDATORY_FEATS and COMMENT_OUT_FEATS to generate different feature subset 14 | 15 | """ 16 | 17 | import re 18 | import os 19 | from optparse import OptionParser 20 | 21 | import config 22 | from utils import time_utils 23 | 24 | 25 | INCLUDE_FEATS = [ 26 | ".+" 27 | ] 28 | 29 | 30 | COUNT_FEATS = [ 31 | "Freq", 32 | "Len", 33 | "Count", 34 | "Size", 35 | "Position", 36 | ] 37 | # COUNT_FEATS = [] 38 | 39 | 40 | NOT_COUNT_FEATS = ["Norm", "Ratio"] 41 | 42 | 43 | MANDATORY_FEATS = [ 44 | 45 | "DocId_(search_term|product_title|product_color|product_brand)", 46 | 47 | # including product_uid according to 48 | # https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/20288/trends-in-relevances-by-row-ids/115886#post115886 49 | "DocIdEcho_product_uid", 50 | "ProductUidDummy1_product_uid", 51 | "ProductUidDummy2_product_uid", 52 | 53 | "IsInGoogleDict", 54 | "GroupRelevance_Size", 55 | "TSNE", 56 | ] 57 | 58 | 59 | COMMENT_OUT_FEATS = [ 60 | 61 | #-------------- General -------------- 62 | "search_term_alt", 63 | 64 | "Bigram", 65 | "Trigram", 66 | "UBgram", 67 | "UBTgram", 68 | 69 | "Median", 70 | "Std", 71 | 72 | ".+(Bigram|Trigram)_.+_product_(brand|color)", 73 | 74 | 75 | #-------------- Basic -------------- 76 | "DocLogFreq", 77 | "Digit", 78 | "Unique", 79 | "^DocIdOneHot", 80 | "^DocId", 81 | 82 | "DocLen_product_(brand|color)", 83 | "DocLen_product_attribute_1D", 84 | "DocFreq_product_description_1D", 85 | "DocFreq_product_attribute_1D", 86 | "Digit(Count|Ratio)_product_(brand|color)", 87 | "Doc(Entropy|Len)_product_(brand|color)", 88 | "Unique(Count|Ratio)_.+_product_(brand|color)", 89 | 90 | 91 | #-------------- Distance -------------- 92 | "DiceDistance", 93 | # "EditDistance", 94 | "Compression", 95 | 96 | 97 | #-------------- First and Last Ngram -------------- 98 | "FirstIntersectNormPosition", 99 | "FirstIntersectPosition", 100 | "LastIntersectNormPosition", 101 | "LastIntersectPosition", 102 | 103 | 104 | #-------------- Group -------------- 105 | "GroupRelevance_(Mean|Std|Max|Min|Median)", 106 | "Group_\d+", 107 | "GroupDistanceStat", 108 | 109 | 110 | #-------------- Intersect Count & Position -------------- 111 | "IntersectPosition_.+_(Std|Max|Min|Median)", 112 | "IntersectNormPosition_.+_(Std|Max|Min|Median)", 113 | 114 | 115 | #-------------- Match -------------- 116 | "LongestMatchSize", 117 | 118 | 119 | #-------------- StatCooc -------------- 120 | # since product_name is of length 2, it makes no difference for various aggregation as there is only one item 121 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_search_term_product_name_x_product_title_product_name_1D", 122 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_product_title_product_name_x_search_term_product_name_1D", 123 | 124 | "NormTF", 125 | "NormTFIDF", 126 | 127 | 128 | #-------------- Vector Space -------------- 129 | # as TFIDF_Word_Trigram has the largest corr 130 | "LSA\d+_Word_Unigram", 131 | "LSA\d+_Word_Bigram", 132 | "TFIDF_Word_Unigram", 133 | "TFIDF_Word_Bigram", 134 | 135 | # as TFIDF_Char_Fourgram has the largest corr 136 | "LSA\d+_Char_Bigram", 137 | "LSA\d+_Char_Trigram", 138 | "LSA\d+_Char_Fivegram", 139 | "TFIDF_Char_Bigram", 140 | "TFIDF_Char_Trigram", 141 | "TFIDF_Char_Fivegram", 142 | 143 | "CharDistribution_Ratio", 144 | 145 | 146 | #-------------- Word2Vec & Doc2Vec -------------- 147 | "_Vector_", 148 | "_Vdiff_", 149 | "Word2Vec_Wikipedia_D50", 150 | "Word2Vec_Wikipedia_D100", 151 | "Word2Vec_Wikipedia_D200", 152 | # "Word2Vec_GoogleNews", 153 | "Word2Vec_GoogleNews_D300_Vector", 154 | # as all the words are used to train the model 155 | "Word2Vec_Homedepot_D100_Importance", 156 | "Word2Vec_Homedepot_D100_N_Similarity_Imp", 157 | 158 | 159 | #-------------- Turing Test -------------- 160 | # d = { 161 | # "df_basic_features.csv": "Basic", 162 | # "df_brand_material_dummies.csv": "BrandMaterialDummy", 163 | # "df_dist_new.csv": "Dist", 164 | # "dld_features.csv": "DLD", 165 | # "df_st_tfidf.csv": "StTFIDF", 166 | # "df_tfidf_intersept_new.csv": "TFIDF", 167 | # "df_thekey_dummies.csv": "TheKeyDummy", 168 | # "df_word2vec_new.csv": "Word2Vec", 169 | # } 170 | # "TuringTest_Basic", 171 | # "TuringTest_BrandMaterialDummy", 172 | # "TuringTest_Dist", 173 | # "TuringTest_DLD", 174 | # "TuringTest_StTFIDF", 175 | # "TuringTest_TFIDF", 176 | # "TuringTest_TheKeyDummy", 177 | # "TuringTest_Word2Vec", 178 | 179 | 180 | ] 181 | 182 | 183 | def _check_include(fname): 184 | for v in INCLUDE_FEATS: 185 | pat = re.compile(v) 186 | if len(re.findall(pat, fname)) > 0: 187 | return True 188 | return False 189 | 190 | 191 | def _check_count_feat(fname): 192 | for v in NOT_COUNT_FEATS: 193 | pat = re.compile(v) 194 | if len(re.findall(pat, fname)) > 0: 195 | return False 196 | for v in COUNT_FEATS: 197 | pat = re.compile(v) 198 | if len(re.findall(pat, fname)) > 0: 199 | return True 200 | return False 201 | 202 | 203 | def _check_lsa_matrix(fname): 204 | pat = re.compile("^LSA") 205 | if len(re.findall(pat, fname)) > 0: 206 | return True 207 | return False 208 | 209 | 210 | def _check_mandatory(fname): 211 | for v in MANDATORY_FEATS: 212 | pat = re.compile(v) 213 | if len(re.findall(pat, fname)) > 0: 214 | return True 215 | return False 216 | 217 | 218 | def _check_comment_out(fname): 219 | for v in COMMENT_OUT_FEATS: 220 | pat = re.compile(v) 221 | if len(re.findall(pat, fname)) > 0: 222 | return True 223 | return False 224 | 225 | 226 | header_pattern = """ 227 | # -*- coding: utf-8 -*- 228 | \"\"\" 229 | @author: Chenglong Chen 230 | @brief: one feature conf 231 | 232 | Generated by 233 | python %s -d %d -o %s 234 | 235 | Format: 236 | FEATURE_NAME : (MANDATORY, TRANSFORM) 237 | 238 | \"\"\" 239 | 240 | import config 241 | from feature_transformer import SimpleTransform, ColumnSelector 242 | 243 | LSA_COLUMNS = range(%d) 244 | 245 | feature_dict = { 246 | 247 | """ 248 | 249 | 250 | def _create_feature_conf(lsa_columns, outfile): 251 | res = header_pattern%(__file__, int(lsa_columns), outfile, int(lsa_columns)) 252 | 253 | folders = [config.FEAT_DIR, config.FEAT_DIR+"/All"] 254 | for folder in folders: 255 | try: 256 | for file in sorted(os.listdir(folder)): 257 | if config.FEAT_FILE_SUFFIX in file: 258 | fname = file.split(".")[0] 259 | if _check_include(fname): 260 | line = "" 261 | mandatory = _check_mandatory(fname) 262 | if not mandatory and _check_comment_out(fname): 263 | continue 264 | line += "# " 265 | line += "'%s' : "%fname 266 | if mandatory: 267 | line += "(True, " 268 | else: 269 | line += "(False, " 270 | if _check_lsa_matrix(fname): 271 | if int(lsa_columns) > 0: 272 | line += "ColumnSelector(LSA_COLUMNS)),\n" 273 | else: 274 | continue 275 | elif _check_count_feat(fname): 276 | line += "SimpleTransform(config.COUNT_TRANSFORM)),\n" 277 | else: 278 | line += "SimpleTransform()),\n" 279 | res += line 280 | except: 281 | pass 282 | res += "}\n" 283 | 284 | with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f: 285 | f.write(res) 286 | 287 | 288 | def parse_args(parser): 289 | parser.add_option("-d", "--dim", default=1, type=int, dest="lsa_columns", 290 | help="lsa_columns") 291 | parser.add_option("-o", "--outfile", default="feature_conf_%s.py"%time_utils._timestamp(), 292 | type="string", dest="outfile", help="outfile") 293 | 294 | (options, args) = parser.parse_args() 295 | return options, args 296 | 297 | 298 | def main(options): 299 | _create_feature_conf(lsa_columns=options.lsa_columns, outfile=options.outfile) 300 | 301 | 302 | if __name__ == "__main__": 303 | parser = OptionParser() 304 | options, args = parse_args(parser) 305 | main(options) 306 | -------------------------------------------------------------------------------- /Code/Chenglong/get_stacking_feature_conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: generate stacking feature conf for 2nd and 3rd level models 5 | 6 | """ 7 | 8 | import os 9 | import re 10 | from optparse import OptionParser 11 | 12 | import pandas as pd 13 | 14 | import config 15 | from utils import time_utils 16 | 17 | 18 | def grab(pattern, text): 19 | pat = re.compile(pattern) 20 | group = re.findall(pat, text) 21 | return group 22 | 23 | 24 | def check_valid(model): 25 | file = "%s/All/test.pred.%s.csv" % (config.OUTPUT_DIR, model) 26 | try: 27 | df = pd.read_csv(file) 28 | if df.shape[0] == config.TEST_SIZE: 29 | return True 30 | else: 31 | return False 32 | except: 33 | return False 34 | 35 | 36 | def get_model_list(log_folder, topN): 37 | tasks_ens = [] 38 | for file in sorted(os.listdir(log_folder)): 39 | if not os.path.isfile(os.path.join(log_folder, file)): 40 | continue 41 | text = open(os.path.join(log_folder, file), "r").read() 42 | 43 | # grab everything we need 44 | tasks = grab("(\[Feat@.*)", text) 45 | rmse_mean = grab("Mean: (.*)", text) 46 | rmse_std = grab("Std: (.*)", text) 47 | rmse_mean = [float(x) for x in rmse_mean] 48 | rmse_std = [float(x) for x in rmse_std] 49 | L = min(len(tasks), len(rmse_mean), len(rmse_std)) 50 | d = dict(zip(tasks[:L], rmse_mean[:L])) 51 | 52 | # keep the top-N 53 | ds = sorted(d.items(), key=lambda x: float(x[1])) 54 | cnt = 0 55 | for t,v in ds: 56 | if check_valid(t): 57 | tasks_ens.append(t) 58 | print("Read %s : %.6f"%(t, v)) 59 | cnt += 1 60 | if cnt >= topN: 61 | break 62 | if cnt > 0: 63 | print("Read %d models from %s"%(cnt, file)) 64 | 65 | return tasks_ens 66 | 67 | 68 | header_pattern = """ 69 | # -*- coding: utf-8 -*- 70 | \"\"\" 71 | @author: Chenglong Chen 72 | @brief: one stacking feature conf 73 | 74 | Generated by 75 | python %s -l %s -t %d -o %s 76 | 77 | \"\"\" 78 | 79 | feature_list = [ 80 | 81 | """ 82 | 83 | 84 | def _create_feature_conf(level, topN, outfile): 85 | log_folder = "%s/level%d_models"%(config.LOG_DIR, level) 86 | feature_list = get_model_list(log_folder, topN) 87 | res = header_pattern%(__file__, level, int(topN), outfile) 88 | for feature in feature_list: 89 | res += '"%s",\n'%feature 90 | res += "]\n" 91 | with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f: 92 | f.write(res) 93 | 94 | 95 | def main(options): 96 | _create_feature_conf(level=options.level, topN=options.topN, outfile=options.outfile) 97 | 98 | 99 | def parse_args(parser): 100 | parser.add_option("-l", "--level", default=2, 101 | type="int", dest="level", help="level") 102 | parser.add_option("-t", "--top", default=10, 103 | type="int", dest="topN", help="top-N") 104 | parser.add_option("-o", "--outfile", 105 | default="stacking_feature_conf_%s.py"%time_utils._timestamp(), 106 | type="string", dest="outfile", help="outfile") 107 | (options, args) = parser.parse_args() 108 | return options, args 109 | 110 | 111 | if __name__ == "__main__": 112 | parser = OptionParser() 113 | options, args = parse_args(parser) 114 | main(options) 115 | -------------------------------------------------------------------------------- /Code/Chenglong/plot_CV_LB.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: plot CV RMSE vs LB RMSE 5 | 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import config 12 | 13 | 14 | def main(): 15 | rmse_cv = [ 16 | # [0.527408,0.000768], 17 | # [0.482010,0.000752], 18 | [0.470570,0.000740], 19 | [0.470197,0.000558], 20 | [0.470167,0.000492], 21 | [0.468127,0.000749], 22 | [0.467613,0.000617], 23 | [0.467570,0.000509], 24 | [0.463124,0.000934], 25 | [0.462973,0.001178], 26 | [0.462632,0.001026], 27 | [0.461406,0.001050], 28 | [0.460582,0.001128], 29 | [0.458092,0.000782], 30 | [0.457421,0.000848], 31 | [0.455473,0.001008], 32 | [0.450111,0.000749], 33 | [0.447134,0.001033], 34 | [0.438318,0.000786], 35 | ] 36 | rmse_lb = [ 37 | # [0.52770,0.52690], 38 | # [0.48067,0.48071], 39 | [0.46982,0.47028], 40 | [0.46968,0.46931], 41 | [0.46986,0.46981], 42 | [0.46864,0.46837], 43 | [0.46569,0.46544], 44 | [0.46653,0.46623], 45 | [0.46263,0.46181], 46 | [0.46251,0.46180], 47 | [0.46185,0.46147], 48 | [0.45944,0.45900], 49 | [0.45993,0.45958], 50 | [0.45909,0.45860], 51 | [0.45816,0.45725], 52 | [0.45640,0.45533], 53 | [0.44967,0.44902], 54 | [0.44577,0.44457], 55 | [0.43996,0.43811], 56 | ] 57 | 58 | 59 | rmse_cv = np.asarray(rmse_cv, dtype=float) 60 | rmse_lb = np.asarray(rmse_lb, dtype=float) 61 | 62 | N = rmse_cv.shape[0] 63 | x = np.arange(1,N+1,1) 64 | label = "CV" 65 | plt.errorbar(x, rmse_cv[:,0], 66 | yerr=2*rmse_cv[:,1], 67 | fmt='-o', label=label) 68 | plt.plot(x, rmse_lb[:,0]) 69 | plt.plot(x, rmse_lb[:,1]) 70 | plt.xlim(1, N) 71 | plt.title("CV RMSE vs LB RMSE") 72 | plt.xlabel("#Sub") 73 | plt.ylabel("RMSE") 74 | plt.legend(["CV (+- 2std)", "Public LB", "Private LB"], loc="upper right") 75 | fig_file = "%s/CV_LB_Chenglong.pdf"%config.FIG_DIR 76 | plt.savefig(fig_file) 77 | plt.clf() 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /Code/Chenglong/plot_feature_corr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: plot correlation with target relevance for each feature group 5 | 6 | """ 7 | 8 | import os 9 | import re 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | import config 15 | 16 | 17 | def is_feat_log(fname): 18 | pat = re.compile("generate_feature_(.+)_2016") 19 | groups = re.findall(pat, fname) 20 | if len(groups) > 0 and groups[0] != "group_relevance": 21 | return groups[0] 22 | return None 23 | 24 | 25 | def grap_feat_line_corr(line): 26 | pat = re.compile("corr = (.+)") 27 | groups = re.findall(pat, line) 28 | if len(groups) > 0: 29 | return float(groups[0]) 30 | return None 31 | 32 | 33 | def grap_feat_line_name(line): 34 | pat = re.compile("INFO: (.+) \(\d+D\):") 35 | groups = re.findall(pat, line) 36 | if len(groups) > 0: 37 | return groups[0] 38 | return None 39 | 40 | 41 | def grap_feat_corr_dict(fname): 42 | d = {} 43 | with open("%s/feature/%s"%(config.LOG_DIR, fname), "r") as f: 44 | for line in f: 45 | corr = grap_feat_line_corr(line) 46 | if corr is not None: 47 | name = grap_feat_line_name(line) 48 | d[name] = (corr) 49 | return d.values() 50 | 51 | def grap_all_feat_corr_dict(): 52 | d = {} 53 | for fname in sorted(os.listdir("%s/feature"%(config.LOG_DIR))): 54 | name = is_feat_log(fname) 55 | if name is not None: 56 | d[name] = grap_feat_corr_dict(fname) 57 | return d 58 | 59 | def main(): 60 | colors = "rgbcmyk" 61 | d = grap_all_feat_corr_dict() 62 | keys = sorted(d.keys()) 63 | N = len(keys) 64 | fig = plt.figure() 65 | ax = fig.add_subplot(111) 66 | for e,k in enumerate(keys, start=1): 67 | vals = sorted(d[k]) 68 | color = colors[(e-1) % len(colors)] 69 | plt.bar(np.linspace(e-0.48,e+0.48,len(vals)), vals, 70 | width=1./(len(vals)+10), color=color, edgecolor=color) 71 | plt.xlabel("Feature Group", fontsize=15) 72 | plt.ylabel("Correlation Coefficient", fontsize=15) 73 | plt.xticks(range(1,N+1), fontsize=15) 74 | plt.yticks([-0.4, -0.2, 0, 0.2, 0.4], fontsize=15) 75 | ax.set_xticklabels(keys, rotation=45, ha="right") 76 | ax.set_xlim([0, N+1]) 77 | ax.set_ylim([-0.4, 0.4]) 78 | pos1 = ax.get_position() 79 | pos2 = [pos1.x0 - 0.075, pos1.y0 + 0.175, pos1.width * 1.2, pos1.height * 0.85] 80 | ax.set_position(pos2) 81 | plt.show() 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /Code/Chenglong/run_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: generate all the data and features in one shot 5 | @note: if you don't have access to multi-core computers, drop the "&" in the cmd 6 | 7 | """ 8 | 9 | import os 10 | 11 | 12 | #----------------------------------------------------------------------- 13 | # generate split (or you can use mine in ./Data/split/) 14 | cmd = "python splitter.py" 15 | os.system(cmd) 16 | 17 | 18 | #----------------------------------------------------------------------- 19 | # prepare data 20 | cmd = "python data_preparer.py" 21 | os.system(cmd) 22 | 23 | 24 | #----------------------------------------------------------------------- 25 | # process/clean data 26 | cmd = "python data_processor.py" 27 | os.system(cmd) 28 | 29 | 30 | #----------------------------------------------------------------------- 31 | # generate basic features 32 | cmd = "python feature_basic.py &" 33 | os.system(cmd) 34 | 35 | 36 | #----------------------------------------------------------------------- 37 | # generate distance features 38 | cmd = "python feature_distance.py jaccard &" 39 | os.system(cmd) 40 | 41 | cmd = "python feature_distance.py edit &" 42 | os.system(cmd) 43 | 44 | # # not used in the final model 45 | # cmd = "python feature_distance.py compression &" 46 | # os.system(cmd) 47 | 48 | 49 | #----------------------------------------------------------------------- 50 | # generate first and last ngram features 51 | cmd = "python feature_first_last_ngram.py &" 52 | os.system(cmd) 53 | 54 | 55 | #----------------------------------------------------------------------- 56 | # generate group based features (not used in the final model) 57 | # cmd = "python feature_group_distance.py &" 58 | # os.system(cmd) 59 | 60 | # cmd = "python feature_group_distance_stat.py &" 61 | # os.system(cmd) 62 | 63 | cmd = "python feature_group_relevance.py &" 64 | os.system(cmd) 65 | 66 | 67 | #----------------------------------------------------------------------- 68 | # generate intersect features 69 | cmd = "python feature_intersect_count.py &" 70 | os.system(cmd) 71 | 72 | cmd = "python feature_intersect_position.py &" 73 | os.system(cmd) 74 | 75 | 76 | #----------------------------------------------------------------------- 77 | # generate match features 78 | cmd = "python feature_match.py &" 79 | os.system(cmd) 80 | 81 | 82 | #----------------------------------------------------------------------- 83 | # generate query quality features 84 | cmd = "python feature_query_quality.py &" 85 | os.system(cmd) 86 | 87 | 88 | #----------------------------------------------------------------------- 89 | # generate statistical cooccurrence (weighted) features 90 | cmd = "python feature_stat_cooc_tfidf.py tf &" 91 | os.system(cmd) 92 | # cmd = "python feature_stat_cooc_tfidf.py norm_tf &" 93 | # os.system(cmd) 94 | 95 | cmd = "python feature_stat_cooc_tfidf.py tfidf &" 96 | os.system(cmd) 97 | # cmd = "python feature_stat_cooc_tfidf.py norm_tfidf &" 98 | # os.system(cmd) 99 | 100 | cmd = "python feature_stat_cooc_tfidf.py bm25 &" 101 | os.system(cmd) 102 | 103 | 104 | #----------------------------------------------------------------------- 105 | # generate word2vec features using pre-trained word2vec model 106 | cmd = "python feature_word2vec.py google &" 107 | os.system(cmd) 108 | 109 | cmd = "python feature_word2vec.py wikipedia &" 110 | os.system(cmd) 111 | 112 | 113 | #----------------------------------------------------------------------- 114 | # generate wordnet similarity features 115 | # time consuming part ~20 hrs 116 | cmd = "python feature_wordnet_similarity.py &" 117 | os.system(cmd) 118 | 119 | 120 | #----------------------------------------------------------------------- 121 | # generate word2vec & doc2vec features using word2vec/doc2vec models 122 | # trained with Homedepot provided data 123 | cmd = "python embedding_trainer.py" 124 | os.system(cmd) 125 | 126 | # most time consuming part 1 ~ 2 days 127 | # after you have trained the WORD2VEC model above, you can MANUALLY distribute the 128 | # feature generation part in feature_word2vec.py to save time, e.g., 129 | # you can run the following part in parallel 130 | # - search_term vs product_title 131 | # - search_term vs product_description 132 | # - search_term vs product_attribute 133 | cmd = "python feature_word2vec.py homedepot &" 134 | os.system(cmd) 135 | 136 | cmd = "python feature_doc2vec.py &" 137 | os.system(cmd) 138 | 139 | 140 | #----------------------------------------------------------------------- 141 | # generate vector space features 142 | # most memory consuming part > 16GB 143 | cmd = "python feature_vector_space.py" 144 | os.system(cmd) 145 | 146 | cmd = "python convert_pkl_lsa_to_csv_lsa.py" 147 | os.system(cmd) 148 | 149 | cmd = "Rscript feature_tsne.R" 150 | os.system(cmd) 151 | 152 | cmd = "python convert_csv_tsne_to_pkl_tsne.py" 153 | os.system(cmd) 154 | -------------------------------------------------------------------------------- /Code/Chenglong/run_stacking_ridge.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: script for testing 2nd & 3rd level model with reg_skl_ridge 5 | 6 | """ 7 | 8 | import os 9 | from optparse import OptionParser 10 | 11 | from utils import time_utils 12 | 13 | 14 | def parse_args(parser): 15 | parser.add_option("-l", "--level", default=2, 16 | type="int", dest="level", help="level") 17 | parser.add_option("-d", "--dim", default=0, 18 | type="int", dest="dim", help="LSA dim") 19 | parser.add_option("-t", "--top", default=10, 20 | type="int", dest="top", help="top N") 21 | parser.add_option("-c", "--corr", default=1.0, 22 | type="float", dest="corr", help="corr") 23 | parser.add_option("-L", "--learner", default="reg_skl_ridge", 24 | type="string", dest="learner", help="learner") 25 | parser.add_option("-o", default=False, action="store_true", dest="refit_once", 26 | help="stacking refit_once") 27 | (options, args) = parser.parse_args() 28 | return options, args 29 | 30 | def main(options): 31 | now = time_utils._timestamp_pretty() 32 | 33 | meta_conf = "level%d_feature_conf_meta_linear_%s"%(options.level, now) 34 | stacking_conf = "level%d_feature_conf_%s"%(options.level, now) 35 | feat_name = "level%d_meta_linear_%s"%(options.level, now) 36 | 37 | # get meta feature conf for `level` models 38 | cmd = "python get_feature_conf_linear_stacking.py -d %d -o %s.py"%( 39 | options.dim, meta_conf) 40 | os.system(cmd) 41 | 42 | # NOTE: using predictions from `level-1` models to generate features 43 | # for `level` models 44 | cmd = "python get_stacking_feature_conf.py -l %d -t %d -o %s.py"%( 45 | options.level-1, options.top, stacking_conf) 46 | os.system(cmd) 47 | 48 | # generate feature for `level` models 49 | cmd = "python feature_combiner.py -l %d -c %s -m %s -n %s -s .csv -t %f"%( 50 | options.level, stacking_conf, meta_conf, feat_name, options.corr) 51 | os.system(cmd) 52 | 53 | # train `level` models 54 | if options.refit_once: 55 | cmd = "python task.py -m stacking -f %s -l %s -e 100 -o"%(feat_name, options.learner) 56 | else: 57 | cmd = "python task.py -m stacking -f %s -l %s -e 100"%(feat_name, options.learner) 58 | os.system(cmd) 59 | 60 | if __name__ == "__main__": 61 | parser = OptionParser() 62 | options, args = parse_args(parser) 63 | main(options) 64 | -------------------------------------------------------------------------------- /Code/Chenglong/run_test_ridge.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: script for testing 1st level model with reg_skl_ridge 5 | 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from utils import time_utils 12 | 13 | if len(sys.argv) >= 3: 14 | suffix = sys.argv[1] 15 | threshold = float(sys.argv[2]) 16 | else: 17 | suffix = time_utils._timestamp_pretty() 18 | threshold = 0.05 19 | 20 | cmd = "python get_feature_conf_linear.py -d 10 -o feature_conf_linear_%s.py"%suffix 21 | os.system(cmd) 22 | 23 | cmd = "python feature_combiner.py -l 1 -c feature_conf_linear_%s -n basic_linear_%s -t %.6f"%(suffix, suffix, threshold) 24 | os.system(cmd) 25 | 26 | cmd = "python task.py -m single -f basic_linear_%s -l reg_skl_ridge -e 100"%suffix 27 | os.system(cmd) 28 | -------------------------------------------------------------------------------- /Code/Chenglong/run_test_xgb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: script for testing 1st level model with reg_xgb_tree 5 | 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from utils import time_utils 12 | 13 | if len(sys.argv) >= 3: 14 | suffix = sys.argv[1] 15 | threshold = float(sys.argv[2]) 16 | else: 17 | suffix = time_utils._timestamp_pretty() 18 | threshold = 0.05 19 | 20 | cmd = "python get_feature_conf_nonlinear.py -d 10 -o feature_conf_nonlinear_%s.py"%suffix 21 | os.system(cmd) 22 | 23 | cmd = "python feature_combiner.py -l 1 -c feature_conf_nonlinear_%s -n basic_nonlinear_%s -t %.6f"%(suffix, suffix, threshold) 24 | os.system(cmd) 25 | 26 | cmd = "python task.py -m single -f basic_nonlinear_%s -l reg_xgb_tree -e 100"%suffix 27 | os.system(cmd) 28 | -------------------------------------------------------------------------------- /Code/Chenglong/turing_test_converter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: convert .csv format dataframe features (from Igor&Kostia) to .pkl format features 5 | 6 | """ 7 | 8 | import os 9 | import sys 10 | import imp 11 | from optparse import OptionParser 12 | 13 | import scipy 14 | import numpy as np 15 | import pandas as pd 16 | 17 | import config 18 | from utils import pkl_utils 19 | 20 | 21 | class TuringTestConverter: 22 | def __init__(self, fname, name): 23 | self.fname = fname 24 | self.name = name 25 | 26 | def convert(self): 27 | dfAll = pd.read_csv(self.fname) 28 | columns_to_drop = ["id", "product_uid", "relevance", "search_term", "product_title"] 29 | columns_to_drop = [col for col in columns_to_drop if col in dfAll.columns] 30 | dfAll.drop(columns_to_drop, axis=1, inplace=True) 31 | for col in dfAll.columns: 32 | pkl_utils._save("%s/TuringTest_%s_%s.pkl"%(config.FEAT_DIR, self.name, col), dfAll[col].values) 33 | 34 | 35 | def main(): 36 | d = { 37 | "df_basic_features.csv": "Basic", 38 | "df_brand_material_dummies.csv": "BrandMaterialDummy", 39 | "df_dist_new.csv": "Dist", 40 | "df_st_tfidf.csv": "StTFIDF", 41 | "df_tfidf_intersept_new.csv": "TFIDF", 42 | "df_thekey_dummies.csv": "TheKeyDummy", 43 | "df_word2vec_new.csv": "Word2Vec", 44 | "dld_features.csv": "DLD", 45 | } 46 | 47 | for k,v in d.items(): 48 | converter = TuringTestConverter( 49 | fname="%s/Turing_test/%s"%(config.FEAT_DIR, k), 50 | name=v) 51 | converter.convert() 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Code/Chenglong/utils/__init__.py -------------------------------------------------------------------------------- /Code/Chenglong/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for distance computation 5 | 6 | """ 7 | 8 | import sys 9 | import warnings 10 | warnings.filterwarnings("ignore") 11 | 12 | try: 13 | import lzma 14 | import Levenshtein 15 | except: 16 | pass 17 | import numpy as np 18 | from difflib import SequenceMatcher 19 | from sklearn.metrics.pairwise import cosine_similarity 20 | 21 | from utils import np_utils 22 | sys.path.append("..") 23 | import config 24 | 25 | 26 | def _edit_dist(str1, str2): 27 | try: 28 | # very fast 29 | # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed 30 | # d = Levenshtein.ratio(str1, str2) 31 | d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2))) 32 | except: 33 | # https://docs.python.org/2/library/difflib.html 34 | d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio() 35 | return d 36 | 37 | 38 | def _is_str_match(str1, str2, threshold=1.0): 39 | assert threshold >= 0.0 and threshold <= 1.0, "Wrong threshold." 40 | if float(threshold) == 1.0: 41 | return str1 == str2 42 | else: 43 | return (1. - _edit_dist(str1, str2)) >= threshold 44 | 45 | 46 | def _longest_match_size(str1, str2): 47 | sq = SequenceMatcher(lambda x: x==" ", str1, str2) 48 | match = sq.find_longest_match(0, len(str1), 0, len(str2)) 49 | return match.size 50 | 51 | 52 | def _longest_match_ratio(str1, str2): 53 | sq = SequenceMatcher(lambda x: x==" ", str1, str2) 54 | match = sq.find_longest_match(0, len(str1), 0, len(str2)) 55 | return np_utils._try_divide(match.size, min(len(str1), len(str2))) 56 | 57 | 58 | def _compression_dist(x, y, l_x=None, l_y=None): 59 | if x == y: 60 | return 0 61 | x_b = x.encode('utf-8') 62 | y_b = y.encode('utf-8') 63 | if l_x is None: 64 | l_x = len(lzma.compress(x_b)) 65 | l_y = len(lzma.compress(y_b)) 66 | l_xy = len(lzma.compress(x_b+y_b)) 67 | l_yx = len(lzma.compress(y_b+x_b)) 68 | dist = np_utils._try_divide(min(l_xy,l_yx)-min(l_x,l_y), max(l_x,l_y)) 69 | return dist 70 | 71 | 72 | def _cosine_sim(vec1, vec2): 73 | try: 74 | s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0] 75 | except: 76 | try: 77 | s = cosine_similarity(vec1, vec2)[0][0] 78 | except: 79 | s = config.MISSING_VALUE_NUMERIC 80 | return s 81 | 82 | 83 | def _vdiff(vec1, vec2): 84 | return vec1 - vec2 85 | 86 | 87 | def _rmse(vec1, vec2): 88 | vdiff = vec1 - vec2 89 | rmse = np.sqrt(np.mean(vdiff**2)) 90 | return rmse 91 | 92 | 93 | def _KL(dist1, dist2): 94 | "Kullback-Leibler Divergence" 95 | return np.sum(dist1 * np.log(dist1/dist2), axis=1) 96 | 97 | 98 | def _jaccard_coef(A, B): 99 | if not isinstance(A, set): 100 | A = set(A) 101 | if not isinstance(B, set): 102 | B = set(B) 103 | return np_utils._try_divide(float(len(A.intersection(B))), len(A.union(B))) 104 | 105 | 106 | def _dice_dist(A, B): 107 | if not isinstance(A, set): 108 | A = set(A) 109 | if not isinstance(B, set): 110 | B = set(B) 111 | return np_utils._try_divide(2.*float(len(A.intersection(B))), (len(A) + len(B))) 112 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/keras_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for Keras models 5 | 6 | """ 7 | 8 | from sklearn.preprocessing import StandardScaler 9 | from keras.models import Sequential 10 | from keras.layers.core import Dense, Layer, Dropout, Activation 11 | from keras.layers.normalization import BatchNormalization 12 | from keras.layers.advanced_activations import ELU, PReLU 13 | from keras.optimizers import SGD 14 | from keras.utils import np_utils, generic_utils 15 | 16 | 17 | class KerasDNNRegressor: 18 | def __init__(self, input_dropout=0.2, hidden_layers=2, hidden_units=64, 19 | hidden_activation="relu", hidden_dropout=0.5, batch_norm=None, 20 | optimizer="adadelta", nb_epoch=10, batch_size=64): 21 | self.input_dropout = input_dropout 22 | self.hidden_layers = hidden_layers 23 | self.hidden_units = hidden_units 24 | self.hidden_activation = hidden_activation 25 | self.hidden_dropout = hidden_dropout 26 | self.batch_norm = batch_norm 27 | self.optimizer = optimizer 28 | self.nb_epoch = nb_epoch 29 | self.batch_size = batch_size 30 | self.scaler = None 31 | self.model = None 32 | 33 | def __str__(self): 34 | return self.__repr__() 35 | 36 | def __repr__(self): 37 | return ("%s(input_dropout=%f, hidden_layers=%d, hidden_units=%d, \n" 38 | "hidden_activation=\'%s\', hidden_dropout=%f, batch_norm=\'%s\', \n" 39 | "optimizer=\'%s\', nb_epoch=%d, batch_size=%d)" % ( 40 | self.__class__.__name__, 41 | self.input_dropout, 42 | self.hidden_layers, 43 | self.hidden_units, 44 | self.hidden_activation, 45 | self.hidden_dropout, 46 | str(self.batch_norm), 47 | self.optimizer, 48 | self.nb_epoch, 49 | self.batch_size, 50 | )) 51 | 52 | 53 | def fit(self, X, y): 54 | ## scaler 55 | self.scaler = StandardScaler() 56 | X = self.scaler.fit_transform(X) 57 | 58 | #### build model 59 | self.model = Sequential() 60 | ## input layer 61 | self.model.add(Dropout(self.input_dropout, input_shape=(X.shape[1],))) 62 | ## hidden layers 63 | first = True 64 | hidden_layers = self.hidden_layers 65 | while hidden_layers > 0: 66 | self.model.add(Dense(self.hidden_units)) 67 | if self.batch_norm == "before_act": 68 | self.model.add(BatchNormalization()) 69 | if self.hidden_activation == "prelu": 70 | self.model.add(PReLU()) 71 | elif self.hidden_activation == "elu": 72 | self.model.add(ELU()) 73 | else: 74 | self.model.add(Activation(self.hidden_activation)) 75 | if self.batch_norm == "after_act": 76 | self.model.add(BatchNormalization()) 77 | self.model.add(Dropout(self.hidden_dropout)) 78 | hidden_layers -= 1 79 | 80 | ## output layer 81 | output_dim = 1 82 | output_act = "linear" 83 | self.model.add(Dense(output_dim)) 84 | self.model.add(Activation(output_act)) 85 | 86 | ## loss 87 | if self.optimizer == "sgd": 88 | sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) 89 | self.model.compile(loss="mse", optimizer=sgd) 90 | else: 91 | self.model.compile(loss="mse", optimizer=self.optimizer) 92 | 93 | ## fit 94 | self.model.fit(X, y, 95 | nb_epoch=self.nb_epoch, 96 | batch_size=self.batch_size, 97 | validation_split=0, verbose=0) 98 | return self 99 | 100 | def predict(self, X): 101 | X = self.scaler.transform(X) 102 | y_pred = self.model.predict(X) 103 | y_pred = y_pred.flatten() 104 | return y_pred 105 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for logging 5 | 6 | """ 7 | 8 | import os 9 | import logging 10 | import logging.handlers 11 | 12 | 13 | def _get_logger(logdir, logname, loglevel=logging.INFO): 14 | fmt = "[%(asctime)s] %(levelname)s: %(message)s" 15 | formatter = logging.Formatter(fmt) 16 | 17 | handler = logging.handlers.RotatingFileHandler( 18 | filename=os.path.join(logdir, logname), 19 | maxBytes=10*1024*1024, 20 | backupCount=10) 21 | handler.setFormatter(formatter) 22 | 23 | logger = logging.getLogger("") 24 | logger.addHandler(handler) 25 | logger.setLevel(loglevel) 26 | return logger 27 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/ngram_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for ngram 5 | 6 | """ 7 | 8 | 9 | def _unigrams(words): 10 | """ 11 | Input: a list of words, e.g., ["I", "am", "Denny"] 12 | Output: a list of unigram 13 | """ 14 | assert type(words) == list 15 | return words 16 | 17 | 18 | def _bigrams(words, join_string, skip=0): 19 | """ 20 | Input: a list of words, e.g., ["I", "am", "Denny"] 21 | Output: a list of bigram, e.g., ["I_am", "am_Denny"] 22 | I use _ as join_string for this example. 23 | """ 24 | assert type(words) == list 25 | L = len(words) 26 | if L > 1: 27 | lst = [] 28 | for i in range(L-1): 29 | for k in range(1,skip+2): 30 | if i+k < L: 31 | lst.append( join_string.join([words[i], words[i+k]]) ) 32 | else: 33 | # set it as unigram 34 | lst = _unigrams(words) 35 | return lst 36 | 37 | 38 | def _trigrams(words, join_string, skip=0): 39 | """ 40 | Input: a list of words, e.g., ["I", "am", "Denny"] 41 | Output: a list of trigram, e.g., ["I_am_Denny"] 42 | I use _ as join_string for this example. 43 | """ 44 | assert type(words) == list 45 | L = len(words) 46 | if L > 2: 47 | lst = [] 48 | for i in range(L-2): 49 | for k1 in range(1,skip+2): 50 | for k2 in range(1,skip+2): 51 | if i+k1 < L and i+k1+k2 < L: 52 | lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) ) 53 | else: 54 | # set it as bigram 55 | lst = _bigrams(words, join_string, skip) 56 | return lst 57 | 58 | 59 | def _fourgrams(words, join_string): 60 | """ 61 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 62 | Output: a list of trigram, e.g., ["I_am_Denny_boy"] 63 | I use _ as join_string for this example. 64 | """ 65 | assert type(words) == list 66 | L = len(words) 67 | if L > 3: 68 | lst = [] 69 | for i in xrange(L-3): 70 | lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) ) 71 | else: 72 | # set it as trigram 73 | lst = _trigrams(words, join_string) 74 | return lst 75 | 76 | 77 | def _uniterms(words): 78 | return _unigrams(words) 79 | 80 | 81 | def _biterms(words, join_string): 82 | """ 83 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 84 | Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"] 85 | I use _ as join_string for this example. 86 | """ 87 | assert type(words) == list 88 | L = len(words) 89 | if L > 1: 90 | lst = [] 91 | for i in range(L-1): 92 | for j in range(i+1,L): 93 | lst.append( join_string.join([words[i], words[j]]) ) 94 | else: 95 | # set it as uniterm 96 | lst = _uniterms(words) 97 | return lst 98 | 99 | 100 | def _triterms(words, join_string): 101 | """ 102 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 103 | Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"] 104 | I use _ as join_string for this example. 105 | """ 106 | assert type(words) == list 107 | L = len(words) 108 | if L > 2: 109 | lst = [] 110 | for i in xrange(L-2): 111 | for j in xrange(i+1,L-1): 112 | for k in xrange(j+1,L): 113 | lst.append( join_string.join([words[i], words[j], words[k]]) ) 114 | else: 115 | # set it as biterm 116 | lst = _biterms(words, join_string) 117 | return lst 118 | 119 | 120 | def _fourterms(words, join_string): 121 | """ 122 | Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"] 123 | Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"] 124 | I use _ as join_string for this example. 125 | """ 126 | assert type(words) == list 127 | L = len(words) 128 | if L > 3: 129 | lst = [] 130 | for i in xrange(L-3): 131 | for j in xrange(i+1,L-2): 132 | for k in xrange(j+1,L-1): 133 | for l in xrange(k+1,L): 134 | lst.append( join_string.join([words[i], words[j], words[k], words[l]]) ) 135 | else: 136 | # set it as triterm 137 | lst = _triterms(words, join_string) 138 | return lst 139 | 140 | 141 | _ngram_str_map = { 142 | 1: "Unigram", 143 | 2: "Bigram", 144 | 3: "Trigram", 145 | 4: "Fourgram", 146 | 5: "Fivegram", 147 | 12: "UBgram", 148 | 123: "UBTgram", 149 | } 150 | 151 | 152 | def _ngrams(words, ngram, join_string=" "): 153 | """wrapper for ngram""" 154 | if ngram == 1: 155 | return _unigrams(words) 156 | elif ngram == 2: 157 | return _bigrams(words, join_string) 158 | elif ngram == 3: 159 | return _trigrams(words, join_string) 160 | elif ngram == 4: 161 | return _fourgrams(words, join_string) 162 | elif ngram == 12: 163 | unigram = _unigrams(words) 164 | bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2] 165 | return unigram + bigram 166 | elif ngram == 123: 167 | unigram = _unigrams(words) 168 | bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2] 169 | trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3] 170 | return unigram + bigram + trigram 171 | 172 | 173 | _nterm_str_map = { 174 | 1: "Uniterm", 175 | 2: "Biterm", 176 | 3: "Triterm", 177 | 4: "Fourterm", 178 | 5: "Fiveterm", 179 | } 180 | 181 | 182 | def _nterms(words, nterm, join_string=" "): 183 | """wrapper for nterm""" 184 | if nterm == 1: 185 | return _uniterms(words) 186 | elif nterm == 2: 187 | return _biterms(words, join_string) 188 | elif nterm == 3: 189 | return _triterms(words, join_string) 190 | elif nterm == 4: 191 | return _fourterms(words, join_string) 192 | 193 | 194 | if __name__ == "__main__": 195 | 196 | text = "I am Denny boy ha" 197 | words = text.split(" ") 198 | 199 | assert _ngrams(words, 1) == ["I", "am", "Denny", "boy", "ha"] 200 | assert _ngrams(words, 2) == ["I am", "am Denny", "Denny boy", "boy ha"] 201 | assert _ngrams(words, 3) == ["I am Denny", "am Denny boy", "Denny boy ha"] 202 | assert _ngrams(words, 4) == ["I am Denny boy", "am Denny boy ha"] 203 | 204 | assert _nterms(words, 1) == ["I", "am", "Denny", "boy", "ha"] 205 | assert _nterms(words, 2) == ["I am", "I Denny", "I boy", "I ha", "am Denny", "am boy", "am ha", "Denny boy", "Denny ha", "boy ha"] 206 | assert _nterms(words, 3) == ["I am Denny", "I am boy", "I am ha", "I Denny boy", "I Denny ha", "I boy ha", "am Denny boy", "am Denny ha", "am boy ha", "Denny boy ha"] 207 | assert _nterms(words, 4) == ["I am Denny boy", "I am Denny ha", "I am boy ha", "I Denny boy ha", "am Denny boy ha"] 208 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/nlp_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for nlp 5 | 6 | """ 7 | 8 | import re 9 | 10 | 11 | def _tokenize(text, token_pattern=" "): 12 | # token_pattern = r"(?u)\b\w\w+\b" 13 | # token_pattern = r"\w{1,}" 14 | # token_pattern = r"\w+" 15 | # token_pattern = r"[\w']+" 16 | if token_pattern == " ": 17 | # just split the text into tokens 18 | return text.split(" ") 19 | else: 20 | token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE) 21 | group = token_pattern.findall(text) 22 | return group 23 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/np_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for numpy 5 | 6 | """ 7 | 8 | import sys 9 | 10 | import numpy as np 11 | from scipy.stats import pearsonr 12 | from collections import Counter 13 | 14 | sys.path.append("..") 15 | import config 16 | 17 | 18 | def _sigmoid(score): 19 | p = 1. / (1. + np.exp(-score)) 20 | return p 21 | 22 | 23 | def _logit(p): 24 | return np.log(p/(1.-p)) 25 | 26 | 27 | def _softmax(score): 28 | score = np.asarray(score, dtype=float) 29 | score = np.exp(score - np.max(score)) 30 | score /= np.sum(score, axis=1)[:,np.newaxis] 31 | return score 32 | 33 | 34 | def _cast_proba_predict(proba): 35 | N = proba.shape[1] 36 | w = np.arange(1,N+1) 37 | pred = proba * w[np.newaxis,:] 38 | pred = np.sum(pred, axis=1) 39 | return pred 40 | 41 | 42 | def _one_hot_label(label, n_classes): 43 | num = label.shape[0] 44 | tmp = np.zeros((num, n_classes), dtype=int) 45 | tmp[np.arange(num),label.astype(int)] = 1 46 | return tmp 47 | 48 | 49 | def _majority_voting(x, weight=None): 50 | ## apply weight 51 | if weight is not None: 52 | assert len(weight) == len(x) 53 | x = np.repeat(x, weight) 54 | c = Counter(x) 55 | value, count = c.most_common()[0] 56 | return value 57 | 58 | 59 | def _voter(x, weight=None): 60 | idx = np.isfinite(x) 61 | if sum(idx) == 0: 62 | value = config.MISSING_VALUE_NUMERIC 63 | else: 64 | if weight is not None: 65 | value = _majority_voting(x[idx], weight[idx]) 66 | else: 67 | value = _majority_voting(x[idx]) 68 | return value 69 | 70 | 71 | def _array_majority_voting(X, weight=None): 72 | y = np.apply_along_axis(_voter, axis=1, arr=X, weight=weight) 73 | return y 74 | 75 | 76 | def _mean(x): 77 | idx = np.isfinite(x) 78 | if sum(idx) == 0: 79 | value = float(config.MISSING_VALUE_NUMERIC) # cast it to float to accommodate the np.mean 80 | else: 81 | value = np.mean(x[idx]) # this is float! 82 | return value 83 | 84 | 85 | def _array_mean(X): 86 | y = np.apply_along_axis(_mean, axis=1, arr=X) 87 | return y 88 | 89 | 90 | def _corr(x, y_train): 91 | if _dim(x) == 1: 92 | corr = pearsonr(x.flatten(), y_train)[0] 93 | if str(corr) == "nan": 94 | corr = 0. 95 | else: 96 | corr = 1. 97 | return corr 98 | 99 | 100 | def _dim(x): 101 | d = 1 if len(x.shape) == 1 else x.shape[1] 102 | return d 103 | 104 | 105 | def _entropy(proba): 106 | entropy = -np.sum(proba*np.log(proba)) 107 | return entropy 108 | 109 | 110 | def _try_divide(x, y, val=0.0): 111 | """try to divide two numbers""" 112 | if y != 0.0: 113 | val = float(x) / y 114 | return val 115 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/os_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for os 5 | 6 | """ 7 | 8 | import os 9 | import time 10 | import shutil 11 | 12 | 13 | def _gen_signature(): 14 | # get pid and current time 15 | pid = int(os.getpid()) 16 | now = int(time.time()) 17 | # signature 18 | signature = "%d_%d" % (pid, now) 19 | return signature 20 | 21 | def _create_dirs(dirs): 22 | for dir in dirs: 23 | if not os.path.exists(dir): 24 | os.makedirs(dir) 25 | 26 | def _remove_files(files): 27 | for file in files: 28 | os.remove(file) 29 | 30 | def _remove_dirs(dirs): 31 | for dir in dirs: 32 | shutil.rmtree(dir) 33 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/pkl_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for pickle 5 | 6 | """ 7 | 8 | import pickle 9 | 10 | 11 | def _save(fname, data, protocol=3): 12 | with open(fname, "wb") as f: 13 | pickle.dump(data, f, protocol) 14 | 15 | def _load(fname): 16 | with open(fname, "rb") as f: 17 | return pickle.load(f) 18 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/rgf_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for RGF models 5 | 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | import numpy as np 12 | 13 | from . import os_utils 14 | sys.path.append("..") 15 | import config 16 | 17 | 18 | class RGFRegressor: 19 | def __init__(self, reg_L2=0.1, reg_sL2=0.0001, max_leaf_forest=10000, num_iteration_opt=10, 20 | num_tree_search=1, min_pop=10, opt_interval=100, opt_stepsize=0.5): 21 | 22 | self.param = { 23 | "reg_L2": reg_L2, 24 | "reg_sL2": reg_sL2, 25 | "max_leaf_forest": max_leaf_forest, 26 | "num_iteration_opt": num_iteration_opt, 27 | "num_tree_search": num_tree_search, 28 | "min_pop": min_pop, 29 | "opt_interval": opt_interval, 30 | "opt_stepsize": opt_stepsize, 31 | } 32 | 33 | # create tmp dir to hold data and model (especially the latter) 34 | self.tmp_dir = "%s/%s"%(config.TMP_DIR, os_utils._gen_signature()) 35 | os_utils._create_dirs([self.tmp_dir]) 36 | self.model_fn_prefix = "%s/rgf_model"%self.tmp_dir 37 | 38 | def __del__(self): 39 | ## delete tmp dir 40 | os_utils._remove_dirs([self.tmp_dir]) 41 | 42 | def __str__(self): 43 | return "RGFRegressor" 44 | 45 | def fit(self, X, y): 46 | 47 | # write train data to file 48 | train_x_fn = "%s/data.x"%self.tmp_dir 49 | train_y_fn = "%s/data.y"%self.tmp_dir 50 | np.savetxt(train_x_fn, X, fmt="%.6f", delimiter="\t") 51 | np.savetxt(train_y_fn, y, fmt="%.6f", delimiter="\t") 52 | 53 | ## write train param to file 54 | params = [ 55 | "train_x_fn=",train_x_fn,"\n", 56 | "train_y_fn=",train_y_fn,"\n", 57 | #"train_w_fn=",weight_train_path,"\n", 58 | "model_fn_prefix=",self.model_fn_prefix,"\n", 59 | "reg_L2=", self.param["reg_L2"], "\n", 60 | "reg_sL2=", self.param["reg_sL2"], "\n", 61 | #"reg_depth=", 1.01, "\n", 62 | "algorithm=","RGF","\n", 63 | "loss=","LS","\n", 64 | #"opt_interval=", 100, "\n", 65 | # save model at the end of training 66 | "test_interval=", self.param["max_leaf_forest"],"\n", 67 | "max_leaf_forest=", self.param["max_leaf_forest"],"\n", 68 | "num_iteration_opt=", self.param["num_iteration_opt"], "\n", 69 | "num_tree_search=", self.param["num_tree_search"], "\n", 70 | "min_pop=", self.param["min_pop"], "\n", 71 | "opt_interval=", self.param["opt_interval"], "\n", 72 | "opt_stepsize=", self.param["opt_stepsize"], "\n", 73 | "NormalizeTarget" 74 | ] 75 | params = "".join([str(p) for p in params]) 76 | 77 | rgf_setting = "%s/rgf_setting"%self.tmp_dir # DOES NOT contain ".inp" 78 | with open(rgf_setting+".inp", "w") as f: 79 | f.write(params) 80 | 81 | ## train rgf 82 | rgf_log = "%s/rgf_log"%self.tmp_dir 83 | cmd = "perl %s %s train %s >> %s"%( 84 | config.RGF_CALL_EXE, config.RGF_EXE, rgf_setting, rgf_log) 85 | os.system(cmd) 86 | 87 | return self 88 | 89 | def predict(self, X): 90 | 91 | ## write data to file 92 | valid_x_fn = "%s/data.x"%self.tmp_dir 93 | valid_y_fn = "%s/data.y"%self.tmp_dir 94 | np.savetxt(valid_x_fn, X, fmt="%.6f", delimiter="\t") 95 | 96 | ## write predict params to file 97 | model_fn = self.model_fn_prefix + "-01" 98 | params = [ 99 | "test_x_fn=", valid_x_fn,"\n", 100 | "model_fn=", model_fn,"\n", 101 | "prediction_fn=", valid_y_fn 102 | ] 103 | params = "".join([str(p) for p in params]) 104 | 105 | rgf_setting = "%s/rgf_setting"%self.tmp_dir 106 | with open(rgf_setting+".inp", "w") as f: 107 | f.write(params) 108 | 109 | ## predict 110 | rgf_log = "%s/rgf_log"%self.tmp_dir 111 | cmd = "perl %s %s predict %s >> %s"%( 112 | config.RGF_CALL_EXE, config.RGF_EXE, rgf_setting, rgf_log) 113 | os.system(cmd) 114 | 115 | y_pred = np.loadtxt(valid_y_fn, dtype=float) 116 | 117 | return y_pred 118 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/skl_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for scikit-learn models 5 | 6 | """ 7 | 8 | import numpy as np 9 | import sklearn.svm 10 | import sklearn.neighbors 11 | import sklearn.ensemble 12 | from sklearn.linear_model import Ridge 13 | from sklearn.tree import DecisionTreeRegressor 14 | from sklearn.tree import ExtraTreeRegressor 15 | from sklearn.pipeline import Pipeline 16 | from sklearn.preprocessing import StandardScaler, PolynomialFeatures 17 | 18 | from . import dist_utils 19 | 20 | 21 | class SVR: 22 | def __init__(self, kernel='rbf', degree=3, gamma='auto', C=1.0, 23 | epsilon=0.1, normalize=True, cache_size=2048): 24 | svr = sklearn.svm.SVR(kernel=kernel, degree=degree, 25 | gamma=gamma, C=C, epsilon=epsilon) 26 | if normalize: 27 | self.model = Pipeline([('ss', StandardScaler()), ('svr', svr)]) 28 | else: 29 | self.model = svr 30 | 31 | def __str__(self): 32 | return "SVR" 33 | 34 | def fit(self, X, y): 35 | self.model.fit(X, y) 36 | return self 37 | 38 | def predict(self, X): 39 | y_pred = self.model.predict(X) 40 | return y_pred 41 | 42 | 43 | class LinearSVR: 44 | def __init__(self, epsilon=0.0, C=1.0, loss='epsilon_insensitive', 45 | random_state=None, normalize=True): 46 | lsvr = sklearn.svm.LinearSVR(epsilon=epsilon, C=C, 47 | loss=loss, random_state=random_state) 48 | if normalize: 49 | self.model = Pipeline([('ss', StandardScaler()), ('lsvr', lsvr)]) 50 | else: 51 | self.model = lsvr 52 | 53 | def __str__(self): 54 | return "LinearSVR" 55 | 56 | def fit(self, X, y): 57 | self.model.fit(X, y) 58 | return self 59 | 60 | def predict(self, X): 61 | y_pred = self.model.predict(X) 62 | return y_pred 63 | 64 | 65 | class KNNRegressor: 66 | def __init__(self, n_neighbors=5, weights='uniform', leaf_size=30, 67 | metric='minkowski', normalize=True): 68 | if metric == 'cosine': 69 | metric = lambda x,y: dist_utils._cosine_sim(x, y) 70 | knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, 71 | leaf_size=leaf_size, metric=metric) 72 | if normalize: 73 | self.model = Pipeline([('ss', StandardScaler()), ('knn', knn)]) 74 | else: 75 | self.model = knn 76 | 77 | def __str__(self): 78 | return "KNNRegressor" 79 | 80 | def fit(self, X, y): 81 | self.model.fit(X, y) 82 | return self 83 | 84 | def predict(self, X): 85 | y_pred = self.model.predict(X) 86 | return y_pred 87 | 88 | 89 | class AdaBoostRegressor: 90 | def __init__(self, base_estimator=None, n_estimators=50, max_features=1.0, 91 | max_depth=6, learning_rate=1.0, loss='linear', random_state=None): 92 | if base_estimator and base_estimator == 'etr': 93 | base_estimator = ExtraTreeRegressor(max_depth=max_depth, 94 | max_features=max_features) 95 | else: 96 | base_estimator = DecisionTreeRegressor(max_depth=max_depth, 97 | max_features=max_features) 98 | 99 | self.model = sklearn.ensemble.AdaBoostRegressor( 100 | base_estimator=base_estimator, 101 | n_estimators=n_estimators, 102 | learning_rate=learning_rate, 103 | random_state=random_state, 104 | loss=loss) 105 | 106 | def __str__(self): 107 | return "AdaBoostRegressor" 108 | 109 | def fit(self, X, y): 110 | self.model.fit(X, y) 111 | return self 112 | 113 | def predict(self, X): 114 | y_pred = self.model.predict(X) 115 | return y_pred 116 | 117 | 118 | class RandomRidge: 119 | def __init__(self, alpha=1.0, normalize=True, poly=False, 120 | n_estimators=10, max_features=1.0, 121 | bootstrap=True, subsample=1.0, 122 | random_state=2016): 123 | self.alpha = alpha 124 | self.normalize = normalize 125 | self.poly = poly 126 | self.n_estimators = n_estimators 127 | if isinstance(max_features, float): 128 | assert max_features > 0 and max_features <= 1 129 | self.max_features = max_features 130 | self.bootstrap = bootstrap 131 | assert subsample > 0 and subsample <= 1 132 | self.subsample = subsample 133 | self.random_state = random_state 134 | self.ridge_list = [0]*self.n_estimators 135 | self.feature_idx_list = [0]*self.n_estimators 136 | 137 | def __str__(self): 138 | return "RandomRidge" 139 | 140 | def _random_feature_idx(self, fdim, random_state): 141 | rng = np.random.RandomState(random_state) 142 | if isinstance(self.max_features, int): 143 | size = min(fdim, self.max_features) 144 | else: 145 | size = int(fdim * self.max_features) 146 | idx = rng.permutation(fdim)[:size] 147 | return idx 148 | 149 | def _random_sample_idx(self, sdim, random_state): 150 | rng = np.random.RandomState(random_state) 151 | size = int(sdim * self.subsample) 152 | if self.bootstrap: 153 | idx = rng.randint(sdim, size=size) 154 | else: 155 | idx = rng.permutation(sdim)[:size] 156 | return idx 157 | 158 | def fit(self, X, y): 159 | sdim, fdim = X.shape 160 | for i in range(self.n_estimators): 161 | ridge = Ridge(alpha=self.alpha, normalize=self.normalize, random_state=self.random_state) 162 | fidx = self._random_feature_idx(fdim, self.random_state+i*100) 163 | sidx = self._random_sample_idx(sdim, self.random_state+i*10) 164 | X_tmp = X[sidx][:,fidx] 165 | if self.poly: 166 | X_tmp = PolynomialFeatures(degree=2).fit_transform(X_tmp)[:,1:] 167 | ridge.fit(X_tmp, y[sidx]) 168 | self.ridge_list[i] = ridge 169 | self.feature_idx_list[i] = fidx 170 | return self 171 | 172 | def predict(self, X): 173 | y_pred = np.zeros((X.shape[0], self.n_estimators)) 174 | for i in range(self.n_estimators): 175 | fidx = self.feature_idx_list[i] 176 | ridge = self.ridge_list[i] 177 | X_tmp = X[:,fidx] 178 | if self.poly: 179 | X_tmp = PolynomialFeatures(degree=2).fit_transform(X_tmp)[:,1:] 180 | y_pred[:,i] = ridge.predict(X_tmp) 181 | y_pred = np.mean(y_pred, axis=1) 182 | return y_pred 183 | -------------------------------------------------------------------------------- /Code/Chenglong/utils/time_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for time 5 | 6 | """ 7 | 8 | import datetime 9 | 10 | 11 | def _timestamp(): 12 | now = datetime.datetime.now() 13 | now_str = now.strftime("%Y-%m-%d-%H-%M") 14 | return now_str 15 | 16 | 17 | def _timestamp_pretty(): 18 | now = datetime.datetime.now() 19 | now_str = now.strftime("%Y%m%d%H%M") 20 | return now_str 21 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/config_IgorKostia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This is config for HomeDepot Project: Igor&Kostia's part 4 | 5 | Competition: HomeDepot Search Relevance 6 | Author: Igor Buinyi 7 | Team: Turing test 8 | """ 9 | 10 | 11 | import os 12 | ROOT_DIR = os.getcwd() 13 | 14 | DATA_DIR= "%s/data"%ROOT_DIR 15 | PROCESSINGTEXT_DIR= "%s/processing_text"%ROOT_DIR 16 | FEATURES_DIR= "%s/features"%ROOT_DIR 17 | SAVEDMODELS_DIR= "%s/saved_models"%ROOT_DIR 18 | MODELS_DIR= "%s/models"%ROOT_DIR 19 | MODELSENSEMBLE_DIR= "%s/models_ensemble"%ROOT_DIR 20 | FEATURESETS_DIR="%s/feature_sets"%ROOT_DIR 21 | 22 | 23 | if not os.path.exists(PROCESSINGTEXT_DIR): 24 | os.mkdir(PROCESSINGTEXT_DIR) 25 | if not os.path.exists(FEATURES_DIR): 26 | os.mkdir(FEATURES_DIR) 27 | if not os.path.exists(SAVEDMODELS_DIR): 28 | os.mkdir(SAVEDMODELS_DIR) 29 | if not os.path.exists(MODELS_DIR): 30 | os.mkdir(MODELS_DIR) 31 | if not os.path.exists(MODELSENSEMBLE_DIR): 32 | os.mkdir(MODELSENSEMBLE_DIR) 33 | 34 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/feature_sets/first_part_2000.csv: -------------------------------------------------------------------------------- 1 | feature_name,number 2 | wordFor_in_title_string_only_letratio,103 3 | two_words_in_description_string_only_sum,181 4 | pt_pd__unigram_dice_dist,488 5 | beforethekeys_pathsimilarity_max,310 6 | keyword_in_description_num,272 7 | beforethekey_before2thekey_lchsimilarity_max,372 8 | beforethekey_thekey_ressimilarity_max,338 9 | pt_pd__triterm_dice_dist,517 10 | beforethekeys_thekey_ressimilarity_max,350 11 | tfidf_vbg_in_description_let,416 12 | beforethekey_before2thekey_pathsimilarity_mean,371 13 | st_at__trigram_dice_dist,499 14 | nn_important_in_nn_unimportant_in_description_num,199 15 | size_of_brands_in_product_description,28 16 | pd_at_unigram_tfidf,587 17 | seqmatch_desc&bullets_ratioscaled,383 18 | st_tfidf_8.1,457 19 | st_pd__fourgram_dice_dist,504 20 | query_material_in_description_convoluted,99 21 | pd_at__bigram_jaccard_coef,530 22 | pt_at_unigram_tfidf,579 23 | tfidf_jj_rb_in_bullets_let,423 24 | vbg_in_vbg_in_title_sum,157 25 | no_bullets_dummy,2 26 | pt_ab_fourgram_tfidf,578 27 | st_pd__biterm_dice_dist,510 28 | st_pt_fourgram_tfidf,556 29 | thekey_before2thekey_ressimilarity_max,326 30 | before2thekey_thekey_pathsimilarity_max,340 31 | len_of_materials_in_query,21 32 | query_lchhsimilarity_max,354 33 | query_brand_in_brand_convoluted,75 34 | query_in_description,165 35 | tfidf_nn_unimportant_in_title_num,406 36 | word_in_description_string_only_let,173 37 | len_of_brands_in_product_title,23 38 | st_at__bigram_jaccard_coef,527 39 | beforethekey_thekey_pathsimilarity_max,334 40 | thekeys_lchsimilarity_mean,307 41 | common_digits_in_description_jaccard,186 42 | perc_digits_in_description,69 43 | title_pathsimilarity_mean,359 44 | word_in_description_string_only_num,171 45 | st_at__fourgram_dice_dist,505 46 | st_tfidf_7.1,456 47 | query_brand_in_all_nomatch,79 48 | jj_rb_in_jj_rb_in_title_sum,154 49 | pt_pd_fourgram_tfidf,574 50 | description_similarity_11-20,427 51 | query_brand_in_all_fullmatch,76 52 | vbg_in_vbg_in_description_sum,211 53 | tfidf_vbg_in_bullets_let,417 54 | st_pd__triterm_dice_dist,515 55 | st_at__trigram_jaccard_coef,533 56 | seqmatch_title_ratioscaled,377 57 | query_brand_in_all_convoluted,80 58 | len_of_attribute_bullets_woBM,18 59 | st_tfidf_7,444 60 | tfidf_nn_important_in_title_num,400 61 | nn_important_in_title_let,135 62 | two_words_in_bullets_num,231 63 | st_at__triterm_jaccard_coef,550 64 | description_similarity_11-20to10,435 65 | tfidf_nn_important_in_bullets_let,405 66 | tfidf_jj_rb_in_title_num,418 67 | two_words_in_description_let,179 68 | nn_important_in_nn_important_in_title_letratio,144 69 | len_of_product_title_woBM,16 70 | query_pathsimilarity_max,352 71 | vbg_in_vbg_in_bullets_sum,265 72 | st_tfidf_5.1,454 73 | two_words_in_description_string_only_num,180 74 | query_brand_in_title_fullmatch,81 75 | pt_pd__bigram_jaccard_coef,528 76 | wordWith_in_bullets_string_only_letratio,218 77 | pd_at__trigram_dice_dist,502 78 | two_words_in_bullets_let,233 79 | st_at__biterm_jaccard_coef,545 80 | st_tfidf_1,438 81 | len_of_query_woBM,9 82 | beforethekey_in_bullets_sum,293 83 | wordWith_in_title_string_only_num,104 84 | size_of_brands_in_product_title,24 85 | beforethekey_before2thekey_ressimilarity_mean,375 86 | nn_unimportant_in_title_num,138 87 | thekey_before2thekey_pathsimilarity_mean,323 88 | nn_important_in_nn_important_in_bullets_letratio,252 89 | beforethekeys_ressimilarity_mean,315 90 | seqmatch_desc&bullets_ratio,382 91 | len_of_product_title_keys,40 92 | thekey_in_bullets_sum,289 93 | word_in_description_numratio,169 94 | tfidf_title_querybeforethekey_num,397 95 | nn_important_in_nn_important_in_bullets_let,251 96 | query_brand_in_bullets_convoluted,87 97 | pd_at__fourgram_dice_dist,508 98 | nn_important_in_nn_important_in_title_sum,142 99 | beforethekeys_thekey_lchsimilarity_max,348 100 | word_in_bullets_let,222 101 | st_tfidf_6.1,455 102 | ratio_of_nn_unimportant_in_search_term,48 103 | keyword_in_titlekeys_jaclet,283 104 | query_brand_in_brand_partialmatch,72 105 | st_pt__fourgram_jaccard_coef,537 106 | ratio_of_nn_unimportant_in_attribute_bullets,63 107 | st_at__unigram_dice_dist,487 108 | word_in_bullets_string_only_num,225 109 | beforethekey_in_bullets_let,294 110 | beforethekey_thekey_pathsimilarity_mean,335 111 | nn_unimportant_in_nn_important_in_description_num,203 112 | word_in_description_string_only_letratio,175 113 | before2thekey_beforethekey_lchsimilarity_max,366 114 | thekeys_in_title,284 115 | wordFor_in_bullets_string_only_letratio,215 116 | len_of_product_title_thekey,42 117 | initial_len_of_query,8 118 | word2vec_13,474 119 | vbg_in_vbg_in_title_let,158 120 | pt_ab_bigram_tfidf,576 121 | query_brand_in_description_convoluted,86 122 | avg_wordlength_in_query,6 123 | tfidf_nn_unimportant_in_title_let,409 124 | seqmatch_bullets_ratio,380 125 | query_material_in_all_fullmatch,93 126 | jj_rb_in_jj_rb_in_bullets_let,263 127 | word_in_bullets_string_only_letratio,229 128 | st_tfidf_9,446 129 | len_of_query_keys,39 130 | beforethekeys_thekey_pathsimilarity_max,346 131 | nn_unimportant_in_description_letratio,194 132 | thekey_before2thekey_pathsimilarity_max,322 133 | word2vec_22,483 134 | pd_ab_unigram_tfidf,583 135 | word2vec_6,467 136 | pd_at_trigram_tfidf,589 137 | st_pd__bigram_jaccard_coef,526 138 | word2vec_17,478 139 | st_pd__biterm_jaccard_coef,544 140 | word_in_bullets_num,220 141 | query_material_in_all_assumedmatch,95 142 | keyword_in_bullets_num,277 143 | pt_pd__trigram_dice_dist,500 144 | word_in_description_let,168 145 | size_of_brands_in_attribute_bullets,32 146 | size_of_materials_in_query,22 147 | pt_pd__biterm_jaccard_coef,546 148 | nn_unimportant_in_nn_important_in_description_sum,204 149 | nn_unimportant_in_title_letratio,140 150 | pt_at__triterm_jaccard_coef,552 151 | jj_rb_in_jj_rb_in_description_let,209 152 | query_brand_in_title_partialmatch,82 153 | pt_at__triterm_dice_dist,518 154 | st_pt__fourgram_dice_dist,503 155 | len_of_query_beforethekey,43 156 | word_in_title_string_only_letratio,120 157 | jj_rb_in_jj_rb_in_title_let,155 158 | jj_rb_in_jj_rb_in_description_num,207 159 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/feature_sets/first_part_3000.csv: -------------------------------------------------------------------------------- 1 | feature_name,number 2 | pt_pd__bigram_jaccard_coef,346 3 | description_similarity_20,10 4 | beforethekeys_in_beforethekeys,357 5 | st_tfidf_0.1,74 6 | query_brand_in_bullets_convoluted,330 7 | thekey_beforethekey_pathsimilarity_mean,107 8 | common_digits_in_title_jaccard,53 9 | st_pd_unigram_tfidf,166 10 | 1word_string_dld_in_pts,358 11 | tfidf_nn_unimportant_in_title_let,278 12 | st_tfidf_3.1,58 13 | ratio_of_jj_rb_in_product_description,151 14 | len_of_product_title,344 15 | pt_at__trigram_dice_dist,320 16 | word2vec_11,8 17 | st_at__unigram_jaccard_coef,304 18 | nn_unimportant_in_description_let,342 19 | pt_pd__unigram_jaccard_coef,246 20 | word2vec_22,4 21 | nn_unimportant_in_title_let,367 22 | st_tfidf_5.1,98 23 | thekey_in_bullets_let,185 24 | word_in_bullets_numratio,321 25 | 1word_dld_in_pt,324 26 | word2vec_1,40 27 | tfidf_description_num,68 28 | tfidf_matchdescription_stringonly_num,249 29 | ratio_of_jj_rb_in_attribute_bullets,234 30 | nn_important_in_title_letratio,145 31 | seqmatch_description_ratioscaled,81 32 | st_tfidf_10,67 33 | st_pd__trigram_dice_dist,350 34 | 2word_string_dld_in_pds,385 35 | tfidf_description_let,80 36 | nn_important_in_nn_important_in_bullets_letratio,438 37 | keyword_in_bullets_letratio,189 38 | nn_unimportant_in_bullets_let,373 39 | tfidf_title_num,65 40 | wordFor_in_title_string_only_let,265 41 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/feature_sets/first_part_3010.csv: -------------------------------------------------------------------------------- 1 | feature_name,number 2 | pt_at__bigram_dice_dist,91 3 | pt_at__triterm_jaccard_coef,148 4 | st_at__bigram_dice_dist,89 5 | word2vec_16,73 6 | word2vec_5,62 7 | ab_at_fourgram_tfidf,190 8 | st_pt__trigram_jaccard_coef,127 9 | pd_ab_unigram_tfidf,179 10 | st_tfidf_11,44 11 | st_ab_bigram_tfidf,160 12 | st_pd__trigram_jaccard_coef,128 13 | st_pt_triterm_tfidf,154 14 | 1word_dld_in_pt,0 15 | 1word_string_dld_in_pt,2 16 | pd_ab_trigram_tfidf,181 17 | 1word_string_dld_in_pts,3 18 | above15_dummy_frequency_of_beforethekey_thekey,191 19 | st_ab_unigram_tfidf,159 20 | st_tfidf_0,33 21 | st_pt__triterm_dice_dist,110 22 | pd_at_trigram_tfidf,185 23 | word2vec_2,59 24 | st_tfidf_9.1,54 25 | word2vec_22,79 26 | word2vec_9,66 27 | 1word_dld_in_at,24 28 | pd_at_unigram_tfidf,183 29 | pt_at_trigram_tfidf,177 30 | st_pt_bigram_tfidf,150 31 | word2vec_1,58 32 | pt_pd_bigram_tfidf,168 33 | st_pt__triterm_jaccard_coef,144 34 | pd_at__unigram_jaccard_coef,120 35 | word2vec_23,80 36 | pt_pd__triterm_jaccard_coef,147 37 | st_pt__biterm_dice_dist,105 38 | st_tfidf_2,35 39 | st_tfidf_3.1,48 40 | word2vec_11,68 41 | 2word_dld_in_pt,4 42 | pt_at__triterm_dice_dist,114 43 | pd_at__unigram_dice_dist,86 44 | pd_at__bigram_jaccard_coef,126 45 | pt_ab_bigram_tfidf,172 46 | pd_at_fourgram_tfidf,186 47 | st_pt_fourgram_tfidf,152 48 | pt_pd__fourgram_dice_dist,102 49 | st_tfidf_1,34 50 | 2word_string_dld_in_ab,22 51 | pt_at__bigram_jaccard_coef,125 52 | st_tfidf_4,37 53 | st_pd__unigram_dice_dist,82 54 | 2word_dld_in_abs,21 55 | pt_at_bigram_tfidf,176 56 | st_at__fourgram_jaccard_coef,135 57 | pt_pd_fourgram_tfidf,170 58 | st_tfidf_10.1,55 59 | st_pd__bigram_jaccard_coef,122 60 | 1word_string_dld_in_pd,10 61 | st_at__triterm_jaccard_coef,146 62 | pt_pd__fourgram_jaccard_coef,136 63 | 2word_dld_in_pds,13 64 | pt_pd__trigram_dice_dist,96 65 | word2vec_3,60 66 | st_at__triterm_dice_dist,112 67 | st_pd__fourgram_jaccard_coef,134 68 | 2word_dld_in_at,28 69 | st_tfidf_11.1,56 70 | 1word_string_dld_in_abs,19 71 | pd_at__bigram_dice_dist,92 72 | 2word_string_dld_in_pd,14 73 | st_pt__biterm_jaccard_coef,139 74 | st_pt__bigram_jaccard_coef,121 75 | 1word_string_dld_in_at,26 76 | st_pt__bigram_dice_dist,87 77 | st_at__biterm_jaccard_coef,141 78 | 1word_dld_in_pts,1 79 | 2word_dld_in_ab,20 80 | st_pd_unigram_tfidf,155 81 | pt_at__fourgram_jaccard_coef,137 82 | st_at_trigram_tfidf,165 83 | pd_at__trigram_jaccard_coef,132 84 | 2word_string_dld_in_pds,15 85 | pt_pd__unigram_dice_dist,84 86 | st_pt__unigram_dice_dist,81 87 | word2vec_18,75 88 | pt_at__fourgram_dice_dist,103 89 | st_at__fourgram_dice_dist,101 90 | st_tfidf_2.1,47 91 | word2vec_17,74 92 | 1word_dld_in_ats,25 93 | ab_at_bigram_tfidf,188 94 | 2word_string_dld_in_pt,6 95 | ab_at_trigram_tfidf,189 96 | st_tfidf_7,40 97 | 2word_string_dld_in_ats,31 98 | 2word_string_dld_in_pts,7 99 | st_at_unigram_tfidf,163 100 | 2word_dld_in_ats,29 101 | st_pt_trigram_tfidf,151 102 | st_pt__fourgram_dice_dist,99 103 | st_pt__trigram_dice_dist,93 104 | st_at__unigram_jaccard_coef,117 105 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/feature_sets/first_part_3020.csv: -------------------------------------------------------------------------------- 1 | feature_name,number 2 | word_in_description_string_only_letratio,56 3 | query_brand_in_title_convoluted,57 4 | st_pt_trigram_tfidf,197 5 | nn_important_in_title_let,190 6 | above8_dummy_frequency_of_beforethekey_thekey,179 7 | beforethekeys_pathsimilarity_max,74 8 | word2vec_7,77 9 | word2vec_12,189 10 | 2word_dld_in_pts,167 11 | nn_important_in_nn_unimportant_in_description_num,51 12 | word2vec_21,75 13 | st_pt__biterm_jaccard_coef,63 14 | description_similarity_21-30rel,192 15 | nn_important_in_nn_important_in_title_num,133 16 | nn_unimportant_in_nn_important_in_description_letratio,27 17 | 1word_dld_in_pds,120 18 | query_brand_in_all_fullmatch,65 19 | query_brand_in_title_fullmatch,21 20 | st_tfidf_6.1,98 21 | two_words_in_description_num,156 22 | nn_unimportant_in_title_letratio,93 23 | tfidf_title_querythekey_num,101 24 | tfidf_matchtitle_num,124 25 | len_of_brands_in_query,12 26 | 2word_string_dld_in_pt,11 27 | nn_unimportant_in_nn_important_in_description_num,201 28 | keyword_in_titlekeys_let,135 29 | word_in_title_string_only_sum,114 30 | nn_important_in_nn_important_in_title_letratio,187 31 | description_similarity_10,66 32 | query_brand_in_all_convoluted,155 33 | two_words_in_description_sum,151 34 | 2word_dld_in_pt,142 35 | beforethekeys_lchsimilarity_max,157 36 | beforethekeys_lchsimilarity_mean,158 37 | two_words_in_description_string_only_sum,122 38 | nn_important_in_nn_important_in_description_letratio,173 39 | seqmatch_description_ratioscaled,53 40 | word_in_title_string_only_let,193 41 | thekey_in_thekey,175 42 | 2word_string_dld_in_ats,137 43 | st_pd_unigram_tfidf,61 44 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/feature_sets/readme.txt: -------------------------------------------------------------------------------- 1 | This folder contains feature sets that are necessaey to reproduce our calculations of `Ensemble_B` in Step IK5 (see readme.md in the root folder). -------------------------------------------------------------------------------- /Code/Igor&Kostia/feature_sets/second_part_2000.csv: -------------------------------------------------------------------------------- 1 | feature_name,number 2 | wordFor_in_title_string_only_letratio,103 3 | two_words_in_description_string_only_sum,181 4 | pt_pd__unigram_dice_dist,488 5 | beforethekeys_pathsimilarity_max,310 6 | keyword_in_description_num,272 7 | beforethekey_before2thekey_lchsimilarity_max,372 8 | beforethekey_thekey_ressimilarity_max,338 9 | pt_pd__triterm_dice_dist,517 10 | beforethekeys_thekey_ressimilarity_max,350 11 | tfidf_vbg_in_description_let,416 12 | beforethekey_before2thekey_pathsimilarity_mean,371 13 | st_at__trigram_dice_dist,499 14 | nn_important_in_nn_unimportant_in_description_num,199 15 | size_of_brands_in_product_description,28 16 | pd_at_unigram_tfidf,587 17 | seqmatch_desc&bullets_ratioscaled,383 18 | st_tfidf_8.1,457 19 | st_pd__fourgram_dice_dist,504 20 | query_material_in_description_convoluted,99 21 | pd_at__bigram_jaccard_coef,530 22 | pt_at_unigram_tfidf,579 23 | tfidf_jj_rb_in_bullets_let,423 24 | vbg_in_vbg_in_title_sum,157 25 | no_bullets_dummy,2 26 | pt_ab_fourgram_tfidf,578 27 | st_pd__biterm_dice_dist,510 28 | st_pt_fourgram_tfidf,556 29 | thekey_before2thekey_ressimilarity_max,326 30 | before2thekey_thekey_pathsimilarity_max,340 31 | len_of_materials_in_query,21 32 | query_lchhsimilarity_max,354 33 | query_brand_in_brand_convoluted,75 34 | query_in_description,165 35 | tfidf_nn_unimportant_in_title_num,406 36 | word_in_description_string_only_let,173 37 | len_of_brands_in_product_title,23 38 | st_at__bigram_jaccard_coef,527 39 | beforethekey_thekey_pathsimilarity_max,334 40 | thekeys_lchsimilarity_mean,307 41 | common_digits_in_description_jaccard,186 42 | perc_digits_in_description,69 43 | title_pathsimilarity_mean,359 44 | word_in_description_string_only_num,171 45 | st_at__fourgram_dice_dist,505 46 | st_tfidf_7.1,456 47 | query_brand_in_all_nomatch,79 48 | jj_rb_in_jj_rb_in_title_sum,154 49 | pt_pd_fourgram_tfidf,574 50 | description_similarity_11-20,427 51 | query_brand_in_all_fullmatch,76 52 | vbg_in_vbg_in_description_sum,211 53 | tfidf_vbg_in_bullets_let,417 54 | st_pd__triterm_dice_dist,515 55 | st_at__trigram_jaccard_coef,533 56 | seqmatch_title_ratioscaled,377 57 | query_brand_in_all_convoluted,80 58 | len_of_attribute_bullets_woBM,18 59 | st_tfidf_7,444 60 | tfidf_nn_important_in_title_num,400 61 | nn_important_in_title_let,135 62 | two_words_in_bullets_num,231 63 | st_at__triterm_jaccard_coef,550 64 | description_similarity_11-20to10,435 65 | tfidf_nn_important_in_bullets_let,405 66 | tfidf_jj_rb_in_title_num,418 67 | two_words_in_description_let,179 68 | nn_important_in_nn_important_in_title_letratio,144 69 | len_of_product_title_woBM,16 70 | query_pathsimilarity_max,352 71 | vbg_in_vbg_in_bullets_sum,265 72 | st_tfidf_5.1,454 73 | two_words_in_description_string_only_num,180 74 | query_brand_in_title_fullmatch,81 75 | pt_pd__bigram_jaccard_coef,528 76 | wordWith_in_bullets_string_only_letratio,218 77 | pd_at__trigram_dice_dist,502 78 | two_words_in_bullets_let,233 79 | st_at__biterm_jaccard_coef,545 80 | st_tfidf_1,438 81 | len_of_query_woBM,9 82 | beforethekey_in_bullets_sum,293 83 | wordWith_in_title_string_only_num,104 84 | size_of_brands_in_product_title,24 85 | beforethekey_before2thekey_ressimilarity_mean,375 86 | nn_unimportant_in_title_num,138 87 | thekey_before2thekey_pathsimilarity_mean,323 88 | nn_important_in_nn_important_in_bullets_letratio,252 89 | beforethekeys_ressimilarity_mean,315 90 | seqmatch_desc&bullets_ratio,382 91 | len_of_product_title_keys,40 92 | thekey_in_bullets_sum,289 93 | word_in_description_numratio,169 94 | tfidf_title_querybeforethekey_num,397 95 | nn_important_in_nn_important_in_bullets_let,251 96 | query_brand_in_bullets_convoluted,87 97 | pd_at__fourgram_dice_dist,508 98 | nn_important_in_nn_important_in_title_sum,142 99 | beforethekeys_thekey_lchsimilarity_max,348 100 | word_in_bullets_let,222 101 | st_tfidf_6.1,455 102 | ratio_of_nn_unimportant_in_search_term,48 103 | keyword_in_titlekeys_jaclet,283 104 | query_brand_in_brand_partialmatch,72 105 | st_pt__fourgram_jaccard_coef,537 106 | ratio_of_nn_unimportant_in_attribute_bullets,63 107 | st_at__unigram_dice_dist,487 108 | word_in_bullets_string_only_num,225 109 | beforethekey_in_bullets_let,294 110 | beforethekey_thekey_pathsimilarity_mean,335 111 | nn_unimportant_in_nn_important_in_description_num,203 112 | word_in_description_string_only_letratio,175 113 | before2thekey_beforethekey_lchsimilarity_max,366 114 | thekeys_in_title,284 115 | wordFor_in_bullets_string_only_letratio,215 116 | len_of_product_title_thekey,42 117 | initial_len_of_query,8 118 | word2vec_13,474 119 | vbg_in_vbg_in_title_let,158 120 | pt_ab_bigram_tfidf,576 121 | query_brand_in_description_convoluted,86 122 | avg_wordlength_in_query,6 123 | tfidf_nn_unimportant_in_title_let,409 124 | seqmatch_bullets_ratio,380 125 | query_material_in_all_fullmatch,93 126 | jj_rb_in_jj_rb_in_bullets_let,263 127 | word_in_bullets_string_only_letratio,229 128 | st_tfidf_9,446 129 | len_of_query_keys,39 130 | beforethekeys_thekey_pathsimilarity_max,346 131 | nn_unimportant_in_description_letratio,194 132 | thekey_before2thekey_pathsimilarity_max,322 133 | word2vec_22,483 134 | pd_ab_unigram_tfidf,583 135 | word2vec_6,467 136 | pd_at_trigram_tfidf,589 137 | st_pd__bigram_jaccard_coef,526 138 | word2vec_17,478 139 | st_pd__biterm_jaccard_coef,544 140 | word_in_bullets_num,220 141 | query_material_in_all_assumedmatch,95 142 | keyword_in_bullets_num,277 143 | pt_pd__trigram_dice_dist,500 144 | word_in_description_let,168 145 | size_of_brands_in_attribute_bullets,32 146 | size_of_materials_in_query,22 147 | pt_pd__biterm_jaccard_coef,546 148 | nn_unimportant_in_nn_important_in_description_sum,204 149 | nn_unimportant_in_title_letratio,140 150 | pt_at__triterm_jaccard_coef,552 151 | jj_rb_in_jj_rb_in_description_let,209 152 | query_brand_in_title_partialmatch,82 153 | pt_at__triterm_dice_dist,518 154 | st_pt__fourgram_dice_dist,503 155 | len_of_query_beforethekey,43 156 | word_in_title_string_only_letratio,120 157 | jj_rb_in_jj_rb_in_title_let,155 158 | jj_rb_in_jj_rb_in_description_num,207 159 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/feature_sets/second_part_3000.csv: -------------------------------------------------------------------------------- 1 | feature_name,number 2 | pt_pd__bigram_jaccard_coef,346 3 | description_similarity_20,10 4 | beforethekeys_in_beforethekeys,357 5 | st_tfidf_0.1,74 6 | query_brand_in_bullets_convoluted,330 7 | thekey_beforethekey_pathsimilarity_mean,107 8 | common_digits_in_title_jaccard,53 9 | st_pd_unigram_tfidf,166 10 | 1word_string_dld_in_pts,358 11 | tfidf_nn_unimportant_in_title_let,278 12 | st_tfidf_3.1,58 13 | ratio_of_jj_rb_in_product_description,151 14 | len_of_product_title,344 15 | pt_at__trigram_dice_dist,320 16 | word2vec_11,8 17 | st_at__unigram_jaccard_coef,304 18 | nn_unimportant_in_description_let,342 19 | pt_pd__unigram_jaccard_coef,246 20 | word2vec_22,4 21 | nn_unimportant_in_title_let,367 22 | st_tfidf_5.1,98 23 | thekey_in_bullets_let,185 24 | word_in_bullets_numratio,321 25 | 1word_dld_in_pt,324 26 | word2vec_1,40 27 | tfidf_description_num,68 28 | tfidf_matchdescription_stringonly_num,249 29 | ratio_of_jj_rb_in_attribute_bullets,234 30 | nn_important_in_title_letratio,145 31 | seqmatch_description_ratioscaled,81 32 | st_tfidf_10,67 33 | st_pd__trigram_dice_dist,350 34 | 2word_string_dld_in_pds,385 35 | tfidf_description_let,80 36 | nn_important_in_nn_important_in_bullets_letratio,438 37 | keyword_in_bullets_letratio,189 38 | nn_unimportant_in_bullets_let,373 39 | tfidf_title_num,65 40 | wordFor_in_title_string_only_let,265 41 | -------------------------------------------------------------------------------- /Code/Igor&Kostia/generate_feature_importances.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | The file to generated feature importances from the benchmark Gradient Boost model: 4 | separately for dummies and all other features. 5 | 6 | Competition: HomeDepot Search Relevance 7 | Author: Igor Buinyi 8 | Team: Turing test 9 | """ 10 | 11 | from config_IgorKostia import * 12 | 13 | import numpy as np 14 | import pandas as pd 15 | from sklearn.ensemble import GradientBoostingRegressor 16 | from time import time 17 | 18 | 19 | # get num_tain 20 | df_train = pd.read_csv(DATA_DIR+'/train.csv', encoding="ISO-8859-1") 21 | num_train = df_train.shape[0] #number of observations 22 | 23 | # load features 24 | df_all = pd.read_csv(FEATURES_DIR+'/df_basic_features.csv', encoding="utf-8") 25 | df_dist = pd.read_csv(FEATURES_DIR+'/df_dist_new.csv', encoding="utf-8") 26 | df_st_tfidf= pd.read_csv(FEATURES_DIR+'/df_st_tfidf.csv', encoding="utf-8") 27 | if 'Unnamed: 0' in df_st_tfidf.keys(): 28 | df_st_tfidf = df_st_tfidf.drop(['Unnamed: 0'],axis=1) 29 | df_tfidf_intersect = pd.read_csv(FEATURES_DIR+'/df_tfidf_intersept_new.csv', encoding="utf-8") 30 | df_word2vec = pd.read_csv(FEATURES_DIR+'/df_word2vec_new.csv', encoding="utf-8") 31 | df_dld = pd.read_csv(FEATURES_DIR+'/dld_features.csv', encoding="utf-8") 32 | 33 | """ 34 | the following features and files were added later 35 | so this is the adjustment in order to reproduce the same results 36 | """ 37 | df_above15 = pd.read_csv(FEATURES_DIR+'/df_feature_above15_ext.csv', encoding="utf-8") 38 | df_above15 = df_above15[['id','above15_dummy_frequency_of_beforethekey_thekey']] 39 | df_all = pd.merge(df_all, df_above15, how='left', on='id') 40 | 41 | # merge 42 | df_all = pd.merge(df_all, df_dist, how='left', on='id') 43 | df_all = pd.merge(df_all, df_st_tfidf, how='left', on='id') 44 | df_all = pd.merge(df_all, df_tfidf_intersect, how='left', on='id') 45 | df_all = pd.merge(df_all, df_word2vec, how='left', on='id') 46 | df_all = pd.merge(df_all, df_dld, how='left', on='id') 47 | 48 | 49 | # drop product_uid and some vars 50 | drop_list=['product_uid'] 51 | drop_list+=['description_similarity_10', 'description_similarity_11-20', 'description_similarity_30', 52 | 'description_similarity_21-30', 'description_similarity_10rel', 'description_similarity_11-20rel', 53 | 'description_similarity_30rel', 'description_similarity_21-30rel', 'description_similarity_21-30to10', 54 | 'word_in_title_string_only_num', 'word_in_title_string_only_sum', 'word_in_title_string_only_let'] 55 | 56 | 57 | print len(df_all.keys()) 58 | new_drop_list=[] 59 | for var in drop_list: 60 | if var in df_all.keys(): 61 | new_drop_list.append(var) 62 | df_all=df_all.drop(new_drop_list,axis=1) 63 | print len(df_all.keys()) 64 | 65 | 66 | # generate matrices to be used in clf 67 | df_train = df_all.iloc[:num_train] 68 | df_test = df_all.iloc[num_train:] 69 | id_test = df_test['id'] 70 | id_train = df_train['id'] 71 | 72 | y_train = df_train['relevance'].values 73 | X_train = df_train.drop(['id','relevance'],axis=1).values 74 | X_test = df_test.drop(['id','relevance'],axis=1).values 75 | 76 | 77 | ######################################################################### 78 | ##### use GradientBoostingRegressor to generate feature importances 79 | t0 = time() 80 | params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 1, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1} 81 | clf = GradientBoostingRegressor(**params) 82 | 83 | clf.fit(X_train, y_train) 84 | 85 | y_pred = clf.predict(X_test) 86 | y_pred[y_pred<1.]=1. 87 | y_pred[y_pred>3.]=3. 88 | 89 | 90 | pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv(MODELS_DIR+'/submission_benchmark_without_dummies.csv',index=False) 91 | sorted_idx = np.argsort(clf.feature_importances_) 92 | pd.DataFrame({"name":df_all.keys().drop(['id','relevance'])[sorted_idx], "importance": clf.feature_importances_[sorted_idx]}).to_csv(MODELS_DIR+'/feature_importances_benchmark_without_dummies.csv',index=False) 93 | 94 | print "file saved" 95 | print 'modelling time:',round((time()-t0)/60,1) ,'minutes\n' 96 | t0 = time() 97 | 98 | 99 | #### load feature importances from file 100 | df_importance = pd.read_csv(MODELS_DIR+'/feature_importances_benchmark_without_dummies.csv', encoding="utf-8") 101 | df_importance=df_importance.sort_values(['importance'],ascending=[0]) 102 | df_importance['cumulative']=df_importance['importance'].map(lambda x: sum(df_importance['importance'][df_importance['importance']>=x])) 103 | var_list=list(df_importance['name'][df_importance['cumulative']<0.990]) 104 | 105 | 106 | # use only 40 vars in the next step 107 | df_all=df_all[['id','relevance']+var_list[0:40]] 108 | 109 | 110 | # load dummies 111 | df_bm_dummy = pd.read_csv(FEATURES_DIR+'/df_brand_material_dummies.csv', encoding="utf-8") 112 | df_thekey_dummy = pd.read_csv(FEATURES_DIR+'/df_thekey_dummies.csv', encoding="utf-8") 113 | df_all = pd.merge(df_all, df_bm_dummy, how='left', on='id') 114 | df_all = pd.merge(df_all, df_thekey_dummy, how='left', on='id') 115 | 116 | # generate matrices to be used in clf 117 | df_train = df_all.iloc[:num_train] 118 | df_test = df_all.iloc[num_train:] 119 | id_test = df_test['id'] 120 | id_train = df_train['id'] 121 | 122 | y_train = df_train['relevance'].values 123 | X_train = df_train.drop(['id','relevance'],axis=1).values 124 | X_test = df_test.drop(['id','relevance'],axis=1).values 125 | 126 | ################################################################################# 127 | ##### use GradientBoostingRegressor to generate feature importances for dummies 128 | t0 = time() 129 | params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 1, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1} 130 | clf = GradientBoostingRegressor(**params) 131 | 132 | clf.fit(X_train, y_train) 133 | 134 | y_pred = clf.predict(X_test) 135 | y_pred[y_pred<1.]=1. 136 | y_pred[y_pred>3.]=3. 137 | 138 | 139 | pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv(MODELS_DIR+'/submission_benchmark_top40_and_dummies.csv',index=False) 140 | sorted_idx = np.argsort(clf.feature_importances_) 141 | pd.DataFrame({"name":df_all.keys().drop(['id','relevance'])[sorted_idx], "importance": clf.feature_importances_[sorted_idx]}).to_csv(MODELS_DIR+'/feature_importances_benchmark_top40_and_dummies.csv',index=False) 142 | 143 | print "file saved" 144 | print 'modelling time:',round((time()-t0)/60,1) ,'minutes\n' 145 | t0 = time() 146 | 147 | -------------------------------------------------------------------------------- /Data/dict/color_data.py: -------------------------------------------------------------------------------- 1 | # https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/18967/data-preparation 2 | 3 | COLOR_LIST = [ 4 | "white", 5 | "black", 6 | "brown", 7 | "gray", 8 | "chrome", 9 | "stainless steel", 10 | "whites", 11 | "red", 12 | "browns / tans", 13 | "bronze", 14 | "silver", 15 | "blacks", 16 | "beige", 17 | "stainless", 18 | "blue", 19 | "nickel", 20 | "metallics", 21 | "clear", 22 | "grays", 23 | "green", 24 | "multi", 25 | "beige / cream", 26 | "tan", 27 | "greens", 28 | "yellow", 29 | "wood", 30 | "blues", 31 | "reds / pinks", 32 | "brushed nickel", 33 | "orange", 34 | "metallic", 35 | "brass", 36 | "yellows / golds", 37 | "oil rubbed bronze", 38 | "polished chrome", 39 | "almond", 40 | "multi-colored", 41 | "dark brown wood", 42 | "primed white", 43 | "beige/bisque", 44 | "biscuit", 45 | "ivory", 46 | "oranges / peaches", 47 | "grey", 48 | "unfinished wood", 49 | "light brown wood", 50 | "wood grain", 51 | "silver metallic", 52 | "copper", 53 | "medium brown wood", 54 | "soft white", 55 | "gold", 56 | "satin nickel", 57 | "cherry", 58 | "bright white", 59 | "red/orange", 60 | "teal", 61 | "natural", 62 | "oak", 63 | "mahogany", 64 | "aluminum", 65 | "espresso", 66 | "unfinished", 67 | "purples / lavenders", 68 | "brown/tan", 69 | "steel", 70 | "venetian bronze", 71 | "slate", 72 | "warm white", 73 | "bone", 74 | "pink", 75 | "stainless look", 76 | "reddish brown wood", 77 | "solid colors", 78 | "off-white", 79 | "walnut", 80 | "chocolate", 81 | "light almond", 82 | "vibrant brushed nickel", 83 | "satin white", 84 | "polished brass", 85 | "linen", 86 | "white primer", 87 | "purple", 88 | "charcoal", 89 | "color", 90 | "oil-rubbed bronze", 91 | "melamine white", 92 | "turquoises / aquas", 93 | "blue/purple", 94 | "primed", 95 | "bisque", 96 | "browns/tans", 97 | "assorted colors", 98 | "java", 99 | "pewter", 100 | "chestnut", 101 | "yellow/gold", 102 | "taupe", 103 | "pacific white", 104 | "cedar", 105 | "monochromatic stainless steel", 106 | "other", 107 | "platinum", 108 | "mocha", 109 | "cream", 110 | "sand", 111 | "daylight", 112 | "brushed stainless steel", 113 | "powder-coat white", 114 | ] -------------------------------------------------------------------------------- /Data/split/splits_level1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Data/split/splits_level1.pkl -------------------------------------------------------------------------------- /Data/split/splits_level2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Data/split/splits_level2.pkl -------------------------------------------------------------------------------- /Data/split/splits_level3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Data/split/splits_level3.pkl -------------------------------------------------------------------------------- /Doc/Kaggle_HomeDepot_Turing_Test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Doc/Kaggle_HomeDepot_Turing_Test.pdf -------------------------------------------------------------------------------- /Doc/reference.bib: -------------------------------------------------------------------------------- 1 | % Created by Chenglong Chen 2 | % Date: Jul. 12 2015 3 | 4 | @inproceedings{ebc, 5 | AUTHOR = {Ling Li and Hsuan-Tien Lin}, 6 | title = {Ordinal regression by extended binary classification}, 7 | booktitle = {Advances in Neural Information Processing Systems: Proceedings of the 2006 Conference (NIPS '06)}, 8 | YEAR = {2006}, 9 | PAGES = {865-872}, 10 | } 11 | 12 | @article{cocr, 13 | author = {Yu-Xun Ruan and Hsuan-Tien Lin and Ming-Feng Tsai}, 14 | title = {Improving ranking performance with cost-sensitive ordinal classification via regression}, 15 | journal = {Information Retrieval}, 16 | volume = {17}, 17 | number = {1}, 18 | pages = {1--20}, 19 | year = {2014} 20 | } 21 | 22 | @inproceedings{hyperopt, 23 | AUTHOR = {James Bergstra and R$\acute{\text{e}}$mi Bardenet and Yoshua Bengio and Bal$\acute{\text{a}}$zs K$\acute{\text{e}}$gl}, 24 | title = {Algorithms for Hyper-Parameter Optimization}, 25 | booktitle = {Advances in Neural Information Processing Systems: Proceedings of the 2011 Conference (NIPS '11)}, 26 | year = {2011}, 27 | pages = {2546--2554} 28 | } 29 | 30 | @MISC{hyperopt_url, 31 | note = {\url{http://hyperopt.github.io/hyperopt/}} 32 | } 33 | 34 | @MISC{glove-gensim, 35 | note = {\url{https://github.com/manasRK/glove-gensim}} 36 | } 37 | 38 | @MISC{PeterNorvig, 39 | note = {\url{http://norvig.com/spell-correct.html}} 40 | } 41 | 42 | @MISC{BenS, 43 | note = {\url{https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/19463/what-s-the-lb-shakeup-potential/116689\#post116689}} 44 | } 45 | 46 | 47 | @MISC{CrowdFlower_1st, 48 | note = {\url{https://github.com/ChenglongChen/Kaggle_CrowdFlower}} 49 | } 50 | 51 | 52 | @article{ensemble_selection, 53 | title={Ensemble selection from libraries of models}, 54 | author={Niculescu-Mizil, Alexandru and Caruana, Rich and Crew, Geoff and Ksikes, Alex}, 55 | journal={Proceedings of International Conference on Machine Learning}, 56 | pages={137--144}, 57 | year={2004} 58 | } 59 | 60 | 61 | @book{NLTK_Cookbook, 62 | author = {Jacob Perkins}, 63 | title = {Python Text Processing with NLTK 2.0 Cookbook}, 64 | publisher = {}, 65 | month = {Nov.}, 66 | year = {2010}, 67 | } 68 | 69 | 70 | @inproceedings{wmd, 71 | AUTHOR = {Matt J. Kusner and Yu Sun and Nicholas I. Kolkin and Kilian Q. Weinberger}, 72 | title = {From Word Embeddings To Document Distances}, 73 | booktitle = {the $32^{\text{nd}}$ International Conference on Machine Learning}, 74 | year = {2015}, 75 | pages = {} 76 | } 77 | 78 | -------------------------------------------------------------------------------- /Doc/reference2.bib: -------------------------------------------------------------------------------- 1 | % Created by Igor Buinyi 2 | % Date: May. 5 2016 3 | 4 | 5 | @MISC{Google_dict, 6 | note = {\url{https://www.kaggle.com/steubk/home-depot-product-search-relevance/fixing-typos 7 | }}} 8 | 9 | @MISC{crowdflower_3place, 10 | note = {\url{http://blog.kaggle.com/2015/07/22/crowdflower-winners-interview-3rd-place-team-quartet/ 11 | }}} 12 | 13 | @MISC{dato:beatthebenchmark, 14 | note = {\url{https://www.kaggle.com/c/dato-native/forums/t/16626/beat-the-benchmark-0-90388-with-simple-model 15 | }}} 16 | 17 | @MISC{crowdflower_2place, 18 | note = {\url{https://github.com/geffy/kaggle-crowdflower 19 | }}} 20 | -------------------------------------------------------------------------------- /Fig/CV_LB_Chenglong.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/CV_LB_Chenglong.pdf -------------------------------------------------------------------------------- /Fig/FlowChart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/FlowChart.jpg -------------------------------------------------------------------------------- /Fig/FlowChart.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/FlowChart.pptx -------------------------------------------------------------------------------- /Fig/actual_product_uid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/actual_product_uid.pdf -------------------------------------------------------------------------------- /Fig/actual_search_term.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/actual_search_term.pdf -------------------------------------------------------------------------------- /Fig/feature_corr_Chenglong.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/feature_corr_Chenglong.pdf -------------------------------------------------------------------------------- /Fig/feature_importances_Igor.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/feature_importances_Igor.pdf -------------------------------------------------------------------------------- /Fig/naive_product_uid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/naive_product_uid.pdf -------------------------------------------------------------------------------- /Fig/naive_search_term.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/naive_search_term.pdf -------------------------------------------------------------------------------- /Fig/plot_ensembles_means.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_ensembles_means.pdf -------------------------------------------------------------------------------- /Fig/plot_ensembles_performance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_ensembles_performance.pdf -------------------------------------------------------------------------------- /Fig/plot_feature_importances_benchmark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_feature_importances_benchmark.pdf -------------------------------------------------------------------------------- /Fig/plot_feature_importances_simplified_model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_feature_importances_simplified_model.pdf -------------------------------------------------------------------------------- /Fig/plot_full_query_in_title.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_full_query_in_title.pdf -------------------------------------------------------------------------------- /Fig/plot_high_vs_low_relevance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_high_vs_low_relevance.pdf -------------------------------------------------------------------------------- /Fig/plot_query_with.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_query_with.pdf -------------------------------------------------------------------------------- /Fig/plot_replaced_with_Google.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_replaced_with_Google.pdf -------------------------------------------------------------------------------- /Fig/proposed_product_uid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/proposed_product_uid.pdf -------------------------------------------------------------------------------- /Fig/proposed_search_term.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/proposed_search_term.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Igor Buinyi, Kostia Omelianchuk, Chenglong Chen 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. -------------------------------------------------------------------------------- /Log/README.md: -------------------------------------------------------------------------------- 1 | This folder contains logs for Chenglong's features and models. 2 | * `feature`: logs of most of the features 3 | 4 | * `level1_models`: logs of all the 1st level models used for building 2nd level model (a.k.a. Chenglong's final ensemble) 5 | 6 | * `feature_combiner_level2_meta_linear_201605030922_2016-05-03-09-23.log`: log of the features (i.e., 1st level models) chosen for building 2nd level model 7 | 8 | * `[Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_hyperopt_2016-05-07-18-42.log`: log of the 2nd level model -------------------------------------------------------------------------------- /Log/feature/data_processor_2016-05-08-00-36.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 00:48:55,265] INFO: Run GoogleQuerySpellingChecker at search_term 2 | [2016-05-08 01:29:06,789] INFO: Save to ../../Data/Clean/all.lemmatized.csv.pkl 3 | [2016-05-08 01:43:43,971] INFO: Save to ../../Data/Clean/all.lemmatized.stemmed.csv.pkl 4 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_basic_2016-05-08-01-43.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 01:44:22,874] INFO: DocId_search_term_1D (1D): corr = -0.006971 2 | [2016-05-08 01:44:23,556] INFO: DocId_product_title_1D (1D): corr = -0.005569 3 | [2016-05-08 01:44:24,929] INFO: DocId_product_description_1D (1D): corr = -0.001244 4 | [2016-05-08 01:44:26,188] INFO: DocId_product_attribute_1D (1D): corr = 0.089704 5 | [2016-05-08 01:44:26,684] INFO: DocId_product_brand_1D (1D): corr = -0.022024 6 | [2016-05-08 01:44:27,210] INFO: DocId_product_color_1D (1D): corr = 0.036558 7 | [2016-05-08 01:44:29,230] INFO: DocLen_search_term_1D (1D): corr = -0.084015 8 | [2016-05-08 01:44:30,432] INFO: DocLen_product_title_1D (1D): corr = -0.013815 9 | [2016-05-08 01:44:35,849] INFO: DocLen_product_description_1D (1D): corr = 0.042311 10 | [2016-05-08 01:44:40,927] INFO: DocLen_product_attribute_1D (1D): corr = -0.039293 11 | [2016-05-08 01:44:41,716] INFO: DocLen_product_brand_1D (1D): corr = -0.074429 12 | [2016-05-08 01:44:42,484] INFO: DocLen_product_color_1D (1D): corr = -0.020528 13 | [2016-05-08 01:44:43,079] INFO: DocFreq_search_term_1D (1D): corr = 0.148810 14 | [2016-05-08 01:44:43,707] INFO: DocFreq_product_title_1D (1D): corr = -0.030461 15 | [2016-05-08 01:44:44,980] INFO: DocFreq_product_description_1D (1D): corr = -0.025100 16 | [2016-05-08 01:44:46,213] INFO: DocFreq_product_attribute_1D (1D): corr = 0.145885 17 | [2016-05-08 01:44:46,767] INFO: DocFreq_product_brand_1D (1D): corr = 0.146631 18 | [2016-05-08 01:44:47,310] INFO: DocFreq_product_color_1D (1D): corr = 0.047351 19 | [2016-05-08 01:45:11,103] INFO: DocEntropy_search_term_1D (1D): corr = -0.068423 20 | [2016-05-08 01:45:36,281] INFO: DocEntropy_product_title_1D (1D): corr = -0.004657 21 | [2016-05-08 01:46:19,960] INFO: DocEntropy_product_description_1D (1D): corr = 0.053725 22 | [2016-05-08 01:47:01,011] INFO: DocEntropy_product_attribute_1D (1D): corr = -0.130249 23 | [2016-05-08 01:47:22,331] INFO: DocEntropy_product_brand_1D (1D): corr = -0.078429 24 | [2016-05-08 01:47:45,043] INFO: DocEntropy_product_color_1D (1D): corr = -0.026344 25 | [2016-05-08 01:47:46,418] INFO: DigitCount_search_term_1D (1D): corr = -0.078397 26 | [2016-05-08 01:47:48,546] INFO: DigitCount_product_title_1D (1D): corr = -0.012951 27 | [2016-05-08 01:47:56,985] INFO: DigitCount_product_description_1D (1D): corr = 0.042207 28 | [2016-05-08 01:48:07,439] INFO: DigitCount_product_attribute_1D (1D): corr = -0.032872 29 | [2016-05-08 01:48:08,783] INFO: DigitCount_product_brand_1D (1D): corr = -0.007779 30 | [2016-05-08 01:48:10,034] INFO: DigitCount_product_color_1D (1D): corr = -0.008055 31 | [2016-05-08 01:48:12,696] INFO: DigitRatio_search_term_1D (1D): corr = -0.070659 32 | [2016-05-08 01:48:16,447] INFO: DigitRatio_product_title_1D (1D): corr = -0.002637 33 | [2016-05-08 01:48:30,963] INFO: DigitRatio_product_description_1D (1D): corr = 0.017609 34 | [2016-05-08 01:48:47,017] INFO: DigitRatio_product_attribute_1D (1D): corr = -0.120129 35 | [2016-05-08 01:48:49,532] INFO: DigitRatio_product_brand_1D (1D): corr = -0.010386 36 | [2016-05-08 01:48:51,975] INFO: DigitRatio_product_color_1D (1D): corr = -0.003868 37 | [2016-05-08 01:48:52,263] INFO: DocIdEcho_product_uid_1D (1D): corr = -0.130656 38 | [2016-05-08 01:48:53,049] INFO: DocFreq_product_uid_1D (1D): corr = -0.032851 39 | [2016-05-08 01:48:54,079] INFO: ProductUidDummy1_product_uid_1D (1D): corr = 0.171689 40 | [2016-05-08 01:48:54,952] INFO: ProductUidDummy2_product_uid_1D (1D): corr = 0.000000 41 | [2016-05-08 01:48:55,685] INFO: ProductUidDummy3_product_uid_1D (1D): corr = -0.172492 42 | [2016-05-08 01:48:57,041] INFO: UniqueCount_Unigram_search_term_1D (1D): corr = -0.083333 43 | [2016-05-08 01:48:59,167] INFO: UniqueCount_Unigram_product_title_1D (1D): corr = -0.011923 44 | [2016-05-08 01:49:09,973] INFO: UniqueCount_Unigram_product_description_1D (1D): corr = 0.047268 45 | [2016-05-08 01:49:21,315] INFO: UniqueCount_Unigram_product_attribute_1D (1D): corr = -0.058670 46 | [2016-05-08 01:49:22,825] INFO: UniqueCount_Unigram_product_brand_1D (1D): corr = -0.074985 47 | [2016-05-08 01:49:24,334] INFO: UniqueCount_Unigram_product_color_1D (1D): corr = -0.024776 48 | [2016-05-08 01:49:28,358] INFO: UniqueCount_Bigram_search_term_1D (1D): corr = -0.096647 49 | [2016-05-08 01:49:39,780] INFO: UniqueCount_Bigram_product_title_1D (1D): corr = -0.013097 50 | [2016-05-08 01:51:33,197] INFO: UniqueCount_Bigram_product_description_1D (1D): corr = 0.043463 51 | [2016-05-08 01:53:18,345] INFO: UniqueCount_Bigram_product_attribute_1D (1D): corr = -0.042916 52 | [2016-05-08 01:53:20,921] INFO: UniqueCount_Bigram_product_brand_1D (1D): corr = -0.040597 53 | [2016-05-08 01:53:23,404] INFO: UniqueCount_Bigram_product_color_1D (1D): corr = -0.016889 54 | [2016-05-08 01:53:28,045] INFO: UniqueCount_Trigram_search_term_1D (1D): corr = -0.080895 55 | [2016-05-08 01:53:43,518] INFO: UniqueCount_Trigram_product_title_1D (1D): corr = -0.013489 56 | [2016-05-08 01:56:32,561] INFO: UniqueCount_Trigram_product_description_1D (1D): corr = 0.042642 57 | [2016-05-08 01:58:51,653] INFO: UniqueCount_Trigram_product_attribute_1D (1D): corr = -0.039823 58 | [2016-05-08 01:58:54,527] INFO: UniqueCount_Trigram_product_brand_1D (1D): corr = -0.011067 59 | [2016-05-08 01:58:57,297] INFO: UniqueCount_Trigram_product_color_1D (1D): corr = -0.013767 60 | [2016-05-08 01:58:59,923] INFO: UniqueRatio_Unigram_search_term_1D (1D): corr = 0.027054 61 | [2016-05-08 01:59:03,582] INFO: UniqueRatio_Unigram_product_title_1D (1D): corr = 0.005347 62 | [2016-05-08 01:59:16,935] INFO: UniqueRatio_Unigram_product_description_1D (1D): corr = -0.007640 63 | [2016-05-08 01:59:29,356] INFO: UniqueRatio_Unigram_product_attribute_1D (1D): corr = 0.100459 64 | [2016-05-08 01:59:31,767] INFO: UniqueRatio_Unigram_product_brand_1D (1D): corr = -0.003936 65 | [2016-05-08 01:59:34,131] INFO: UniqueRatio_Unigram_product_color_1D (1D): corr = -0.001990 66 | [2016-05-08 01:59:39,711] INFO: UniqueRatio_Bigram_search_term_1D (1D): corr = 0.020047 67 | [2016-05-08 01:59:53,123] INFO: UniqueRatio_Bigram_product_title_1D (1D): corr = 0.007022 68 | [2016-05-08 02:01:40,205] INFO: UniqueRatio_Bigram_product_description_1D (1D): corr = 0.008495 69 | [2016-05-08 02:02:59,568] INFO: UniqueRatio_Bigram_product_attribute_1D (1D): corr = 0.044705 70 | [2016-05-08 02:03:01,494] INFO: UniqueRatio_Bigram_product_brand_1D (1D): corr = 0.000000 71 | [2016-05-08 02:03:03,467] INFO: UniqueRatio_Bigram_product_color_1D (1D): corr = -0.008007 72 | [2016-05-08 02:03:07,453] INFO: UniqueRatio_Trigram_search_term_1D (1D): corr = 0.015218 73 | [2016-05-08 02:03:23,431] INFO: UniqueRatio_Trigram_product_title_1D (1D): corr = 0.008634 74 | [2016-05-08 02:05:29,477] INFO: UniqueRatio_Trigram_product_description_1D (1D): corr = 0.007814 75 | [2016-05-08 02:07:26,469] INFO: UniqueRatio_Trigram_product_attribute_1D (1D): corr = 0.020869 76 | [2016-05-08 02:07:30,186] INFO: UniqueRatio_Trigram_product_brand_1D (1D): corr = 0.000000 77 | [2016-05-08 02:07:33,792] INFO: UniqueRatio_Trigram_product_color_1D (1D): corr = -0.004186 78 | [2016-05-08 02:07:34,141] INFO: AttrCount_product_attribute_list_1D (1D): corr = -0.048513 79 | [2016-05-08 02:07:39,868] INFO: AttrBulletCount_product_attribute_list_1D (1D): corr = -0.070608 80 | [2016-05-08 02:07:46,288] INFO: AttrBulletRatio_product_attribute_list_1D (1D): corr = -0.150464 81 | [2016-05-08 02:07:52,174] INFO: AttrNonBulletCount_product_attribute_list_1D (1D): corr = -0.035769 82 | [2016-05-08 02:07:58,799] INFO: AttrNonBulletRatio_product_attribute_list_1D (1D): corr = 0.150464 83 | [2016-05-08 02:08:04,288] INFO: AttrHasProductHeight_product_attribute_list_1D (1D): corr = -0.099505 84 | [2016-05-08 02:08:10,515] INFO: AttrHasProductWidth_product_attribute_list_1D (1D): corr = -0.110805 85 | [2016-05-08 02:08:16,320] INFO: AttrHasProductLength_product_attribute_list_1D (1D): corr = -0.051217 86 | [2016-05-08 02:08:22,124] INFO: AttrHasProductDepth_product_attribute_list_1D (1D): corr = -0.093089 87 | [2016-05-08 02:08:27,670] INFO: AttrHasIndoorOutdoor_product_attribute_list_1D (1D): corr = -0.017944 88 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_char_dist_sim_2016-05-08-12-02.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 12:05:53,743] INFO: CharDistribution_CosineSim_search_term_x_product_title_1D (1D): corr = 0.221607 2 | [2016-05-08 12:07:43,389] INFO: CharDistribution_CosineSim_search_term_x_product_description_1D (1D): corr = 0.072751 3 | [2016-05-08 12:09:21,822] INFO: CharDistribution_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.036551 4 | [2016-05-08 12:09:32,860] INFO: CharDistribution_KL_search_term_x_product_title_1D (1D): corr = -0.223736 5 | [2016-05-08 12:11:01,222] INFO: CharDistribution_KL_search_term_x_product_description_1D (1D): corr = -0.048662 6 | [2016-05-08 12:12:21,357] INFO: CharDistribution_KL_search_term_x_product_attribute_1D (1D): corr = -0.117159 7 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_doc2vec_2016-05-08-12-56.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 12:56:52,428] INFO: loading Doc2Vec object from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model 2 | [2016-05-08 12:57:02,221] INFO: loading docvecs recursively from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.docvecs.* with mmap=None 3 | [2016-05-08 12:57:02,235] INFO: loading doctag_syn0 from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.docvecs.doctag_syn0.npy with mmap=None 4 | [2016-05-08 12:57:02,377] INFO: loading syn0 from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.syn0.npy with mmap=None 5 | [2016-05-08 12:57:02,438] INFO: loading syn1 from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.syn1.npy with mmap=None 6 | [2016-05-08 12:57:02,498] INFO: setting ignored attribute syn0norm to None 7 | [2016-05-08 12:57:02,499] INFO: setting ignored attribute cum_table to None 8 | [2016-05-08 12:59:10,467] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_title_1D (1D): corr = 0.315041 9 | [2016-05-08 13:00:31,971] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_description_1D (1D): corr = 0.238963 10 | [2016-05-08 13:01:48,488] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.065506 11 | [2016-05-08 13:03:03,277] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_brand_1D (1D): corr = 0.052180 12 | [2016-05-08 13:04:20,041] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_color_1D (1D): corr = 0.004814 13 | [2016-05-08 13:04:32,482] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_title_1D (1D): corr = -0.237534 14 | [2016-05-08 13:04:45,344] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_description_1D (1D): corr = -0.189458 15 | [2016-05-08 13:04:58,122] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_attribute_1D (1D): corr = 0.131127 16 | [2016-05-08 13:05:10,420] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_brand_1D (1D): corr = 0.131438 17 | [2016-05-08 13:05:22,656] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_color_1D (1D): corr = 0.043696 18 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_group_relevance_2016-05-08-01-47.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 02:12:32,450] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.005912 2 | [2016-05-08 02:12:32,459] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.003595 3 | [2016-05-08 02:12:32,468] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.005852 4 | [2016-05-08 02:12:32,478] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.006198 5 | [2016-05-08 02:12:32,487] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.005835 6 | [2016-05-08 02:12:32,497] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.000607 7 | [2016-05-08 02:32:38,630] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.006510 8 | [2016-05-08 02:32:38,639] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.006458 9 | [2016-05-08 02:32:38,649] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.006631 10 | [2016-05-08 02:32:38,658] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.006391 11 | [2016-05-08 02:32:38,668] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.006539 12 | [2016-05-08 02:32:38,677] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.005025 13 | [2016-05-08 02:53:26,446] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.004440 14 | [2016-05-08 02:53:26,457] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.006256 15 | [2016-05-08 02:53:26,469] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.004708 16 | [2016-05-08 02:53:26,480] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.004053 17 | [2016-05-08 02:53:26,492] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.004498 18 | [2016-05-08 02:53:26,504] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.004047 19 | [2016-05-08 03:14:22,463] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.003041 20 | [2016-05-08 03:14:22,472] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.003957 21 | [2016-05-08 03:14:22,483] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.003095 22 | [2016-05-08 03:14:22,493] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.002943 23 | [2016-05-08 03:14:22,504] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.003132 24 | [2016-05-08 03:14:22,513] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.000749 25 | [2016-05-08 03:39:16,457] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.003750 26 | [2016-05-08 03:39:16,465] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.003960 27 | [2016-05-08 03:39:16,474] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.003835 28 | [2016-05-08 03:39:16,483] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.003798 29 | [2016-05-08 03:39:16,491] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.003724 30 | [2016-05-08 03:39:16,501] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.001400 31 | [2016-05-08 04:14:08,676] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.008724 32 | [2016-05-08 04:14:08,686] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.007908 33 | [2016-05-08 04:14:08,696] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.008765 34 | [2016-05-08 04:14:08,706] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.008751 35 | [2016-05-08 04:14:08,716] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.008758 36 | [2016-05-08 04:14:08,727] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.005363 37 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_lsa_ngram_cosinesim_2016-05-08-13-38.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 13:40:32,505] INFO: LSA100_Word_Trigram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.233220 2 | [2016-05-08 13:49:21,913] INFO: LSA100_Word_Trigram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.165219 3 | [2016-05-08 13:56:26,644] INFO: LSA100_Word_Trigram_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.019628 4 | [2016-05-08 13:59:17,295] INFO: LSA100_Char_Fourgram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.281925 5 | [2016-05-08 14:20:33,890] INFO: LSA100_Char_Fourgram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.152655 6 | [2016-05-08 14:38:10,383] INFO: LSA100_Char_Fourgram_CosineSim_search_term_x_product_attribute_1D (1D): corr = -0.041331 7 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_ngram_jaccard_2016-05-08-01-43.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 01:44:30,209] INFO: JaccardCoef_Unigram_search_term_x_product_title_1D (1D): corr = 0.285574 2 | [2016-05-08 01:44:35,568] INFO: JaccardCoef_Unigram_search_term_x_product_title_product_name_1D (1D): corr = 0.281925 3 | [2016-05-08 01:44:53,352] INFO: JaccardCoef_Unigram_search_term_x_product_description_1D (1D): corr = 0.131190 4 | [2016-05-08 01:45:09,992] INFO: JaccardCoef_Unigram_search_term_x_product_attribute_1D (1D): corr = 0.004878 5 | [2016-05-08 01:45:15,088] INFO: JaccardCoef_Unigram_search_term_x_product_brand_1D (1D): corr = 0.067692 6 | [2016-05-08 01:45:20,159] INFO: JaccardCoef_Unigram_search_term_x_product_color_1D (1D): corr = 0.003534 7 | [2016-05-08 01:45:26,660] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_title_1D (1D): corr = 0.268095 8 | [2016-05-08 01:45:31,707] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.262341 9 | [2016-05-08 01:45:49,303] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_description_1D (1D): corr = 0.178812 10 | [2016-05-08 01:46:05,678] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.040235 11 | [2016-05-08 01:46:08,726] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.050577 12 | [2016-05-08 01:46:11,104] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_color_1D (1D): corr = -0.015404 13 | [2016-05-08 01:46:21,959] INFO: JaccardCoef_Bigram_search_term_x_product_title_1D (1D): corr = 0.197671 14 | [2016-05-08 01:46:29,620] INFO: JaccardCoef_Bigram_search_term_x_product_title_product_name_1D (1D): corr = 0.192269 15 | [2016-05-08 01:48:12,935] INFO: JaccardCoef_Bigram_search_term_x_product_description_1D (1D): corr = 0.129658 16 | [2016-05-08 01:49:59,806] INFO: JaccardCoef_Bigram_search_term_x_product_attribute_1D (1D): corr = 0.051732 17 | [2016-05-08 01:50:06,832] INFO: JaccardCoef_Bigram_search_term_x_product_brand_1D (1D): corr = 0.039314 18 | [2016-05-08 01:50:13,429] INFO: JaccardCoef_Bigram_search_term_x_product_color_1D (1D): corr = 0.006161 19 | [2016-05-08 01:50:30,836] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_title_1D (1D): corr = 0.164851 20 | [2016-05-08 01:50:39,245] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283 21 | [2016-05-08 01:52:36,040] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_description_1D (1D): corr = 0.128455 22 | [2016-05-08 01:54:25,900] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.057716 23 | [2016-05-08 01:54:31,066] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.030678 24 | [2016-05-08 01:54:36,248] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_color_1D (1D): corr = -0.002454 25 | [2016-05-08 01:54:51,972] INFO: JaccardCoef_Trigram_search_term_x_product_title_1D (1D): corr = 0.112496 26 | [2016-05-08 01:54:58,411] INFO: JaccardCoef_Trigram_search_term_x_product_title_product_name_1D (1D): corr = 0.126837 27 | [2016-05-08 01:57:51,480] INFO: JaccardCoef_Trigram_search_term_x_product_description_1D (1D): corr = 0.072817 28 | [2016-05-08 02:00:36,316] INFO: JaccardCoef_Trigram_search_term_x_product_attribute_1D (1D): corr = 0.029948 29 | [2016-05-08 02:00:45,294] INFO: JaccardCoef_Trigram_search_term_x_product_brand_1D (1D): corr = 0.038705 30 | [2016-05-08 02:00:54,241] INFO: JaccardCoef_Trigram_search_term_x_product_color_1D (1D): corr = 0.005326 31 | [2016-05-08 02:01:11,855] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_title_1D (1D): corr = 0.004708 32 | [2016-05-08 02:01:18,177] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283 33 | [2016-05-08 02:04:00,328] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_description_1D (1D): corr = 0.000000 34 | [2016-05-08 02:06:10,004] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.000000 35 | [2016-05-08 02:06:15,796] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.029351 36 | [2016-05-08 02:06:21,417] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_color_1D (1D): corr = 0.000113 37 | [2016-05-08 02:06:26,021] INFO: DiceDistance_Unigram_search_term_x_product_title_1D (1D): corr = 0.296123 38 | [2016-05-08 02:06:29,656] INFO: DiceDistance_Unigram_search_term_x_product_title_product_name_1D (1D): corr = 0.285837 39 | [2016-05-08 02:06:41,774] INFO: DiceDistance_Unigram_search_term_x_product_description_1D (1D): corr = 0.133760 40 | [2016-05-08 02:06:54,041] INFO: DiceDistance_Unigram_search_term_x_product_attribute_1D (1D): corr = 0.003806 41 | [2016-05-08 02:06:57,586] INFO: DiceDistance_Unigram_search_term_x_product_brand_1D (1D): corr = 0.064598 42 | [2016-05-08 02:07:01,145] INFO: DiceDistance_Unigram_search_term_x_product_color_1D (1D): corr = 0.002170 43 | [2016-05-08 02:07:07,233] INFO: DiceDistance_Unigram_search_term_product_name_x_product_title_1D (1D): corr = 0.278330 44 | [2016-05-08 02:07:12,292] INFO: DiceDistance_Unigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.272974 45 | [2016-05-08 02:07:28,392] INFO: DiceDistance_Unigram_search_term_product_name_x_product_description_1D (1D): corr = 0.180541 46 | [2016-05-08 02:07:43,595] INFO: DiceDistance_Unigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.040289 47 | [2016-05-08 02:07:47,888] INFO: DiceDistance_Unigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.048742 48 | [2016-05-08 02:07:52,006] INFO: DiceDistance_Unigram_search_term_product_name_x_product_color_1D (1D): corr = -0.017108 49 | [2016-05-08 02:08:09,476] INFO: DiceDistance_Bigram_search_term_x_product_title_1D (1D): corr = 0.204344 50 | [2016-05-08 02:08:18,741] INFO: DiceDistance_Bigram_search_term_x_product_title_product_name_1D (1D): corr = 0.192708 51 | [2016-05-08 02:10:17,082] INFO: DiceDistance_Bigram_search_term_x_product_description_1D (1D): corr = 0.130818 52 | [2016-05-08 02:12:07,924] INFO: DiceDistance_Bigram_search_term_x_product_attribute_1D (1D): corr = 0.052014 53 | [2016-05-08 02:12:16,453] INFO: DiceDistance_Bigram_search_term_x_product_brand_1D (1D): corr = 0.036609 54 | [2016-05-08 02:12:25,011] INFO: DiceDistance_Bigram_search_term_x_product_color_1D (1D): corr = 0.005968 55 | [2016-05-08 02:12:41,843] INFO: DiceDistance_Bigram_search_term_product_name_x_product_title_1D (1D): corr = 0.170214 56 | [2016-05-08 02:12:49,737] INFO: DiceDistance_Bigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283 57 | [2016-05-08 02:14:46,468] INFO: DiceDistance_Bigram_search_term_product_name_x_product_description_1D (1D): corr = 0.128834 58 | [2016-05-08 02:16:37,562] INFO: DiceDistance_Bigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.057913 59 | [2016-05-08 02:16:45,053] INFO: DiceDistance_Bigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.030788 60 | [2016-05-08 02:16:52,512] INFO: DiceDistance_Bigram_search_term_product_name_x_product_color_1D (1D): corr = -0.002522 61 | [2016-05-08 02:17:14,976] INFO: DiceDistance_Trigram_search_term_x_product_title_1D (1D): corr = 0.119359 62 | [2016-05-08 02:17:25,037] INFO: DiceDistance_Trigram_search_term_x_product_title_product_name_1D (1D): corr = 0.126837 63 | [2016-05-08 02:20:18,041] INFO: DiceDistance_Trigram_search_term_x_product_description_1D (1D): corr = 0.073943 64 | [2016-05-08 02:22:11,063] INFO: DiceDistance_Trigram_search_term_x_product_attribute_1D (1D): corr = 0.030170 65 | [2016-05-08 02:22:17,401] INFO: DiceDistance_Trigram_search_term_x_product_brand_1D (1D): corr = 0.038456 66 | [2016-05-08 02:22:23,943] INFO: DiceDistance_Trigram_search_term_x_product_color_1D (1D): corr = 0.006093 67 | [2016-05-08 02:22:38,782] INFO: DiceDistance_Trigram_search_term_product_name_x_product_title_1D (1D): corr = 0.004708 68 | [2016-05-08 02:22:44,366] INFO: DiceDistance_Trigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283 69 | [2016-05-08 02:24:48,640] INFO: DiceDistance_Trigram_search_term_product_name_x_product_description_1D (1D): corr = 0.000000 70 | [2016-05-08 02:26:47,009] INFO: DiceDistance_Trigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.000000 71 | [2016-05-08 02:26:52,516] INFO: DiceDistance_Trigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.029351 72 | [2016-05-08 02:26:57,999] INFO: DiceDistance_Trigram_search_term_product_name_x_product_color_1D (1D): corr = 0.000113 73 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_query_quality_2016-05-08-13-05.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 13:06:07,296] INFO: QueryQuality_raw_x_lemmatized_1D (1D): corr = -0.011984 2 | [2016-05-08 13:06:08,419] INFO: QueryQuality_raw_x_product_name_1D (1D): corr = -0.094242 3 | [2016-05-08 13:06:09,308] INFO: QueryQuality_raw_x_stemmed_1D (1D): corr = -0.017614 4 | [2016-05-08 13:06:10,263] INFO: QueryQuality_lemmatized_x_product_name_1D (1D): corr = -0.108217 5 | [2016-05-08 13:06:11,274] INFO: QueryQuality_lemmatized_x_stemmed_1D (1D): corr = -0.016097 6 | [2016-05-08 13:06:11,889] INFO: QueryQuality_product_name_x_stemmed_1D (1D): corr = -0.107457 7 | [2016-05-08 13:06:12,699] INFO: IsInGoogleDict_search_term_1D (1D): corr = -0.068113 8 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_tfidf_ngram_cosinesim_2016-05-08-12-12.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 12:17:14,100] INFO: TFIDF_Word_Trigram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.303002 2 | [2016-05-08 12:48:05,438] INFO: TFIDF_Word_Trigram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.257729 3 | [2016-05-08 13:03:57,970] INFO: TFIDF_Word_Trigram_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.112431 4 | [2016-05-08 13:07:26,447] INFO: TFIDF_Char_Fourgram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.372841 5 | [2016-05-08 13:24:13,093] INFO: TFIDF_Char_Fourgram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.274988 6 | [2016-05-08 13:38:43,647] INFO: TFIDF_Char_Fourgram_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.069589 7 | -------------------------------------------------------------------------------- /Log/feature/generate_feature_wordnet_similarity_2016-05-08-01-43.log: -------------------------------------------------------------------------------- 1 | [2016-05-08 21:28:42,498] INFO: WordNet_Path_Similarity_Mean_Mean_search_term_x_product_title_1D (1D): corr = 0.171081 2 | [2016-05-08 21:28:42,508] INFO: WordNet_Path_Similarity_Mean_Std_search_term_x_product_title_1D (1D): corr = -0.043572 3 | [2016-05-08 21:28:42,517] INFO: WordNet_Path_Similarity_Mean_Max_search_term_x_product_title_1D (1D): corr = 0.107101 4 | [2016-05-08 21:28:42,526] INFO: WordNet_Path_Similarity_Mean_Min_search_term_x_product_title_1D (1D): corr = 0.163098 5 | [2016-05-08 21:28:42,536] INFO: WordNet_Path_Similarity_Mean_Median_search_term_x_product_title_1D (1D): corr = 0.161434 6 | [2016-05-08 21:28:42,545] INFO: WordNet_Path_Similarity_Max_Mean_search_term_x_product_title_1D (1D): corr = 0.297643 7 | [2016-05-08 21:28:42,554] INFO: WordNet_Path_Similarity_Max_Std_search_term_x_product_title_1D (1D): corr = -0.152113 8 | [2016-05-08 21:28:42,564] INFO: WordNet_Path_Similarity_Max_Max_search_term_x_product_title_1D (1D): corr = 0.153445 9 | [2016-05-08 21:28:42,573] INFO: WordNet_Path_Similarity_Max_Min_search_term_x_product_title_1D (1D): corr = 0.265098 10 | [2016-05-08 21:28:42,582] INFO: WordNet_Path_Similarity_Max_Median_search_term_x_product_title_1D (1D): corr = 0.253243 11 | [2016-05-08 21:28:42,592] INFO: WordNet_Path_Similarity_Min_Mean_search_term_x_product_title_1D (1D): corr = 0.010445 12 | [2016-05-08 21:28:42,601] INFO: WordNet_Path_Similarity_Min_Std_search_term_x_product_title_1D (1D): corr = -0.004510 13 | [2016-05-08 21:28:42,610] INFO: WordNet_Path_Similarity_Min_Max_search_term_x_product_title_1D (1D): corr = 0.004670 14 | [2016-05-08 21:28:42,619] INFO: WordNet_Path_Similarity_Min_Min_search_term_x_product_title_1D (1D): corr = 0.018414 15 | [2016-05-08 21:28:42,629] INFO: WordNet_Path_Similarity_Min_Median_search_term_x_product_title_1D (1D): corr = 0.010229 16 | [2016-05-08 21:28:42,638] INFO: WordNet_Path_Similarity_Median_Mean_search_term_x_product_title_1D (1D): corr = 0.049001 17 | [2016-05-08 21:28:42,647] INFO: WordNet_Path_Similarity_Median_Std_search_term_x_product_title_1D (1D): corr = -0.007067 18 | [2016-05-08 21:28:42,656] INFO: WordNet_Path_Similarity_Median_Max_search_term_x_product_title_1D (1D): corr = 0.022593 19 | [2016-05-08 21:28:42,666] INFO: WordNet_Path_Similarity_Median_Min_search_term_x_product_title_1D (1D): corr = 0.060468 20 | [2016-05-08 21:28:42,675] INFO: WordNet_Path_Similarity_Median_Median_search_term_x_product_title_1D (1D): corr = 0.045057 21 | -------------------------------------------------------------------------------- /Log/level1_models/[Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_hyperopt_2016-05-01-23-31.log: -------------------------------------------------------------------------------- 1 | [2016-05-01 23:31:29,673] INFO: tpe_transform took 0.007514 seconds 2 | [2016-05-01 23:31:29,674] INFO: TPE using 0 trials 3 | [2016-05-01 23:31:29,677] INFO: ================================================== 4 | [2016-05-01 23:31:29,677] INFO: Task 5 | [2016-05-01 23:31:29,677] INFO: [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_[Id@1] 6 | [2016-05-01 23:31:29,678] INFO: Param 7 | [2016-05-01 23:31:29,678] INFO: leaf_size: 40 8 | [2016-05-01 23:31:29,678] INFO: metric: minkowski 9 | [2016-05-01 23:31:29,678] INFO: n_neighbors: 5 10 | [2016-05-01 23:31:29,679] INFO: normalize: False 11 | [2016-05-01 23:31:29,679] INFO: weights: uniform 12 | [2016-05-01 23:31:29,679] INFO: Result 13 | [2016-05-01 23:31:29,679] INFO: Run RMSE Shape 14 | [2016-05-01 23:32:24,692] INFO: 1 0.56615 23167 x 722 15 | [2016-05-01 23:33:18,848] INFO: 2 0.565393 21940 x 722 16 | [2016-05-01 23:34:12,722] INFO: 3 0.567662 22182 x 722 17 | [2016-05-01 23:35:06,997] INFO: 4 0.567039 21966 x 722 18 | [2016-05-01 23:36:05,726] INFO: 5 0.565128 21961 x 722 19 | [2016-05-01 23:36:06,056] INFO: RMSE 20 | [2016-05-01 23:36:06,056] INFO: Mean: 0.566274 21 | [2016-05-01 23:36:06,056] INFO: Std: 0.000961 22 | [2016-05-01 23:36:06,056] INFO: Time 23 | [2016-05-01 23:36:06,056] INFO: 4 mins 24 | [2016-05-01 23:36:06,057] INFO: -------------------------------------------------- 25 | [2016-05-01 23:40:56,020] INFO: tpe_transform took 0.007016 seconds 26 | [2016-05-01 23:40:56,021] INFO: TPE using 1/1 trials with best loss 0.566274 27 | [2016-05-01 23:40:56,023] INFO: ================================================== 28 | [2016-05-01 23:40:56,024] INFO: Task 29 | [2016-05-01 23:40:56,024] INFO: [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_[Id@2] 30 | [2016-05-01 23:40:56,024] INFO: Param 31 | [2016-05-01 23:40:56,024] INFO: leaf_size: 20 32 | [2016-05-01 23:40:56,024] INFO: metric: minkowski 33 | [2016-05-01 23:40:56,025] INFO: n_neighbors: 17 34 | [2016-05-01 23:40:56,025] INFO: normalize: False 35 | [2016-05-01 23:40:56,025] INFO: weights: uniform 36 | [2016-05-01 23:40:56,025] INFO: Result 37 | [2016-05-01 23:40:56,025] INFO: Run RMSE Shape 38 | [2016-05-01 23:42:12,758] INFO: 1 0.533613 23167 x 722 39 | [2016-05-01 23:43:30,701] INFO: 2 0.53441 21940 x 722 40 | [2016-05-01 23:44:49,979] INFO: 3 0.534479 22182 x 722 41 | [2016-05-01 23:46:09,834] INFO: 4 0.53528 21966 x 722 42 | [2016-05-01 23:47:31,515] INFO: 5 0.533529 21961 x 722 43 | [2016-05-01 23:47:31,973] INFO: RMSE 44 | [2016-05-01 23:47:31,973] INFO: Mean: 0.534262 45 | [2016-05-01 23:47:31,973] INFO: Std: 0.000642 46 | [2016-05-01 23:47:31,973] INFO: Time 47 | [2016-05-01 23:47:31,973] INFO: 6 mins 48 | [2016-05-01 23:47:31,974] INFO: -------------------------------------------------- 49 | [2016-05-01 23:53:35,158] INFO: tpe_transform took 0.007978 seconds 50 | [2016-05-01 23:53:35,158] INFO: TPE using 2/2 trials with best loss 0.534262 51 | [2016-05-01 23:53:35,162] INFO: ================================================== 52 | [2016-05-01 23:53:35,162] INFO: Task 53 | [2016-05-01 23:53:35,162] INFO: [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_[Id@3] 54 | [2016-05-01 23:53:35,162] INFO: Param 55 | [2016-05-01 23:53:35,163] INFO: leaf_size: 70 56 | [2016-05-01 23:53:35,163] INFO: metric: minkowski 57 | [2016-05-01 23:53:35,163] INFO: n_neighbors: 11 58 | [2016-05-01 23:53:35,163] INFO: normalize: True 59 | [2016-05-01 23:53:35,163] INFO: weights: uniform 60 | [2016-05-01 23:53:35,164] INFO: Result 61 | [2016-05-01 23:53:35,164] INFO: Run RMSE Shape 62 | [2016-05-02 00:48:01,432] INFO: 1 0.470701 23167 x 722 63 | [2016-05-02 01:44:17,330] INFO: 2 0.473355 21940 x 722 64 | [2016-05-02 02:42:51,392] INFO: 3 0.472826 22182 x 722 65 | [2016-05-02 03:36:47,762] INFO: 4 0.473626 21966 x 722 66 | [2016-05-02 04:31:33,827] INFO: 5 0.473484 21961 x 722 67 | [2016-05-02 04:31:34,254] INFO: RMSE 68 | [2016-05-02 04:31:34,254] INFO: Mean: 0.472798 69 | [2016-05-02 04:31:34,254] INFO: Std: 0.001083 70 | [2016-05-02 04:31:34,254] INFO: Time 71 | [2016-05-02 04:31:34,255] INFO: 277 mins 72 | [2016-05-02 04:31:34,255] INFO: -------------------------------------------------- 73 | -------------------------------------------------------------------------------- /Log/level1_models/[Feat@basic_linear_201605010104]_[Learner@reg_skl_svr]_hyperopt_2016-05-01-22-45.log: -------------------------------------------------------------------------------- 1 | [2016-05-01 22:45:17,834] INFO: tpe_transform took 0.008110 seconds 2 | [2016-05-01 22:45:17,834] INFO: TPE using 0 trials 3 | [2016-05-01 22:45:17,838] INFO: ================================================== 4 | [2016-05-01 22:45:17,838] INFO: Task 5 | [2016-05-01 22:45:17,838] INFO: [Feat@basic_linear_201605010104]_[Learner@reg_skl_svr]_[Id@1] 6 | [2016-05-01 22:45:17,839] INFO: Param 7 | [2016-05-01 22:45:17,839] INFO: C: 1.0 8 | [2016-05-01 22:45:17,839] INFO: degree: 3 9 | [2016-05-01 22:45:17,839] INFO: epsilon: 0.011478668759041495 10 | [2016-05-01 22:45:17,839] INFO: gamma: 0.023621022279227823 11 | [2016-05-01 22:45:17,839] INFO: kernel: poly 12 | [2016-05-01 22:45:17,840] INFO: normalize: True 13 | [2016-05-01 22:45:17,840] INFO: Result 14 | [2016-05-01 22:45:17,840] INFO: Run RMSE Shape 15 | [2016-05-02 04:56:39,928] INFO: 1 0.64399 23167 x 722 16 | -------------------------------------------------------------------------------- /Log/level1_models/[Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_hyperopt_2016-05-01-02-16.log: -------------------------------------------------------------------------------- 1 | [2016-05-01 02:16:33,900] INFO: tpe_transform took 0.009265 seconds 2 | [2016-05-01 02:16:33,901] INFO: TPE using 0 trials 3 | [2016-05-01 02:16:33,904] INFO: ================================================== 4 | [2016-05-01 02:16:33,905] INFO: Task 5 | [2016-05-01 02:16:33,905] INFO: [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_[Id@1] 6 | [2016-05-01 02:16:33,905] INFO: Param 7 | [2016-05-01 02:16:33,905] INFO: learning_rate: 0.002 8 | [2016-05-01 02:16:33,906] INFO: max_depth: 4 9 | [2016-05-01 02:16:33,906] INFO: max_features: 0.55 10 | [2016-05-01 02:16:33,906] INFO: min_samples_leaf: 1 11 | [2016-05-01 02:16:33,906] INFO: n_estimators: 990 12 | [2016-05-01 02:16:33,907] INFO: random_state: 2016 13 | [2016-05-01 02:16:33,907] INFO: verbose: 0 14 | [2016-05-01 02:16:33,907] INFO: Result 15 | [2016-05-01 02:16:33,907] INFO: Run RMSE Shape 16 | [2016-05-01 03:55:18,018] INFO: 1 0.452484 23167 x 726 17 | [2016-05-01 05:23:42,084] INFO: 2 0.45364 21940 x 726 18 | [2016-05-01 06:50:24,247] INFO: 3 0.452617 22182 x 726 19 | [2016-05-01 08:19:01,662] INFO: 4 0.453387 21966 x 726 20 | [2016-05-01 09:52:15,306] INFO: 5 0.453765 21961 x 726 21 | [2016-05-01 09:52:15,764] INFO: RMSE 22 | [2016-05-01 09:52:15,764] INFO: Mean: 0.453179 23 | [2016-05-01 09:52:15,764] INFO: Std: 0.000529 24 | [2016-05-01 09:52:15,764] INFO: Time 25 | [2016-05-01 09:52:15,764] INFO: 455 mins 26 | [2016-05-01 09:52:15,765] INFO: -------------------------------------------------- 27 | [2016-05-01 14:28:18,909] INFO: tpe_transform took 0.008584 seconds 28 | [2016-05-01 14:28:18,910] INFO: TPE using 1/1 trials with best loss 0.453179 29 | [2016-05-01 14:28:18,913] INFO: ================================================== 30 | [2016-05-01 14:28:18,913] INFO: Task 31 | [2016-05-01 14:28:18,913] INFO: [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_[Id@2] 32 | [2016-05-01 14:28:18,914] INFO: Param 33 | [2016-05-01 14:28:18,914] INFO: learning_rate: 0.034 34 | [2016-05-01 14:28:18,914] INFO: max_depth: 9 35 | [2016-05-01 14:28:18,914] INFO: max_features: 0.8500000000000001 36 | [2016-05-01 14:28:18,914] INFO: min_samples_leaf: 11 37 | [2016-05-01 14:28:18,915] INFO: n_estimators: 200 38 | [2016-05-01 14:28:18,915] INFO: random_state: 2016 39 | [2016-05-01 14:28:18,915] INFO: verbose: 0 40 | [2016-05-01 14:28:18,915] INFO: Result 41 | [2016-05-01 14:28:18,915] INFO: Run RMSE Shape 42 | [2016-05-01 15:41:36,106] INFO: 1 0.442676 23167 x 726 43 | [2016-05-01 16:51:39,302] INFO: 2 0.445524 21940 x 726 44 | [2016-05-01 17:57:33,868] INFO: 3 0.442897 22182 x 726 45 | [2016-05-01 19:03:23,230] INFO: 4 0.443379 21966 x 726 46 | [2016-05-01 20:03:51,011] INFO: 5 0.443433 21961 x 726 47 | [2016-05-01 20:03:51,616] INFO: RMSE 48 | [2016-05-01 20:03:51,617] INFO: Mean: 0.443582 49 | [2016-05-01 20:03:51,617] INFO: Std: 0.001012 50 | [2016-05-01 20:03:51,617] INFO: Time 51 | [2016-05-01 20:03:51,617] INFO: 335 mins 52 | [2016-05-01 20:03:51,618] INFO: -------------------------------------------------- 53 | [2016-05-02 01:03:01,007] INFO: tpe_transform took 0.010242 seconds 54 | [2016-05-02 01:03:01,019] INFO: TPE using 2/2 trials with best loss 0.443582 55 | [2016-05-02 01:03:01,023] INFO: ================================================== 56 | [2016-05-02 01:03:01,023] INFO: Task 57 | [2016-05-02 01:03:01,023] INFO: [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_[Id@3] 58 | [2016-05-02 01:03:01,023] INFO: Param 59 | [2016-05-02 01:03:01,023] INFO: learning_rate: 0.004 60 | [2016-05-02 01:03:01,024] INFO: max_depth: 10 61 | [2016-05-02 01:03:01,024] INFO: max_features: 0.65 62 | [2016-05-02 01:03:01,024] INFO: min_samples_leaf: 3 63 | [2016-05-02 01:03:01,024] INFO: n_estimators: 940 64 | [2016-05-02 01:03:01,024] INFO: random_state: 2016 65 | [2016-05-02 01:03:01,024] INFO: verbose: 0 66 | [2016-05-02 01:03:01,025] INFO: Result 67 | [2016-05-02 01:03:01,025] INFO: Run RMSE Shape 68 | [2016-05-02 07:52:25,663] INFO: 1 0.440727 23167 x 726 69 | [2016-05-02 12:14:21,236] INFO: 2 0.443748 21940 x 726 70 | -------------------------------------------------------------------------------- /Output/Subm/README.md: -------------------------------------------------------------------------------- 1 | 0. sub0: `test.pred.[Feat@basic_nonlinear_201604210409]_[Learner@reg_xgb_tree]_[Id@84].[Mean0.438318]_[Std0.000786].csv` 2 | - best single model from Chenglong 3 | - Public LB: **0.43996** 4 | - Private LB: **0.43811** (9th place) 5 | 6 | 1. sub1: `submission_kostia + igor final_ensemble (1 to 3 weights).csv` 7 | - best ensembled model from Igor and Kostia 8 | - Public LB: **0.43819** 9 | - Private LB: **0.43704** (8th place) 10 | 11 | 2. sub2: `test.pred.[Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_[Id@1].[Mean0.436087]_[Std0.001027].csv` 12 | - *reproduced* best ensembled model from Chenglong 13 | - Public LB: **0.43582** 14 | - Private LB: **0.43325** (4th place) 15 | 16 | 3. sub3: `reproduced_blend_0.438_0.436CV.csv` 17 | - *reproduced* best blended model from 0.3 * sub1 + 0.7 * sub2 18 | - Public LB: **0.43465** 19 | - Private LB: **0.43248** (3rd place) --------------------------------------------------------------------------------