├── Code
    ├── Chenglong
    │   ├── __init__.py
    │   ├── conf
    │   │   ├── README.md
    │   │   ├── feature_conf_linear_201604172250.py
    │   │   ├── feature_conf_linear_201605010104.py
    │   │   ├── feature_conf_nonlinear_201604170111.py
    │   │   ├── feature_conf_nonlinear_201604210409.py
    │   │   └── feature_conf_nonlinear_201605010058.py
    │   ├── config.py
    │   ├── convert_csv_tsne_to_pkl_tsne.py
    │   ├── convert_pkl_lsa_to_csv_lsa.py
    │   ├── data_preparer.py
    │   ├── data_processor.py
    │   ├── embedding_trainer.py
    │   ├── extreme_ensemble_selection.py
    │   ├── feature_base.py
    │   ├── feature_basic.py
    │   ├── feature_combiner.py
    │   ├── feature_distance.py
    │   ├── feature_doc2vec.py
    │   ├── feature_first_last_ngram.py
    │   ├── feature_group_distance.py
    │   ├── feature_group_distance_stat.py
    │   ├── feature_group_relevance.py
    │   ├── feature_intersect_count.py
    │   ├── feature_intersect_position.py
    │   ├── feature_match.py
    │   ├── feature_query_quality.py
    │   ├── feature_stat_cooc_tfidf.py
    │   ├── feature_transformer.py
    │   ├── feature_tsne.R
    │   ├── feature_vector_space.py
    │   ├── feature_word2vec.py
    │   ├── feature_wordnet_similarity.py
    │   ├── gen_best_ensemble_model.py
    │   ├── gen_best_single_model.py
    │   ├── get_feature_conf_linear.py
    │   ├── get_feature_conf_linear_stacking.py
    │   ├── get_feature_conf_nonlinear.py
    │   ├── get_stacking_feature_conf.py
    │   ├── google_spelling_checker_dict.py
    │   ├── model_param_space.py
    │   ├── plot_CV_LB.py
    │   ├── plot_feature_corr.py
    │   ├── run_data.py
    │   ├── run_stacking_ridge.py
    │   ├── run_test_ridge.py
    │   ├── run_test_xgb.py
    │   ├── spelling_checker.py
    │   ├── splitter.py
    │   ├── task.py
    │   ├── turing_test_converter.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── dist_utils.py
    │   │   ├── keras_utils.py
    │   │   ├── logging_utils.py
    │   │   ├── ngram_utils.py
    │   │   ├── nlp_utils.py
    │   │   ├── np_utils.py
    │   │   ├── os_utils.py
    │   │   ├── pkl_utils.py
    │   │   ├── rgf_utils.py
    │   │   ├── skl_utils.py
    │   │   ├── time_utils.py
    │   │   └── xgb_utils.py
    └── Igor&Kostia
    │   ├── config_IgorKostia.py
    │   ├── dld_features.py
    │   ├── ensemble_script_imitation_version.py
    │   ├── ensemble_script_random_version.py
    │   ├── feature_extraction1.py
    │   ├── feature_extraction1_wo_google.py
    │   ├── feature_sets
    │       ├── first_part_1000.csv
    │       ├── first_part_1001.csv
    │       ├── first_part_2000.csv
    │       ├── first_part_3000.csv
    │       ├── first_part_3010.csv
    │       ├── first_part_3020.csv
    │       ├── readme.txt
    │       ├── second_part_1000.csv
    │       ├── second_part_2000.csv
    │       └── second_part_3000.csv
    │   ├── generate_ensemble_output_from_models.py
    │   ├── generate_feature_importances.py
    │   ├── generate_model_wo_google.py
    │   ├── generate_models.py
    │   ├── google_dict.py
    │   ├── grams_and_terms_features.py
    │   ├── homedepot_functions.py
    │   ├── model_selecting.py
    │   ├── models_ensemble
    │       └── log_2016-04-21.txt
    │   ├── processing_text
    │       ├── automatically_generated_word_corrections.csv
    │       ├── brand_statistics.csv
    │       └── material_statistics.csv
    │   ├── text_processing.py
    │   ├── text_processing_wo_google.py
    │   ├── tfidf_by_st_features.py
    │   ├── word2vec.py
    │   └── word2vec_without_google_dict.py
├── Data
    ├── dict
    │   ├── color_data.py
    │   └── word_replacer.csv
    └── split
    │   ├── splits_level1.pkl
    │   ├── splits_level2.pkl
    │   └── splits_level3.pkl
├── Doc
    ├── Kaggle_HomeDepot_Turing_Test.pdf
    ├── Kaggle_HomeDepot_Turing_Test.tex
    ├── reference.bib
    └── reference2.bib
├── Fig
    ├── CV_LB_Chenglong.pdf
    ├── FlowChart.jpg
    ├── FlowChart.pptx
    ├── actual_product_uid.pdf
    ├── actual_search_term.pdf
    ├── feature_corr_Chenglong.pdf
    ├── feature_importances_Igor.pdf
    ├── naive_product_uid.pdf
    ├── naive_search_term.pdf
    ├── plot_ensembles_means.pdf
    ├── plot_ensembles_performance.pdf
    ├── plot_feature_importances_benchmark.pdf
    ├── plot_feature_importances_simplified_model.pdf
    ├── plot_full_query_in_title.pdf
    ├── plot_high_vs_low_relevance.pdf
    ├── plot_query_with.pdf
    ├── plot_replaced_with_Google.pdf
    ├── proposed_product_uid.pdf
    └── proposed_search_term.pdf
├── LICENSE
├── Log
    ├── README.md
    ├── [Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_hyperopt_2016-05-07-18-42.log
    ├── feature
    │   ├── data_processor_2016-05-08-00-36.log
    │   ├── generate_feature_basic_2016-05-08-01-43.log
    │   ├── generate_feature_char_dist_sim_2016-05-08-12-02.log
    │   ├── generate_feature_doc2vec_2016-05-08-12-56.log
    │   ├── generate_feature_edit_distance_2016-05-08-13-03.log
    │   ├── generate_feature_first_last_ngram_count_2016-05-08-13-03.log
    │   ├── generate_feature_group_relevance_2016-05-08-01-47.log
    │   ├── generate_feature_intersect_count_2016-05-08-13-03.log
    │   ├── generate_feature_intersect_position_2016-05-08-01-43.log
    │   ├── generate_feature_lsa_ngram_cosinesim_2016-05-08-13-38.log
    │   ├── generate_feature_ngram_jaccard_2016-05-08-01-43.log
    │   ├── generate_feature_query_quality_2016-05-08-13-05.log
    │   ├── generate_feature_stat_cooc_tfidf_bm25_2016-05-08-13-04.log
    │   ├── generate_feature_stat_cooc_tfidf_tf_2016-05-08-13-03.log
    │   ├── generate_feature_stat_cooc_tfidf_tfidf_2016-05-08-13-03.log
    │   ├── generate_feature_tfidf_ngram_cosinesim_2016-05-08-12-12.log
    │   ├── generate_feature_word2vec_google_2016-05-08-12-56.log
    │   ├── generate_feature_word2vec_homedepot_2016-05-08-12-56.log
    │   ├── generate_feature_word2vec_wikipedia_2016-05-08-12-56.log
    │   └── generate_feature_wordnet_similarity_2016-05-08-01-43.log
    ├── feature_combiner_level2_meta_linear_201605030922_2016-05-03-09-23.log
    └── level1_models
    │   ├── [Feat@basic20160313]_[Learner@reg_skl_adaboost]_hyperopt_2016-03-13-12-28.log
    │   ├── [Feat@basic20160313]_[Learner@reg_skl_gbm]_hyperopt_2016-03-13-12-27.log
    │   ├── [Feat@basic20160313]_[Learner@reg_skl_lasso]_hyperopt_2016-03-13-11-19.log
    │   ├── [Feat@basic20160313]_[Learner@reg_skl_lsvr]_hyperopt_2016-03-13-11-31.log
    │   ├── [Feat@basic20160313]_[Learner@reg_skl_ridge]_hyperopt_2016-03-13-11-18.log
    │   ├── [Feat@basic20160313]_[Learner@reg_xgb_tree]_hyperopt_2016-03-14-09-48.log
    │   ├── [Feat@basic_linear_201604172250]_[Learner@reg_keras_dnn]_hyperopt_2016-04-20-20-10.log
    │   ├── [Feat@basic_linear_201604172250]_[Learner@reg_skl_lasso]_hyperopt_2016-04-18-19-53.log
    │   ├── [Feat@basic_linear_201604172250]_[Learner@reg_skl_lsvr]_hyperopt_2016-04-19-06-28.log
    │   ├── [Feat@basic_linear_201604172250]_[Learner@reg_skl_ridge]_hyperopt_2016-04-17-23-09.log
    │   ├── [Feat@basic_linear_201605010104]_[Learner@reg_keras_dnn]_hyperopt_2016-05-01-01-43.log
    │   ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_hyperopt_2016-05-01-23-31.log
    │   ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_lasso]_hyperopt_2016-05-01-22-31.log
    │   ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_lsvr]_hyperopt_2016-05-01-02-16.log
    │   ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_ridge]_hyperopt_2016-05-01-01-05.log
    │   ├── [Feat@basic_linear_201605010104]_[Learner@reg_skl_svr]_hyperopt_2016-05-01-22-45.log
    │   ├── [Feat@basic_linear_201605010104]_[Learner@reg_xgb_linear]_hyperopt_2016-05-02-00-18.log
    │   ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_rgf]_hyperopt_2016-04-17-18-41.log
    │   ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_adaboost]_hyperopt_2016-04-23-10-48.log
    │   ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_etr]_hyperopt_2016-04-23-10-48.log
    │   ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_gbm]_hyperopt_2016-04-17-22-18.log
    │   ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_skl_rf]_hyperopt_2016-04-23-10-48.log
    │   ├── [Feat@basic_nonlinear_201604170111]_[Learner@reg_xgb_tree]_hyperopt_2016-04-17-01-12.log
    │   ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_rgf]_hyperopt_2016-04-21-04-54.log
    │   ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_skl_adaboost]_hyperopt_2016-04-23-10-37.log
    │   ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_skl_gbm]_hyperopt_2016-04-21-04-34.log
    │   ├── [Feat@basic_nonlinear_201604210409]_[Learner@reg_xgb_tree]_hyperopt_2016-04-21-04-11.log
    │   ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_rgf]_hyperopt_2016-05-01-02-30.log
    │   ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_adaboost]_hyperopt_2016-05-01-02-27.log
    │   ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_etr]_hyperopt_2016-05-01-01-45.log
    │   ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_hyperopt_2016-05-01-02-16.log
    │   ├── [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_rf]_hyperopt_2016-05-01-02-22.log
    │   └── [Feat@basic_nonlinear_201605010058]_[Learner@reg_xgb_tree]_hyperopt_2016-05-01-00-59.log
├── Output
    └── Subm
    │   ├── README.md
    │   ├── reproduced_blend_0.438_0.436CV.csv
    │   ├── submission_kostia + igor final_ensemble (1 to 3 weights).csv
    │   ├── test.pred.[Feat@basic_nonlinear_201604210409]_[Learner@reg_xgb_tree]_[Id@84].[Mean0.438318]_[Std0.000786].csv
    │   └── test.pred.[Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_[Id@1].[Mean0.436087]_[Std0.001027].csv
└── README.md


/Code/Chenglong/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Code/Chenglong/__init__.py


--------------------------------------------------------------------------------
/Code/Chenglong/conf/README.md:
--------------------------------------------------------------------------------
1 | This folder contains feature confs used to generate feature matrix (input) for Chenglong's models.
2 | 
3 | They are used in a similar way as following (excuted in the `./Code/Chenglong` directory):
4 | `python feature_combiner.py -l 1 -c feature_conf_xxx -n basic_xxx -t 0.05`
5 | 
6 | Please see `feature_combiner.py`for the usage.


--------------------------------------------------------------------------------
/Code/Chenglong/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: config for Homedepot project
  5 | 
  6 | """
  7 | 
  8 | import os
  9 | import platform
 10 | 
 11 | import numpy as np
 12 | from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
 13 | 
 14 | from utils import os_utils
 15 | 
 16 | 
 17 | # ---------------------- Overall -----------------------
 18 | TASK = "all"
 19 | # # for testing data processing and feature generation
 20 | # TASK = "sample"
 21 | SAMPLE_SIZE = 1000
 22 | 
 23 | # ------------------------ PATH ------------------------
 24 | ROOT_DIR = "../.."
 25 | 
 26 | DATA_DIR = "%s/Data"%ROOT_DIR
 27 | CLEAN_DATA_DIR = "%s/Clean"%DATA_DIR
 28 | 
 29 | FEAT_DIR = "%s/Feat"%ROOT_DIR
 30 | FEAT_FILE_SUFFIX = ".pkl"
 31 | FEAT_CONF_DIR = "./conf"
 32 | 
 33 | OUTPUT_DIR = "%s/Output"%ROOT_DIR
 34 | SUBM_DIR = "%s/Subm"%OUTPUT_DIR
 35 | 
 36 | LOG_DIR = "%s/Log"%ROOT_DIR
 37 | FIG_DIR = "%s/Fig"%ROOT_DIR
 38 | TMP_DIR = "%s/Tmp"%ROOT_DIR
 39 | THIRDPARTY_DIR = "%s/Thirdparty"%ROOT_DIR
 40 | 
 41 | # word2vec/doc2vec/glove
 42 | WORD2VEC_MODEL_DIR = "%s/word2vec"%DATA_DIR
 43 | GLOVE_WORD2VEC_MODEL_DIR = "%s/glove/gensim"%DATA_DIR
 44 | DOC2VEC_MODEL_DIR = "%s/doc2vec"%DATA_DIR
 45 | 
 46 | # index split
 47 | SPLIT_DIR = "%s/split"%DATA_DIR
 48 | 
 49 | # dictionary
 50 | WORD_REPLACER_DATA = "%s/dict/word_replacer.csv"%DATA_DIR
 51 | 
 52 | # colors
 53 | COLOR_DATA = "%s/dict/color_data.py"%DATA_DIR
 54 | 
 55 | # ------------------------ DATA ------------------------
 56 | # provided data
 57 | TRAIN_DATA = "%s/train.csv"%DATA_DIR
 58 | TEST_DATA = "%s/test.csv"%DATA_DIR
 59 | ATTR_DATA = "%s/attributes.csv"%DATA_DIR
 60 | DESC_DATA = "%s/product_descriptions.csv"%DATA_DIR
 61 | SAMPLE_DATA = "%s/sample_submission.csv"%DATA_DIR
 62 | 
 63 | ALL_DATA_RAW = "%s/all.raw.csv.pkl"%CLEAN_DATA_DIR
 64 | ALL_DATA_LEMMATIZED = "%s/all.lemmatized.csv.pkl"%CLEAN_DATA_DIR
 65 | ALL_DATA_LEMMATIZED_STEMMED = "%s/all.lemmatized.stemmed.csv.pkl"%CLEAN_DATA_DIR
 66 | INFO_DATA = "%s/info.csv.pkl"%CLEAN_DATA_DIR
 67 | 
 68 | # size
 69 | TRAIN_SIZE = 74067
 70 | if TASK == "sample":
 71 |     TRAIN_SIZE = SAMPLE_SIZE
 72 | TEST_SIZE = 166693
 73 | VALID_SIZE_MAX = 60000 # 0.7 * TRAIN_SIZE
 74 | 
 75 | TRAIN_MEAN = 2.381634
 76 | TRAIN_VAR = 0.285135
 77 | 
 78 | TEST_MEAN = TRAIN_MEAN
 79 | TEST_VAR = TRAIN_VAR
 80 | 
 81 | MEAN_STD_DICT = {
 82 |     1.00: 0.000, # Common: [1, 1, 1]
 83 |     1.25: 0.433, # Rare: [1,1,1,2]
 84 |     1.33: 0.471, # Common: [1, 1, 2]
 85 |     1.50: 0.866, # Rare: [1, 1, 1, 3]
 86 |     1.67: 0.471, # Common: [1, 2, 2]
 87 |     1.75: 0.829, # Rare: [1, 1, 2, 3]
 88 |     2.00: 0.000, # Common: [2, 2, 2], [1, 2, 3]
 89 |     2.25: 0.829, # Rare: [1,2,3,3]
 90 |     2.33: 0.471, # Common: [2, 2, 3]
 91 |     2.50: 0.500, # Rare: [2,2,3,3]
 92 |     2.67: 0.471, # Common: [2, 3, 3]
 93 |     2.75: 0.433, # Rare: [2,3,3,3]
 94 |     3.00: 0.000, # Common: [3, 3, 3]
 95 | }
 96 | 
 97 | # ------------------------ PARAM ------------------------
 98 | # attribute name and value SEPARATOR
 99 | ATTR_SEPARATOR = " | "
100 | 
101 | # cv
102 | N_RUNS = 5
103 | N_FOLDS = 1
104 | 
105 | # intersect count/match
106 | STR_MATCH_THRESHOLD = 0.85
107 | 
108 | # correct query with google spelling check dict
109 | # turn this on/off to have two versions of features/models
110 | # which is useful for ensembling
111 | GOOGLE_CORRECTING_QUERY = True
112 | 
113 | # auto correcting query (quite time consuming; not used in final submission)
114 | AUTO_CORRECTING_QUERY = False
115 | 
116 | # query expansion (not used in final submission)
117 | QUERY_EXPANSION = False
118 | 
119 | # bm25
120 | BM25_K1 = 1.6
121 | BM25_B = 0.75
122 | 
123 | # svd
124 | SVD_DIM = 100
125 | SVD_N_ITER = 5
126 | 
127 | # xgboost
128 | # mean of relevance in training set
129 | BASE_SCORE = TRAIN_MEAN
130 | 
131 | # word2vec/doc2vec
132 | EMBEDDING_ALPHA = 0.025
133 | EMBEDDING_LEARNING_RATE_DECAY = 0.5
134 | EMBEDDING_N_EPOCH = 5
135 | EMBEDDING_MIN_COUNT = 3
136 | EMBEDDING_DIM = 100
137 | EMBEDDING_WINDOW = 5
138 | EMBEDDING_WORKERS = 6
139 | 
140 | # count transformer
141 | COUNT_TRANSFORM = np.log1p
142 | 
143 | # missing value
144 | MISSING_VALUE_STRING = "MISSINGVALUE"
145 | MISSING_VALUE_NUMERIC = -1.
146 | 
147 | # stop words
148 | STOP_WORDS = set(ENGLISH_STOP_WORDS)
149 | 
150 | # ------------------------ OTHER ------------------------
151 | RANDOM_SEED = 2016
152 | PLATFORM = platform.system()
153 | NUM_CORES = 4 if PLATFORM == "Windows" else 14
154 | 
155 | DATA_PROCESSOR_N_JOBS = 4 if PLATFORM == "Windows" else 6
156 | AUTO_SPELLING_CHECKER_N_JOBS = 4 if PLATFORM == "Windows" else 8
157 | # multi processing is not faster
158 | AUTO_SPELLING_CHECKER_N_JOBS = 1
159 | 
160 | ## rgf
161 | RGF_CALL_EXE = "%s/rgf1.2/test/call_exe.pl"%THIRDPARTY_DIR
162 | RGF_EXTENSION = ".exe" if PLATFORM == "Windows" else ""
163 | RGF_EXE = "%s/rgf1.2/bin/rgf%s"%(THIRDPARTY_DIR, RGF_EXTENSION)
164 | 
165 | 
166 | # ---------------------- CREATE PATH --------------------
167 | DIRS = []
168 | DIRS += [CLEAN_DATA_DIR]
169 | DIRS += [SPLIT_DIR]
170 | DIRS += [FEAT_DIR, FEAT_CONF_DIR]
171 | DIRS += ["%s/All"%FEAT_DIR]
172 | DIRS += ["%s/Run%d"%(FEAT_DIR,i+1) for i in range(N_RUNS)]
173 | DIRS += ["%s/Combine"%FEAT_DIR]
174 | DIRS += [OUTPUT_DIR, SUBM_DIR]
175 | DIRS += ["%s/All"%OUTPUT_DIR]
176 | DIRS += ["%s/Run%d"%(OUTPUT_DIR,i+1) for i in range(N_RUNS)]
177 | DIRS += [LOG_DIR, FIG_DIR, TMP_DIR]
178 | DIRS += [WORD2VEC_MODEL_DIR, DOC2VEC_MODEL_DIR, GLOVE_WORD2VEC_MODEL_DIR]
179 | 
180 | os_utils._create_dirs(DIRS)
181 | 


--------------------------------------------------------------------------------
/Code/Chenglong/convert_csv_tsne_to_pkl_tsne.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: convert .csv format TSNE features to .pkl format
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | 
10 | import pandas as pd
11 | 
12 | import config
13 | from utils import pkl_utils
14 | 
15 | 
16 | def main():
17 |     fnames = [
18 |         "TSNE_LSA100_Word_Unigram_Pair_search_term_x_product_title_100D",
19 |         "TSNE_LSA100_Word_Bigram_Pair_search_term_x_product_title_100D",
20 |         "TSNE_LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D",
21 |         "TSNE_LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D",
22 |     ]
23 | 
24 |     fnames = [os.path.join(config.FEAT_DIR, fname+".csv") for fname in fnames]
25 | 
26 |     for fname in fnames:
27 |         df = pd.read_csv(fname, index=False)
28 |         f = df.values
29 |         pkl_utils._save(fname[:-4]+".pkl", f)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/Code/Chenglong/convert_pkl_lsa_to_csv_lsa.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: convert .pkl format LSA features to .csv format for using Rtsne package in R
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | 
10 | import pandas as pd
11 | 
12 | import config
13 | from utils import pkl_utils
14 | 
15 | 
16 | def main():
17 |     fnames = [
18 |         "LSA100_Word_Unigram_Pair_search_term_x_product_title_100D",
19 |         "LSA100_Word_Bigram_Pair_search_term_x_product_title_100D",
20 |         "LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D",
21 |         "LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D",
22 |     ]
23 | 
24 |     fnames = [os.path.join(config.FEAT_DIR, fname+".pkl") for fname in fnames]
25 | 
26 |     for fname in fnames:
27 |         f = pkl_utils._load(fname)
28 |         columns = ["LSA%d"%(i+1) for i in range(f.shape[1])]
29 |         pd.DataFrame(f, columns=columns).to_csv(fname[:-4]+".csv", index=False)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/Code/Chenglong/data_preparer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: generate raw dataframe data
 5 | 
 6 | """
 7 | 
 8 | import gc
 9 | 
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | import config
14 | from utils import pkl_utils
15 | 
16 | 
17 | def main():
18 |     # load provided data
19 |     dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1")
20 |     dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1")
21 |     dfAttr = pd.read_csv(config.ATTR_DATA)
22 |     dfDesc = pd.read_csv(config.DESC_DATA)
23 | 
24 |     # 
25 |     print("Train Mean: %.6f"%np.mean(dfTrain["relevance"]))
26 |     print("Train Var: %.6f"%np.var(dfTrain["relevance"]))
27 | 
28 |     #
29 |     dfTest["relevance"] = np.zeros((config.TEST_SIZE))
30 |     dfAttr.dropna(how="all", inplace=True)
31 |     dfAttr["value"] = dfAttr["value"].astype(str)
32 | 
33 |     # concat train and test
34 |     dfAll = pd.concat((dfTrain, dfTest), ignore_index=True)
35 |     del dfTrain
36 |     del dfTest
37 |     gc.collect()
38 | 
39 |     # merge product description
40 |     dfAll = pd.merge(dfAll, dfDesc, on="product_uid", how="left")
41 |     dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
42 |     del dfDesc
43 |     gc.collect()
44 | 
45 |     # merge product brand
46 |     dfBrand = dfAttr[dfAttr.name=="MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "product_brand"})
47 |     dfAll = pd.merge(dfAll, dfBrand, on="product_uid", how="left")
48 |     dfBrand["product_brand"] = dfBrand["product_brand"].values.astype(str)
49 |     dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
50 |     del dfBrand
51 |     gc.collect()
52 | 
53 |     # merge product color
54 |     color_columns = ["product_color", "Color Family", "Color/Finish", "Color/Finish Family"]
55 |     dfColor = dfAttr[dfAttr.name.isin(color_columns)][["product_uid", "value"]].rename(columns={"value": "product_color"})
56 |     dfColor.dropna(how="all", inplace=True)
57 |     _agg_color = lambda df: " ".join(list(set(df["product_color"])))
58 |     dfColor = dfColor.groupby("product_uid").apply(_agg_color)
59 |     dfColor = dfColor.reset_index(name="product_color")
60 |     dfColor["product_color"] = dfColor["product_color"].values.astype(str)
61 |     dfAll = pd.merge(dfAll, dfColor, on="product_uid", how="left")
62 |     dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
63 |     del dfColor
64 |     gc.collect()
65 | 
66 |     # merge product attribute
67 |     _agg_attr = lambda df: config.ATTR_SEPARATOR.join(df["name"] + config.ATTR_SEPARATOR + df["value"])
68 |     dfAttr = dfAttr.groupby("product_uid").apply(_agg_attr)
69 |     dfAttr = dfAttr.reset_index(name="product_attribute_concat")
70 |     dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left")
71 |     dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
72 |     del dfAttr
73 |     gc.collect()
74 |     
75 |     # save data
76 |     if config.TASK == "sample":
77 |         dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy()
78 |     pkl_utils._save(config.ALL_DATA_RAW, dfAll)
79 | 
80 |     # info
81 |     dfInfo = dfAll[["id","relevance"]].copy()
82 |     pkl_utils._save(config.INFO_DATA, dfInfo)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/Code/Chenglong/embedding_trainer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: word2vec & doc2vec trainer
  5 | 
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | 
 11 | import pandas as pd
 12 | from gensim.models import Word2Vec, Doc2Vec
 13 | from gensim.models.doc2vec import LabeledSentence
 14 | 
 15 | import config
 16 | from utils import nlp_utils
 17 | from utils import logging_utils, pkl_utils, time_utils
 18 | 
 19 | 
 20 | # tune the token pattern to get a better correlation with y_train
 21 | # token_pattern = r"(?u)\b\w\w+\b"
 22 | # token_pattern = r"\w{1,}"
 23 | # token_pattern = r"\w+"
 24 | # token_pattern = r"[\w']+"
 25 | token_pattern = " " # just split the text into tokens
 26 | 
 27 | 
 28 | #---------------------- Word2Vec ----------------------
 29 | class DataFrameSentences(object):
 30 |     def __init__(self, df, columns):
 31 |         self.df = df
 32 |         self.columns = columns
 33 | 
 34 |     def __iter__(self):
 35 |         for column in self.columns:
 36 |             for sentence in self.df[column]:
 37 |                 tokens = nlp_utils._tokenize(sentence, token_pattern)
 38 |                 yield tokens
 39 | 
 40 | 
 41 | class DataFrameWord2Vec:
 42 |     def __init__(self, df, columns, model_param):
 43 |         self.df = df
 44 |         self.columns = columns
 45 |         self.model_param = model_param
 46 |         self.model = Word2Vec(sg=self.model_param["sg"], 
 47 |                                 hs=self.model_param["hs"], 
 48 |                                 alpha=self.model_param["alpha"],
 49 |                                 min_alpha=self.model_param["alpha"],
 50 |                                 min_count=self.model_param["min_count"], 
 51 |                                 size=self.model_param["size"], 
 52 |                                 sample=self.model_param["sample"], 
 53 |                                 window=self.model_param["window"], 
 54 |                                 workers=self.model_param["workers"])
 55 | 
 56 |     def train(self):
 57 |         # build vocabulary
 58 |         self.sentences = DataFrameSentences(self.df, self.columns)
 59 |         self.model.build_vocab(self.sentences)
 60 |         # train for n_epoch
 61 |         for i in range(self.model_param["n_epoch"]):
 62 |             self.sentences = DataFrameSentences(self.df, self.columns)
 63 |             self.model.train(self.sentences)
 64 |             self.model.alpha *= self.model_param["learning_rate_decay"]
 65 |             self.model.min_alpha = self.model.alpha
 66 |         return self
 67 | 
 68 |     def save(self, model_dir, model_name):
 69 |         fname = os.path.join(model_dir, model_name)
 70 |         self.model.save(fname)
 71 | 
 72 | 
 73 | def train_word2vec_model(df, columns):
 74 |     model_param = {
 75 |         "alpha": config.EMBEDDING_ALPHA,
 76 |         "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY,
 77 |         "n_epoch": config.EMBEDDING_N_EPOCH,
 78 |         "sg": 1,
 79 |         "hs": 1,
 80 |         "min_count": config.EMBEDDING_MIN_COUNT,
 81 |         "size": config.EMBEDDING_DIM,
 82 |         "sample": 0.001,
 83 |         "window": config.EMBEDDING_WINDOW,
 84 |         "workers": config.EMBEDDING_WORKERS,
 85 |     }
 86 |     model_dir = config.WORD2VEC_MODEL_DIR
 87 |     model_name = "Homedepot-word2vec-D%d-min_count%d.model"%(
 88 |                     model_param["size"], model_param["min_count"])
 89 | 
 90 |     word2vec = DataFrameWord2Vec(df, columns, model_param)
 91 |     word2vec.train()
 92 |     word2vec.save(model_dir, model_name)
 93 | 
 94 | 
 95 | #---------------------- Doc2Vec ----------------------
 96 | class DataFrameLabelSentences(object):
 97 |     def __init__(self, df, columns):
 98 |         self.df = df
 99 |         self.columns = columns
100 |         self.cnt = -1
101 |         self.sent_label = {}
102 | 
103 |     def __iter__(self):
104 |         for column in self.columns:
105 |             for sentence in self.df[column]:
106 |                 if not sentence in self.sent_label:
107 |                     self.cnt += 1
108 |                     self.sent_label[sentence] = "SENT_%d"%self.cnt
109 |                 tokens = nlp_utils._tokenize(sentence, token_pattern)
110 |                 yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]])
111 | 
112 | 
113 | class DataFrameDoc2Vec(DataFrameWord2Vec):
114 |     def __init__(self, df, columns, model_param):
115 |         super().__init__(df, columns, model_param)
116 |         self.model = Doc2Vec(dm=self.model_param["dm"], 
117 |                                 hs=self.model_param["hs"], 
118 |                                 alpha=self.model_param["alpha"],
119 |                                 min_alpha=self.model_param["alpha"],
120 |                                 min_count=self.model_param["min_count"], 
121 |                                 size=self.model_param["size"], 
122 |                                 sample=self.model_param["sample"], 
123 |                                 window=self.model_param["window"], 
124 |                                 workers=self.model_param["workers"])
125 |     def train(self):
126 |         # build vocabulary
127 |         self.sentences = DataFrameLabelSentences(self.df, self.columns)
128 |         self.model.build_vocab(self.sentences)
129 |         # train for n_epoch
130 |         for i in range(self.model_param["n_epoch"]):
131 |             self.sentences = DataFrameLabelSentences(self.df, self.columns)
132 |             self.model.train(self.sentences)
133 |             self.model.alpha *= self.model_param["learning_rate_decay"]
134 |             self.model.min_alpha = self.model.alpha
135 |         return self
136 | 
137 |     def save(self, model_dir, model_name):
138 |         fname = os.path.join(model_dir, model_name)
139 |         self.model.save(fname)
140 |         pkl_utils._save("%s.sent_label"%fname, self.sentences.sent_label)
141 | 
142 | 
143 | def train_doc2vec_model(df, columns):
144 |     model_param = {
145 |         "alpha": config.EMBEDDING_ALPHA,
146 |         "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY,
147 |         "n_epoch": config.EMBEDDING_N_EPOCH,
148 |         "sg": 1, # not use
149 |         "dm": 1,
150 |         "hs": 1,
151 |         "min_count": config.EMBEDDING_MIN_COUNT,
152 |         "size": config.EMBEDDING_DIM,
153 |         "sample": 0.001,
154 |         "window": config.EMBEDDING_WINDOW,
155 |         "workers": config.EMBEDDING_WORKERS,
156 |     }
157 |     model_dir = config.DOC2VEC_MODEL_DIR
158 |     model_name = "Homedepot-doc2vec-D%d-min_count%d.model"%(
159 |                     model_param["size"], model_param["min_count"])
160 | 
161 |     doc2vec = DataFrameDoc2Vec(df, columns, model_param)
162 |     doc2vec.train()
163 |     doc2vec.save(model_dir, model_name)
164 | 
165 | 
166 | #---------------------- Main ----------------------
167 | if __name__ == "__main__":
168 |     df = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
169 |     columns = ["search_term", "search_term_alt", "product_title", "product_description",
170 |                 "product_attribute", "product_brand", "product_color"]
171 |     columns = [col for col in columns if col in df.columns]
172 | 
173 |     if len(sys.argv) >= 2:
174 |         for w in sys.argv[1].split(","):
175 |             if w == "word2vec":
176 |                 train_word2vec_model(df, columns)
177 |             elif w == "doc2vec":
178 |                 train_doc2vec_model(df, columns)
179 |             else:
180 |                 print("Skip: %s"%w)
181 |                 continue
182 |     else:
183 |         train_doc2vec_model(df, columns)
184 |         train_word2vec_model(df, columns)
185 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_doc2vec.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: doc2vec based features
  5 | 
  6 | """
  7 | 
  8 | import gensim
  9 | import numpy as np
 10 | import pandas as pd
 11 | from sklearn.metrics.pairwise import cosine_similarity
 12 | 
 13 | import config
 14 | from utils import dist_utils, ngram_utils, nlp_utils
 15 | from utils import logging_utils, time_utils, pkl_utils
 16 | from feature_base import BaseEstimator, StandaloneFeatureWrapper, PairwiseFeatureWrapper
 17 | 
 18 | 
 19 | class Doc2Vec_BaseEstimator(BaseEstimator):
 20 |     def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""):
 21 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 22 |         self.model = doc2vec_model
 23 |         self.sent_label = sent_label
 24 |         self.model_prefix = model_prefix
 25 |         self.vector_size = doc2vec_model.vector_size
 26 | 
 27 |     def _get_vector(self, sent):
 28 |         try:
 29 |             vect = self.model.docvecs[self.sent_label[sent]]
 30 |         except:
 31 |             vect = np.zeros(self.vector_size, dtype=float)
 32 |         return vect
 33 | 
 34 |     def _get_cosine_sim(self, sent1, sent2):
 35 |         vect1 = self._get_vector(sent1)
 36 |         vect2 = self._get_vector(sent2)
 37 |         return dist_utils._cosine_sim(vect1, vect2)
 38 | 
 39 |     def _get_vdiff(self, sent1, sent2):
 40 |         vect1 = self._get_vector(sent1)
 41 |         vect2 = self._get_vector(sent2)
 42 |         return dist_utils._vdiff(vect1, vect2)
 43 | 
 44 |     def _get_rmse(self, sent1, sent2):
 45 |         vect1 = self._get_vector(sent1)
 46 |         vect2 = self._get_vector(sent2)
 47 |         return dist_utils._rmse(vect1, vect2)
 48 | 
 49 | 
 50 | class Doc2Vec_Vector(Doc2Vec_BaseEstimator):
 51 |     def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""):
 52 |         super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode)
 53 | 
 54 |     def __name__(self):
 55 |         return "Doc2Vec_%s_D%d_Vector"%(self.model_prefix, self.vector_size)
 56 | 
 57 |     def transform_one(self, obs, target, id):
 58 |         return self._get_vector(obs)
 59 | 
 60 | 
 61 | class Doc2Vec_Vdiff(Doc2Vec_BaseEstimator):
 62 |     def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""):
 63 |         super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode)
 64 | 
 65 |     def __name__(self):
 66 |         return "Doc2Vec_%s_D%d_Vdiff"%(self.model_prefix, self.vector_size)
 67 | 
 68 |     def transform_one(self, obs, target, id):
 69 |         return self._get_vdiff(obs, target)
 70 | 
 71 | 
 72 | class Doc2Vec_CosineSim(Doc2Vec_BaseEstimator):
 73 |     def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""):
 74 |         super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode)
 75 | 
 76 |     def __name__(self):
 77 |         return "Doc2Vec_%s_D%d_CosineSim"%(self.model_prefix, self.vector_size)
 78 | 
 79 |     def transform_one(self, obs, target, id):
 80 |         return self._get_cosine_sim(obs, target)
 81 | 
 82 | 
 83 | class Doc2Vec_RMSE(Doc2Vec_BaseEstimator):
 84 |     def __init__(self, obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode=""):
 85 |         super().__init__(obs_corpus, target_corpus, doc2vec_model, sent_label, model_prefix, aggregation_mode)
 86 | 
 87 |     def __name__(self):
 88 |         return "Doc2Vec_%s_D%d_RMSE"%(self.model_prefix, self.vector_size)
 89 | 
 90 |     def transform_one(self, obs, target, id):
 91 |         return self._get_rmse(obs, target)
 92 | 
 93 | 
 94 | # -------------------------------- Main ----------------------------------
 95 | def main():
 96 |     logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
 97 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
 98 |     #### NOTE: use data BEFORE STEMMING
 99 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
100 | 
101 |     doc2vec_model_dirs = []
102 |     model_prefixes = []
103 |     ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
104 |     doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
105 |     model_prefixes.append( "Homedepot" )
106 |     for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
107 |         ## load model
108 |         try:
109 |             if ".bin" in doc2vec_model_dir:
110 |                 doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
111 |             if ".txt" in doc2vec_model_dir:
112 |                 doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
113 |             else:
114 |                 doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
115 |                 doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
116 |         except:
117 |             continue
118 | 
119 |         # ## standalone (not used in model building)
120 |         # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
121 |         # generator = Doc2Vec_Vector
122 |         # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
123 |         # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
124 |         # sf.go()
125 | 
126 |         ## pairwise
127 |         generators = [
128 |             Doc2Vec_CosineSim, 
129 |             Doc2Vec_RMSE, 
130 |             # Doc2Vec_Vdiff, 
131 |         ]
132 |         obs_fields_list = []
133 |         target_fields_list = []
134 |         obs_fields_list.append( ["search_term", "search_term_alt"][:1] )
135 |         target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
136 |         for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
137 |             for generator in generators:
138 |                 param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
139 |                 pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
140 |                 pf.go()
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     main()
145 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_group_distance.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: group relevance based distance features
  5 | @note: such features are not used in final submission
  6 | 
  7 | """
  8 | 
  9 | import re
 10 | import string
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | import config
 16 | from config import TRAIN_SIZE
 17 | from utils import dist_utils, ngram_utils, nlp_utils
 18 | from utils import logging_utils, pkl_utils, time_utils
 19 | from feature_base import BaseEstimator, StandaloneFeatureWrapper, PairwiseFeatureWrapper
 20 | 
 21 | 
 22 | # tune the token pattern to get a better correlation with y_train
 23 | # token_pattern = r"(?u)\b\w\w+\b"
 24 | # token_pattern = r"\w{1,}"
 25 | # token_pattern = r"\w+"
 26 | # token_pattern = r"[\w']+"
 27 | token_pattern = " " # just split the text into tokens
 28 | 
 29 | 
 30 | # -------------------- Group by (obs, relevance) based distance features ----------------------------------- #
 31 | # Something related to Query Expansion
 32 | class GroupRelevance_Ngram_Jaccard(BaseEstimator):
 33 |     """Single aggregation features"""
 34 |     def __init__(self, obs_corpus, target_corpus, id_list, dfTrain, target_field, relevance, ngram, aggregation_mode=""):
 35 |         super().__init__(obs_corpus, target_corpus, aggregation_mode, id_list)
 36 |         self.dfTrain = dfTrain[dfTrain["relevance"] != 0].copy()
 37 |         self.target_field = target_field
 38 |         self.relevance = relevance
 39 |         self.relevance_str = self._relevance_to_str()
 40 |         self.ngram = ngram
 41 |         self.ngram_str = ngram_utils._ngram_str_map[self.ngram]
 42 | 
 43 |     def __name__(self):
 44 |         if isinstance(self.aggregation_mode, str):
 45 |             feat_name = "Group_%sRelevance_%s_Jaccard_%s"%(
 46 |                 self.relevance_str, self.ngram_str, string.capwords(self.aggregation_mode))
 47 |         elif isinstance(self.aggregation_mode, list):
 48 |             feat_name = ["Group_%sRelevance_%s_Jaccard_%s"%(
 49 |                 self.relevance_str, self.ngram_str, string.capwords(m)) for m in self.aggregation_mode]
 50 |         return feat_name
 51 | 
 52 |     def _relevance_to_str(self):
 53 |         if isinstance(self.relevance, float):
 54 |             return re.sub("\.", "d", str(self.relevance))
 55 |         else:
 56 |             return str(self.relevance)
 57 | 
 58 |     def transform_one(self, obs, target, id):
 59 |         df = self.dfTrain[self.dfTrain["search_term"] == obs].copy()
 60 |         val_list = [config.MISSING_VALUE_NUMERIC]
 61 |         if df is not None:
 62 |             df = df[df["id"] != id].copy()
 63 |             df = df[df["relevance"] == self.relevance].copy()
 64 |             if df is not None and df.shape[0] > 0:
 65 |                 target_tokens = nlp_utils._tokenize(target, token_pattern)
 66 |                 target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
 67 |                 val_list = []
 68 |                 for x in df[self.target_field]:
 69 |                     x_tokens = nlp_utils._tokenize(x, token_pattern)
 70 |                     x_ngrams = ngram_utils._ngrams(x_tokens, self.ngram)
 71 |                     val_list.append(dist_utils._jaccard_coef(x_ngrams, target_ngrams))
 72 |         return val_list
 73 | 
 74 | 
 75 | # -------------------------------- Main ----------------------------------
 76 | def main():
 77 |     logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
 78 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
 79 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
 80 |     dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
 81 | 
 82 |     ## run python3 splitter.py first
 83 |     split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
 84 |     n_iter = len(split)
 85 | 
 86 |     relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
 87 |     relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
 88 |     ngrams = [1]
 89 |     obs_fields = ["search_term"]
 90 |     target_fields = ["product_title", "product_description"]
 91 |     aggregation_mode = ["mean", "std", "max", "min", "median"]
 92 | 
 93 |     ## for cv
 94 |     for i in range(n_iter):
 95 |         trainInd, validInd = split[i][0], split[i][1]
 96 |         dfTrain2 = dfTrain.iloc[trainInd].copy()
 97 |         sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)
 98 | 
 99 |         for target_field in target_fields:
100 |             for relevance in relevances:
101 |                 for ngram in ngrams:
102 |                     param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
103 |                     pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
104 |                     pf.go()
105 | 
106 |     ## for all
107 |     sub_feature_dir = "%s/All" % (config.FEAT_DIR)
108 |     for target_field in target_fields:
109 |         for relevance in relevances:
110 |             for ngram in ngrams:
111 |                 param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
112 |                 pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
113 |                 pf.go()
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     main()
118 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_group_relevance.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: group based relevance features
 5 | @note: such features are not used in final submission (except GroupRelevance_Size)
 6 | 
 7 | """
 8 | 
 9 | import string
10 | 
11 | import numpy as np
12 | import pandas as pd
13 | 
14 | import config
15 | from config import TRAIN_SIZE
16 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils
17 | from utils import logging_utils, time_utils, pkl_utils
18 | from feature_base import BaseEstimator, StandaloneFeatureWrapper
19 | 
20 | 
21 | class GroupRelevance(BaseEstimator):
22 |     """Single aggregation features"""
23 |     def __init__(self, obs_corpus, target_corpus, id_list, dfTrain, aggregation_mode=""):
24 |         super().__init__(obs_corpus, target_corpus, aggregation_mode, id_list)
25 |         self.dfTrain = dfTrain[dfTrain["relevance"] != 0].copy()
26 | 
27 |     def __name__(self):
28 |         if isinstance(self.aggregation_mode, str):
29 |             feat_name = "GroupRelevance_%s"%string.capwords(self.aggregation_mode)
30 |         elif isinstance(self.aggregation_mode, list):
31 |             feat_name = ["GroupRelevance_%s"%string.capwords(m) for m in self.aggregation_mode]
32 |         return feat_name
33 | 
34 |     def transform_one(self, obs, target, id):
35 |         df = self.dfTrain[self.dfTrain["search_term"] == obs].copy()
36 |         val_list = [config.MISSING_VALUE_NUMERIC]
37 |         if df is not None:
38 |             df = df[df["id"] != id].copy()
39 |             if df is not None and df.shape[0] > 0:
40 |                 val_list = df["relevance"].values.tolist()
41 |         return val_list
42 | 
43 | 
44 | # -------------------------------- Main ----------------------------------
45 | def main():
46 |     logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp()
47 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
48 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
49 |     dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
50 | 
51 |     ## run python3 splitter.py first
52 |     split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
53 |     n_iter = len(split)
54 | 
55 |     ## for cv
56 |     for i in range(n_iter):
57 |         trainInd, validInd = split[i][0], split[i][1]
58 |         dfTrain2 = dfTrain.iloc[trainInd].copy()
59 |         sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)
60 | 
61 |         obs_fields = ["search_term", "product_title"][1:]
62 |         aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
63 |         param_list = [dfAll["id"], dfTrain2, aggregation_mode]
64 |         sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
65 |         sf.go()
66 | 
67 |     ## for all
68 |     sub_feature_dir = "%s/All" % (config.FEAT_DIR)
69 |     obs_fields = ["search_term", "product_title"][1:]
70 |     aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
71 |     param_list = [dfAll["id"], dfTrain, aggregation_mode]
72 |     sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
73 |     sf.go()
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_intersect_count.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: intersect count features
  5 | 
  6 | """
  7 | 
  8 | import re
  9 | import string
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | 
 14 | import config
 15 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils
 16 | from utils import logging_utils, time_utils, pkl_utils
 17 | from feature_base import BaseEstimator, PairwiseFeatureWrapper
 18 | 
 19 | 
 20 | # tune the token pattern to get a better correlation with y_train
 21 | # token_pattern = r"(?u)\b\w\w+\b"
 22 | # token_pattern = r"\w{1,}"
 23 | # token_pattern = r"\w+"
 24 | # token_pattern = r"[\w']+"
 25 | token_pattern = " " # just split the text into tokens
 26 | 
 27 | 
 28 | # ----------------------------------------------------------------------------
 29 | # How many ngrams of obs are in target?
 30 | # Obs: [AB, AB, AB, AC, DE, CD]
 31 | # Target: [AB, AC, AB, AD, ED]
 32 | # ->
 33 | # IntersectCount: 4 (i.e., AB, AB, AB, AC)
 34 | # IntersectRatio: 4/6
 35 | class IntersectCount_Ngram(BaseEstimator):
 36 |     def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", 
 37 |         str_match_threshold=config.STR_MATCH_THRESHOLD):
 38 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 39 |         self.ngram = ngram
 40 |         self.ngram_str = ngram_utils._ngram_str_map[self.ngram]
 41 |         self.str_match_threshold = str_match_threshold
 42 |         
 43 |     def __name__(self):
 44 |         return "IntersectCount_%s"%self.ngram_str
 45 | 
 46 |     def transform_one(self, obs, target, id):
 47 |         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
 48 |         target_tokens = nlp_utils._tokenize(target, token_pattern)
 49 |         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
 50 |         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
 51 |         s = 0.
 52 |         for w1 in obs_ngrams:
 53 |             for w2 in target_ngrams:
 54 |                 if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
 55 |                     s += 1.
 56 |                     break
 57 |         return s
 58 | 
 59 | 
 60 | class IntersectRatio_Ngram(BaseEstimator):
 61 |     def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", 
 62 |         str_match_threshold=config.STR_MATCH_THRESHOLD):
 63 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 64 |         self.ngram = ngram
 65 |         self.ngram_str = ngram_utils._ngram_str_map[self.ngram]
 66 |         self.str_match_threshold = str_match_threshold
 67 |         
 68 |     def __name__(self):
 69 |         return "IntersectRatio_%s"%self.ngram_str
 70 | 
 71 |     def transform_one(self, obs, target, id):
 72 |         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
 73 |         target_tokens = nlp_utils._tokenize(target, token_pattern)
 74 |         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
 75 |         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
 76 |         s = 0.
 77 |         for w1 in obs_ngrams:
 78 |             for w2 in target_ngrams:
 79 |                 if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
 80 |                     s += 1.
 81 |                     break
 82 |         return np_utils._try_divide(s, len(obs_ngrams))
 83 | 
 84 | 
 85 | # ----------------------------------------------------------------------------
 86 | # How many cooccurrence ngrams between obs and target?
 87 | # Obs: [AB, AB, AB, AC, DE, CD]
 88 | # Target: [AB, AC, AB, AD, ED]
 89 | # ->
 90 | # CooccurrenceCount: 7 (i.e., AB x 2 + AB x 2 + AB x 2 + AC x 1)
 91 | # CooccurrenceRatio: 7/(6 x 5)
 92 | class CooccurrenceCount_Ngram(BaseEstimator):
 93 |     def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", str_match_threshold=config.STR_MATCH_THRESHOLD):
 94 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 95 |         self.ngram = ngram
 96 |         self.ngram_str = ngram_utils._ngram_str_map[self.ngram]
 97 |         self.str_match_threshold = str_match_threshold
 98 |         
 99 |     def __name__(self):
100 |         return "CooccurrenceCount_%s"%self.ngram_str
101 | 
102 |     def transform_one(self, obs, target, id):
103 |         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
104 |         target_tokens = nlp_utils._tokenize(target, token_pattern)
105 |         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
106 |         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
107 |         s = 0.
108 |         for w1 in obs_ngrams:
109 |             for w2 in target_ngrams:
110 |                 if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
111 |                     s += 1.
112 |         return s
113 | 
114 | 
115 | class CooccurrenceRatio_Ngram(BaseEstimator):
116 |     def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode="", str_match_threshold=config.STR_MATCH_THRESHOLD):
117 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
118 |         self.ngram = ngram
119 |         self.ngram_str = ngram_utils._ngram_str_map[self.ngram]
120 |         self.str_match_threshold = str_match_threshold
121 |         
122 |     def __name__(self):
123 |         return "CooccurrenceRatio_%s"%self.ngram_str
124 | 
125 |     def transform_one(self, obs, target, id):
126 |         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
127 |         target_tokens = nlp_utils._tokenize(target, token_pattern)
128 |         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
129 |         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
130 |         s = 0.
131 |         for w1 in obs_ngrams:
132 |             for w2 in target_ngrams:
133 |                 if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
134 |                     s += 1.
135 |         return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
136 | 
137 | 
138 | # ---------------------------- Main --------------------------------------
139 | def main():
140 |     logname = "generate_feature_intersect_count_%s.log"%time_utils._timestamp()
141 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
142 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
143 | 
144 |     generators = [
145 |         IntersectCount_Ngram, 
146 |         IntersectRatio_Ngram, 
147 |         CooccurrenceCount_Ngram, 
148 |         CooccurrenceRatio_Ngram, 
149 |     ]
150 |     obs_fields_list = []
151 |     target_fields_list = []
152 |     ## query in document
153 |     obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
154 |     target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
155 |     ## document in query
156 |     obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
157 |     target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
158 |     ngrams = [1,2,3,12,123][:3]
159 |     for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
160 |         for generator in generators:
161 |             for ngram in ngrams:
162 |                 param_list = [ngram]
163 |                 pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
164 |                 pf.go()
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     main()
169 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_intersect_position.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: intersect position features
  5 | 
  6 | """
  7 | 
  8 | import re
  9 | import string
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | 
 14 | import config
 15 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils
 16 | from utils import logging_utils, time_utils, pkl_utils
 17 | from feature_base import BaseEstimator, PairwiseFeatureWrapper
 18 | 
 19 | 
 20 | # tune the token pattern to get a better correlation with y_train
 21 | # token_pattern = r"(?u)\b\w\w+\b"
 22 | # token_pattern = r"\w{1,}"
 23 | # token_pattern = r"\w+"
 24 | # token_pattern = r"[\w']+"
 25 | token_pattern = " " # just split the text into tokens
 26 | 
 27 | 
 28 | def _inter_pos_list(obs, target):
 29 |     """
 30 |         Get the list of positions of obs in target
 31 |     """
 32 |     pos_list = [0]
 33 |     if len(obs) != 0:
 34 |         pos_list = [i for i,o in enumerate(obs, start=1) if o in target]
 35 |         if len(pos_list) == 0:
 36 |             pos_list = [0]
 37 |     return pos_list
 38 | 
 39 | 
 40 | def _inter_norm_pos_list(obs, target):
 41 |     pos_list = _inter_pos_list(obs, target)
 42 |     N = len(obs)
 43 |     return [np_utils._try_divide(i, N) for i in pos_list]
 44 | 
 45 | 
 46 | class IntersectPosition_Ngram(BaseEstimator):
 47 |     """Single aggregation features"""
 48 |     def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode=""):
 49 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 50 |         self.ngram = ngram
 51 |         self.ngram_str = ngram_utils._ngram_str_map[self.ngram]
 52 |         
 53 |     def __name__(self):
 54 |         if isinstance(self.aggregation_mode, str):
 55 |             feat_name = "IntersectPosition_%s_%s"%(
 56 |                 self.ngram_str, string.capwords(self.aggregation_mode))
 57 |         elif isinstance(self.aggregation_mode, list):
 58 |             feat_name = ["IntersectPosition_%s_%s"%(
 59 |                 self.ngram_str, string.capwords(m)) for m in self.aggregation_mode]
 60 |         return feat_name
 61 | 
 62 |     def transform_one(self, obs, target, id):
 63 |         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
 64 |         target_tokens = nlp_utils._tokenize(target, token_pattern)
 65 |         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
 66 |         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
 67 |         pos_list = _inter_pos_list(obs_ngrams, target_ngrams)
 68 |         return pos_list
 69 | 
 70 | 
 71 | class IntersectNormPosition_Ngram(BaseEstimator):
 72 |     """Single aggregation features"""
 73 |     def __init__(self, obs_corpus, target_corpus, ngram, aggregation_mode=""):
 74 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 75 |         self.ngram = ngram
 76 |         self.ngram_str = ngram_utils._ngram_str_map[self.ngram]
 77 |         
 78 |     def __name__(self):
 79 |         if isinstance(self.aggregation_mode, str):
 80 |             feat_name = "IntersectNormPosition_%s_%s"%(
 81 |                 self.ngram_str, string.capwords(self.aggregation_mode))
 82 |         elif isinstance(self.aggregation_mode, list):
 83 |             feat_name = ["IntersectNormPosition_%s_%s"%(
 84 |                 self.ngram_str, string.capwords(m)) for m in self.aggregation_mode]
 85 |         return feat_name
 86 | 
 87 |     def transform_one(self, obs, target, id):
 88 |         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
 89 |         target_tokens = nlp_utils._tokenize(target, token_pattern)
 90 |         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
 91 |         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
 92 |         pos_list = _inter_norm_pos_list(obs_ngrams, target_ngrams)
 93 |         return pos_list
 94 | 
 95 | 
 96 | # ---------------------------- Main --------------------------------------
 97 | def main():
 98 |     logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
 99 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
100 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
101 | 
102 |     generators = [
103 |         IntersectPosition_Ngram, 
104 |         IntersectNormPosition_Ngram, 
105 |     ]
106 |     obs_fields_list = []
107 |     target_fields_list = []
108 |     ## query in document
109 |     obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
110 |     target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
111 |     ## document in query
112 |     obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
113 |     target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
114 |     ngrams = [1,2,3,12,123][:3]
115 |     aggregation_mode = ["mean", "std", "max", "min", "median"]
116 |     for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
117 |         for generator in generators:
118 |             for ngram in ngrams:
119 |                 param_list = [ngram, aggregation_mode]
120 |                 pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
121 |                 pf.go()
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_match.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: match based features
  5 | 
  6 | """
  7 | 
  8 | import re
  9 | import string
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | 
 14 | import config
 15 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils
 16 | from utils import logging_utils, time_utils, pkl_utils
 17 | from feature_base import BaseEstimator, PairwiseFeatureWrapper
 18 | 
 19 | 
 20 | class MatchQueryCount(BaseEstimator):
 21 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
 22 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 23 |         
 24 |     def __name__(self):
 25 |         return "MatchQueryCount"
 26 | 
 27 |     def _str_whole_word(self, str1, str2, i_):
 28 |         cnt = 0
 29 |         if len(str1) > 0 and len(str2) > 0:
 30 |             try:
 31 |                 while i_ < len(str2):
 32 |                     i_ = str2.find(str1, i_)
 33 |                     if i_ == -1:
 34 |                         return cnt
 35 |                     else:
 36 |                         cnt += 1
 37 |                         i_ += len(str1)
 38 |             except:
 39 |                 pass
 40 |         return cnt
 41 | 
 42 |     def transform_one(self, obs, target, id):
 43 |         return self._str_whole_word(obs, target, 0)
 44 | 
 45 | 
 46 | class MatchQueryRatio(MatchQueryCount):
 47 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
 48 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 49 |         
 50 |     def __name__(self):
 51 |         return "MatchQueryRatio"
 52 | 
 53 |     def transform_one(self, obs, target, id):
 54 |         return np_utils._try_divide(super().transform_one(obs, target, id), len(target.split(" ")))
 55 | 
 56 | 
 57 | #------------- Longest match features -------------------------------
 58 | class LongestMatchSize(BaseEstimator):
 59 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
 60 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 61 |         
 62 |     def __name__(self):
 63 |         return "LongestMatchSize"
 64 | 
 65 |     def transform_one(self, obs, target, id):
 66 |         return dist_utils._longest_match_size(obs, target)
 67 | 
 68 | 
 69 | class LongestMatchRatio(BaseEstimator):
 70 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
 71 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 72 |         
 73 |     def __name__(self):
 74 |         return "LongestMatchRatio"
 75 | 
 76 |     def transform_one(self, obs, target, id):
 77 |         return dist_utils._longest_match_ratio(obs, target)
 78 | 
 79 | 
 80 | # --------------------------- Attribute based features -------------------------
 81 | class MatchAttrCount(BaseEstimator):
 82 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
 83 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
 84 |         
 85 |     def __name__(self):
 86 |         return "MatchAttrCount"
 87 | 
 88 |     def _str_whole_word(self, str1, str2, i_):
 89 |         cnt = 0
 90 |         if len(str1) > 0 and len(str2) > 0:
 91 |             try:
 92 |                 while i_ < len(str2):
 93 |                     i_ = str2.find(str1, i_)
 94 |                     if i_ == -1:
 95 |                         return cnt
 96 |                     else:
 97 |                         cnt += 1
 98 |                         i_ += len(str1)
 99 |             except:
100 |                 pass
101 |         return cnt
102 | 
103 |     def transform_one(self, obs, target, id):
104 |         cnt = 0
105 |         for o in obs.split(" "):
106 |             for t in target:
107 |                 if not t[0].startswith("bullet"):
108 |                     if self._str_whole_word(obs, t[0], 0):
109 |                         cnt += 1
110 |         return cnt
111 | 
112 | 
113 | class MatchAttrRatio(MatchQueryCount):
114 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
115 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
116 |         
117 |     def __name__(self):
118 |         return "MatchAttrRatio"
119 | 
120 |     def transform_one(self, obs, target, id):
121 |         lo = len(obs.split(" "))
122 |         lt = len([t[0] for t in target if not t[0].startswith("bullet")])
123 |         return np_utils._try_divide(super().transform_one(obs, target, id), lo*lt)
124 | 
125 | 
126 | class IsIndoorOutdoorMatch(BaseEstimator):
127 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
128 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
129 |         
130 |     def __name__(self):
131 |         return "IsIndoorOutdoorMatch"
132 | 
133 |     def transform_one(self, obs, target, id):
134 |         os = []
135 |         if obs.find("indoor") != -1:
136 |             os.append("indoor")
137 |         if obs.find("outdoor") != -1:
138 |             os.append("outdoor")
139 | 
140 |         cnt = 0
141 |         for t in target:
142 |             if t[0].find("indoor outdoor") != -1:
143 |                 cnt = 1
144 |                 ts = t[1].split(" ")
145 |                 for i in ts:
146 |                     if i in os:
147 |                         return 1
148 |         if cnt == 0:
149 |             return 0
150 |         else:
151 |             return -1
152 | 
153 | 
154 | # ---------------------------- Main --------------------------------------
155 | def main():
156 |     logname = "generate_feature_match_%s.log"%time_utils._timestamp()
157 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
158 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
159 |     
160 |     generators = [
161 |         MatchQueryCount, 
162 |         MatchQueryRatio, 
163 |         LongestMatchSize, 
164 |         LongestMatchRatio, 
165 |     ]
166 |     obs_fields_list = []
167 |     target_fields_list = []
168 |     obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
169 |     target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
170 |     for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
171 |         for generator in generators:
172 |             param_list = []
173 |             pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
174 |             pf.go()
175 | 
176 |     # product_attribute_list
177 |     generators = [
178 |         MatchAttrCount, 
179 |         MatchAttrRatio, 
180 |         IsIndoorOutdoorMatch, 
181 |     ]
182 |     obs_fields_list = []
183 |     target_fields_list = []
184 |     obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
185 |     target_fields_list.append( ["product_attribute_list"] )
186 |     for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
187 |         for generator in generators:
188 |             param_list = []
189 |             pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
190 |             pf.go()
191 | 
192 | 
193 | if __name__ == "__main__":
194 |     main()
195 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_query_quality.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: query quality based features
 5 | 
 6 | """
 7 | 
 8 | import re
 9 | import os
10 | import string
11 | 
12 | import numpy as np
13 | import pandas as pd
14 | 
15 | import config
16 | from config import TRAIN_SIZE
17 | from utils import dist_utils, ngram_utils, nlp_utils, np_utils
18 | from utils import logging_utils, time_utils, pkl_utils
19 | from feature_base import BaseEstimator, StandaloneFeatureWrapper
20 | import google_spelling_checker_dict
21 | 
22 | 
23 | class QueryQuality(BaseEstimator):
24 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
25 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
26 |         
27 |     def __name__(self):
28 |         return "QueryQuality"
29 | 
30 |     def transform_one(self, obs, target, id):
31 |         return dist_utils._edit_dist(obs, target)
32 | 
33 | 
34 | class IsInGoogleDict(BaseEstimator):
35 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
36 |         super().__init__(obs_corpus, target_corpus, aggregation_mode)
37 |         
38 |     def __name__(self):
39 |         return "IsInGoogleDict"
40 | 
41 |     def transform_one(self, obs, target, id):
42 |         if obs in google_spelling_checker_dict.spelling_checker_dict:
43 |             return 1.
44 |         else:
45 |             return 0.
46 | 
47 | 
48 | # ---------------------------- Main --------------------------------------
49 | def main():
50 |     logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
51 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
52 | 
53 |     obs_corpus = []
54 |     query_suffix = []
55 |     # raw
56 |     dfAll = pkl_utils._load(config.ALL_DATA_RAW)
57 |     obs_corpus.append(dfAll["search_term"].values)
58 |     query_suffix.append("raw")
59 |     # after processing    
60 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
61 |     obs_corpus.append(dfAll["search_term"].values)
62 |     query_suffix.append("lemmatized")
63 |     # after extracting product_name in search_term
64 |     obs_corpus.append(dfAll["search_term_product_name"].values)
65 |     query_suffix.append("product_name")
66 |     if "search_term_auto_corrected" in dfAll.columns:
67 |         # after auto correction
68 |         obs_corpus.append(dfAll["search_term_auto_corrected"].values)
69 |         query_suffix.append("corrected")  
70 |     # after stemming
71 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
72 |     obs_corpus.append(dfAll["search_term"].values)
73 |     query_suffix.append("stemmed")
74 | 
75 |     y_train = dfAll["relevance"].values[:TRAIN_SIZE]
76 |     for i in range(len(query_suffix)-1):
77 |         for j in range(i+1, len(query_suffix)):
78 |             ext = QueryQuality(obs_corpus[i], obs_corpus[j])
79 |             x = ext.transform()
80 |             dim = np_utils._dim(x)
81 |             fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim)
82 |             pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
83 |             corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
84 |             logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
85 | 
86 |     # raw
87 |     dfAll = pkl_utils._load(config.ALL_DATA_RAW)
88 |     obs_fields = ["search_term"]
89 |     param_list = []
90 |     sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
91 |     sf.go()
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_transformer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: feature transformer
  5 | 
  6 | """
  7 | 
  8 | from collections import Counter
  9 | 
 10 | from sklearn.base import BaseEstimator
 11 | 
 12 | 
 13 | #### adopted from @Ben Hamner's Python Benchmark code
 14 | ## https://www.kaggle.com/benhamner/crowdflower-search-relevance/python-benchmark
 15 | def identity(x):
 16 |     return x
 17 | 
 18 | 
 19 | class SimpleTransform(BaseEstimator):
 20 |     def __init__(self, transformer=identity):
 21 |         self.transformer = transformer
 22 | 
 23 |     def fit(self, X, y=None):
 24 |         return self
 25 | 
 26 |     def fit_transform(self, X, y=None):
 27 |         return self.transform(X)
 28 | 
 29 |     def transform(self, X, y=None):
 30 |         return self.transformer(X)
 31 | 
 32 | 
 33 | class ColumnSelector(BaseEstimator):
 34 |     def __init__(self, columns=-1):
 35 |         # assert (type(columns) == int) or (type(columns) == list)
 36 |         self.columns = columns
 37 | 
 38 |     def fit(self, X, y=None):
 39 |         return self
 40 | 
 41 |     def fit_transform(self, X, y=None):
 42 |         return self.transform(X)
 43 | 
 44 |     def transform(self, X, y=None):
 45 |         if len(X.shape) == 1:
 46 |             return X
 47 |         elif self.columns == -1:
 48 |             return X
 49 |         else:
 50 |             return X[:,self.columns]
 51 | 
 52 | 
 53 | # feature mapper for mapping rare categorical values to a special case
 54 | # example
 55 | # mapper = FeatureMapper(10, 0)
 56 | # dfTrain = mapper.fit_transform(dfTrain, "Medical_History_2")
 57 | # dfTest = mapper.transform(dfTest, "Medical_History_2")
 58 | class FeatureMapper:
 59 |     def __init__(self, threshold, rare_code):
 60 |         self.threshold = threshold
 61 |         self.rare_code = rare_code
 62 |         self.counter = Counter()
 63 |         self.mapper = {}
 64 | 
 65 |     def fit(self, X):
 66 |         self.counter = Counter(X)
 67 |         if self.rare_code is None:
 68 |             most_freq = sorted(self.counter.items(),
 69 |                                 key=lambda x: x[1],
 70 |                                 reverse=True)[0][0]
 71 |             self.rare_code = most_freq
 72 |         self.mapper = {}
 73 |         for k,v in self.counter.items():
 74 |             if v < self.threshold:
 75 |                 self.mapper[k] = self.rare_code
 76 |         return self
 77 | 
 78 |     def transform(self, X):
 79 |         Y = map(lambda x:self.mapper.get(x, x), X)
 80 |         return Y
 81 | 
 82 |     def fit_transform(self, X):
 83 |         self.fit(X)
 84 |         return self.transform(X)
 85 | 
 86 | 
 87 | class CountFeaturizer:
 88 |     def __init__(self):
 89 |         self.mapper = Counter()
 90 | 
 91 |     def fit(self, X):
 92 |         self.mapper = Counter(X)
 93 |         s = sum(self.mapper.values())
 94 |         for k,v in self.mapper.items():
 95 |             self.mapper[k] = float(v) / s
 96 |         return self
 97 | 
 98 |     def transform(self, X):
 99 |         Y = map(lambda x:self.mapper.get(x, 0), X)
100 |         return Y
101 | 
102 |     def fit_transform(self, X):
103 |         self.fit(X)
104 |         return self.transform(X)
105 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_tsne.R:
--------------------------------------------------------------------------------
 1 | # 
 2 | # @author: Chenglong Chen <c.chenglong@gmail.com>
 3 | # @brief: tsne based features
 4 | # 
 5 | 
 6 | require(data.table)
 7 | require(Rtsne)
 8 | 
 9 | # random seed for reproducibility
10 | set.seed(2016)
11 | 
12 | # path
13 | setwd(".")
14 | feat_dir <- "../../Feat/"
15 | 
16 | # feature names
17 | fnames <- c(
18 |     "LSA100_Word_Unigram_Pair_search_term_x_product_title_100D",
19 |     "LSA100_Word_Bigram_Pair_search_term_x_product_title_100D",
20 |     "LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D",
21 |     "LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D"
22 | )
23 | 
24 | # setting
25 | perplexity <- 30
26 | theta <- 0.5
27 | dims <- 2
28 | 
29 | # run
30 | for(fname in fnames) {
31 |     # load lsa features
32 |     file_lsa <- paste(feat_dir, fname, ".csv", sep="")
33 |     X <- fread(file_lsa, data.table=F)
34 |     X <- as.matrix(X)
35 |     gc()
36 | 
37 |     # run tsne
38 |     tsne <- Rtsne(X , check_duplicates=FALSE, pca=FALSE,
39 |                   perplexity=perplexity, theta=theta, dims=dims)
40 | 
41 |     # save tsne features
42 |     col.names <- paste("TSNE_", 1:ncol(tsne$Y), sep="")
43 |     file_tsne <- paste(feat_dir, "/TSNE_", fname, ".csv", sep="")
44 |     write.table(tsne$Y, file=file_tsne, sep=',', quote=FALSE, 
45 |                 row.names=FALSE, col.names=col.names)
46 | }
47 | 


--------------------------------------------------------------------------------
/Code/Chenglong/feature_wordnet_similarity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: wordnet similarity based features (veeerrry time consuming)
  5 | @note: in our final submission, we are only able to generate WordNet_Path_Similarity between
  6 |        search_term and product_title in reasonable time.
  7 | """
  8 | 
  9 | """
 10 | http://stackoverflow.com/questions/16877517/compare-similarity-of-terms-expressions-using-nltk
 11 | http://stackoverflow.com/questions/22031968/how-to-find-distance-between-two-synset-using-python-nltk-in-wordnet-hierarchy
 12 | 
 13 | #----------------------------------------------------------------------------------------
 14 | Path similarity, wup_similarity and lch_similarity, all of these should work 
 15 | since they are based on the distance between two synsets in the Wordnet hierarchy.
 16 | 
 17 | dog = wn.synset('dog.n.01')
 18 | cat = wn.synset('cat.n.01')
 19 | 
 20 | dog.path_similarity(cat)
 21 | 
 22 | dog.lch_similarity(cat)
 23 | 
 24 | dog.wup_similarity(cat)
 25 | 
 26 | #----------------------------------------------------------------------------------------
 27 | synset1.path_similarity(synset2):
 28 | 
 29 | Return a score denoting how similar two word senses are, based on the shortest 
 30 | path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The 
 31 | score is in the range 0 to 1, except in those cases where a path cannot be 
 32 | found (will only be true for verbs as there are many distinct verb taxonomies),
 33 | in which case -1 is returned. A score of 1 represents identity i.e. comparing
 34 | a sense with itself will return 1.
 35 | 
 36 | #----------------------------------------------------------------------------------------
 37 | synset1.lch_similarity(synset2), Leacock-Chodorow Similarity:
 38 | 
 39 | Return a score denoting how similar two word senses are, based on the shortest 
 40 | path that connects the senses (as above) and the maximum depth of the taxonomy 
 41 | in which the senses occur. The relationship is given as -log(p/2d) where p is 
 42 | the shortest path length and d the taxonomy depth.
 43 | 
 44 | #----------------------------------------------------------------------------------------
 45 | synset1.wup_similarity(synset2), Wu-Palmer Similarity:
 46 | 
 47 | Return a score denoting how similar two word senses are, based on the depth of the
 48 | two senses in the taxonomy and that of their Least Common Subsumer (most specific 
 49 | ancestor node). Note that at this time the scores given do not always agree with 
 50 | those given by Pedersen's Perl implementation of Wordnet Similarity.
 51 | """
 52 | 
 53 | import string
 54 | 
 55 | import numpy as np
 56 | import pandas as pd
 57 | from nltk.corpus import wordnet as wn
 58 | 
 59 | import config
 60 | from utils import dist_utils, ngram_utils, nlp_utils, pkl_utils
 61 | from utils import logging_utils, time_utils
 62 | from feature_base import BaseEstimator, PairwiseFeatureWrapper
 63 | 
 64 | 
 65 | # tune the token pattern to get a better correlation with y_train
 66 | # token_pattern = r"(?u)\b\w\w+\b"
 67 | # token_pattern = r"\w{1,}"
 68 | # token_pattern = r"\w+"
 69 | # token_pattern = r"[\w']+"
 70 | token_pattern = " " # just split the text into tokens
 71 | 
 72 | 
 73 | class WordNet_Similarity(BaseEstimator):
 74 |     """Double aggregation features"""
 75 |     def __init__(self, obs_corpus, target_corpus, metric="path", aggregation_mode_prev="", aggregation_mode=""):
 76 |         super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
 77 |         self.metric = metric
 78 |         if self.metric == "path":
 79 |             self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
 80 |         elif self.metric == "lch":
 81 |             self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
 82 |         elif self.metric == "wup":
 83 |             self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
 84 |         else:
 85 |             raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
 86 |             
 87 |     def __name__(self):
 88 |         feat_name = []
 89 |         for m1 in self.aggregation_mode_prev:
 90 |             for m in self.aggregation_mode:
 91 |                 n = "WordNet_%s_Similarity_%s_%s"%(
 92 |                     string.capwords(self.metric), string.capwords(m1), string.capwords(m))
 93 |                 feat_name.append(n)
 94 |         return feat_name
 95 | 
 96 |     def _maximum_similarity_for_two_synset_list(self, syn_list1, syn_list2):
 97 |         s = 0.
 98 |         if syn_list1 and syn_list2:
 99 |             for syn1 in syn_list1:
100 |                 for syn2 in syn_list2:
101 |                     try:
102 |                         _s = self.metric_func(syn1, syn2)
103 |                     except:
104 |                         _s = config.MISSING_VALUE_NUMERIC
105 |                     if _s and _s > s:
106 |                         s = _s
107 |         return s
108 | 
109 |     def transform_one(self, obs, target, id):
110 |         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
111 |         target_tokens = nlp_utils._tokenize(target, token_pattern)
112 |         obs_synset_list = [wn.synsets(obs_token) for obs_token in obs_tokens]
113 |         target_synset_list = [wn.synsets(target_token) for target_token in target_tokens]
114 |         val_list = []
115 |         for obs_synset in obs_synset_list:
116 |             _val_list = []
117 |             for target_synset in target_synset_list:
118 |                 _s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset)
119 |                 _val_list.append(_s)
120 |             if len(_val_list) == 0:
121 |                 _val_list = [config.MISSING_VALUE_NUMERIC]
122 |             val_list.append( _val_list )
123 |         if len(val_list) == 0:
124 |             val_list = [[config.MISSING_VALUE_NUMERIC]]
125 |         return val_list
126 | 
127 | 
128 | class WordNet_Path_Similarity(WordNet_Similarity):
129 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode_prev="", aggregation_mode=""):
130 |         super().__init__(obs_corpus, target_corpus, "path", aggregation_mode_prev, aggregation_mode)
131 | 
132 | 
133 | class WordNet_Lch_Similarity(WordNet_Similarity):
134 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode_prev="", aggregation_mode=""):
135 |         super().__init__(obs_corpus, target_corpus, "lch", aggregation_mode_prev, aggregation_mode)
136 | 
137 | 
138 | class WordNet_Wup_Similarity(WordNet_Similarity):
139 |     def __init__(self, obs_corpus, target_corpus, aggregation_mode_prev="", aggregation_mode=""):
140 |         super().__init__(obs_corpus, target_corpus, "wup", aggregation_mode_prev, aggregation_mode)
141 | 
142 | 
143 | # ---------------------------- Main --------------------------------------
144 | def main():
145 |     logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp()
146 |     logger = logging_utils._get_logger(config.LOG_DIR, logname)
147 |     #### NOTE: use data BEFORE STEMMING
148 |     dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
149 | 
150 |     # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission
151 |     generators = [
152 |         WordNet_Path_Similarity,
153 |         WordNet_Lch_Similarity,
154 |         WordNet_Wup_Similarity,
155 |     ][:1]
156 |     obs_fields_list = []
157 |     target_fields_list = []
158 |     # only search_term and product_title are used in final submission
159 |     obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
160 |     target_fields_list.append( ["product_title", "product_description", "product_attribute"][:1] )
161 |     # double aggregation
162 |     aggregation_mode_prev = ["mean", "max", "min", "median"]
163 |     aggregation_mode = ["mean", "std", "max", "min", "median"]
164 |     for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
165 |         for generator in generators:
166 |             param_list = [aggregation_mode_prev, aggregation_mode]
167 |             pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
168 |             pf.go()
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     main()
173 | 


--------------------------------------------------------------------------------
/Code/Chenglong/gen_best_ensemble_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: script for generating the best ensemble model from Chenglong's side
 5 | @note: 1. make sure you have run `python run_data.py` first
 6 | 	   2. make sure you have built `some diverse` 1st level models first (see `./Log/level1_models` for example)
 7 | 
 8 | """
 9 | 
10 | import os
11 | 
12 | 
13 | cmd = "python run_stacking_ridge.py -l 2 -d 0 -t 10 -c 1 -L reg_ensemble -o"
14 | os.system(cmd)
15 | 


--------------------------------------------------------------------------------
/Code/Chenglong/gen_best_single_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: script for generating the best single model from Chenglong's side
 5 | @note: 1. make sure you have run `python run_data.py` first
 6 | 	   2. RMSE should be something around 0.438 ~ 0.439
 7 | 
 8 | """
 9 | 
10 | import os
11 | 
12 | 
13 | suffix = '201604210409'
14 | threshold = 0.05
15 | 
16 | cmd = "python feature_combiner.py -l 1 -c feature_conf_nonlinear_%s -n basic_nonlinear_%s -t %.6f"%(suffix, suffix, threshold)
17 | os.system(cmd)
18 | 
19 | cmd = "python task.py -m single -f basic_nonlinear_%s -l reg_xgb_tree_best_single_model -e 1"%suffix
20 | os.system(cmd)
21 | 


--------------------------------------------------------------------------------
/Code/Chenglong/get_feature_conf_linear.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: generate feature conf for the following models (most of which are linear models)
  5 |         - reg_skl_ridge
  6 |         - reg_skl_bayesian_ridge
  7 |         - reg_skl_lasso
  8 |         - reg_skl_lsvr
  9 |         - reg_xgb_linear
 10 |         - reg_keras_dnn (nonlinear models)
 11 | @note: 
 12 |         - such features DO NOT INCLUDE "DocId_(search_term|product_title|product_color|product_brand)"
 13 |         - one can tune the MANDATORY_FEATS and COMMENT_OUT_FEATS to generate different feature subset
 14 | 
 15 | """
 16 | 
 17 | import re
 18 | import os
 19 | from optparse import OptionParser
 20 | 
 21 | import config
 22 | from utils import time_utils
 23 | 
 24 | 
 25 | INCLUDE_FEATS = [
 26 | ".+"
 27 | ]
 28 | 
 29 | 
 30 | COUNT_FEATS = [
 31 | "Freq", 
 32 | "Len", 
 33 | "Count", 
 34 | "Size", 
 35 | "Position", 
 36 | ]
 37 | # COUNT_FEATS = []
 38 | 
 39 | 
 40 | NOT_COUNT_FEATS = ["Norm", "Ratio"]
 41 | 
 42 | 
 43 | MANDATORY_FEATS = [
 44 | 
 45 | # including product_uid according to
 46 | # https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/20288/trends-in-relevances-by-row-ids/115886#post115886
 47 | "DocIdEcho_product_uid",
 48 | "ProductUidDummy1_product_uid",
 49 | "ProductUidDummy2_product_uid",
 50 | 
 51 | "IsInGoogleDict",
 52 | "GroupRelevance_Size",
 53 | "TSNE",
 54 | ]
 55 | 
 56 | 
 57 | COMMENT_OUT_FEATS = [
 58 | 
 59 | #-------------- General --------------
 60 | "search_term_alt",
 61 | 
 62 | "Bigram",
 63 | "Trigram",
 64 | "UBgram",
 65 | "UBTgram",
 66 | 
 67 | "Median",
 68 | "Std",
 69 | 
 70 | ".+(Bigram|Trigram)_.+_product_(brand|color)",
 71 | 
 72 | 
 73 | #-------------- Basic --------------
 74 | "DocLogFreq",
 75 | "Digit",
 76 | "Unique",
 77 | "^DocIdOneHot",
 78 | "^DocId",
 79 | 
 80 | "DocLen_product_(brand|color)",
 81 | "DocLen_product_attribute_1D",
 82 | "DocFreq_product_description_1D",
 83 | "DocFreq_product_attribute_1D",
 84 | "Digit(Count|Ratio)_product_(brand|color)",
 85 | "Doc(Entropy|Len)_product_(brand|color)",
 86 | "Unique(Count|Ratio)_.+_product_(brand|color)",
 87 | 
 88 | 
 89 | #-------------- Distance --------------
 90 | "DiceDistance",
 91 | # "EditDistance",
 92 | "Compression",
 93 | 
 94 | 
 95 | #-------------- First and Last Ngram --------------
 96 | "FirstIntersectNormPosition",
 97 | "FirstIntersectPosition",
 98 | "LastIntersectNormPosition",
 99 | "LastIntersectPosition",
100 | 
101 | 
102 | #-------------- Group --------------
103 | "GroupRelevance_(Mean|Std|Max|Min|Median)",
104 | "Group_\d+",
105 | "GroupDistanceStat",
106 | 
107 | 
108 | #-------------- Intersect Count & Position --------------
109 | "IntersectPosition_.+_(Std|Max|Min|Median)",
110 | "IntersectNormPosition_.+_(Std|Max|Min|Median)",
111 | 
112 | 
113 | #-------------- Match --------------
114 | "LongestMatchSize",
115 | 
116 | 
117 | #-------------- StatCooc --------------
118 | # since product_name is of length 2, it makes no difference for various aggregation as there is only one item
119 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_search_term_product_name_x_product_title_product_name_1D",
120 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_product_title_product_name_x_search_term_product_name_1D",
121 | 
122 | "NormTF",
123 | "NormTFIDF",
124 | 
125 | 
126 | #-------------- Vector Space --------------
127 | # as TFIDF_Word_Trigram has the largest corr
128 | "LSA\d+_Word_Unigram",
129 | "LSA\d+_Word_Bigram",
130 | "TFIDF_Word_Unigram",
131 | "TFIDF_Word_Bigram",
132 | 
133 | # as TFIDF_Char_Fourgram has the largest corr
134 | "LSA\d+_Char_Bigram",
135 | "LSA\d+_Char_Trigram",
136 | "LSA\d+_Char_Fivegram",
137 | "TFIDF_Char_Bigram",
138 | "TFIDF_Char_Trigram",
139 | "TFIDF_Char_Fivegram",
140 | 
141 | "CharDistribution_Ratio",
142 | 
143 | 
144 | #-------------- Word2Vec & Doc2Vec --------------
145 | "_Vector_", 
146 | "_Vdiff_", 
147 | "Word2Vec_Wikipedia_D50",
148 | "Word2Vec_Wikipedia_D100",
149 | "Word2Vec_Wikipedia_D200",
150 | # "Word2Vec_GoogleNews",
151 | "Word2Vec_GoogleNews_D300_Vector",
152 | # as all the words are used to train the model
153 | "Word2Vec_Homedepot_D100_Importance",
154 | "Word2Vec_Homedepot_D100_N_Similarity_Imp",
155 | 
156 | 
157 | #-------------- Turing Test --------------
158 | # d = {
159 | #     "df_basic_features.csv": "Basic",
160 | #     "df_brand_material_dummies.csv": "BrandMaterialDummy",
161 | #     "df_dist_new.csv": "Dist",
162 | #     "dld_features.csv": "DLD",
163 | #     "df_st_tfidf.csv": "StTFIDF",
164 | #     "df_tfidf_intersept_new.csv": "TFIDF",
165 | #     "df_thekey_dummies.csv": "TheKeyDummy",
166 | #     "df_word2vec_new.csv": "Word2Vec",
167 | # }
168 | # "TuringTest_Basic",
169 | # "TuringTest_BrandMaterialDummy",
170 | # "TuringTest_Dist",
171 | # "TuringTest_DLD",
172 | # "TuringTest_StTFIDF",
173 | # "TuringTest_TFIDF",
174 | # "TuringTest_TheKeyDummy",
175 | # "TuringTest_Word2Vec",
176 | 
177 | 
178 | ]
179 | 
180 | 
181 | def _check_include(fname):
182 |     for v in INCLUDE_FEATS:
183 |         pat = re.compile(v)
184 |         if len(re.findall(pat, fname)) > 0:
185 |             return True
186 |     return False
187 | 
188 | 
189 | def _check_count_feat(fname):
190 |     for v in NOT_COUNT_FEATS:
191 |         pat = re.compile(v)
192 |         if len(re.findall(pat, fname)) > 0:
193 |             return False
194 |     for v in COUNT_FEATS:
195 |         pat = re.compile(v)
196 |         if len(re.findall(pat, fname)) > 0:
197 |             return True
198 |     return False
199 | 
200 | 
201 | def _check_lsa_matrix(fname):
202 |     pat = re.compile("^LSA")
203 |     if len(re.findall(pat, fname)) > 0:
204 |         return True
205 |     return False
206 | 
207 | 
208 | def _check_mandatory(fname):
209 |     for v in MANDATORY_FEATS:
210 |         pat = re.compile(v)
211 |         if len(re.findall(pat, fname)) > 0:
212 |             return True
213 |     return False
214 | 
215 | 
216 | def _check_comment_out(fname):
217 |     for v in COMMENT_OUT_FEATS:
218 |         pat = re.compile(v)
219 |         if len(re.findall(pat, fname)) > 0:
220 |             return True
221 |     return False
222 | 
223 | 
224 | header_pattern = """
225 | # -*- coding: utf-8 -*-
226 | \"\"\"
227 | @author: Chenglong Chen <c.chenglong@gmail.com>
228 | @brief: one feature conf
229 | 
230 | Generated by
231 | python %s -d %d -o %s
232 | 
233 | Format:
234 | FEATURE_NAME : (MANDATORY, TRANSFORM)
235 | 
236 | \"\"\"
237 | 
238 | import config
239 | from feature_transformer import SimpleTransform, ColumnSelector
240 | 
241 | LSA_COLUMNS = range(%d)
242 | 
243 | feature_dict = {
244 | 
245 | """
246 | 
247 | 
248 | def _create_feature_conf(lsa_columns, outfile):
249 |     res = header_pattern%(__file__, int(lsa_columns), outfile, int(lsa_columns))
250 | 
251 |     folders = [config.FEAT_DIR, config.FEAT_DIR+"/All"]
252 |     for folder in folders:
253 |         try:
254 |             for file in sorted(os.listdir(folder)):
255 |                 if config.FEAT_FILE_SUFFIX in file:
256 |                     fname = file.split(".")[0]
257 |                     if _check_include(fname):
258 |                         line = ""
259 |                         mandatory = _check_mandatory(fname)
260 |                         if not mandatory and _check_comment_out(fname):
261 |                             continue
262 |                             line += "# "
263 |                         line += "'%s' : "%fname
264 |                         if mandatory:
265 |                             line += "(True, "
266 |                         else:
267 |                             line += "(False, "
268 |                         if _check_lsa_matrix(fname):
269 |                             if int(lsa_columns) > 0:
270 |                                 line += "ColumnSelector(LSA_COLUMNS)),\n"
271 |                             else:
272 |                                 continue
273 |                         elif _check_count_feat(fname):
274 |                             line += "SimpleTransform(config.COUNT_TRANSFORM)),\n"
275 |                         else:
276 |                             line += "SimpleTransform()),\n"
277 |                         res += line
278 |         except:
279 |             pass
280 |     res += "}\n"
281 | 
282 |     with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f:
283 |         f.write(res)
284 | 
285 | 
286 | def parse_args(parser):
287 |     parser.add_option("-d", "--dim", default=1, type=int, dest="lsa_columns",
288 |         help="lsa_columns")
289 |     parser.add_option("-o", "--outfile", default="feature_conf_%s.py"%time_utils._timestamp(),
290 |         type="string", dest="outfile", help="outfile")
291 | 
292 |     (options, args) = parser.parse_args()
293 |     return options, args
294 | 
295 | 
296 | def main(options):
297 |     _create_feature_conf(lsa_columns=options.lsa_columns, outfile=options.outfile)
298 | 
299 | 
300 | if __name__ == "__main__":
301 |     parser = OptionParser()
302 |     options, args = parse_args(parser)
303 |     main(options)
304 | 


--------------------------------------------------------------------------------
/Code/Chenglong/get_feature_conf_nonlinear.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: generate feature conf for the following models
  5 |         - reg_skl_gbm
  6 |         - reg_skl_adaboost
  7 |         - reg_skl_etr
  8 |         - reg_skl_rf
  9 |         - reg_xgb_tree
 10 |         - reg_rgf
 11 | @note: 
 12 |         - such features INCLUDE "DocId_(search_term|product_title|product_color|product_brand)"
 13 |         - one can tune the MANDATORY_FEATS and COMMENT_OUT_FEATS to generate different feature subset
 14 | 
 15 | """
 16 | 
 17 | import re
 18 | import os
 19 | from optparse import OptionParser
 20 | 
 21 | import config
 22 | from utils import time_utils
 23 | 
 24 | 
 25 | INCLUDE_FEATS = [
 26 | ".+"
 27 | ]
 28 | 
 29 | 
 30 | COUNT_FEATS = [
 31 | "Freq", 
 32 | "Len", 
 33 | "Count", 
 34 | "Size", 
 35 | "Position", 
 36 | ]
 37 | # COUNT_FEATS = []
 38 | 
 39 | 
 40 | NOT_COUNT_FEATS = ["Norm", "Ratio"]
 41 | 
 42 | 
 43 | MANDATORY_FEATS = [
 44 | 
 45 | "DocId_(search_term|product_title|product_color|product_brand)",
 46 | 
 47 | # including product_uid according to
 48 | # https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/20288/trends-in-relevances-by-row-ids/115886#post115886
 49 | "DocIdEcho_product_uid",
 50 | "ProductUidDummy1_product_uid",
 51 | "ProductUidDummy2_product_uid",
 52 | 
 53 | "IsInGoogleDict",
 54 | "GroupRelevance_Size",
 55 | "TSNE",
 56 | ]
 57 | 
 58 | 
 59 | COMMENT_OUT_FEATS = [
 60 | 
 61 | #-------------- General --------------
 62 | "search_term_alt",
 63 | 
 64 | "Bigram",
 65 | "Trigram",
 66 | "UBgram",
 67 | "UBTgram",
 68 | 
 69 | "Median",
 70 | "Std",
 71 | 
 72 | ".+(Bigram|Trigram)_.+_product_(brand|color)",
 73 | 
 74 | 
 75 | #-------------- Basic --------------
 76 | "DocLogFreq",
 77 | "Digit",
 78 | "Unique",
 79 | "^DocIdOneHot",
 80 | "^DocId",
 81 | 
 82 | "DocLen_product_(brand|color)",
 83 | "DocLen_product_attribute_1D",
 84 | "DocFreq_product_description_1D",
 85 | "DocFreq_product_attribute_1D",
 86 | "Digit(Count|Ratio)_product_(brand|color)",
 87 | "Doc(Entropy|Len)_product_(brand|color)",
 88 | "Unique(Count|Ratio)_.+_product_(brand|color)",
 89 | 
 90 | 
 91 | #-------------- Distance --------------
 92 | "DiceDistance",
 93 | # "EditDistance",
 94 | "Compression",
 95 | 
 96 | 
 97 | #-------------- First and Last Ngram --------------
 98 | "FirstIntersectNormPosition",
 99 | "FirstIntersectPosition",
100 | "LastIntersectNormPosition",
101 | "LastIntersectPosition",
102 | 
103 | 
104 | #-------------- Group --------------
105 | "GroupRelevance_(Mean|Std|Max|Min|Median)",
106 | "Group_\d+",
107 | "GroupDistanceStat",
108 | 
109 | 
110 | #-------------- Intersect Count & Position --------------
111 | "IntersectPosition_.+_(Std|Max|Min|Median)",
112 | "IntersectNormPosition_.+_(Std|Max|Min|Median)",
113 | 
114 | 
115 | #-------------- Match --------------
116 | "LongestMatchSize",
117 | 
118 | 
119 | #-------------- StatCooc --------------
120 | # since product_name is of length 2, it makes no difference for various aggregation as there is only one item
121 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_search_term_product_name_x_product_title_product_name_1D",
122 | "StatCooc(TF|NormTF|TFIDF|NormTFIDF|BM25)_Bigram_(Std|Max|Min|Median)_product_title_product_name_x_search_term_product_name_1D",
123 | 
124 | "NormTF",
125 | "NormTFIDF",
126 | 
127 | 
128 | #-------------- Vector Space --------------
129 | # as TFIDF_Word_Trigram has the largest corr
130 | "LSA\d+_Word_Unigram",
131 | "LSA\d+_Word_Bigram",
132 | "TFIDF_Word_Unigram",
133 | "TFIDF_Word_Bigram",
134 | 
135 | # as TFIDF_Char_Fourgram has the largest corr
136 | "LSA\d+_Char_Bigram",
137 | "LSA\d+_Char_Trigram",
138 | "LSA\d+_Char_Fivegram",
139 | "TFIDF_Char_Bigram",
140 | "TFIDF_Char_Trigram",
141 | "TFIDF_Char_Fivegram",
142 | 
143 | "CharDistribution_Ratio",
144 | 
145 | 
146 | #-------------- Word2Vec & Doc2Vec --------------
147 | "_Vector_", 
148 | "_Vdiff_", 
149 | "Word2Vec_Wikipedia_D50",
150 | "Word2Vec_Wikipedia_D100",
151 | "Word2Vec_Wikipedia_D200",
152 | # "Word2Vec_GoogleNews",
153 | "Word2Vec_GoogleNews_D300_Vector",
154 | # as all the words are used to train the model
155 | "Word2Vec_Homedepot_D100_Importance",
156 | "Word2Vec_Homedepot_D100_N_Similarity_Imp",
157 | 
158 | 
159 | #-------------- Turing Test --------------
160 | # d = {
161 | #     "df_basic_features.csv": "Basic",
162 | #     "df_brand_material_dummies.csv": "BrandMaterialDummy",
163 | #     "df_dist_new.csv": "Dist",
164 | #     "dld_features.csv": "DLD",
165 | #     "df_st_tfidf.csv": "StTFIDF",
166 | #     "df_tfidf_intersept_new.csv": "TFIDF",
167 | #     "df_thekey_dummies.csv": "TheKeyDummy",
168 | #     "df_word2vec_new.csv": "Word2Vec",
169 | # }
170 | # "TuringTest_Basic",
171 | # "TuringTest_BrandMaterialDummy",
172 | # "TuringTest_Dist",
173 | # "TuringTest_DLD",
174 | # "TuringTest_StTFIDF",
175 | # "TuringTest_TFIDF",
176 | # "TuringTest_TheKeyDummy",
177 | # "TuringTest_Word2Vec",
178 | 
179 | 
180 | ]
181 | 
182 | 
183 | def _check_include(fname):
184 |     for v in INCLUDE_FEATS:
185 |         pat = re.compile(v)
186 |         if len(re.findall(pat, fname)) > 0:
187 |             return True
188 |     return False
189 | 
190 | 
191 | def _check_count_feat(fname):
192 |     for v in NOT_COUNT_FEATS:
193 |         pat = re.compile(v)
194 |         if len(re.findall(pat, fname)) > 0:
195 |             return False
196 |     for v in COUNT_FEATS:
197 |         pat = re.compile(v)
198 |         if len(re.findall(pat, fname)) > 0:
199 |             return True
200 |     return False
201 | 
202 | 
203 | def _check_lsa_matrix(fname):
204 |     pat = re.compile("^LSA")
205 |     if len(re.findall(pat, fname)) > 0:
206 |         return True
207 |     return False
208 | 
209 | 
210 | def _check_mandatory(fname):
211 |     for v in MANDATORY_FEATS:
212 |         pat = re.compile(v)
213 |         if len(re.findall(pat, fname)) > 0:
214 |             return True
215 |     return False
216 | 
217 | 
218 | def _check_comment_out(fname):
219 |     for v in COMMENT_OUT_FEATS:
220 |         pat = re.compile(v)
221 |         if len(re.findall(pat, fname)) > 0:
222 |             return True
223 |     return False
224 | 
225 | 
226 | header_pattern = """
227 | # -*- coding: utf-8 -*-
228 | \"\"\"
229 | @author: Chenglong Chen <c.chenglong@gmail.com>
230 | @brief: one feature conf
231 | 
232 | Generated by
233 | python %s -d %d -o %s
234 | 
235 | Format:
236 | FEATURE_NAME : (MANDATORY, TRANSFORM)
237 | 
238 | \"\"\"
239 | 
240 | import config
241 | from feature_transformer import SimpleTransform, ColumnSelector
242 | 
243 | LSA_COLUMNS = range(%d)
244 | 
245 | feature_dict = {
246 | 
247 | """
248 | 
249 | 
250 | def _create_feature_conf(lsa_columns, outfile):
251 |     res = header_pattern%(__file__, int(lsa_columns), outfile, int(lsa_columns))
252 | 
253 |     folders = [config.FEAT_DIR, config.FEAT_DIR+"/All"]
254 |     for folder in folders:
255 |         try:
256 |             for file in sorted(os.listdir(folder)):
257 |                 if config.FEAT_FILE_SUFFIX in file:
258 |                     fname = file.split(".")[0]
259 |                     if _check_include(fname):
260 |                         line = ""
261 |                         mandatory = _check_mandatory(fname)
262 |                         if not mandatory and _check_comment_out(fname):
263 |                             continue
264 |                             line += "# "
265 |                         line += "'%s' : "%fname
266 |                         if mandatory:
267 |                             line += "(True, "
268 |                         else:
269 |                             line += "(False, "
270 |                         if _check_lsa_matrix(fname):
271 |                             if int(lsa_columns) > 0:
272 |                                 line += "ColumnSelector(LSA_COLUMNS)),\n"
273 |                             else:
274 |                                 continue
275 |                         elif _check_count_feat(fname):
276 |                             line += "SimpleTransform(config.COUNT_TRANSFORM)),\n"
277 |                         else:
278 |                             line += "SimpleTransform()),\n"
279 |                         res += line
280 |         except:
281 |             pass
282 |     res += "}\n"
283 | 
284 |     with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f:
285 |         f.write(res)
286 | 
287 | 
288 | def parse_args(parser):
289 |     parser.add_option("-d", "--dim", default=1, type=int, dest="lsa_columns",
290 |         help="lsa_columns")
291 |     parser.add_option("-o", "--outfile", default="feature_conf_%s.py"%time_utils._timestamp(),
292 |         type="string", dest="outfile", help="outfile")
293 | 
294 |     (options, args) = parser.parse_args()
295 |     return options, args
296 | 
297 | 
298 | def main(options):
299 |     _create_feature_conf(lsa_columns=options.lsa_columns, outfile=options.outfile)
300 | 
301 | 
302 | if __name__ == "__main__":
303 |     parser = OptionParser()
304 |     options, args = parse_args(parser)
305 |     main(options)
306 | 


--------------------------------------------------------------------------------
/Code/Chenglong/get_stacking_feature_conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: generate stacking feature conf for 2nd and 3rd level models
  5 | 
  6 | """
  7 | 
  8 | import os
  9 | import re
 10 | from optparse import OptionParser
 11 | 
 12 | import pandas as pd
 13 | 
 14 | import config
 15 | from utils import time_utils
 16 | 
 17 | 
 18 | def grab(pattern, text):
 19 |     pat = re.compile(pattern)
 20 |     group = re.findall(pat, text)
 21 |     return group
 22 | 
 23 | 
 24 | def check_valid(model):
 25 |     file = "%s/All/test.pred.%s.csv" % (config.OUTPUT_DIR, model)
 26 |     try:
 27 |         df = pd.read_csv(file)
 28 |         if df.shape[0] == config.TEST_SIZE:
 29 |             return True
 30 |         else:
 31 |             return False
 32 |     except:
 33 |         return False
 34 | 
 35 | 
 36 | def get_model_list(log_folder, topN):
 37 |     tasks_ens = []
 38 |     for file in sorted(os.listdir(log_folder)):
 39 |         if not os.path.isfile(os.path.join(log_folder, file)):
 40 |             continue
 41 |         text = open(os.path.join(log_folder, file), "r").read()
 42 | 
 43 |         # grab everything we need
 44 |         tasks = grab("(\[Feat@.*)", text)
 45 |         rmse_mean = grab("Mean: (.*)", text)
 46 |         rmse_std = grab("Std: (.*)", text)
 47 |         rmse_mean = [float(x) for x in rmse_mean]
 48 |         rmse_std = [float(x) for x in rmse_std]
 49 |         L = min(len(tasks), len(rmse_mean), len(rmse_std))
 50 |         d = dict(zip(tasks[:L], rmse_mean[:L]))
 51 | 
 52 |         # keep the top-N
 53 |         ds = sorted(d.items(), key=lambda x: float(x[1]))
 54 |         cnt = 0
 55 |         for t,v in ds:
 56 |             if check_valid(t):
 57 |                 tasks_ens.append(t)
 58 |                 print("Read %s : %.6f"%(t, v))
 59 |                 cnt += 1
 60 |                 if cnt >= topN:
 61 |                     break
 62 |         if cnt > 0:
 63 |             print("Read %d models from %s"%(cnt, file))
 64 | 
 65 |     return tasks_ens
 66 | 
 67 | 
 68 | header_pattern = """
 69 | # -*- coding: utf-8 -*-
 70 | \"\"\"
 71 | @author: Chenglong Chen <c.chenglong@gmail.com>
 72 | @brief: one stacking feature conf
 73 | 
 74 | Generated by
 75 | python %s -l %s -t %d -o %s
 76 | 
 77 | \"\"\"
 78 | 
 79 | feature_list = [
 80 | 
 81 | """
 82 | 
 83 | 
 84 | def _create_feature_conf(level, topN, outfile):
 85 |     log_folder = "%s/level%d_models"%(config.LOG_DIR, level)
 86 |     feature_list = get_model_list(log_folder, topN)
 87 |     res = header_pattern%(__file__, level, int(topN), outfile)
 88 |     for feature in feature_list:
 89 |         res += '"%s",\n'%feature
 90 |     res += "]\n"
 91 |     with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f:
 92 |         f.write(res)
 93 | 
 94 | 
 95 | def main(options):
 96 |     _create_feature_conf(level=options.level, topN=options.topN, outfile=options.outfile)
 97 | 
 98 | 
 99 | def parse_args(parser):
100 |     parser.add_option("-l", "--level", default=2, 
101 |         type="int", dest="level", help="level")
102 |     parser.add_option("-t", "--top", default=10, 
103 |         type="int", dest="topN", help="top-N")
104 |     parser.add_option("-o", "--outfile", 
105 |         default="stacking_feature_conf_%s.py"%time_utils._timestamp(),
106 |         type="string", dest="outfile", help="outfile")
107 |     (options, args) = parser.parse_args()
108 |     return options, args
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     parser = OptionParser()
113 |     options, args = parse_args(parser)
114 |     main(options)
115 | 


--------------------------------------------------------------------------------
/Code/Chenglong/plot_CV_LB.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: plot CV RMSE vs LB RMSE
 5 | 
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import config
12 | 
13 | 
14 | def main():
15 |     rmse_cv = [
16 |         # [0.527408,0.000768],
17 |         # [0.482010,0.000752],
18 |         [0.470570,0.000740],
19 |         [0.470197,0.000558],
20 |         [0.470167,0.000492],
21 |         [0.468127,0.000749],
22 |         [0.467613,0.000617],
23 |         [0.467570,0.000509],
24 |         [0.463124,0.000934],
25 |         [0.462973,0.001178],
26 |         [0.462632,0.001026],
27 |         [0.461406,0.001050],
28 |         [0.460582,0.001128],
29 |         [0.458092,0.000782],
30 |         [0.457421,0.000848],
31 |         [0.455473,0.001008],
32 |         [0.450111,0.000749],
33 |         [0.447134,0.001033],
34 |         [0.438318,0.000786],
35 |     ]
36 |     rmse_lb = [
37 |         # [0.52770,0.52690],
38 |         # [0.48067,0.48071],
39 |         [0.46982,0.47028],
40 |         [0.46968,0.46931],
41 |         [0.46986,0.46981],
42 |         [0.46864,0.46837],
43 |         [0.46569,0.46544],
44 |         [0.46653,0.46623],
45 |         [0.46263,0.46181],
46 |         [0.46251,0.46180],
47 |         [0.46185,0.46147],
48 |         [0.45944,0.45900],
49 |         [0.45993,0.45958],
50 |         [0.45909,0.45860],
51 |         [0.45816,0.45725],
52 |         [0.45640,0.45533],
53 |         [0.44967,0.44902],
54 |         [0.44577,0.44457],
55 |         [0.43996,0.43811],
56 |     ]
57 | 
58 | 
59 |     rmse_cv = np.asarray(rmse_cv, dtype=float)
60 |     rmse_lb = np.asarray(rmse_lb, dtype=float)
61 | 
62 |     N = rmse_cv.shape[0]
63 |     x = np.arange(1,N+1,1)
64 |     label = "CV"
65 |     plt.errorbar(x, rmse_cv[:,0], 
66 |         yerr=2*rmse_cv[:,1], 
67 |         fmt='-o', label=label)
68 |     plt.plot(x, rmse_lb[:,0])
69 |     plt.plot(x, rmse_lb[:,1])
70 |     plt.xlim(1, N)
71 |     plt.title("CV RMSE vs LB RMSE")
72 |     plt.xlabel("#Sub")
73 |     plt.ylabel("RMSE")
74 |     plt.legend(["CV (+- 2std)", "Public LB", "Private LB"], loc="upper right")
75 |     fig_file = "%s/CV_LB_Chenglong.pdf"%config.FIG_DIR
76 |     plt.savefig(fig_file)
77 |     plt.clf()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/Code/Chenglong/plot_feature_corr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: plot correlation with target relevance for each feature group
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | import re
10 | 
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | 
14 | import config
15 | 
16 | 
17 | def is_feat_log(fname):
18 |     pat = re.compile("generate_feature_(.+)_2016")
19 |     groups = re.findall(pat, fname)
20 |     if len(groups) > 0 and groups[0] != "group_relevance":
21 |         return groups[0]
22 |     return None
23 | 
24 | 
25 | def grap_feat_line_corr(line):
26 |     pat = re.compile("corr = (.+)")
27 |     groups = re.findall(pat, line)
28 |     if len(groups) > 0:
29 |         return float(groups[0])
30 |     return None
31 | 
32 | 
33 | def grap_feat_line_name(line):
34 |     pat = re.compile("INFO: (.+) \(\d+D\):")
35 |     groups = re.findall(pat, line)
36 |     if len(groups) > 0:
37 |         return groups[0]
38 |     return None    
39 | 
40 | 
41 | def grap_feat_corr_dict(fname):
42 |     d = {}
43 |     with open("%s/feature/%s"%(config.LOG_DIR, fname), "r") as f:
44 |         for line in f:
45 |             corr = grap_feat_line_corr(line)
46 |             if corr is not None:
47 |                 name = grap_feat_line_name(line)
48 |                 d[name] = (corr)
49 |     return d.values()
50 | 
51 | def grap_all_feat_corr_dict():
52 |     d = {}
53 |     for fname in sorted(os.listdir("%s/feature"%(config.LOG_DIR))):
54 |         name = is_feat_log(fname)
55 |         if name is not None:
56 |             d[name] = grap_feat_corr_dict(fname)
57 |     return d
58 | 
59 | def main():
60 |     colors = "rgbcmyk"
61 |     d = grap_all_feat_corr_dict()
62 |     keys = sorted(d.keys())
63 |     N = len(keys)
64 |     fig = plt.figure()
65 |     ax = fig.add_subplot(111)
66 |     for e,k in enumerate(keys, start=1):
67 |         vals = sorted(d[k])
68 |         color = colors[(e-1) % len(colors)]
69 |         plt.bar(np.linspace(e-0.48,e+0.48,len(vals)), vals, 
70 |             width=1./(len(vals)+10), color=color, edgecolor=color)
71 |     plt.xlabel("Feature Group", fontsize=15)
72 |     plt.ylabel("Correlation Coefficient", fontsize=15)
73 |     plt.xticks(range(1,N+1), fontsize=15)
74 |     plt.yticks([-0.4, -0.2, 0, 0.2, 0.4], fontsize=15)
75 |     ax.set_xticklabels(keys, rotation=45, ha="right")
76 |     ax.set_xlim([0, N+1])
77 |     ax.set_ylim([-0.4, 0.4])
78 |     pos1 = ax.get_position()
79 |     pos2 = [pos1.x0 - 0.075, pos1.y0 + 0.175,  pos1.width * 1.2, pos1.height * 0.85] 
80 |     ax.set_position(pos2)
81 |     plt.show()
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/Code/Chenglong/run_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: generate all the data and features in one shot
  5 | @note: if you don't have access to multi-core computers, drop the "&" in the cmd
  6 | 
  7 | """
  8 | 
  9 | import os
 10 | 
 11 | 
 12 | #-----------------------------------------------------------------------
 13 | # generate split (or you can use mine in ./Data/split/)
 14 | cmd = "python splitter.py"
 15 | os.system(cmd)
 16 | 
 17 | 
 18 | #-----------------------------------------------------------------------
 19 | # prepare data
 20 | cmd = "python data_preparer.py"
 21 | os.system(cmd)
 22 | 
 23 | 
 24 | #-----------------------------------------------------------------------
 25 | # process/clean data
 26 | cmd = "python data_processor.py"
 27 | os.system(cmd)
 28 | 
 29 | 
 30 | #-----------------------------------------------------------------------
 31 | # generate basic features
 32 | cmd = "python feature_basic.py &"
 33 | os.system(cmd)
 34 | 
 35 | 
 36 | #-----------------------------------------------------------------------
 37 | # generate distance features
 38 | cmd = "python feature_distance.py jaccard &"
 39 | os.system(cmd)
 40 | 
 41 | cmd = "python feature_distance.py edit &"
 42 | os.system(cmd)
 43 | 
 44 | # # not used in the final model
 45 | # cmd = "python feature_distance.py compression &"
 46 | # os.system(cmd)
 47 | 
 48 | 
 49 | #-----------------------------------------------------------------------
 50 | # generate first and last ngram features
 51 | cmd = "python feature_first_last_ngram.py &"
 52 | os.system(cmd)
 53 | 
 54 | 
 55 | #-----------------------------------------------------------------------
 56 | # generate group based features (not used in the final model)
 57 | # cmd = "python feature_group_distance.py &"
 58 | # os.system(cmd)
 59 | 
 60 | # cmd = "python feature_group_distance_stat.py &"
 61 | # os.system(cmd)
 62 | 
 63 | cmd = "python feature_group_relevance.py &"
 64 | os.system(cmd)
 65 | 
 66 | 
 67 | #-----------------------------------------------------------------------
 68 | # generate intersect features
 69 | cmd = "python feature_intersect_count.py &"
 70 | os.system(cmd)
 71 | 
 72 | cmd = "python feature_intersect_position.py &"
 73 | os.system(cmd)
 74 | 
 75 | 
 76 | #-----------------------------------------------------------------------
 77 | # generate match features
 78 | cmd = "python feature_match.py &"
 79 | os.system(cmd)
 80 | 
 81 | 
 82 | #-----------------------------------------------------------------------
 83 | # generate query quality features
 84 | cmd = "python feature_query_quality.py &"
 85 | os.system(cmd)
 86 | 
 87 | 
 88 | #-----------------------------------------------------------------------
 89 | # generate statistical cooccurrence (weighted) features
 90 | cmd = "python feature_stat_cooc_tfidf.py tf &"
 91 | os.system(cmd)
 92 | # cmd = "python feature_stat_cooc_tfidf.py norm_tf &"
 93 | # os.system(cmd)
 94 | 
 95 | cmd = "python feature_stat_cooc_tfidf.py tfidf &"
 96 | os.system(cmd)
 97 | # cmd = "python feature_stat_cooc_tfidf.py norm_tfidf &"
 98 | # os.system(cmd)
 99 | 
100 | cmd = "python feature_stat_cooc_tfidf.py bm25 &"
101 | os.system(cmd)
102 | 
103 | 
104 | #-----------------------------------------------------------------------
105 | # generate word2vec features using pre-trained word2vec model
106 | cmd = "python feature_word2vec.py google &"
107 | os.system(cmd)
108 | 
109 | cmd = "python feature_word2vec.py wikipedia &"
110 | os.system(cmd)
111 | 
112 | 
113 | #-----------------------------------------------------------------------
114 | # generate wordnet similarity features
115 | # time consuming part ~20 hrs
116 | cmd = "python feature_wordnet_similarity.py &"
117 | os.system(cmd)
118 | 
119 | 
120 | #-----------------------------------------------------------------------
121 | # generate word2vec & doc2vec features using word2vec/doc2vec models
122 | # trained with Homedepot provided data
123 | cmd = "python embedding_trainer.py"
124 | os.system(cmd)
125 | 
126 | # most time consuming part 1 ~ 2 days
127 | # after you have trained the WORD2VEC model above, you can MANUALLY distribute the
128 | # feature generation part in feature_word2vec.py to save time, e.g.,
129 | # you can run the following part in parallel
130 | # - search_term vs product_title
131 | # - search_term vs product_description
132 | # - search_term vs product_attribute
133 | cmd = "python feature_word2vec.py homedepot &"
134 | os.system(cmd)
135 | 
136 | cmd = "python feature_doc2vec.py &"
137 | os.system(cmd)
138 | 
139 | 
140 | #-----------------------------------------------------------------------
141 | # generate vector space features
142 | # most memory consuming part > 16GB
143 | cmd = "python feature_vector_space.py"
144 | os.system(cmd)
145 | 
146 | cmd = "python convert_pkl_lsa_to_csv_lsa.py"
147 | os.system(cmd)
148 | 
149 | cmd = "Rscript feature_tsne.R"
150 | os.system(cmd)
151 | 
152 | cmd = "python convert_csv_tsne_to_pkl_tsne.py"
153 | os.system(cmd)
154 | 


--------------------------------------------------------------------------------
/Code/Chenglong/run_stacking_ridge.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: script for testing 2nd & 3rd level model with reg_skl_ridge
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | from optparse import OptionParser
10 | 
11 | from utils import time_utils
12 | 
13 | 
14 | def parse_args(parser):
15 |     parser.add_option("-l", "--level", default=2, 
16 |         type="int", dest="level", help="level")
17 |     parser.add_option("-d", "--dim", default=0, 
18 |         type="int", dest="dim", help="LSA dim")
19 |     parser.add_option("-t", "--top", default=10, 
20 |         type="int", dest="top", help="top N")
21 |     parser.add_option("-c", "--corr", default=1.0,
22 |         type="float", dest="corr", help="corr")
23 |     parser.add_option("-L", "--learner", default="reg_skl_ridge", 
24 |         type="string", dest="learner", help="learner")
25 |     parser.add_option("-o", default=False, action="store_true", dest="refit_once",
26 |         help="stacking refit_once")
27 |     (options, args) = parser.parse_args()
28 |     return options, args
29 | 
30 | def main(options):
31 |     now = time_utils._timestamp_pretty()
32 | 
33 |     meta_conf = "level%d_feature_conf_meta_linear_%s"%(options.level, now)
34 |     stacking_conf = "level%d_feature_conf_%s"%(options.level, now)
35 |     feat_name = "level%d_meta_linear_%s"%(options.level, now)
36 | 
37 |     # get meta feature conf for `level` models
38 |     cmd = "python get_feature_conf_linear_stacking.py -d %d -o %s.py"%(
39 |         options.dim, meta_conf)
40 |     os.system(cmd)
41 | 
42 |     # NOTE: using predictions from `level-1` models to generate features 
43 |     # for `level` models
44 |     cmd = "python get_stacking_feature_conf.py -l %d -t %d -o %s.py"%(
45 |         options.level-1, options.top, stacking_conf)
46 |     os.system(cmd)
47 | 
48 |     # generate feature for `level` models
49 |     cmd = "python feature_combiner.py -l %d -c %s -m %s -n %s -s .csv -t %f"%(
50 |         options.level, stacking_conf, meta_conf, feat_name, options.corr)
51 |     os.system(cmd)
52 | 
53 |     # train `level` models
54 |     if options.refit_once:
55 |         cmd = "python task.py -m stacking -f %s -l %s -e 100 -o"%(feat_name, options.learner)
56 |     else:
57 |         cmd = "python task.py -m stacking -f %s -l %s -e 100"%(feat_name, options.learner)
58 |     os.system(cmd)
59 | 
60 | if __name__ == "__main__":
61 |     parser = OptionParser()
62 |     options, args = parse_args(parser)
63 |     main(options)
64 | 


--------------------------------------------------------------------------------
/Code/Chenglong/run_test_ridge.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: script for testing 1st level model with reg_skl_ridge
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from utils import time_utils
12 | 
13 | if len(sys.argv) >= 3:
14 |     suffix = sys.argv[1]
15 |     threshold = float(sys.argv[2])
16 | else:
17 |     suffix = time_utils._timestamp_pretty()
18 |     threshold = 0.05
19 | 
20 | cmd = "python get_feature_conf_linear.py -d 10 -o feature_conf_linear_%s.py"%suffix
21 | os.system(cmd)
22 | 
23 | cmd = "python feature_combiner.py -l 1 -c feature_conf_linear_%s -n basic_linear_%s -t %.6f"%(suffix, suffix, threshold)
24 | os.system(cmd)
25 | 
26 | cmd = "python task.py -m single -f basic_linear_%s -l reg_skl_ridge -e 100"%suffix
27 | os.system(cmd)
28 | 


--------------------------------------------------------------------------------
/Code/Chenglong/run_test_xgb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: script for testing 1st level model with reg_xgb_tree
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from utils import time_utils
12 | 
13 | if len(sys.argv) >= 3:
14 |     suffix = sys.argv[1]
15 |     threshold = float(sys.argv[2])
16 | else:
17 |     suffix = time_utils._timestamp_pretty()
18 |     threshold = 0.05
19 | 
20 | cmd = "python get_feature_conf_nonlinear.py -d 10 -o feature_conf_nonlinear_%s.py"%suffix
21 | os.system(cmd)
22 | 
23 | cmd = "python feature_combiner.py -l 1 -c feature_conf_nonlinear_%s -n basic_nonlinear_%s -t %.6f"%(suffix, suffix, threshold)
24 | os.system(cmd)
25 | 
26 | cmd = "python task.py -m single -f basic_nonlinear_%s -l reg_xgb_tree -e 100"%suffix
27 | os.system(cmd)
28 | 


--------------------------------------------------------------------------------
/Code/Chenglong/turing_test_converter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: convert .csv format dataframe features (from Igor&Kostia) to .pkl format features
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | import imp
11 | from optparse import OptionParser
12 | 
13 | import scipy
14 | import numpy as np
15 | import pandas as pd
16 | 
17 | import config
18 | from utils import pkl_utils
19 | 
20 | 
21 | class TuringTestConverter:
22 |     def __init__(self, fname, name):
23 |         self.fname = fname
24 |         self.name = name
25 | 
26 |     def convert(self):
27 |         dfAll = pd.read_csv(self.fname)
28 |         columns_to_drop = ["id", "product_uid", "relevance", "search_term", "product_title"]
29 |         columns_to_drop = [col for col in columns_to_drop if col in dfAll.columns]
30 |         dfAll.drop(columns_to_drop, axis=1, inplace=True)
31 |         for col in dfAll.columns:
32 |             pkl_utils._save("%s/TuringTest_%s_%s.pkl"%(config.FEAT_DIR, self.name, col), dfAll[col].values)
33 | 
34 | 
35 | def main():
36 |     d = {
37 |         "df_basic_features.csv": "Basic",
38 |         "df_brand_material_dummies.csv": "BrandMaterialDummy",
39 |         "df_dist_new.csv": "Dist",
40 |         "df_st_tfidf.csv": "StTFIDF",
41 |         "df_tfidf_intersept_new.csv": "TFIDF",
42 |         "df_thekey_dummies.csv": "TheKeyDummy",
43 |         "df_word2vec_new.csv": "Word2Vec",
44 |         "dld_features.csv": "DLD",
45 |     }
46 | 
47 |     for k,v in d.items():
48 |         converter = TuringTestConverter(
49 |                         fname="%s/Turing_test/%s"%(config.FEAT_DIR, k),
50 |                         name=v)
51 |         converter.convert()
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Code/Chenglong/utils/__init__.py


--------------------------------------------------------------------------------
/Code/Chenglong/utils/dist_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for distance computation
  5 | 
  6 | """
  7 | 
  8 | import sys
  9 | import warnings
 10 | warnings.filterwarnings("ignore")
 11 | 
 12 | try:
 13 |     import lzma
 14 |     import Levenshtein
 15 | except:
 16 |     pass
 17 | import numpy as np
 18 | from difflib import SequenceMatcher
 19 | from sklearn.metrics.pairwise import cosine_similarity
 20 | 
 21 | from utils import np_utils
 22 | sys.path.append("..")
 23 | import config
 24 | 
 25 | 
 26 | def _edit_dist(str1, str2):
 27 |     try:
 28 |         # very fast
 29 |         # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
 30 |         # d = Levenshtein.ratio(str1, str2)
 31 |         d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
 32 |     except:
 33 |         # https://docs.python.org/2/library/difflib.html
 34 |         d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
 35 |     return d
 36 | 
 37 | 
 38 | def _is_str_match(str1, str2, threshold=1.0):
 39 |     assert threshold >= 0.0 and threshold <= 1.0, "Wrong threshold."
 40 |     if float(threshold) == 1.0:
 41 |         return str1 == str2
 42 |     else:
 43 |         return (1. - _edit_dist(str1, str2)) >= threshold
 44 | 
 45 | 
 46 | def _longest_match_size(str1, str2):
 47 |     sq = SequenceMatcher(lambda x: x==" ", str1, str2)
 48 |     match = sq.find_longest_match(0, len(str1), 0, len(str2))
 49 |     return match.size
 50 | 
 51 | 
 52 | def _longest_match_ratio(str1, str2):
 53 |     sq = SequenceMatcher(lambda x: x==" ", str1, str2)
 54 |     match = sq.find_longest_match(0, len(str1), 0, len(str2))
 55 |     return np_utils._try_divide(match.size, min(len(str1), len(str2)))
 56 | 
 57 | 
 58 | def _compression_dist(x, y, l_x=None, l_y=None):
 59 |     if x == y:
 60 |         return 0
 61 |     x_b = x.encode('utf-8')
 62 |     y_b = y.encode('utf-8')
 63 |     if l_x is None:
 64 |         l_x = len(lzma.compress(x_b))
 65 |         l_y = len(lzma.compress(y_b))
 66 |     l_xy = len(lzma.compress(x_b+y_b))
 67 |     l_yx = len(lzma.compress(y_b+x_b))
 68 |     dist = np_utils._try_divide(min(l_xy,l_yx)-min(l_x,l_y), max(l_x,l_y))
 69 |     return dist
 70 | 
 71 | 
 72 | def _cosine_sim(vec1, vec2):
 73 |     try:
 74 |         s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
 75 |     except:
 76 |         try:
 77 |             s = cosine_similarity(vec1, vec2)[0][0]
 78 |         except:
 79 |             s = config.MISSING_VALUE_NUMERIC
 80 |     return s
 81 | 
 82 | 
 83 | def _vdiff(vec1, vec2):
 84 |     return vec1 - vec2
 85 | 
 86 | 
 87 | def _rmse(vec1, vec2):
 88 |     vdiff = vec1 - vec2
 89 |     rmse = np.sqrt(np.mean(vdiff**2))
 90 |     return rmse
 91 | 
 92 | 
 93 | def _KL(dist1, dist2):
 94 |     "Kullback-Leibler Divergence"
 95 |     return np.sum(dist1 * np.log(dist1/dist2), axis=1)
 96 | 
 97 | 
 98 | def _jaccard_coef(A, B):
 99 |     if not isinstance(A, set):
100 |         A = set(A)
101 |     if not isinstance(B, set):
102 |         B = set(B)
103 |     return np_utils._try_divide(float(len(A.intersection(B))), len(A.union(B)))
104 | 
105 | 
106 | def _dice_dist(A, B):
107 |     if not isinstance(A, set):
108 |         A = set(A)
109 |     if not isinstance(B, set):
110 |         B = set(B)
111 |     return np_utils._try_divide(2.*float(len(A.intersection(B))), (len(A) + len(B)))
112 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/keras_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for Keras models
  5 | 
  6 | """
  7 | 
  8 | from sklearn.preprocessing import StandardScaler
  9 | from keras.models import Sequential
 10 | from keras.layers.core import Dense, Layer, Dropout, Activation
 11 | from keras.layers.normalization import BatchNormalization
 12 | from keras.layers.advanced_activations import ELU, PReLU
 13 | from keras.optimizers import SGD
 14 | from keras.utils import np_utils, generic_utils
 15 | 
 16 | 
 17 | class KerasDNNRegressor:
 18 |     def __init__(self, input_dropout=0.2, hidden_layers=2, hidden_units=64, 
 19 |                 hidden_activation="relu", hidden_dropout=0.5, batch_norm=None, 
 20 |                 optimizer="adadelta", nb_epoch=10, batch_size=64):
 21 |         self.input_dropout = input_dropout
 22 |         self.hidden_layers = hidden_layers
 23 |         self.hidden_units = hidden_units
 24 |         self.hidden_activation = hidden_activation
 25 |         self.hidden_dropout = hidden_dropout
 26 |         self.batch_norm = batch_norm
 27 |         self.optimizer = optimizer
 28 |         self.nb_epoch = nb_epoch
 29 |         self.batch_size = batch_size
 30 |         self.scaler = None
 31 |         self.model = None
 32 | 
 33 |     def __str__(self):
 34 |         return self.__repr__()
 35 | 
 36 |     def __repr__(self):
 37 |         return ("%s(input_dropout=%f, hidden_layers=%d, hidden_units=%d, \n"
 38 |                     "hidden_activation=\'%s\', hidden_dropout=%f, batch_norm=\'%s\', \n"
 39 |                     "optimizer=\'%s\', nb_epoch=%d, batch_size=%d)" % (
 40 |                     self.__class__.__name__,
 41 |                     self.input_dropout,
 42 |                     self.hidden_layers,
 43 |                     self.hidden_units,
 44 |                     self.hidden_activation,
 45 |                     self.hidden_dropout,
 46 |                     str(self.batch_norm),
 47 |                     self.optimizer,
 48 |                     self.nb_epoch,
 49 |                     self.batch_size,
 50 |                 ))
 51 | 
 52 | 
 53 |     def fit(self, X, y):
 54 |         ## scaler
 55 |         self.scaler = StandardScaler()
 56 |         X = self.scaler.fit_transform(X)
 57 | 
 58 |         #### build model
 59 |         self.model = Sequential()
 60 |         ## input layer
 61 |         self.model.add(Dropout(self.input_dropout, input_shape=(X.shape[1],)))
 62 |         ## hidden layers
 63 |         first = True
 64 |         hidden_layers = self.hidden_layers
 65 |         while hidden_layers > 0:
 66 |             self.model.add(Dense(self.hidden_units))
 67 |             if self.batch_norm == "before_act":
 68 |                 self.model.add(BatchNormalization())
 69 |             if self.hidden_activation == "prelu":
 70 |                 self.model.add(PReLU())
 71 |             elif self.hidden_activation == "elu":
 72 |                 self.model.add(ELU())
 73 |             else:
 74 |                 self.model.add(Activation(self.hidden_activation))
 75 |             if self.batch_norm == "after_act":
 76 |                 self.model.add(BatchNormalization())
 77 |             self.model.add(Dropout(self.hidden_dropout))
 78 |             hidden_layers -= 1
 79 | 
 80 |         ## output layer
 81 |         output_dim = 1
 82 |         output_act = "linear"
 83 |         self.model.add(Dense(output_dim))
 84 |         self.model.add(Activation(output_act))
 85 |         
 86 |         ## loss
 87 |         if self.optimizer == "sgd":
 88 |             sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
 89 |             self.model.compile(loss="mse", optimizer=sgd)
 90 |         else:
 91 |             self.model.compile(loss="mse", optimizer=self.optimizer)
 92 | 
 93 |         ## fit
 94 |         self.model.fit(X, y,
 95 |                     nb_epoch=self.nb_epoch, 
 96 |                     batch_size=self.batch_size,
 97 |                     validation_split=0, verbose=0)
 98 |         return self
 99 | 
100 |     def predict(self, X):
101 |         X = self.scaler.transform(X)
102 |         y_pred = self.model.predict(X)
103 |         y_pred = y_pred.flatten()
104 |         return y_pred
105 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/logging_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: utils for logging
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | import logging
10 | import logging.handlers
11 | 
12 | 
13 | def _get_logger(logdir, logname, loglevel=logging.INFO):
14 |     fmt = "[%(asctime)s] %(levelname)s: %(message)s"
15 |     formatter = logging.Formatter(fmt)
16 | 
17 |     handler = logging.handlers.RotatingFileHandler(
18 |                     filename=os.path.join(logdir, logname),
19 |                     maxBytes=10*1024*1024, 
20 |                     backupCount=10)
21 |     handler.setFormatter(formatter)
22 | 
23 |     logger = logging.getLogger("")
24 |     logger.addHandler(handler)
25 |     logger.setLevel(loglevel)
26 |     return logger
27 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/ngram_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for ngram
  5 | 
  6 | """
  7 | 
  8 | 
  9 | def _unigrams(words):
 10 |     """
 11 |         Input: a list of words, e.g., ["I", "am", "Denny"]
 12 |         Output: a list of unigram
 13 |     """
 14 |     assert type(words) == list
 15 |     return words
 16 | 
 17 | 
 18 | def _bigrams(words, join_string, skip=0):
 19 |     """
 20 |        Input: a list of words, e.g., ["I", "am", "Denny"]
 21 |        Output: a list of bigram, e.g., ["I_am", "am_Denny"]
 22 |        I use _ as join_string for this example.
 23 |     """
 24 |     assert type(words) == list
 25 |     L = len(words)
 26 |     if L > 1:
 27 |         lst = []
 28 |         for i in range(L-1):
 29 |             for k in range(1,skip+2):
 30 |                 if i+k < L:
 31 |                     lst.append( join_string.join([words[i], words[i+k]]) )
 32 |     else:
 33 |         # set it as unigram
 34 |         lst = _unigrams(words)
 35 |     return lst
 36 | 
 37 | 
 38 | def _trigrams(words, join_string, skip=0):
 39 |     """
 40 |        Input: a list of words, e.g., ["I", "am", "Denny"]
 41 |        Output: a list of trigram, e.g., ["I_am_Denny"]
 42 |        I use _ as join_string for this example.
 43 |     """
 44 |     assert type(words) == list
 45 |     L = len(words)
 46 |     if L > 2:
 47 |         lst = []
 48 |         for i in range(L-2):
 49 |             for k1 in range(1,skip+2):
 50 |                 for k2 in range(1,skip+2):
 51 |                     if i+k1 < L and i+k1+k2 < L:
 52 |                         lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
 53 |     else:
 54 |         # set it as bigram
 55 |         lst = _bigrams(words, join_string, skip)
 56 |     return lst
 57 | 
 58 | 
 59 | def _fourgrams(words, join_string):
 60 |     """
 61 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
 62 |         Output: a list of trigram, e.g., ["I_am_Denny_boy"]
 63 |         I use _ as join_string for this example.
 64 |     """
 65 |     assert type(words) == list
 66 |     L = len(words)
 67 |     if L > 3:
 68 |         lst = []
 69 |         for i in xrange(L-3):
 70 |             lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) )
 71 |     else:
 72 |         # set it as trigram
 73 |         lst = _trigrams(words, join_string)
 74 |     return lst
 75 | 
 76 | 
 77 | def _uniterms(words):
 78 |     return _unigrams(words)
 79 | 
 80 | 
 81 | def _biterms(words, join_string):
 82 |     """
 83 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
 84 |         Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
 85 |         I use _ as join_string for this example.
 86 |     """
 87 |     assert type(words) == list
 88 |     L = len(words)
 89 |     if L > 1:
 90 |         lst = []
 91 |         for i in range(L-1):
 92 |             for j in range(i+1,L):
 93 |                 lst.append( join_string.join([words[i], words[j]]) )
 94 |     else:
 95 |         # set it as uniterm
 96 |         lst = _uniterms(words)
 97 |     return lst
 98 | 
 99 | 
100 | def _triterms(words, join_string):
101 |     """
102 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
103 |         Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
104 |         I use _ as join_string for this example.
105 |     """
106 |     assert type(words) == list
107 |     L = len(words)
108 |     if L > 2:
109 |         lst = []
110 |         for i in xrange(L-2):
111 |             for j in xrange(i+1,L-1):
112 |                 for k in xrange(j+1,L):
113 |                     lst.append( join_string.join([words[i], words[j], words[k]]) )
114 |     else:
115 |         # set it as biterm
116 |         lst = _biterms(words, join_string)
117 |     return lst
118 | 
119 | 
120 | def _fourterms(words, join_string):
121 |     """
122 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
123 |         Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
124 |         I use _ as join_string for this example.
125 |     """
126 |     assert type(words) == list
127 |     L = len(words)
128 |     if L > 3:
129 |         lst = []
130 |         for i in xrange(L-3):
131 |             for j in xrange(i+1,L-2):
132 |                 for k in xrange(j+1,L-1):
133 |                     for l in xrange(k+1,L):
134 |                         lst.append( join_string.join([words[i], words[j], words[k], words[l]]) )
135 |     else:
136 |         # set it as triterm
137 |         lst = _triterms(words, join_string)
138 |     return lst
139 | 
140 | 
141 | _ngram_str_map = {
142 |     1: "Unigram",
143 |     2: "Bigram",
144 |     3: "Trigram",
145 |     4: "Fourgram",
146 |     5: "Fivegram",
147 |     12: "UBgram",
148 |     123: "UBTgram",
149 | }
150 | 
151 | 
152 | def _ngrams(words, ngram, join_string=" "):
153 |     """wrapper for ngram"""
154 |     if ngram == 1:
155 |         return _unigrams(words)
156 |     elif ngram == 2:
157 |         return _bigrams(words, join_string)
158 |     elif ngram == 3:
159 |         return _trigrams(words, join_string)
160 |     elif ngram == 4:
161 |         return _fourgrams(words, join_string)
162 |     elif ngram == 12:
163 |         unigram = _unigrams(words)
164 |         bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
165 |         return unigram + bigram
166 |     elif ngram == 123:
167 |         unigram = _unigrams(words)
168 |         bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
169 |         trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3]
170 |         return unigram + bigram + trigram
171 | 
172 | 
173 | _nterm_str_map = {
174 |     1: "Uniterm",
175 |     2: "Biterm",
176 |     3: "Triterm",
177 |     4: "Fourterm",
178 |     5: "Fiveterm",
179 | }
180 | 
181 | 
182 | def _nterms(words, nterm, join_string=" "):
183 |     """wrapper for nterm"""
184 |     if nterm == 1:
185 |         return _uniterms(words)
186 |     elif nterm == 2:
187 |         return _biterms(words, join_string)
188 |     elif nterm == 3:
189 |         return _triterms(words, join_string)
190 |     elif nterm == 4:
191 |         return _fourterms(words, join_string)
192 | 
193 | 
194 | if __name__ == "__main__":
195 | 
196 |     text = "I am Denny boy ha"
197 |     words = text.split(" ")
198 | 
199 |     assert _ngrams(words, 1) == ["I", "am", "Denny", "boy", "ha"]
200 |     assert _ngrams(words, 2) == ["I am", "am Denny", "Denny boy", "boy ha"]
201 |     assert _ngrams(words, 3) == ["I am Denny", "am Denny boy", "Denny boy ha"]
202 |     assert _ngrams(words, 4) == ["I am Denny boy", "am Denny boy ha"]
203 | 
204 |     assert _nterms(words, 1) == ["I", "am", "Denny", "boy", "ha"]
205 |     assert _nterms(words, 2) == ["I am", "I Denny", "I boy", "I ha", "am Denny", "am boy", "am ha", "Denny boy", "Denny ha", "boy ha"]
206 |     assert _nterms(words, 3) == ["I am Denny", "I am boy", "I am ha", "I Denny boy", "I Denny ha", "I boy ha", "am Denny boy", "am Denny ha", "am boy ha", "Denny boy ha"]
207 |     assert _nterms(words, 4) == ["I am Denny boy", "I am Denny ha", "I am boy ha", "I Denny boy ha", "am Denny boy ha"]
208 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/nlp_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: utils for nlp
 5 | 
 6 | """
 7 | 
 8 | import re
 9 | 
10 | 
11 | def _tokenize(text, token_pattern=" "):
12 |     # token_pattern = r"(?u)\b\w\w+\b"
13 |     # token_pattern = r"\w{1,}"
14 |     # token_pattern = r"\w+"
15 |     # token_pattern = r"[\w']+"
16 |     if token_pattern == " ":
17 |         # just split the text into tokens
18 |         return text.split(" ")
19 |     else:
20 |         token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
21 |         group = token_pattern.findall(text)
22 |         return group
23 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/np_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for numpy
  5 | 
  6 | """
  7 | 
  8 | import sys
  9 | 
 10 | import numpy as np
 11 | from scipy.stats import pearsonr
 12 | from collections import Counter
 13 | 
 14 | sys.path.append("..")
 15 | import config
 16 | 
 17 | 
 18 | def _sigmoid(score):
 19 |     p = 1. / (1. + np.exp(-score))
 20 |     return p
 21 | 
 22 | 
 23 | def _logit(p):
 24 |     return np.log(p/(1.-p))
 25 | 
 26 | 
 27 | def _softmax(score):
 28 |     score = np.asarray(score, dtype=float)
 29 |     score = np.exp(score - np.max(score))
 30 |     score /= np.sum(score, axis=1)[:,np.newaxis]
 31 |     return score
 32 | 
 33 | 
 34 | def _cast_proba_predict(proba):
 35 |     N = proba.shape[1]
 36 |     w = np.arange(1,N+1)
 37 |     pred = proba * w[np.newaxis,:]
 38 |     pred = np.sum(pred, axis=1)
 39 |     return pred
 40 | 
 41 | 
 42 | def _one_hot_label(label, n_classes):
 43 |     num = label.shape[0]
 44 |     tmp = np.zeros((num, n_classes), dtype=int)
 45 |     tmp[np.arange(num),label.astype(int)] = 1
 46 |     return tmp
 47 | 
 48 | 
 49 | def _majority_voting(x, weight=None):
 50 |     ## apply weight
 51 |     if weight is not None:
 52 |         assert len(weight) == len(x)
 53 |         x = np.repeat(x, weight)
 54 |     c = Counter(x)
 55 |     value, count = c.most_common()[0]
 56 |     return value
 57 | 
 58 | 
 59 | def _voter(x, weight=None):
 60 |     idx = np.isfinite(x)
 61 |     if sum(idx) == 0:
 62 |         value = config.MISSING_VALUE_NUMERIC
 63 |     else:
 64 |         if weight is not None:
 65 |             value = _majority_voting(x[idx], weight[idx])
 66 |         else:
 67 |             value = _majority_voting(x[idx])
 68 |     return value
 69 | 
 70 | 
 71 | def _array_majority_voting(X, weight=None):
 72 |     y = np.apply_along_axis(_voter, axis=1, arr=X, weight=weight)
 73 |     return y
 74 | 
 75 | 
 76 | def _mean(x):
 77 |     idx = np.isfinite(x)
 78 |     if sum(idx) == 0:
 79 |         value = float(config.MISSING_VALUE_NUMERIC) # cast it to float to accommodate the np.mean
 80 |     else:
 81 |         value = np.mean(x[idx]) # this is float!
 82 |     return value
 83 | 
 84 | 
 85 | def _array_mean(X):
 86 |     y = np.apply_along_axis(_mean, axis=1, arr=X)
 87 |     return y
 88 | 
 89 | 
 90 | def _corr(x, y_train):
 91 |     if _dim(x) == 1:
 92 |         corr = pearsonr(x.flatten(), y_train)[0]
 93 |         if str(corr) == "nan":
 94 |             corr = 0.
 95 |     else:
 96 |         corr = 1.
 97 |     return corr
 98 | 
 99 | 
100 | def _dim(x):
101 |     d = 1 if len(x.shape) == 1 else x.shape[1]
102 |     return d
103 | 
104 | 
105 | def _entropy(proba):
106 |     entropy = -np.sum(proba*np.log(proba))
107 |     return entropy
108 | 
109 | 
110 | def _try_divide(x, y, val=0.0):
111 |     """try to divide two numbers"""
112 |     if y != 0.0:
113 |         val = float(x) / y
114 |     return val
115 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/os_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: utils for os
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | import time
10 | import shutil
11 | 
12 | 
13 | def _gen_signature():
14 |     # get pid and current time
15 |     pid = int(os.getpid())
16 |     now = int(time.time())
17 |     # signature
18 |     signature = "%d_%d" % (pid, now)
19 |     return signature
20 | 
21 | def _create_dirs(dirs):
22 |     for dir in dirs:
23 |         if not os.path.exists(dir):
24 |             os.makedirs(dir)
25 | 
26 | def _remove_files(files):
27 |     for file in files:
28 |         os.remove(file)
29 | 
30 | def _remove_dirs(dirs):
31 |     for dir in dirs:
32 |         shutil.rmtree(dir)
33 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/pkl_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: utils for pickle
 5 | 
 6 | """
 7 | 
 8 | import pickle
 9 | 
10 | 
11 | def _save(fname, data, protocol=3):
12 |     with open(fname, "wb") as f:
13 |         pickle.dump(data, f, protocol)
14 | 
15 | def _load(fname):
16 |     with open(fname, "rb") as f:
17 |         return pickle.load(f)
18 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/rgf_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for RGF models
  5 | 
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | 
 11 | import numpy as np
 12 | 
 13 | from . import os_utils
 14 | sys.path.append("..")
 15 | import config
 16 | 
 17 | 
 18 | class RGFRegressor:
 19 |     def __init__(self, reg_L2=0.1, reg_sL2=0.0001, max_leaf_forest=10000, num_iteration_opt=10,
 20 |                 num_tree_search=1, min_pop=10, opt_interval=100, opt_stepsize=0.5):
 21 | 
 22 |         self.param = {
 23 |             "reg_L2": reg_L2,
 24 |             "reg_sL2": reg_sL2,
 25 |             "max_leaf_forest": max_leaf_forest,
 26 |             "num_iteration_opt": num_iteration_opt,
 27 |             "num_tree_search": num_tree_search,
 28 |             "min_pop": min_pop,
 29 |             "opt_interval": opt_interval,
 30 |             "opt_stepsize": opt_stepsize,
 31 |         }
 32 | 
 33 |         # create tmp dir to hold data and model (especially the latter)
 34 |         self.tmp_dir = "%s/%s"%(config.TMP_DIR, os_utils._gen_signature())
 35 |         os_utils._create_dirs([self.tmp_dir])
 36 |         self.model_fn_prefix = "%s/rgf_model"%self.tmp_dir
 37 |             
 38 |     def __del__(self):
 39 |         ## delete tmp dir
 40 |         os_utils._remove_dirs([self.tmp_dir])
 41 | 
 42 |     def __str__(self):
 43 |         return "RGFRegressor"
 44 | 
 45 |     def fit(self, X, y):
 46 | 
 47 |         # write train data to file
 48 |         train_x_fn = "%s/data.x"%self.tmp_dir
 49 |         train_y_fn = "%s/data.y"%self.tmp_dir
 50 |         np.savetxt(train_x_fn, X, fmt="%.6f", delimiter="\t")
 51 |         np.savetxt(train_y_fn, y, fmt="%.6f", delimiter="\t")
 52 | 
 53 |         ## write train param to file
 54 |         params = [
 55 |             "train_x_fn=",train_x_fn,"\n",
 56 |             "train_y_fn=",train_y_fn,"\n",
 57 |             #"train_w_fn=",weight_train_path,"\n",
 58 |             "model_fn_prefix=",self.model_fn_prefix,"\n",
 59 |             "reg_L2=", self.param["reg_L2"], "\n",
 60 |             "reg_sL2=", self.param["reg_sL2"], "\n",
 61 |             #"reg_depth=", 1.01, "\n",
 62 |             "algorithm=","RGF","\n",
 63 |             "loss=","LS","\n",
 64 |             #"opt_interval=", 100, "\n",
 65 |             # save model at the end of training
 66 |             "test_interval=", self.param["max_leaf_forest"],"\n", 
 67 |             "max_leaf_forest=", self.param["max_leaf_forest"],"\n",
 68 |             "num_iteration_opt=", self.param["num_iteration_opt"], "\n",
 69 |             "num_tree_search=", self.param["num_tree_search"], "\n",
 70 |             "min_pop=", self.param["min_pop"], "\n",
 71 |             "opt_interval=", self.param["opt_interval"], "\n",
 72 |             "opt_stepsize=", self.param["opt_stepsize"], "\n",
 73 |             "NormalizeTarget"
 74 |         ]
 75 |         params = "".join([str(p) for p in params])
 76 | 
 77 |         rgf_setting = "%s/rgf_setting"%self.tmp_dir # DOES NOT contain ".inp"
 78 |         with open(rgf_setting+".inp", "w") as f:
 79 |             f.write(params)
 80 | 
 81 |         ## train rgf
 82 |         rgf_log = "%s/rgf_log"%self.tmp_dir
 83 |         cmd = "perl %s %s train %s >> %s"%(
 84 |                 config.RGF_CALL_EXE, config.RGF_EXE, rgf_setting, rgf_log)
 85 |         os.system(cmd)
 86 | 
 87 |         return self
 88 | 
 89 |     def predict(self, X):
 90 | 
 91 |         ## write data to file
 92 |         valid_x_fn = "%s/data.x"%self.tmp_dir
 93 |         valid_y_fn = "%s/data.y"%self.tmp_dir
 94 |         np.savetxt(valid_x_fn, X, fmt="%.6f", delimiter="\t")
 95 | 
 96 |         ## write predict params to file
 97 |         model_fn = self.model_fn_prefix + "-01"
 98 |         params = [
 99 |             "test_x_fn=", valid_x_fn,"\n",
100 |             "model_fn=", model_fn,"\n",
101 |             "prediction_fn=", valid_y_fn
102 |         ]
103 |         params = "".join([str(p) for p in params])
104 |         
105 |         rgf_setting = "%s/rgf_setting"%self.tmp_dir
106 |         with open(rgf_setting+".inp", "w") as f:
107 |             f.write(params)
108 | 
109 |         ## predict
110 |         rgf_log = "%s/rgf_log"%self.tmp_dir
111 |         cmd = "perl %s %s predict %s >> %s"%(
112 |                 config.RGF_CALL_EXE, config.RGF_EXE, rgf_setting, rgf_log)
113 |         os.system(cmd)
114 | 
115 |         y_pred = np.loadtxt(valid_y_fn, dtype=float)
116 | 
117 |         return y_pred
118 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/skl_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for scikit-learn models
  5 | 
  6 | """
  7 | 
  8 | import numpy as np
  9 | import sklearn.svm
 10 | import sklearn.neighbors
 11 | import sklearn.ensemble
 12 | from sklearn.linear_model import Ridge
 13 | from sklearn.tree import DecisionTreeRegressor
 14 | from sklearn.tree import ExtraTreeRegressor
 15 | from sklearn.pipeline import Pipeline
 16 | from sklearn.preprocessing import StandardScaler, PolynomialFeatures
 17 | 
 18 | from . import dist_utils
 19 | 
 20 | 
 21 | class SVR:
 22 |     def __init__(self, kernel='rbf', degree=3, gamma='auto', C=1.0, 
 23 |                 epsilon=0.1, normalize=True, cache_size=2048):
 24 |         svr = sklearn.svm.SVR(kernel=kernel, degree=degree, 
 25 |                             gamma=gamma, C=C, epsilon=epsilon)
 26 |         if normalize:
 27 |             self.model = Pipeline([('ss', StandardScaler()), ('svr', svr)])
 28 |         else:
 29 |             self.model = svr
 30 |             
 31 |     def __str__(self):
 32 |         return "SVR"
 33 | 
 34 |     def fit(self, X, y):
 35 |         self.model.fit(X, y)
 36 |         return self
 37 | 
 38 |     def predict(self, X):
 39 |         y_pred = self.model.predict(X)
 40 |         return y_pred
 41 | 
 42 | 
 43 | class LinearSVR:
 44 |     def __init__(self, epsilon=0.0, C=1.0, loss='epsilon_insensitive', 
 45 |                 random_state=None, normalize=True):
 46 |         lsvr = sklearn.svm.LinearSVR(epsilon=epsilon, C=C, 
 47 |                     loss=loss, random_state=random_state)
 48 |         if normalize:
 49 |             self.model = Pipeline([('ss', StandardScaler()), ('lsvr', lsvr)])
 50 |         else:
 51 |             self.model = lsvr
 52 | 
 53 |     def __str__(self):
 54 |         return "LinearSVR"
 55 | 
 56 |     def fit(self, X, y):
 57 |         self.model.fit(X, y)
 58 |         return self
 59 |         
 60 |     def predict(self, X):
 61 |         y_pred = self.model.predict(X)
 62 |         return y_pred
 63 | 
 64 | 
 65 | class KNNRegressor:
 66 |     def __init__(self, n_neighbors=5, weights='uniform', leaf_size=30, 
 67 |                 metric='minkowski', normalize=True):
 68 |         if metric == 'cosine':
 69 |             metric = lambda x,y: dist_utils._cosine_sim(x, y)
 70 |         knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, 
 71 |             leaf_size=leaf_size, metric=metric)
 72 |         if normalize:
 73 |             self.model = Pipeline([('ss', StandardScaler()), ('knn', knn)])
 74 |         else:
 75 |             self.model = knn
 76 | 
 77 |     def __str__(self):
 78 |         return "KNNRegressor"
 79 | 
 80 |     def fit(self, X, y):
 81 |         self.model.fit(X, y)
 82 |         return self
 83 |         
 84 |     def predict(self, X):
 85 |         y_pred = self.model.predict(X)
 86 |         return y_pred
 87 | 
 88 | 
 89 | class AdaBoostRegressor:
 90 |     def __init__(self, base_estimator=None, n_estimators=50, max_features=1.0,
 91 |                 max_depth=6, learning_rate=1.0, loss='linear', random_state=None):
 92 |         if base_estimator and base_estimator == 'etr':
 93 |             base_estimator = ExtraTreeRegressor(max_depth=max_depth,
 94 |                                         max_features=max_features)
 95 |         else:
 96 |             base_estimator = DecisionTreeRegressor(max_depth=max_depth,
 97 |                                         max_features=max_features)
 98 | 
 99 |         self.model = sklearn.ensemble.AdaBoostRegressor(
100 |                                     base_estimator=base_estimator,
101 |                                     n_estimators=n_estimators,
102 |                                     learning_rate=learning_rate,
103 |                                     random_state=random_state,
104 |                                     loss=loss)
105 | 
106 |     def __str__(self):
107 |         return "AdaBoostRegressor"
108 | 
109 |     def fit(self, X, y):
110 |         self.model.fit(X, y)
111 |         return self
112 | 
113 |     def predict(self, X):
114 |         y_pred = self.model.predict(X)
115 |         return y_pred
116 | 
117 | 
118 | class RandomRidge:
119 |     def __init__(self, alpha=1.0, normalize=True, poly=False,
120 |                     n_estimators=10, max_features=1.0,
121 |                     bootstrap=True, subsample=1.0,
122 |                     random_state=2016):
123 |         self.alpha = alpha
124 |         self.normalize = normalize
125 |         self.poly = poly
126 |         self.n_estimators = n_estimators
127 |         if isinstance(max_features, float):
128 |             assert max_features > 0 and max_features <= 1
129 |         self.max_features = max_features
130 |         self.bootstrap = bootstrap
131 |         assert subsample > 0 and subsample <= 1
132 |         self.subsample = subsample
133 |         self.random_state = random_state
134 |         self.ridge_list = [0]*self.n_estimators
135 |         self.feature_idx_list = [0]*self.n_estimators
136 | 
137 |     def __str__(self):
138 |         return "RandomRidge"
139 | 
140 |     def _random_feature_idx(self, fdim, random_state):
141 |         rng = np.random.RandomState(random_state)
142 |         if isinstance(self.max_features, int):
143 |             size = min(fdim, self.max_features)
144 |         else:
145 |             size = int(fdim * self.max_features)
146 |         idx = rng.permutation(fdim)[:size]
147 |         return idx
148 | 
149 |     def _random_sample_idx(self, sdim, random_state):
150 |         rng = np.random.RandomState(random_state)
151 |         size = int(sdim * self.subsample)
152 |         if self.bootstrap:
153 |             idx = rng.randint(sdim, size=size)
154 |         else:
155 |             idx = rng.permutation(sdim)[:size]
156 |         return idx
157 | 
158 |     def fit(self, X, y):
159 |         sdim, fdim = X.shape
160 |         for i in range(self.n_estimators):
161 |             ridge = Ridge(alpha=self.alpha, normalize=self.normalize, random_state=self.random_state)
162 |             fidx = self._random_feature_idx(fdim, self.random_state+i*100)
163 |             sidx = self._random_sample_idx(sdim, self.random_state+i*10)
164 |             X_tmp = X[sidx][:,fidx]
165 |             if self.poly:
166 |                 X_tmp = PolynomialFeatures(degree=2).fit_transform(X_tmp)[:,1:]
167 |             ridge.fit(X_tmp, y[sidx])
168 |             self.ridge_list[i] = ridge
169 |             self.feature_idx_list[i] = fidx
170 |         return self
171 | 
172 |     def predict(self, X):
173 |         y_pred = np.zeros((X.shape[0], self.n_estimators))
174 |         for i in range(self.n_estimators):
175 |             fidx = self.feature_idx_list[i]
176 |             ridge = self.ridge_list[i]
177 |             X_tmp = X[:,fidx]
178 |             if self.poly:
179 |                 X_tmp = PolynomialFeatures(degree=2).fit_transform(X_tmp)[:,1:]
180 |             y_pred[:,i] = ridge.predict(X_tmp)
181 |         y_pred = np.mean(y_pred, axis=1)
182 |         return y_pred
183 | 


--------------------------------------------------------------------------------
/Code/Chenglong/utils/time_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Chenglong Chen <c.chenglong@gmail.com>
 4 | @brief: utils for time
 5 | 
 6 | """
 7 | 
 8 | import datetime
 9 | 
10 | 
11 | def _timestamp():
12 |     now = datetime.datetime.now()
13 |     now_str = now.strftime("%Y-%m-%d-%H-%M")
14 |     return now_str
15 | 
16 | 
17 | def _timestamp_pretty():
18 |     now = datetime.datetime.now()
19 |     now_str = now.strftime("%Y%m%d%H%M")
20 |     return now_str
21 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/config_IgorKostia.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This is config for HomeDepot Project: Igor&Kostia's part
 4 | 
 5 | Competition: HomeDepot Search Relevance
 6 | Author: Igor Buinyi
 7 | Team: Turing test
 8 | """
 9 | 
10 | 
11 | import os
12 | ROOT_DIR = os.getcwd()
13 | 
14 | DATA_DIR= "%s/data"%ROOT_DIR
15 | PROCESSINGTEXT_DIR= "%s/processing_text"%ROOT_DIR
16 | FEATURES_DIR= "%s/features"%ROOT_DIR
17 | SAVEDMODELS_DIR= "%s/saved_models"%ROOT_DIR
18 | MODELS_DIR= "%s/models"%ROOT_DIR
19 | MODELSENSEMBLE_DIR= "%s/models_ensemble"%ROOT_DIR
20 | FEATURESETS_DIR="%s/feature_sets"%ROOT_DIR
21 | 
22 | 
23 | if not os.path.exists(PROCESSINGTEXT_DIR):
24 |     os.mkdir(PROCESSINGTEXT_DIR)
25 | if not os.path.exists(FEATURES_DIR):
26 |     os.mkdir(FEATURES_DIR)
27 | if not os.path.exists(SAVEDMODELS_DIR):
28 |     os.mkdir(SAVEDMODELS_DIR)
29 | if not os.path.exists(MODELS_DIR):
30 |     os.mkdir(MODELS_DIR)
31 | if not os.path.exists(MODELSENSEMBLE_DIR):
32 |     os.mkdir(MODELSENSEMBLE_DIR)   
33 | 
34 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/feature_sets/first_part_2000.csv:
--------------------------------------------------------------------------------
  1 | feature_name,number
  2 | wordFor_in_title_string_only_letratio,103
  3 | two_words_in_description_string_only_sum,181
  4 | pt_pd__unigram_dice_dist,488
  5 | beforethekeys_pathsimilarity_max,310
  6 | keyword_in_description_num,272
  7 | beforethekey_before2thekey_lchsimilarity_max,372
  8 | beforethekey_thekey_ressimilarity_max,338
  9 | pt_pd__triterm_dice_dist,517
 10 | beforethekeys_thekey_ressimilarity_max,350
 11 | tfidf_vbg_in_description_let,416
 12 | beforethekey_before2thekey_pathsimilarity_mean,371
 13 | st_at__trigram_dice_dist,499
 14 | nn_important_in_nn_unimportant_in_description_num,199
 15 | size_of_brands_in_product_description,28
 16 | pd_at_unigram_tfidf,587
 17 | seqmatch_desc&bullets_ratioscaled,383
 18 | st_tfidf_8.1,457
 19 | st_pd__fourgram_dice_dist,504
 20 | query_material_in_description_convoluted,99
 21 | pd_at__bigram_jaccard_coef,530
 22 | pt_at_unigram_tfidf,579
 23 | tfidf_jj_rb_in_bullets_let,423
 24 | vbg_in_vbg_in_title_sum,157
 25 | no_bullets_dummy,2
 26 | pt_ab_fourgram_tfidf,578
 27 | st_pd__biterm_dice_dist,510
 28 | st_pt_fourgram_tfidf,556
 29 | thekey_before2thekey_ressimilarity_max,326
 30 | before2thekey_thekey_pathsimilarity_max,340
 31 | len_of_materials_in_query,21
 32 | query_lchhsimilarity_max,354
 33 | query_brand_in_brand_convoluted,75
 34 | query_in_description,165
 35 | tfidf_nn_unimportant_in_title_num,406
 36 | word_in_description_string_only_let,173
 37 | len_of_brands_in_product_title,23
 38 | st_at__bigram_jaccard_coef,527
 39 | beforethekey_thekey_pathsimilarity_max,334
 40 | thekeys_lchsimilarity_mean,307
 41 | common_digits_in_description_jaccard,186
 42 | perc_digits_in_description,69
 43 | title_pathsimilarity_mean,359
 44 | word_in_description_string_only_num,171
 45 | st_at__fourgram_dice_dist,505
 46 | st_tfidf_7.1,456
 47 | query_brand_in_all_nomatch,79
 48 | jj_rb_in_jj_rb_in_title_sum,154
 49 | pt_pd_fourgram_tfidf,574
 50 | description_similarity_11-20,427
 51 | query_brand_in_all_fullmatch,76
 52 | vbg_in_vbg_in_description_sum,211
 53 | tfidf_vbg_in_bullets_let,417
 54 | st_pd__triterm_dice_dist,515
 55 | st_at__trigram_jaccard_coef,533
 56 | seqmatch_title_ratioscaled,377
 57 | query_brand_in_all_convoluted,80
 58 | len_of_attribute_bullets_woBM,18
 59 | st_tfidf_7,444
 60 | tfidf_nn_important_in_title_num,400
 61 | nn_important_in_title_let,135
 62 | two_words_in_bullets_num,231
 63 | st_at__triterm_jaccard_coef,550
 64 | description_similarity_11-20to10,435
 65 | tfidf_nn_important_in_bullets_let,405
 66 | tfidf_jj_rb_in_title_num,418
 67 | two_words_in_description_let,179
 68 | nn_important_in_nn_important_in_title_letratio,144
 69 | len_of_product_title_woBM,16
 70 | query_pathsimilarity_max,352
 71 | vbg_in_vbg_in_bullets_sum,265
 72 | st_tfidf_5.1,454
 73 | two_words_in_description_string_only_num,180
 74 | query_brand_in_title_fullmatch,81
 75 | pt_pd__bigram_jaccard_coef,528
 76 | wordWith_in_bullets_string_only_letratio,218
 77 | pd_at__trigram_dice_dist,502
 78 | two_words_in_bullets_let,233
 79 | st_at__biterm_jaccard_coef,545
 80 | st_tfidf_1,438
 81 | len_of_query_woBM,9
 82 | beforethekey_in_bullets_sum,293
 83 | wordWith_in_title_string_only_num,104
 84 | size_of_brands_in_product_title,24
 85 | beforethekey_before2thekey_ressimilarity_mean,375
 86 | nn_unimportant_in_title_num,138
 87 | thekey_before2thekey_pathsimilarity_mean,323
 88 | nn_important_in_nn_important_in_bullets_letratio,252
 89 | beforethekeys_ressimilarity_mean,315
 90 | seqmatch_desc&bullets_ratio,382
 91 | len_of_product_title_keys,40
 92 | thekey_in_bullets_sum,289
 93 | word_in_description_numratio,169
 94 | tfidf_title_querybeforethekey_num,397
 95 | nn_important_in_nn_important_in_bullets_let,251
 96 | query_brand_in_bullets_convoluted,87
 97 | pd_at__fourgram_dice_dist,508
 98 | nn_important_in_nn_important_in_title_sum,142
 99 | beforethekeys_thekey_lchsimilarity_max,348
100 | word_in_bullets_let,222
101 | st_tfidf_6.1,455
102 | ratio_of_nn_unimportant_in_search_term,48
103 | keyword_in_titlekeys_jaclet,283
104 | query_brand_in_brand_partialmatch,72
105 | st_pt__fourgram_jaccard_coef,537
106 | ratio_of_nn_unimportant_in_attribute_bullets,63
107 | st_at__unigram_dice_dist,487
108 | word_in_bullets_string_only_num,225
109 | beforethekey_in_bullets_let,294
110 | beforethekey_thekey_pathsimilarity_mean,335
111 | nn_unimportant_in_nn_important_in_description_num,203
112 | word_in_description_string_only_letratio,175
113 | before2thekey_beforethekey_lchsimilarity_max,366
114 | thekeys_in_title,284
115 | wordFor_in_bullets_string_only_letratio,215
116 | len_of_product_title_thekey,42
117 | initial_len_of_query,8
118 | word2vec_13,474
119 | vbg_in_vbg_in_title_let,158
120 | pt_ab_bigram_tfidf,576
121 | query_brand_in_description_convoluted,86
122 | avg_wordlength_in_query,6
123 | tfidf_nn_unimportant_in_title_let,409
124 | seqmatch_bullets_ratio,380
125 | query_material_in_all_fullmatch,93
126 | jj_rb_in_jj_rb_in_bullets_let,263
127 | word_in_bullets_string_only_letratio,229
128 | st_tfidf_9,446
129 | len_of_query_keys,39
130 | beforethekeys_thekey_pathsimilarity_max,346
131 | nn_unimportant_in_description_letratio,194
132 | thekey_before2thekey_pathsimilarity_max,322
133 | word2vec_22,483
134 | pd_ab_unigram_tfidf,583
135 | word2vec_6,467
136 | pd_at_trigram_tfidf,589
137 | st_pd__bigram_jaccard_coef,526
138 | word2vec_17,478
139 | st_pd__biterm_jaccard_coef,544
140 | word_in_bullets_num,220
141 | query_material_in_all_assumedmatch,95
142 | keyword_in_bullets_num,277
143 | pt_pd__trigram_dice_dist,500
144 | word_in_description_let,168
145 | size_of_brands_in_attribute_bullets,32
146 | size_of_materials_in_query,22
147 | pt_pd__biterm_jaccard_coef,546
148 | nn_unimportant_in_nn_important_in_description_sum,204
149 | nn_unimportant_in_title_letratio,140
150 | pt_at__triterm_jaccard_coef,552
151 | jj_rb_in_jj_rb_in_description_let,209
152 | query_brand_in_title_partialmatch,82
153 | pt_at__triterm_dice_dist,518
154 | st_pt__fourgram_dice_dist,503
155 | len_of_query_beforethekey,43
156 | word_in_title_string_only_letratio,120
157 | jj_rb_in_jj_rb_in_title_let,155
158 | jj_rb_in_jj_rb_in_description_num,207
159 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/feature_sets/first_part_3000.csv:
--------------------------------------------------------------------------------
 1 | feature_name,number
 2 | pt_pd__bigram_jaccard_coef,346
 3 | description_similarity_20,10
 4 | beforethekeys_in_beforethekeys,357
 5 | st_tfidf_0.1,74
 6 | query_brand_in_bullets_convoluted,330
 7 | thekey_beforethekey_pathsimilarity_mean,107
 8 | common_digits_in_title_jaccard,53
 9 | st_pd_unigram_tfidf,166
10 | 1word_string_dld_in_pts,358
11 | tfidf_nn_unimportant_in_title_let,278
12 | st_tfidf_3.1,58
13 | ratio_of_jj_rb_in_product_description,151
14 | len_of_product_title,344
15 | pt_at__trigram_dice_dist,320
16 | word2vec_11,8
17 | st_at__unigram_jaccard_coef,304
18 | nn_unimportant_in_description_let,342
19 | pt_pd__unigram_jaccard_coef,246
20 | word2vec_22,4
21 | nn_unimportant_in_title_let,367
22 | st_tfidf_5.1,98
23 | thekey_in_bullets_let,185
24 | word_in_bullets_numratio,321
25 | 1word_dld_in_pt,324
26 | word2vec_1,40
27 | tfidf_description_num,68
28 | tfidf_matchdescription_stringonly_num,249
29 | ratio_of_jj_rb_in_attribute_bullets,234
30 | nn_important_in_title_letratio,145
31 | seqmatch_description_ratioscaled,81
32 | st_tfidf_10,67
33 | st_pd__trigram_dice_dist,350
34 | 2word_string_dld_in_pds,385
35 | tfidf_description_let,80
36 | nn_important_in_nn_important_in_bullets_letratio,438
37 | keyword_in_bullets_letratio,189
38 | nn_unimportant_in_bullets_let,373
39 | tfidf_title_num,65
40 | wordFor_in_title_string_only_let,265
41 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/feature_sets/first_part_3010.csv:
--------------------------------------------------------------------------------
  1 | feature_name,number
  2 | pt_at__bigram_dice_dist,91
  3 | pt_at__triterm_jaccard_coef,148
  4 | st_at__bigram_dice_dist,89
  5 | word2vec_16,73
  6 | word2vec_5,62
  7 | ab_at_fourgram_tfidf,190
  8 | st_pt__trigram_jaccard_coef,127
  9 | pd_ab_unigram_tfidf,179
 10 | st_tfidf_11,44
 11 | st_ab_bigram_tfidf,160
 12 | st_pd__trigram_jaccard_coef,128
 13 | st_pt_triterm_tfidf,154
 14 | 1word_dld_in_pt,0
 15 | 1word_string_dld_in_pt,2
 16 | pd_ab_trigram_tfidf,181
 17 | 1word_string_dld_in_pts,3
 18 | above15_dummy_frequency_of_beforethekey_thekey,191
 19 | st_ab_unigram_tfidf,159
 20 | st_tfidf_0,33
 21 | st_pt__triterm_dice_dist,110
 22 | pd_at_trigram_tfidf,185
 23 | word2vec_2,59
 24 | st_tfidf_9.1,54
 25 | word2vec_22,79
 26 | word2vec_9,66
 27 | 1word_dld_in_at,24
 28 | pd_at_unigram_tfidf,183
 29 | pt_at_trigram_tfidf,177
 30 | st_pt_bigram_tfidf,150
 31 | word2vec_1,58
 32 | pt_pd_bigram_tfidf,168
 33 | st_pt__triterm_jaccard_coef,144
 34 | pd_at__unigram_jaccard_coef,120
 35 | word2vec_23,80
 36 | pt_pd__triterm_jaccard_coef,147
 37 | st_pt__biterm_dice_dist,105
 38 | st_tfidf_2,35
 39 | st_tfidf_3.1,48
 40 | word2vec_11,68
 41 | 2word_dld_in_pt,4
 42 | pt_at__triterm_dice_dist,114
 43 | pd_at__unigram_dice_dist,86
 44 | pd_at__bigram_jaccard_coef,126
 45 | pt_ab_bigram_tfidf,172
 46 | pd_at_fourgram_tfidf,186
 47 | st_pt_fourgram_tfidf,152
 48 | pt_pd__fourgram_dice_dist,102
 49 | st_tfidf_1,34
 50 | 2word_string_dld_in_ab,22
 51 | pt_at__bigram_jaccard_coef,125
 52 | st_tfidf_4,37
 53 | st_pd__unigram_dice_dist,82
 54 | 2word_dld_in_abs,21
 55 | pt_at_bigram_tfidf,176
 56 | st_at__fourgram_jaccard_coef,135
 57 | pt_pd_fourgram_tfidf,170
 58 | st_tfidf_10.1,55
 59 | st_pd__bigram_jaccard_coef,122
 60 | 1word_string_dld_in_pd,10
 61 | st_at__triterm_jaccard_coef,146
 62 | pt_pd__fourgram_jaccard_coef,136
 63 | 2word_dld_in_pds,13
 64 | pt_pd__trigram_dice_dist,96
 65 | word2vec_3,60
 66 | st_at__triterm_dice_dist,112
 67 | st_pd__fourgram_jaccard_coef,134
 68 | 2word_dld_in_at,28
 69 | st_tfidf_11.1,56
 70 | 1word_string_dld_in_abs,19
 71 | pd_at__bigram_dice_dist,92
 72 | 2word_string_dld_in_pd,14
 73 | st_pt__biterm_jaccard_coef,139
 74 | st_pt__bigram_jaccard_coef,121
 75 | 1word_string_dld_in_at,26
 76 | st_pt__bigram_dice_dist,87
 77 | st_at__biterm_jaccard_coef,141
 78 | 1word_dld_in_pts,1
 79 | 2word_dld_in_ab,20
 80 | st_pd_unigram_tfidf,155
 81 | pt_at__fourgram_jaccard_coef,137
 82 | st_at_trigram_tfidf,165
 83 | pd_at__trigram_jaccard_coef,132
 84 | 2word_string_dld_in_pds,15
 85 | pt_pd__unigram_dice_dist,84
 86 | st_pt__unigram_dice_dist,81
 87 | word2vec_18,75
 88 | pt_at__fourgram_dice_dist,103
 89 | st_at__fourgram_dice_dist,101
 90 | st_tfidf_2.1,47
 91 | word2vec_17,74
 92 | 1word_dld_in_ats,25
 93 | ab_at_bigram_tfidf,188
 94 | 2word_string_dld_in_pt,6
 95 | ab_at_trigram_tfidf,189
 96 | st_tfidf_7,40
 97 | 2word_string_dld_in_ats,31
 98 | 2word_string_dld_in_pts,7
 99 | st_at_unigram_tfidf,163
100 | 2word_dld_in_ats,29
101 | st_pt_trigram_tfidf,151
102 | st_pt__fourgram_dice_dist,99
103 | st_pt__trigram_dice_dist,93
104 | st_at__unigram_jaccard_coef,117
105 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/feature_sets/first_part_3020.csv:
--------------------------------------------------------------------------------
 1 | feature_name,number
 2 | word_in_description_string_only_letratio,56
 3 | query_brand_in_title_convoluted,57
 4 | st_pt_trigram_tfidf,197
 5 | nn_important_in_title_let,190
 6 | above8_dummy_frequency_of_beforethekey_thekey,179
 7 | beforethekeys_pathsimilarity_max,74
 8 | word2vec_7,77
 9 | word2vec_12,189
10 | 2word_dld_in_pts,167
11 | nn_important_in_nn_unimportant_in_description_num,51
12 | word2vec_21,75
13 | st_pt__biterm_jaccard_coef,63
14 | description_similarity_21-30rel,192
15 | nn_important_in_nn_important_in_title_num,133
16 | nn_unimportant_in_nn_important_in_description_letratio,27
17 | 1word_dld_in_pds,120
18 | query_brand_in_all_fullmatch,65
19 | query_brand_in_title_fullmatch,21
20 | st_tfidf_6.1,98
21 | two_words_in_description_num,156
22 | nn_unimportant_in_title_letratio,93
23 | tfidf_title_querythekey_num,101
24 | tfidf_matchtitle_num,124
25 | len_of_brands_in_query,12
26 | 2word_string_dld_in_pt,11
27 | nn_unimportant_in_nn_important_in_description_num,201
28 | keyword_in_titlekeys_let,135
29 | word_in_title_string_only_sum,114
30 | nn_important_in_nn_important_in_title_letratio,187
31 | description_similarity_10,66
32 | query_brand_in_all_convoluted,155
33 | two_words_in_description_sum,151
34 | 2word_dld_in_pt,142
35 | beforethekeys_lchsimilarity_max,157
36 | beforethekeys_lchsimilarity_mean,158
37 | two_words_in_description_string_only_sum,122
38 | nn_important_in_nn_important_in_description_letratio,173
39 | seqmatch_description_ratioscaled,53
40 | word_in_title_string_only_let,193
41 | thekey_in_thekey,175
42 | 2word_string_dld_in_ats,137
43 | st_pd_unigram_tfidf,61
44 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/feature_sets/readme.txt:
--------------------------------------------------------------------------------
1 | This folder contains feature sets that are necessaey to reproduce our calculations of `Ensemble_B` in Step IK5 (see readme.md in the root folder).


--------------------------------------------------------------------------------
/Code/Igor&Kostia/feature_sets/second_part_2000.csv:
--------------------------------------------------------------------------------
  1 | feature_name,number
  2 | wordFor_in_title_string_only_letratio,103
  3 | two_words_in_description_string_only_sum,181
  4 | pt_pd__unigram_dice_dist,488
  5 | beforethekeys_pathsimilarity_max,310
  6 | keyword_in_description_num,272
  7 | beforethekey_before2thekey_lchsimilarity_max,372
  8 | beforethekey_thekey_ressimilarity_max,338
  9 | pt_pd__triterm_dice_dist,517
 10 | beforethekeys_thekey_ressimilarity_max,350
 11 | tfidf_vbg_in_description_let,416
 12 | beforethekey_before2thekey_pathsimilarity_mean,371
 13 | st_at__trigram_dice_dist,499
 14 | nn_important_in_nn_unimportant_in_description_num,199
 15 | size_of_brands_in_product_description,28
 16 | pd_at_unigram_tfidf,587
 17 | seqmatch_desc&bullets_ratioscaled,383
 18 | st_tfidf_8.1,457
 19 | st_pd__fourgram_dice_dist,504
 20 | query_material_in_description_convoluted,99
 21 | pd_at__bigram_jaccard_coef,530
 22 | pt_at_unigram_tfidf,579
 23 | tfidf_jj_rb_in_bullets_let,423
 24 | vbg_in_vbg_in_title_sum,157
 25 | no_bullets_dummy,2
 26 | pt_ab_fourgram_tfidf,578
 27 | st_pd__biterm_dice_dist,510
 28 | st_pt_fourgram_tfidf,556
 29 | thekey_before2thekey_ressimilarity_max,326
 30 | before2thekey_thekey_pathsimilarity_max,340
 31 | len_of_materials_in_query,21
 32 | query_lchhsimilarity_max,354
 33 | query_brand_in_brand_convoluted,75
 34 | query_in_description,165
 35 | tfidf_nn_unimportant_in_title_num,406
 36 | word_in_description_string_only_let,173
 37 | len_of_brands_in_product_title,23
 38 | st_at__bigram_jaccard_coef,527
 39 | beforethekey_thekey_pathsimilarity_max,334
 40 | thekeys_lchsimilarity_mean,307
 41 | common_digits_in_description_jaccard,186
 42 | perc_digits_in_description,69
 43 | title_pathsimilarity_mean,359
 44 | word_in_description_string_only_num,171
 45 | st_at__fourgram_dice_dist,505
 46 | st_tfidf_7.1,456
 47 | query_brand_in_all_nomatch,79
 48 | jj_rb_in_jj_rb_in_title_sum,154
 49 | pt_pd_fourgram_tfidf,574
 50 | description_similarity_11-20,427
 51 | query_brand_in_all_fullmatch,76
 52 | vbg_in_vbg_in_description_sum,211
 53 | tfidf_vbg_in_bullets_let,417
 54 | st_pd__triterm_dice_dist,515
 55 | st_at__trigram_jaccard_coef,533
 56 | seqmatch_title_ratioscaled,377
 57 | query_brand_in_all_convoluted,80
 58 | len_of_attribute_bullets_woBM,18
 59 | st_tfidf_7,444
 60 | tfidf_nn_important_in_title_num,400
 61 | nn_important_in_title_let,135
 62 | two_words_in_bullets_num,231
 63 | st_at__triterm_jaccard_coef,550
 64 | description_similarity_11-20to10,435
 65 | tfidf_nn_important_in_bullets_let,405
 66 | tfidf_jj_rb_in_title_num,418
 67 | two_words_in_description_let,179
 68 | nn_important_in_nn_important_in_title_letratio,144
 69 | len_of_product_title_woBM,16
 70 | query_pathsimilarity_max,352
 71 | vbg_in_vbg_in_bullets_sum,265
 72 | st_tfidf_5.1,454
 73 | two_words_in_description_string_only_num,180
 74 | query_brand_in_title_fullmatch,81
 75 | pt_pd__bigram_jaccard_coef,528
 76 | wordWith_in_bullets_string_only_letratio,218
 77 | pd_at__trigram_dice_dist,502
 78 | two_words_in_bullets_let,233
 79 | st_at__biterm_jaccard_coef,545
 80 | st_tfidf_1,438
 81 | len_of_query_woBM,9
 82 | beforethekey_in_bullets_sum,293
 83 | wordWith_in_title_string_only_num,104
 84 | size_of_brands_in_product_title,24
 85 | beforethekey_before2thekey_ressimilarity_mean,375
 86 | nn_unimportant_in_title_num,138
 87 | thekey_before2thekey_pathsimilarity_mean,323
 88 | nn_important_in_nn_important_in_bullets_letratio,252
 89 | beforethekeys_ressimilarity_mean,315
 90 | seqmatch_desc&bullets_ratio,382
 91 | len_of_product_title_keys,40
 92 | thekey_in_bullets_sum,289
 93 | word_in_description_numratio,169
 94 | tfidf_title_querybeforethekey_num,397
 95 | nn_important_in_nn_important_in_bullets_let,251
 96 | query_brand_in_bullets_convoluted,87
 97 | pd_at__fourgram_dice_dist,508
 98 | nn_important_in_nn_important_in_title_sum,142
 99 | beforethekeys_thekey_lchsimilarity_max,348
100 | word_in_bullets_let,222
101 | st_tfidf_6.1,455
102 | ratio_of_nn_unimportant_in_search_term,48
103 | keyword_in_titlekeys_jaclet,283
104 | query_brand_in_brand_partialmatch,72
105 | st_pt__fourgram_jaccard_coef,537
106 | ratio_of_nn_unimportant_in_attribute_bullets,63
107 | st_at__unigram_dice_dist,487
108 | word_in_bullets_string_only_num,225
109 | beforethekey_in_bullets_let,294
110 | beforethekey_thekey_pathsimilarity_mean,335
111 | nn_unimportant_in_nn_important_in_description_num,203
112 | word_in_description_string_only_letratio,175
113 | before2thekey_beforethekey_lchsimilarity_max,366
114 | thekeys_in_title,284
115 | wordFor_in_bullets_string_only_letratio,215
116 | len_of_product_title_thekey,42
117 | initial_len_of_query,8
118 | word2vec_13,474
119 | vbg_in_vbg_in_title_let,158
120 | pt_ab_bigram_tfidf,576
121 | query_brand_in_description_convoluted,86
122 | avg_wordlength_in_query,6
123 | tfidf_nn_unimportant_in_title_let,409
124 | seqmatch_bullets_ratio,380
125 | query_material_in_all_fullmatch,93
126 | jj_rb_in_jj_rb_in_bullets_let,263
127 | word_in_bullets_string_only_letratio,229
128 | st_tfidf_9,446
129 | len_of_query_keys,39
130 | beforethekeys_thekey_pathsimilarity_max,346
131 | nn_unimportant_in_description_letratio,194
132 | thekey_before2thekey_pathsimilarity_max,322
133 | word2vec_22,483
134 | pd_ab_unigram_tfidf,583
135 | word2vec_6,467
136 | pd_at_trigram_tfidf,589
137 | st_pd__bigram_jaccard_coef,526
138 | word2vec_17,478
139 | st_pd__biterm_jaccard_coef,544
140 | word_in_bullets_num,220
141 | query_material_in_all_assumedmatch,95
142 | keyword_in_bullets_num,277
143 | pt_pd__trigram_dice_dist,500
144 | word_in_description_let,168
145 | size_of_brands_in_attribute_bullets,32
146 | size_of_materials_in_query,22
147 | pt_pd__biterm_jaccard_coef,546
148 | nn_unimportant_in_nn_important_in_description_sum,204
149 | nn_unimportant_in_title_letratio,140
150 | pt_at__triterm_jaccard_coef,552
151 | jj_rb_in_jj_rb_in_description_let,209
152 | query_brand_in_title_partialmatch,82
153 | pt_at__triterm_dice_dist,518
154 | st_pt__fourgram_dice_dist,503
155 | len_of_query_beforethekey,43
156 | word_in_title_string_only_letratio,120
157 | jj_rb_in_jj_rb_in_title_let,155
158 | jj_rb_in_jj_rb_in_description_num,207
159 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/feature_sets/second_part_3000.csv:
--------------------------------------------------------------------------------
 1 | feature_name,number
 2 | pt_pd__bigram_jaccard_coef,346
 3 | description_similarity_20,10
 4 | beforethekeys_in_beforethekeys,357
 5 | st_tfidf_0.1,74
 6 | query_brand_in_bullets_convoluted,330
 7 | thekey_beforethekey_pathsimilarity_mean,107
 8 | common_digits_in_title_jaccard,53
 9 | st_pd_unigram_tfidf,166
10 | 1word_string_dld_in_pts,358
11 | tfidf_nn_unimportant_in_title_let,278
12 | st_tfidf_3.1,58
13 | ratio_of_jj_rb_in_product_description,151
14 | len_of_product_title,344
15 | pt_at__trigram_dice_dist,320
16 | word2vec_11,8
17 | st_at__unigram_jaccard_coef,304
18 | nn_unimportant_in_description_let,342
19 | pt_pd__unigram_jaccard_coef,246
20 | word2vec_22,4
21 | nn_unimportant_in_title_let,367
22 | st_tfidf_5.1,98
23 | thekey_in_bullets_let,185
24 | word_in_bullets_numratio,321
25 | 1word_dld_in_pt,324
26 | word2vec_1,40
27 | tfidf_description_num,68
28 | tfidf_matchdescription_stringonly_num,249
29 | ratio_of_jj_rb_in_attribute_bullets,234
30 | nn_important_in_title_letratio,145
31 | seqmatch_description_ratioscaled,81
32 | st_tfidf_10,67
33 | st_pd__trigram_dice_dist,350
34 | 2word_string_dld_in_pds,385
35 | tfidf_description_let,80
36 | nn_important_in_nn_important_in_bullets_letratio,438
37 | keyword_in_bullets_letratio,189
38 | nn_unimportant_in_bullets_let,373
39 | tfidf_title_num,65
40 | wordFor_in_title_string_only_let,265
41 | 


--------------------------------------------------------------------------------
/Code/Igor&Kostia/generate_feature_importances.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | The file to generated feature importances from the benchmark Gradient Boost model: 
  4 | separately for dummies and all other features.
  5 | 
  6 | Competition: HomeDepot Search Relevance
  7 | Author: Igor Buinyi
  8 | Team: Turing test
  9 | """
 10 | 
 11 | from config_IgorKostia import *
 12 | 
 13 | import numpy as np
 14 | import pandas as pd
 15 | from sklearn.ensemble import GradientBoostingRegressor
 16 | from time import time
 17 | 
 18 | 
 19 | # get num_tain
 20 | df_train = pd.read_csv(DATA_DIR+'/train.csv', encoding="ISO-8859-1")
 21 | num_train = df_train.shape[0] #number of observations
 22 | 
 23 | # load features
 24 | df_all = pd.read_csv(FEATURES_DIR+'/df_basic_features.csv', encoding="utf-8")
 25 | df_dist = pd.read_csv(FEATURES_DIR+'/df_dist_new.csv', encoding="utf-8")
 26 | df_st_tfidf= pd.read_csv(FEATURES_DIR+'/df_st_tfidf.csv', encoding="utf-8")
 27 | if 'Unnamed: 0' in df_st_tfidf.keys():
 28 |     df_st_tfidf = df_st_tfidf.drop(['Unnamed: 0'],axis=1)
 29 | df_tfidf_intersect = pd.read_csv(FEATURES_DIR+'/df_tfidf_intersept_new.csv', encoding="utf-8")
 30 | df_word2vec = pd.read_csv(FEATURES_DIR+'/df_word2vec_new.csv', encoding="utf-8")
 31 | df_dld = pd.read_csv(FEATURES_DIR+'/dld_features.csv', encoding="utf-8")
 32 | 
 33 | """
 34 | the following features and files were added later
 35 | so this is the adjustment in order to reproduce the same results
 36 | """
 37 | df_above15 = pd.read_csv(FEATURES_DIR+'/df_feature_above15_ext.csv', encoding="utf-8")
 38 | df_above15 = df_above15[['id','above15_dummy_frequency_of_beforethekey_thekey']]
 39 | df_all = pd.merge(df_all, df_above15, how='left', on='id')
 40 | 
 41 | # merge
 42 | df_all = pd.merge(df_all, df_dist, how='left', on='id')
 43 | df_all = pd.merge(df_all, df_st_tfidf, how='left', on='id')
 44 | df_all = pd.merge(df_all, df_tfidf_intersect, how='left', on='id')
 45 | df_all = pd.merge(df_all, df_word2vec, how='left', on='id')
 46 | df_all = pd.merge(df_all, df_dld, how='left', on='id')
 47 | 
 48 | 
 49 | # drop product_uid and some vars
 50 | drop_list=['product_uid']
 51 | drop_list+=['description_similarity_10',	'description_similarity_11-20',	'description_similarity_30',
 52 |             'description_similarity_21-30', 'description_similarity_10rel', 'description_similarity_11-20rel',
 53 |             'description_similarity_30rel',	'description_similarity_21-30rel', 'description_similarity_21-30to10',
 54 |             'word_in_title_string_only_num',	'word_in_title_string_only_sum',	'word_in_title_string_only_let']
 55 | 
 56 | 
 57 | print len(df_all.keys())
 58 | new_drop_list=[]
 59 | for var in drop_list:
 60 |     if var in df_all.keys():
 61 |         new_drop_list.append(var)
 62 | df_all=df_all.drop(new_drop_list,axis=1)
 63 | print len(df_all.keys())
 64 | 
 65 | 
 66 | # generate matrices to be used in clf
 67 | df_train = df_all.iloc[:num_train]
 68 | df_test = df_all.iloc[num_train:]
 69 | id_test = df_test['id']
 70 | id_train = df_train['id']
 71 | 
 72 | y_train = df_train['relevance'].values
 73 | X_train = df_train.drop(['id','relevance'],axis=1).values
 74 | X_test = df_test.drop(['id','relevance'],axis=1).values
 75 | 
 76 | 
 77 | #########################################################################
 78 | ##### use GradientBoostingRegressor to generate feature importances
 79 | t0 = time()
 80 | params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 1, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1}
 81 | clf = GradientBoostingRegressor(**params)
 82 | 
 83 | clf.fit(X_train, y_train)
 84 | 
 85 | y_pred = clf.predict(X_test)
 86 | y_pred[y_pred<1.]=1.
 87 | y_pred[y_pred>3.]=3.
 88 | 
 89 | 
 90 | pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv(MODELS_DIR+'/submission_benchmark_without_dummies.csv',index=False)
 91 | sorted_idx = np.argsort(clf.feature_importances_)
 92 | pd.DataFrame({"name":df_all.keys().drop(['id','relevance'])[sorted_idx], "importance": clf.feature_importances_[sorted_idx]}).to_csv(MODELS_DIR+'/feature_importances_benchmark_without_dummies.csv',index=False)
 93 | 
 94 | print "file saved"
 95 | print 'modelling time:',round((time()-t0)/60,1) ,'minutes\n'
 96 | t0 = time()
 97 | 
 98 | 
 99 | #### load feature importances from file
100 | df_importance = pd.read_csv(MODELS_DIR+'/feature_importances_benchmark_without_dummies.csv', encoding="utf-8")
101 | df_importance=df_importance.sort_values(['importance'],ascending=[0])
102 | df_importance['cumulative']=df_importance['importance'].map(lambda x: sum(df_importance['importance'][df_importance['importance']>=x]))
103 | var_list=list(df_importance['name'][df_importance['cumulative']<0.990])
104 | 
105 | 
106 | # use only 40 vars in the next step
107 | df_all=df_all[['id','relevance']+var_list[0:40]]
108 | 
109 | 
110 | # load dummies
111 | df_bm_dummy = pd.read_csv(FEATURES_DIR+'/df_brand_material_dummies.csv', encoding="utf-8")
112 | df_thekey_dummy = pd.read_csv(FEATURES_DIR+'/df_thekey_dummies.csv', encoding="utf-8")
113 | df_all = pd.merge(df_all, df_bm_dummy, how='left', on='id')
114 | df_all = pd.merge(df_all, df_thekey_dummy, how='left', on='id')
115 | 
116 | # generate matrices to be used in clf
117 | df_train = df_all.iloc[:num_train]
118 | df_test = df_all.iloc[num_train:]
119 | id_test = df_test['id']
120 | id_train = df_train['id']
121 | 
122 | y_train = df_train['relevance'].values
123 | X_train = df_train.drop(['id','relevance'],axis=1).values
124 | X_test = df_test.drop(['id','relevance'],axis=1).values
125 | 
126 | #################################################################################
127 | ##### use GradientBoostingRegressor to generate feature importances for dummies
128 | t0 = time()
129 | params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 1, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1}
130 | clf = GradientBoostingRegressor(**params)
131 | 
132 | clf.fit(X_train, y_train)
133 | 
134 | y_pred = clf.predict(X_test)
135 | y_pred[y_pred<1.]=1.
136 | y_pred[y_pred>3.]=3.
137 | 
138 | 
139 | pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv(MODELS_DIR+'/submission_benchmark_top40_and_dummies.csv',index=False)
140 | sorted_idx = np.argsort(clf.feature_importances_)
141 | pd.DataFrame({"name":df_all.keys().drop(['id','relevance'])[sorted_idx], "importance": clf.feature_importances_[sorted_idx]}).to_csv(MODELS_DIR+'/feature_importances_benchmark_top40_and_dummies.csv',index=False)
142 | 
143 | print "file saved"
144 | print 'modelling time:',round((time()-t0)/60,1) ,'minutes\n'
145 | t0 = time()
146 | 
147 | 


--------------------------------------------------------------------------------
/Data/dict/color_data.py:
--------------------------------------------------------------------------------
  1 | # https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/18967/data-preparation
  2 | 
  3 | COLOR_LIST = [
  4 | "white",
  5 | "black",
  6 | "brown",
  7 | "gray",
  8 | "chrome",
  9 | "stainless steel",
 10 | "whites",
 11 | "red",
 12 | "browns / tans",
 13 | "bronze",
 14 | "silver",
 15 | "blacks",
 16 | "beige",
 17 | "stainless",
 18 | "blue",
 19 | "nickel",
 20 | "metallics",
 21 | "clear",
 22 | "grays",
 23 | "green",
 24 | "multi",
 25 | "beige / cream",
 26 | "tan",
 27 | "greens",
 28 | "yellow",
 29 | "wood",
 30 | "blues",
 31 | "reds / pinks",
 32 | "brushed nickel",
 33 | "orange",
 34 | "metallic",
 35 | "brass",
 36 | "yellows / golds",
 37 | "oil rubbed bronze",
 38 | "polished chrome",
 39 | "almond",
 40 | "multi-colored",
 41 | "dark brown wood",
 42 | "primed white",
 43 | "beige/bisque",
 44 | "biscuit",
 45 | "ivory",
 46 | "oranges / peaches",
 47 | "grey",
 48 | "unfinished wood",
 49 | "light brown wood",
 50 | "wood grain",
 51 | "silver metallic",
 52 | "copper",
 53 | "medium brown wood",
 54 | "soft white",
 55 | "gold",
 56 | "satin nickel",
 57 | "cherry",
 58 | "bright white",
 59 | "red/orange",
 60 | "teal",
 61 | "natural",
 62 | "oak",
 63 | "mahogany",
 64 | "aluminum",
 65 | "espresso",
 66 | "unfinished",
 67 | "purples / lavenders",
 68 | "brown/tan",
 69 | "steel",
 70 | "venetian bronze",
 71 | "slate",
 72 | "warm white",
 73 | "bone",
 74 | "pink",
 75 | "stainless look",
 76 | "reddish brown wood",
 77 | "solid colors",
 78 | "off-white",
 79 | "walnut",
 80 | "chocolate",
 81 | "light almond",
 82 | "vibrant brushed nickel",
 83 | "satin white",
 84 | "polished brass",
 85 | "linen",
 86 | "white primer",
 87 | "purple",
 88 | "charcoal",
 89 | "color",
 90 | "oil-rubbed bronze",
 91 | "melamine white",
 92 | "turquoises / aquas",
 93 | "blue/purple",
 94 | "primed",
 95 | "bisque",
 96 | "browns/tans",
 97 | "assorted colors",
 98 | "java",
 99 | "pewter",
100 | "chestnut",
101 | "yellow/gold",
102 | "taupe",
103 | "pacific white",
104 | "cedar",
105 | "monochromatic stainless steel",
106 | "other",
107 | "platinum",
108 | "mocha",
109 | "cream",
110 | "sand",
111 | "daylight",
112 | "brushed stainless steel",
113 | "powder-coat white",
114 | ]


--------------------------------------------------------------------------------
/Data/split/splits_level1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Data/split/splits_level1.pkl


--------------------------------------------------------------------------------
/Data/split/splits_level2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Data/split/splits_level2.pkl


--------------------------------------------------------------------------------
/Data/split/splits_level3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Data/split/splits_level3.pkl


--------------------------------------------------------------------------------
/Doc/Kaggle_HomeDepot_Turing_Test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Doc/Kaggle_HomeDepot_Turing_Test.pdf


--------------------------------------------------------------------------------
/Doc/reference.bib:
--------------------------------------------------------------------------------
 1 | % Created by Chenglong Chen
 2 | % Date: Jul. 12 2015
 3 | 
 4 | @inproceedings{ebc,
 5 |     AUTHOR = {Ling Li and Hsuan-Tien Lin},
 6 |      title = {Ordinal regression by extended binary classification},
 7 |    booktitle = {Advances in Neural Information Processing Systems: Proceedings of the 2006 Conference (NIPS '06)},
 8 |       YEAR = {2006},
 9 |      PAGES = {865-872},
10 | }
11 | 
12 | @article{cocr,
13 |   author    = {Yu-Xun Ruan and Hsuan-Tien Lin and Ming-Feng Tsai},
14 |   title     = {Improving ranking performance with cost-sensitive ordinal classification via regression},
15 |   journal   = {Information Retrieval},
16 |   volume    = {17},
17 |   number    = {1},
18 |   pages     = {1--20},
19 |   year      = {2014}
20 | }
21 | 
22 | @inproceedings{hyperopt,
23 |     AUTHOR = {James Bergstra and R$\acute{\text{e}}$mi Bardenet and Yoshua Bengio and Bal$\acute{\text{a}}$zs K$\acute{\text{e}}$gl},
24 |      title = {Algorithms for Hyper-Parameter Optimization},
25 |    booktitle = {Advances in Neural Information Processing Systems: Proceedings of the 2011 Conference (NIPS '11)},
26 |     year = {2011},
27 |     pages = {2546--2554}
28 | }
29 | 
30 | @MISC{hyperopt_url,
31 |  note =	 {\url{http://hyperopt.github.io/hyperopt/}}
32 | }
33 | 
34 | @MISC{glove-gensim,
35 |  note =	 {\url{https://github.com/manasRK/glove-gensim}}
36 | }
37 | 
38 | @MISC{PeterNorvig,
39 |  note =	 {\url{http://norvig.com/spell-correct.html}}
40 | }
41 | 
42 | @MISC{BenS,
43 |  note =	 {\url{https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/19463/what-s-the-lb-shakeup-potential/116689\#post116689}}
44 | }
45 | 
46 | 
47 | @MISC{CrowdFlower_1st,
48 |  note =  {\url{https://github.com/ChenglongChen/Kaggle_CrowdFlower}}
49 | }
50 | 
51 | 
52 | @article{ensemble_selection,
53 |   title={Ensemble selection from libraries of models},
54 |   author={Niculescu-Mizil, Alexandru and Caruana, Rich and Crew, Geoff and Ksikes, Alex},
55 |   journal={Proceedings of International Conference on Machine Learning},
56 |   pages={137--144},
57 |   year={2004}
58 | }
59 | 
60 | 
61 | @book{NLTK_Cookbook,
62 |   author        = {Jacob Perkins},
63 |   title         = {Python Text Processing with NLTK 2.0 Cookbook},
64 |   publisher     = {},
65 |   month         = {Nov.},
66 |   year          = {2010},
67 | }
68 | 
69 | 
70 | @inproceedings{wmd,
71 |     AUTHOR = {Matt J. Kusner and Yu Sun and Nicholas I. Kolkin and Kilian Q. Weinberger},
72 |      title = {From Word Embeddings To Document Distances},
73 |    booktitle = {the $32^{\text{nd}}$ International Conference on Machine Learning},
74 |     year = {2015},
75 |     pages = {}
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/Doc/reference2.bib:
--------------------------------------------------------------------------------
 1 | % Created by Igor Buinyi
 2 | % Date: May. 5 2016
 3 | 
 4 | 
 5 | @MISC{Google_dict,
 6 |  note =	 {\url{https://www.kaggle.com/steubk/home-depot-product-search-relevance/fixing-typos
 7 | }}}
 8 | 
 9 | @MISC{crowdflower_3place,
10 |  note =	 {\url{http://blog.kaggle.com/2015/07/22/crowdflower-winners-interview-3rd-place-team-quartet/
11 | }}}
12 | 
13 | @MISC{dato:beatthebenchmark,
14 |  note =	 {\url{https://www.kaggle.com/c/dato-native/forums/t/16626/beat-the-benchmark-0-90388-with-simple-model
15 | }}}
16 | 
17 | @MISC{crowdflower_2place,
18 |  note =	 {\url{https://github.com/geffy/kaggle-crowdflower
19 | }}}
20 | 


--------------------------------------------------------------------------------
/Fig/CV_LB_Chenglong.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/CV_LB_Chenglong.pdf


--------------------------------------------------------------------------------
/Fig/FlowChart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/FlowChart.jpg


--------------------------------------------------------------------------------
/Fig/FlowChart.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/FlowChart.pptx


--------------------------------------------------------------------------------
/Fig/actual_product_uid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/actual_product_uid.pdf


--------------------------------------------------------------------------------
/Fig/actual_search_term.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/actual_search_term.pdf


--------------------------------------------------------------------------------
/Fig/feature_corr_Chenglong.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/feature_corr_Chenglong.pdf


--------------------------------------------------------------------------------
/Fig/feature_importances_Igor.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/feature_importances_Igor.pdf


--------------------------------------------------------------------------------
/Fig/naive_product_uid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/naive_product_uid.pdf


--------------------------------------------------------------------------------
/Fig/naive_search_term.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/naive_search_term.pdf


--------------------------------------------------------------------------------
/Fig/plot_ensembles_means.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_ensembles_means.pdf


--------------------------------------------------------------------------------
/Fig/plot_ensembles_performance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_ensembles_performance.pdf


--------------------------------------------------------------------------------
/Fig/plot_feature_importances_benchmark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_feature_importances_benchmark.pdf


--------------------------------------------------------------------------------
/Fig/plot_feature_importances_simplified_model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_feature_importances_simplified_model.pdf


--------------------------------------------------------------------------------
/Fig/plot_full_query_in_title.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_full_query_in_title.pdf


--------------------------------------------------------------------------------
/Fig/plot_high_vs_low_relevance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_high_vs_low_relevance.pdf


--------------------------------------------------------------------------------
/Fig/plot_query_with.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_query_with.pdf


--------------------------------------------------------------------------------
/Fig/plot_replaced_with_Google.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/plot_replaced_with_Google.pdf


--------------------------------------------------------------------------------
/Fig/proposed_product_uid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/proposed_product_uid.pdf


--------------------------------------------------------------------------------
/Fig/proposed_search_term.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/kaggle-HomeDepot/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Fig/proposed_search_term.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2016 Igor Buinyi, Kostia Omelianchuk, Chenglong Chen
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 
 5 | of this software and associated documentation files (the "Software"), to deal 
 6 | in the Software without restriction, including without limitation the rights 
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 8 | copies of the Software, and to permit persons to whom the Software is 
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all 
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
20 | SOFTWARE.


--------------------------------------------------------------------------------
/Log/README.md:
--------------------------------------------------------------------------------
1 | This folder contains logs for Chenglong's features and models.
2 | * `feature`: logs of most of the features
3 | 
4 | * `level1_models`: logs of all the 1st level models used for building 2nd level model (a.k.a. Chenglong's final ensemble)
5 | 
6 | * `feature_combiner_level2_meta_linear_201605030922_2016-05-03-09-23.log`: log of the features (i.e., 1st level models) chosen for building 2nd level model
7 |  
8 | * `[Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_hyperopt_2016-05-07-18-42.log`: log of the 2nd level model


--------------------------------------------------------------------------------
/Log/feature/data_processor_2016-05-08-00-36.log:
--------------------------------------------------------------------------------
1 | [2016-05-08 00:48:55,265] INFO: Run GoogleQuerySpellingChecker at search_term
2 | [2016-05-08 01:29:06,789] INFO: Save to ../../Data/Clean/all.lemmatized.csv.pkl
3 | [2016-05-08 01:43:43,971] INFO: Save to ../../Data/Clean/all.lemmatized.stemmed.csv.pkl
4 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_basic_2016-05-08-01-43.log:
--------------------------------------------------------------------------------
 1 | [2016-05-08 01:44:22,874] INFO: DocId_search_term_1D (1D): corr = -0.006971
 2 | [2016-05-08 01:44:23,556] INFO: DocId_product_title_1D (1D): corr = -0.005569
 3 | [2016-05-08 01:44:24,929] INFO: DocId_product_description_1D (1D): corr = -0.001244
 4 | [2016-05-08 01:44:26,188] INFO: DocId_product_attribute_1D (1D): corr = 0.089704
 5 | [2016-05-08 01:44:26,684] INFO: DocId_product_brand_1D (1D): corr = -0.022024
 6 | [2016-05-08 01:44:27,210] INFO: DocId_product_color_1D (1D): corr = 0.036558
 7 | [2016-05-08 01:44:29,230] INFO: DocLen_search_term_1D (1D): corr = -0.084015
 8 | [2016-05-08 01:44:30,432] INFO: DocLen_product_title_1D (1D): corr = -0.013815
 9 | [2016-05-08 01:44:35,849] INFO: DocLen_product_description_1D (1D): corr = 0.042311
10 | [2016-05-08 01:44:40,927] INFO: DocLen_product_attribute_1D (1D): corr = -0.039293
11 | [2016-05-08 01:44:41,716] INFO: DocLen_product_brand_1D (1D): corr = -0.074429
12 | [2016-05-08 01:44:42,484] INFO: DocLen_product_color_1D (1D): corr = -0.020528
13 | [2016-05-08 01:44:43,079] INFO: DocFreq_search_term_1D (1D): corr = 0.148810
14 | [2016-05-08 01:44:43,707] INFO: DocFreq_product_title_1D (1D): corr = -0.030461
15 | [2016-05-08 01:44:44,980] INFO: DocFreq_product_description_1D (1D): corr = -0.025100
16 | [2016-05-08 01:44:46,213] INFO: DocFreq_product_attribute_1D (1D): corr = 0.145885
17 | [2016-05-08 01:44:46,767] INFO: DocFreq_product_brand_1D (1D): corr = 0.146631
18 | [2016-05-08 01:44:47,310] INFO: DocFreq_product_color_1D (1D): corr = 0.047351
19 | [2016-05-08 01:45:11,103] INFO: DocEntropy_search_term_1D (1D): corr = -0.068423
20 | [2016-05-08 01:45:36,281] INFO: DocEntropy_product_title_1D (1D): corr = -0.004657
21 | [2016-05-08 01:46:19,960] INFO: DocEntropy_product_description_1D (1D): corr = 0.053725
22 | [2016-05-08 01:47:01,011] INFO: DocEntropy_product_attribute_1D (1D): corr = -0.130249
23 | [2016-05-08 01:47:22,331] INFO: DocEntropy_product_brand_1D (1D): corr = -0.078429
24 | [2016-05-08 01:47:45,043] INFO: DocEntropy_product_color_1D (1D): corr = -0.026344
25 | [2016-05-08 01:47:46,418] INFO: DigitCount_search_term_1D (1D): corr = -0.078397
26 | [2016-05-08 01:47:48,546] INFO: DigitCount_product_title_1D (1D): corr = -0.012951
27 | [2016-05-08 01:47:56,985] INFO: DigitCount_product_description_1D (1D): corr = 0.042207
28 | [2016-05-08 01:48:07,439] INFO: DigitCount_product_attribute_1D (1D): corr = -0.032872
29 | [2016-05-08 01:48:08,783] INFO: DigitCount_product_brand_1D (1D): corr = -0.007779
30 | [2016-05-08 01:48:10,034] INFO: DigitCount_product_color_1D (1D): corr = -0.008055
31 | [2016-05-08 01:48:12,696] INFO: DigitRatio_search_term_1D (1D): corr = -0.070659
32 | [2016-05-08 01:48:16,447] INFO: DigitRatio_product_title_1D (1D): corr = -0.002637
33 | [2016-05-08 01:48:30,963] INFO: DigitRatio_product_description_1D (1D): corr = 0.017609
34 | [2016-05-08 01:48:47,017] INFO: DigitRatio_product_attribute_1D (1D): corr = -0.120129
35 | [2016-05-08 01:48:49,532] INFO: DigitRatio_product_brand_1D (1D): corr = -0.010386
36 | [2016-05-08 01:48:51,975] INFO: DigitRatio_product_color_1D (1D): corr = -0.003868
37 | [2016-05-08 01:48:52,263] INFO: DocIdEcho_product_uid_1D (1D): corr = -0.130656
38 | [2016-05-08 01:48:53,049] INFO: DocFreq_product_uid_1D (1D): corr = -0.032851
39 | [2016-05-08 01:48:54,079] INFO: ProductUidDummy1_product_uid_1D (1D): corr = 0.171689
40 | [2016-05-08 01:48:54,952] INFO: ProductUidDummy2_product_uid_1D (1D): corr = 0.000000
41 | [2016-05-08 01:48:55,685] INFO: ProductUidDummy3_product_uid_1D (1D): corr = -0.172492
42 | [2016-05-08 01:48:57,041] INFO: UniqueCount_Unigram_search_term_1D (1D): corr = -0.083333
43 | [2016-05-08 01:48:59,167] INFO: UniqueCount_Unigram_product_title_1D (1D): corr = -0.011923
44 | [2016-05-08 01:49:09,973] INFO: UniqueCount_Unigram_product_description_1D (1D): corr = 0.047268
45 | [2016-05-08 01:49:21,315] INFO: UniqueCount_Unigram_product_attribute_1D (1D): corr = -0.058670
46 | [2016-05-08 01:49:22,825] INFO: UniqueCount_Unigram_product_brand_1D (1D): corr = -0.074985
47 | [2016-05-08 01:49:24,334] INFO: UniqueCount_Unigram_product_color_1D (1D): corr = -0.024776
48 | [2016-05-08 01:49:28,358] INFO: UniqueCount_Bigram_search_term_1D (1D): corr = -0.096647
49 | [2016-05-08 01:49:39,780] INFO: UniqueCount_Bigram_product_title_1D (1D): corr = -0.013097
50 | [2016-05-08 01:51:33,197] INFO: UniqueCount_Bigram_product_description_1D (1D): corr = 0.043463
51 | [2016-05-08 01:53:18,345] INFO: UniqueCount_Bigram_product_attribute_1D (1D): corr = -0.042916
52 | [2016-05-08 01:53:20,921] INFO: UniqueCount_Bigram_product_brand_1D (1D): corr = -0.040597
53 | [2016-05-08 01:53:23,404] INFO: UniqueCount_Bigram_product_color_1D (1D): corr = -0.016889
54 | [2016-05-08 01:53:28,045] INFO: UniqueCount_Trigram_search_term_1D (1D): corr = -0.080895
55 | [2016-05-08 01:53:43,518] INFO: UniqueCount_Trigram_product_title_1D (1D): corr = -0.013489
56 | [2016-05-08 01:56:32,561] INFO: UniqueCount_Trigram_product_description_1D (1D): corr = 0.042642
57 | [2016-05-08 01:58:51,653] INFO: UniqueCount_Trigram_product_attribute_1D (1D): corr = -0.039823
58 | [2016-05-08 01:58:54,527] INFO: UniqueCount_Trigram_product_brand_1D (1D): corr = -0.011067
59 | [2016-05-08 01:58:57,297] INFO: UniqueCount_Trigram_product_color_1D (1D): corr = -0.013767
60 | [2016-05-08 01:58:59,923] INFO: UniqueRatio_Unigram_search_term_1D (1D): corr = 0.027054
61 | [2016-05-08 01:59:03,582] INFO: UniqueRatio_Unigram_product_title_1D (1D): corr = 0.005347
62 | [2016-05-08 01:59:16,935] INFO: UniqueRatio_Unigram_product_description_1D (1D): corr = -0.007640
63 | [2016-05-08 01:59:29,356] INFO: UniqueRatio_Unigram_product_attribute_1D (1D): corr = 0.100459
64 | [2016-05-08 01:59:31,767] INFO: UniqueRatio_Unigram_product_brand_1D (1D): corr = -0.003936
65 | [2016-05-08 01:59:34,131] INFO: UniqueRatio_Unigram_product_color_1D (1D): corr = -0.001990
66 | [2016-05-08 01:59:39,711] INFO: UniqueRatio_Bigram_search_term_1D (1D): corr = 0.020047
67 | [2016-05-08 01:59:53,123] INFO: UniqueRatio_Bigram_product_title_1D (1D): corr = 0.007022
68 | [2016-05-08 02:01:40,205] INFO: UniqueRatio_Bigram_product_description_1D (1D): corr = 0.008495
69 | [2016-05-08 02:02:59,568] INFO: UniqueRatio_Bigram_product_attribute_1D (1D): corr = 0.044705
70 | [2016-05-08 02:03:01,494] INFO: UniqueRatio_Bigram_product_brand_1D (1D): corr = 0.000000
71 | [2016-05-08 02:03:03,467] INFO: UniqueRatio_Bigram_product_color_1D (1D): corr = -0.008007
72 | [2016-05-08 02:03:07,453] INFO: UniqueRatio_Trigram_search_term_1D (1D): corr = 0.015218
73 | [2016-05-08 02:03:23,431] INFO: UniqueRatio_Trigram_product_title_1D (1D): corr = 0.008634
74 | [2016-05-08 02:05:29,477] INFO: UniqueRatio_Trigram_product_description_1D (1D): corr = 0.007814
75 | [2016-05-08 02:07:26,469] INFO: UniqueRatio_Trigram_product_attribute_1D (1D): corr = 0.020869
76 | [2016-05-08 02:07:30,186] INFO: UniqueRatio_Trigram_product_brand_1D (1D): corr = 0.000000
77 | [2016-05-08 02:07:33,792] INFO: UniqueRatio_Trigram_product_color_1D (1D): corr = -0.004186
78 | [2016-05-08 02:07:34,141] INFO: AttrCount_product_attribute_list_1D (1D): corr = -0.048513
79 | [2016-05-08 02:07:39,868] INFO: AttrBulletCount_product_attribute_list_1D (1D): corr = -0.070608
80 | [2016-05-08 02:07:46,288] INFO: AttrBulletRatio_product_attribute_list_1D (1D): corr = -0.150464
81 | [2016-05-08 02:07:52,174] INFO: AttrNonBulletCount_product_attribute_list_1D (1D): corr = -0.035769
82 | [2016-05-08 02:07:58,799] INFO: AttrNonBulletRatio_product_attribute_list_1D (1D): corr = 0.150464
83 | [2016-05-08 02:08:04,288] INFO: AttrHasProductHeight_product_attribute_list_1D (1D): corr = -0.099505
84 | [2016-05-08 02:08:10,515] INFO: AttrHasProductWidth_product_attribute_list_1D (1D): corr = -0.110805
85 | [2016-05-08 02:08:16,320] INFO: AttrHasProductLength_product_attribute_list_1D (1D): corr = -0.051217
86 | [2016-05-08 02:08:22,124] INFO: AttrHasProductDepth_product_attribute_list_1D (1D): corr = -0.093089
87 | [2016-05-08 02:08:27,670] INFO: AttrHasIndoorOutdoor_product_attribute_list_1D (1D): corr = -0.017944
88 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_char_dist_sim_2016-05-08-12-02.log:
--------------------------------------------------------------------------------
1 | [2016-05-08 12:05:53,743] INFO: CharDistribution_CosineSim_search_term_x_product_title_1D (1D): corr = 0.221607
2 | [2016-05-08 12:07:43,389] INFO: CharDistribution_CosineSim_search_term_x_product_description_1D (1D): corr = 0.072751
3 | [2016-05-08 12:09:21,822] INFO: CharDistribution_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.036551
4 | [2016-05-08 12:09:32,860] INFO: CharDistribution_KL_search_term_x_product_title_1D (1D): corr = -0.223736
5 | [2016-05-08 12:11:01,222] INFO: CharDistribution_KL_search_term_x_product_description_1D (1D): corr = -0.048662
6 | [2016-05-08 12:12:21,357] INFO: CharDistribution_KL_search_term_x_product_attribute_1D (1D): corr = -0.117159
7 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_doc2vec_2016-05-08-12-56.log:
--------------------------------------------------------------------------------
 1 | [2016-05-08 12:56:52,428] INFO: loading Doc2Vec object from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model
 2 | [2016-05-08 12:57:02,221] INFO: loading docvecs recursively from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.docvecs.* with mmap=None
 3 | [2016-05-08 12:57:02,235] INFO: loading doctag_syn0 from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.docvecs.doctag_syn0.npy with mmap=None
 4 | [2016-05-08 12:57:02,377] INFO: loading syn0 from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.syn0.npy with mmap=None
 5 | [2016-05-08 12:57:02,438] INFO: loading syn1 from ../../Data/doc2vec/Homedepot-doc2vec-D100-min_count3.model.syn1.npy with mmap=None
 6 | [2016-05-08 12:57:02,498] INFO: setting ignored attribute syn0norm to None
 7 | [2016-05-08 12:57:02,499] INFO: setting ignored attribute cum_table to None
 8 | [2016-05-08 12:59:10,467] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_title_1D (1D): corr = 0.315041
 9 | [2016-05-08 13:00:31,971] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_description_1D (1D): corr = 0.238963
10 | [2016-05-08 13:01:48,488] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.065506
11 | [2016-05-08 13:03:03,277] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_brand_1D (1D): corr = 0.052180
12 | [2016-05-08 13:04:20,041] INFO: Doc2Vec_Homedepot_D100_CosineSim_search_term_x_product_color_1D (1D): corr = 0.004814
13 | [2016-05-08 13:04:32,482] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_title_1D (1D): corr = -0.237534
14 | [2016-05-08 13:04:45,344] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_description_1D (1D): corr = -0.189458
15 | [2016-05-08 13:04:58,122] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_attribute_1D (1D): corr = 0.131127
16 | [2016-05-08 13:05:10,420] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_brand_1D (1D): corr = 0.131438
17 | [2016-05-08 13:05:22,656] INFO: Doc2Vec_Homedepot_D100_RMSE_search_term_x_product_color_1D (1D): corr = 0.043696
18 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_group_relevance_2016-05-08-01-47.log:
--------------------------------------------------------------------------------
 1 | [2016-05-08 02:12:32,450] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.005912
 2 | [2016-05-08 02:12:32,459] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.003595
 3 | [2016-05-08 02:12:32,468] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.005852
 4 | [2016-05-08 02:12:32,478] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.006198
 5 | [2016-05-08 02:12:32,487] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.005835
 6 | [2016-05-08 02:12:32,497] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.000607
 7 | [2016-05-08 02:32:38,630] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.006510
 8 | [2016-05-08 02:32:38,639] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.006458
 9 | [2016-05-08 02:32:38,649] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.006631
10 | [2016-05-08 02:32:38,658] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.006391
11 | [2016-05-08 02:32:38,668] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.006539
12 | [2016-05-08 02:32:38,677] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.005025
13 | [2016-05-08 02:53:26,446] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.004440
14 | [2016-05-08 02:53:26,457] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.006256
15 | [2016-05-08 02:53:26,469] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.004708
16 | [2016-05-08 02:53:26,480] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.004053
17 | [2016-05-08 02:53:26,492] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.004498
18 | [2016-05-08 02:53:26,504] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.004047
19 | [2016-05-08 03:14:22,463] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.003041
20 | [2016-05-08 03:14:22,472] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.003957
21 | [2016-05-08 03:14:22,483] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.003095
22 | [2016-05-08 03:14:22,493] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.002943
23 | [2016-05-08 03:14:22,504] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.003132
24 | [2016-05-08 03:14:22,513] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.000749
25 | [2016-05-08 03:39:16,457] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.003750
26 | [2016-05-08 03:39:16,465] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.003960
27 | [2016-05-08 03:39:16,474] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.003835
28 | [2016-05-08 03:39:16,483] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.003798
29 | [2016-05-08 03:39:16,491] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.003724
30 | [2016-05-08 03:39:16,501] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.001400
31 | [2016-05-08 04:14:08,676] INFO: GroupRelevance_Mean_product_title_1D (1D): corr = 0.008724
32 | [2016-05-08 04:14:08,686] INFO: GroupRelevance_Std_product_title_1D (1D): corr = 0.007908
33 | [2016-05-08 04:14:08,696] INFO: GroupRelevance_Max_product_title_1D (1D): corr = 0.008765
34 | [2016-05-08 04:14:08,706] INFO: GroupRelevance_Min_product_title_1D (1D): corr = 0.008751
35 | [2016-05-08 04:14:08,716] INFO: GroupRelevance_Median_product_title_1D (1D): corr = 0.008758
36 | [2016-05-08 04:14:08,727] INFO: GroupRelevance_Size_product_title_1D (1D): corr = 0.005363
37 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_lsa_ngram_cosinesim_2016-05-08-13-38.log:
--------------------------------------------------------------------------------
1 | [2016-05-08 13:40:32,505] INFO: LSA100_Word_Trigram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.233220
2 | [2016-05-08 13:49:21,913] INFO: LSA100_Word_Trigram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.165219
3 | [2016-05-08 13:56:26,644] INFO: LSA100_Word_Trigram_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.019628
4 | [2016-05-08 13:59:17,295] INFO: LSA100_Char_Fourgram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.281925
5 | [2016-05-08 14:20:33,890] INFO: LSA100_Char_Fourgram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.152655
6 | [2016-05-08 14:38:10,383] INFO: LSA100_Char_Fourgram_CosineSim_search_term_x_product_attribute_1D (1D): corr = -0.041331
7 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_ngram_jaccard_2016-05-08-01-43.log:
--------------------------------------------------------------------------------
 1 | [2016-05-08 01:44:30,209] INFO: JaccardCoef_Unigram_search_term_x_product_title_1D (1D): corr = 0.285574
 2 | [2016-05-08 01:44:35,568] INFO: JaccardCoef_Unigram_search_term_x_product_title_product_name_1D (1D): corr = 0.281925
 3 | [2016-05-08 01:44:53,352] INFO: JaccardCoef_Unigram_search_term_x_product_description_1D (1D): corr = 0.131190
 4 | [2016-05-08 01:45:09,992] INFO: JaccardCoef_Unigram_search_term_x_product_attribute_1D (1D): corr = 0.004878
 5 | [2016-05-08 01:45:15,088] INFO: JaccardCoef_Unigram_search_term_x_product_brand_1D (1D): corr = 0.067692
 6 | [2016-05-08 01:45:20,159] INFO: JaccardCoef_Unigram_search_term_x_product_color_1D (1D): corr = 0.003534
 7 | [2016-05-08 01:45:26,660] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_title_1D (1D): corr = 0.268095
 8 | [2016-05-08 01:45:31,707] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.262341
 9 | [2016-05-08 01:45:49,303] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_description_1D (1D): corr = 0.178812
10 | [2016-05-08 01:46:05,678] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.040235
11 | [2016-05-08 01:46:08,726] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.050577
12 | [2016-05-08 01:46:11,104] INFO: JaccardCoef_Unigram_search_term_product_name_x_product_color_1D (1D): corr = -0.015404
13 | [2016-05-08 01:46:21,959] INFO: JaccardCoef_Bigram_search_term_x_product_title_1D (1D): corr = 0.197671
14 | [2016-05-08 01:46:29,620] INFO: JaccardCoef_Bigram_search_term_x_product_title_product_name_1D (1D): corr = 0.192269
15 | [2016-05-08 01:48:12,935] INFO: JaccardCoef_Bigram_search_term_x_product_description_1D (1D): corr = 0.129658
16 | [2016-05-08 01:49:59,806] INFO: JaccardCoef_Bigram_search_term_x_product_attribute_1D (1D): corr = 0.051732
17 | [2016-05-08 01:50:06,832] INFO: JaccardCoef_Bigram_search_term_x_product_brand_1D (1D): corr = 0.039314
18 | [2016-05-08 01:50:13,429] INFO: JaccardCoef_Bigram_search_term_x_product_color_1D (1D): corr = 0.006161
19 | [2016-05-08 01:50:30,836] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_title_1D (1D): corr = 0.164851
20 | [2016-05-08 01:50:39,245] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283
21 | [2016-05-08 01:52:36,040] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_description_1D (1D): corr = 0.128455
22 | [2016-05-08 01:54:25,900] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.057716
23 | [2016-05-08 01:54:31,066] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.030678
24 | [2016-05-08 01:54:36,248] INFO: JaccardCoef_Bigram_search_term_product_name_x_product_color_1D (1D): corr = -0.002454
25 | [2016-05-08 01:54:51,972] INFO: JaccardCoef_Trigram_search_term_x_product_title_1D (1D): corr = 0.112496
26 | [2016-05-08 01:54:58,411] INFO: JaccardCoef_Trigram_search_term_x_product_title_product_name_1D (1D): corr = 0.126837
27 | [2016-05-08 01:57:51,480] INFO: JaccardCoef_Trigram_search_term_x_product_description_1D (1D): corr = 0.072817
28 | [2016-05-08 02:00:36,316] INFO: JaccardCoef_Trigram_search_term_x_product_attribute_1D (1D): corr = 0.029948
29 | [2016-05-08 02:00:45,294] INFO: JaccardCoef_Trigram_search_term_x_product_brand_1D (1D): corr = 0.038705
30 | [2016-05-08 02:00:54,241] INFO: JaccardCoef_Trigram_search_term_x_product_color_1D (1D): corr = 0.005326
31 | [2016-05-08 02:01:11,855] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_title_1D (1D): corr = 0.004708
32 | [2016-05-08 02:01:18,177] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283
33 | [2016-05-08 02:04:00,328] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_description_1D (1D): corr = 0.000000
34 | [2016-05-08 02:06:10,004] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.000000
35 | [2016-05-08 02:06:15,796] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.029351
36 | [2016-05-08 02:06:21,417] INFO: JaccardCoef_Trigram_search_term_product_name_x_product_color_1D (1D): corr = 0.000113
37 | [2016-05-08 02:06:26,021] INFO: DiceDistance_Unigram_search_term_x_product_title_1D (1D): corr = 0.296123
38 | [2016-05-08 02:06:29,656] INFO: DiceDistance_Unigram_search_term_x_product_title_product_name_1D (1D): corr = 0.285837
39 | [2016-05-08 02:06:41,774] INFO: DiceDistance_Unigram_search_term_x_product_description_1D (1D): corr = 0.133760
40 | [2016-05-08 02:06:54,041] INFO: DiceDistance_Unigram_search_term_x_product_attribute_1D (1D): corr = 0.003806
41 | [2016-05-08 02:06:57,586] INFO: DiceDistance_Unigram_search_term_x_product_brand_1D (1D): corr = 0.064598
42 | [2016-05-08 02:07:01,145] INFO: DiceDistance_Unigram_search_term_x_product_color_1D (1D): corr = 0.002170
43 | [2016-05-08 02:07:07,233] INFO: DiceDistance_Unigram_search_term_product_name_x_product_title_1D (1D): corr = 0.278330
44 | [2016-05-08 02:07:12,292] INFO: DiceDistance_Unigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.272974
45 | [2016-05-08 02:07:28,392] INFO: DiceDistance_Unigram_search_term_product_name_x_product_description_1D (1D): corr = 0.180541
46 | [2016-05-08 02:07:43,595] INFO: DiceDistance_Unigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.040289
47 | [2016-05-08 02:07:47,888] INFO: DiceDistance_Unigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.048742
48 | [2016-05-08 02:07:52,006] INFO: DiceDistance_Unigram_search_term_product_name_x_product_color_1D (1D): corr = -0.017108
49 | [2016-05-08 02:08:09,476] INFO: DiceDistance_Bigram_search_term_x_product_title_1D (1D): corr = 0.204344
50 | [2016-05-08 02:08:18,741] INFO: DiceDistance_Bigram_search_term_x_product_title_product_name_1D (1D): corr = 0.192708
51 | [2016-05-08 02:10:17,082] INFO: DiceDistance_Bigram_search_term_x_product_description_1D (1D): corr = 0.130818
52 | [2016-05-08 02:12:07,924] INFO: DiceDistance_Bigram_search_term_x_product_attribute_1D (1D): corr = 0.052014
53 | [2016-05-08 02:12:16,453] INFO: DiceDistance_Bigram_search_term_x_product_brand_1D (1D): corr = 0.036609
54 | [2016-05-08 02:12:25,011] INFO: DiceDistance_Bigram_search_term_x_product_color_1D (1D): corr = 0.005968
55 | [2016-05-08 02:12:41,843] INFO: DiceDistance_Bigram_search_term_product_name_x_product_title_1D (1D): corr = 0.170214
56 | [2016-05-08 02:12:49,737] INFO: DiceDistance_Bigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283
57 | [2016-05-08 02:14:46,468] INFO: DiceDistance_Bigram_search_term_product_name_x_product_description_1D (1D): corr = 0.128834
58 | [2016-05-08 02:16:37,562] INFO: DiceDistance_Bigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.057913
59 | [2016-05-08 02:16:45,053] INFO: DiceDistance_Bigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.030788
60 | [2016-05-08 02:16:52,512] INFO: DiceDistance_Bigram_search_term_product_name_x_product_color_1D (1D): corr = -0.002522
61 | [2016-05-08 02:17:14,976] INFO: DiceDistance_Trigram_search_term_x_product_title_1D (1D): corr = 0.119359
62 | [2016-05-08 02:17:25,037] INFO: DiceDistance_Trigram_search_term_x_product_title_product_name_1D (1D): corr = 0.126837
63 | [2016-05-08 02:20:18,041] INFO: DiceDistance_Trigram_search_term_x_product_description_1D (1D): corr = 0.073943
64 | [2016-05-08 02:22:11,063] INFO: DiceDistance_Trigram_search_term_x_product_attribute_1D (1D): corr = 0.030170
65 | [2016-05-08 02:22:17,401] INFO: DiceDistance_Trigram_search_term_x_product_brand_1D (1D): corr = 0.038456
66 | [2016-05-08 02:22:23,943] INFO: DiceDistance_Trigram_search_term_x_product_color_1D (1D): corr = 0.006093
67 | [2016-05-08 02:22:38,782] INFO: DiceDistance_Trigram_search_term_product_name_x_product_title_1D (1D): corr = 0.004708
68 | [2016-05-08 02:22:44,366] INFO: DiceDistance_Trigram_search_term_product_name_x_product_title_product_name_1D (1D): corr = 0.182283
69 | [2016-05-08 02:24:48,640] INFO: DiceDistance_Trigram_search_term_product_name_x_product_description_1D (1D): corr = 0.000000
70 | [2016-05-08 02:26:47,009] INFO: DiceDistance_Trigram_search_term_product_name_x_product_attribute_1D (1D): corr = 0.000000
71 | [2016-05-08 02:26:52,516] INFO: DiceDistance_Trigram_search_term_product_name_x_product_brand_1D (1D): corr = 0.029351
72 | [2016-05-08 02:26:57,999] INFO: DiceDistance_Trigram_search_term_product_name_x_product_color_1D (1D): corr = 0.000113
73 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_query_quality_2016-05-08-13-05.log:
--------------------------------------------------------------------------------
1 | [2016-05-08 13:06:07,296] INFO: QueryQuality_raw_x_lemmatized_1D (1D): corr = -0.011984
2 | [2016-05-08 13:06:08,419] INFO: QueryQuality_raw_x_product_name_1D (1D): corr = -0.094242
3 | [2016-05-08 13:06:09,308] INFO: QueryQuality_raw_x_stemmed_1D (1D): corr = -0.017614
4 | [2016-05-08 13:06:10,263] INFO: QueryQuality_lemmatized_x_product_name_1D (1D): corr = -0.108217
5 | [2016-05-08 13:06:11,274] INFO: QueryQuality_lemmatized_x_stemmed_1D (1D): corr = -0.016097
6 | [2016-05-08 13:06:11,889] INFO: QueryQuality_product_name_x_stemmed_1D (1D): corr = -0.107457
7 | [2016-05-08 13:06:12,699] INFO: IsInGoogleDict_search_term_1D (1D): corr = -0.068113
8 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_tfidf_ngram_cosinesim_2016-05-08-12-12.log:
--------------------------------------------------------------------------------
1 | [2016-05-08 12:17:14,100] INFO: TFIDF_Word_Trigram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.303002
2 | [2016-05-08 12:48:05,438] INFO: TFIDF_Word_Trigram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.257729
3 | [2016-05-08 13:03:57,970] INFO: TFIDF_Word_Trigram_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.112431
4 | [2016-05-08 13:07:26,447] INFO: TFIDF_Char_Fourgram_CosineSim_search_term_x_product_title_1D (1D): corr = 0.372841
5 | [2016-05-08 13:24:13,093] INFO: TFIDF_Char_Fourgram_CosineSim_search_term_x_product_description_1D (1D): corr = 0.274988
6 | [2016-05-08 13:38:43,647] INFO: TFIDF_Char_Fourgram_CosineSim_search_term_x_product_attribute_1D (1D): corr = 0.069589
7 | 


--------------------------------------------------------------------------------
/Log/feature/generate_feature_wordnet_similarity_2016-05-08-01-43.log:
--------------------------------------------------------------------------------
 1 | [2016-05-08 21:28:42,498] INFO: WordNet_Path_Similarity_Mean_Mean_search_term_x_product_title_1D (1D): corr = 0.171081
 2 | [2016-05-08 21:28:42,508] INFO: WordNet_Path_Similarity_Mean_Std_search_term_x_product_title_1D (1D): corr = -0.043572
 3 | [2016-05-08 21:28:42,517] INFO: WordNet_Path_Similarity_Mean_Max_search_term_x_product_title_1D (1D): corr = 0.107101
 4 | [2016-05-08 21:28:42,526] INFO: WordNet_Path_Similarity_Mean_Min_search_term_x_product_title_1D (1D): corr = 0.163098
 5 | [2016-05-08 21:28:42,536] INFO: WordNet_Path_Similarity_Mean_Median_search_term_x_product_title_1D (1D): corr = 0.161434
 6 | [2016-05-08 21:28:42,545] INFO: WordNet_Path_Similarity_Max_Mean_search_term_x_product_title_1D (1D): corr = 0.297643
 7 | [2016-05-08 21:28:42,554] INFO: WordNet_Path_Similarity_Max_Std_search_term_x_product_title_1D (1D): corr = -0.152113
 8 | [2016-05-08 21:28:42,564] INFO: WordNet_Path_Similarity_Max_Max_search_term_x_product_title_1D (1D): corr = 0.153445
 9 | [2016-05-08 21:28:42,573] INFO: WordNet_Path_Similarity_Max_Min_search_term_x_product_title_1D (1D): corr = 0.265098
10 | [2016-05-08 21:28:42,582] INFO: WordNet_Path_Similarity_Max_Median_search_term_x_product_title_1D (1D): corr = 0.253243
11 | [2016-05-08 21:28:42,592] INFO: WordNet_Path_Similarity_Min_Mean_search_term_x_product_title_1D (1D): corr = 0.010445
12 | [2016-05-08 21:28:42,601] INFO: WordNet_Path_Similarity_Min_Std_search_term_x_product_title_1D (1D): corr = -0.004510
13 | [2016-05-08 21:28:42,610] INFO: WordNet_Path_Similarity_Min_Max_search_term_x_product_title_1D (1D): corr = 0.004670
14 | [2016-05-08 21:28:42,619] INFO: WordNet_Path_Similarity_Min_Min_search_term_x_product_title_1D (1D): corr = 0.018414
15 | [2016-05-08 21:28:42,629] INFO: WordNet_Path_Similarity_Min_Median_search_term_x_product_title_1D (1D): corr = 0.010229
16 | [2016-05-08 21:28:42,638] INFO: WordNet_Path_Similarity_Median_Mean_search_term_x_product_title_1D (1D): corr = 0.049001
17 | [2016-05-08 21:28:42,647] INFO: WordNet_Path_Similarity_Median_Std_search_term_x_product_title_1D (1D): corr = -0.007067
18 | [2016-05-08 21:28:42,656] INFO: WordNet_Path_Similarity_Median_Max_search_term_x_product_title_1D (1D): corr = 0.022593
19 | [2016-05-08 21:28:42,666] INFO: WordNet_Path_Similarity_Median_Min_search_term_x_product_title_1D (1D): corr = 0.060468
20 | [2016-05-08 21:28:42,675] INFO: WordNet_Path_Similarity_Median_Median_search_term_x_product_title_1D (1D): corr = 0.045057
21 | 


--------------------------------------------------------------------------------
/Log/level1_models/[Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_hyperopt_2016-05-01-23-31.log:
--------------------------------------------------------------------------------
 1 | [2016-05-01 23:31:29,673] INFO: tpe_transform took 0.007514 seconds
 2 | [2016-05-01 23:31:29,674] INFO: TPE using 0 trials
 3 | [2016-05-01 23:31:29,677] INFO: ==================================================
 4 | [2016-05-01 23:31:29,677] INFO: Task
 5 | [2016-05-01 23:31:29,677] INFO:       [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_[Id@1]
 6 | [2016-05-01 23:31:29,678] INFO: Param
 7 | [2016-05-01 23:31:29,678] INFO:       leaf_size: 40
 8 | [2016-05-01 23:31:29,678] INFO:       metric: minkowski
 9 | [2016-05-01 23:31:29,678] INFO:       n_neighbors: 5
10 | [2016-05-01 23:31:29,679] INFO:       normalize: False
11 | [2016-05-01 23:31:29,679] INFO:       weights: uniform
12 | [2016-05-01 23:31:29,679] INFO: Result
13 | [2016-05-01 23:31:29,679] INFO:       Run      RMSE        Shape
14 | [2016-05-01 23:32:24,692] INFO:         1     0.56615    23167 x 722
15 | [2016-05-01 23:33:18,848] INFO:         2    0.565393    21940 x 722
16 | [2016-05-01 23:34:12,722] INFO:         3    0.567662    22182 x 722
17 | [2016-05-01 23:35:06,997] INFO:         4    0.567039    21966 x 722
18 | [2016-05-01 23:36:05,726] INFO:         5    0.565128    21961 x 722
19 | [2016-05-01 23:36:06,056] INFO: RMSE
20 | [2016-05-01 23:36:06,056] INFO:       Mean: 0.566274
21 | [2016-05-01 23:36:06,056] INFO:       Std: 0.000961
22 | [2016-05-01 23:36:06,056] INFO: Time
23 | [2016-05-01 23:36:06,056] INFO:       4 mins
24 | [2016-05-01 23:36:06,057] INFO: --------------------------------------------------
25 | [2016-05-01 23:40:56,020] INFO: tpe_transform took 0.007016 seconds
26 | [2016-05-01 23:40:56,021] INFO: TPE using 1/1 trials with best loss 0.566274
27 | [2016-05-01 23:40:56,023] INFO: ==================================================
28 | [2016-05-01 23:40:56,024] INFO: Task
29 | [2016-05-01 23:40:56,024] INFO:       [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_[Id@2]
30 | [2016-05-01 23:40:56,024] INFO: Param
31 | [2016-05-01 23:40:56,024] INFO:       leaf_size: 20
32 | [2016-05-01 23:40:56,024] INFO:       metric: minkowski
33 | [2016-05-01 23:40:56,025] INFO:       n_neighbors: 17
34 | [2016-05-01 23:40:56,025] INFO:       normalize: False
35 | [2016-05-01 23:40:56,025] INFO:       weights: uniform
36 | [2016-05-01 23:40:56,025] INFO: Result
37 | [2016-05-01 23:40:56,025] INFO:       Run      RMSE        Shape
38 | [2016-05-01 23:42:12,758] INFO:         1    0.533613    23167 x 722
39 | [2016-05-01 23:43:30,701] INFO:         2     0.53441    21940 x 722
40 | [2016-05-01 23:44:49,979] INFO:         3    0.534479    22182 x 722
41 | [2016-05-01 23:46:09,834] INFO:         4     0.53528    21966 x 722
42 | [2016-05-01 23:47:31,515] INFO:         5    0.533529    21961 x 722
43 | [2016-05-01 23:47:31,973] INFO: RMSE
44 | [2016-05-01 23:47:31,973] INFO:       Mean: 0.534262
45 | [2016-05-01 23:47:31,973] INFO:       Std: 0.000642
46 | [2016-05-01 23:47:31,973] INFO: Time
47 | [2016-05-01 23:47:31,973] INFO:       6 mins
48 | [2016-05-01 23:47:31,974] INFO: --------------------------------------------------
49 | [2016-05-01 23:53:35,158] INFO: tpe_transform took 0.007978 seconds
50 | [2016-05-01 23:53:35,158] INFO: TPE using 2/2 trials with best loss 0.534262
51 | [2016-05-01 23:53:35,162] INFO: ==================================================
52 | [2016-05-01 23:53:35,162] INFO: Task
53 | [2016-05-01 23:53:35,162] INFO:       [Feat@basic_linear_201605010104]_[Learner@reg_skl_knn]_[Id@3]
54 | [2016-05-01 23:53:35,162] INFO: Param
55 | [2016-05-01 23:53:35,163] INFO:       leaf_size: 70
56 | [2016-05-01 23:53:35,163] INFO:       metric: minkowski
57 | [2016-05-01 23:53:35,163] INFO:       n_neighbors: 11
58 | [2016-05-01 23:53:35,163] INFO:       normalize: True
59 | [2016-05-01 23:53:35,163] INFO:       weights: uniform
60 | [2016-05-01 23:53:35,164] INFO: Result
61 | [2016-05-01 23:53:35,164] INFO:       Run      RMSE        Shape
62 | [2016-05-02 00:48:01,432] INFO:         1    0.470701    23167 x 722
63 | [2016-05-02 01:44:17,330] INFO:         2    0.473355    21940 x 722
64 | [2016-05-02 02:42:51,392] INFO:         3    0.472826    22182 x 722
65 | [2016-05-02 03:36:47,762] INFO:         4    0.473626    21966 x 722
66 | [2016-05-02 04:31:33,827] INFO:         5    0.473484    21961 x 722
67 | [2016-05-02 04:31:34,254] INFO: RMSE
68 | [2016-05-02 04:31:34,254] INFO:       Mean: 0.472798
69 | [2016-05-02 04:31:34,254] INFO:       Std: 0.001083
70 | [2016-05-02 04:31:34,254] INFO: Time
71 | [2016-05-02 04:31:34,255] INFO:       277 mins
72 | [2016-05-02 04:31:34,255] INFO: --------------------------------------------------
73 | 


--------------------------------------------------------------------------------
/Log/level1_models/[Feat@basic_linear_201605010104]_[Learner@reg_skl_svr]_hyperopt_2016-05-01-22-45.log:
--------------------------------------------------------------------------------
 1 | [2016-05-01 22:45:17,834] INFO: tpe_transform took 0.008110 seconds
 2 | [2016-05-01 22:45:17,834] INFO: TPE using 0 trials
 3 | [2016-05-01 22:45:17,838] INFO: ==================================================
 4 | [2016-05-01 22:45:17,838] INFO: Task
 5 | [2016-05-01 22:45:17,838] INFO:       [Feat@basic_linear_201605010104]_[Learner@reg_skl_svr]_[Id@1]
 6 | [2016-05-01 22:45:17,839] INFO: Param
 7 | [2016-05-01 22:45:17,839] INFO:       C: 1.0
 8 | [2016-05-01 22:45:17,839] INFO:       degree: 3
 9 | [2016-05-01 22:45:17,839] INFO:       epsilon: 0.011478668759041495
10 | [2016-05-01 22:45:17,839] INFO:       gamma: 0.023621022279227823
11 | [2016-05-01 22:45:17,839] INFO:       kernel: poly
12 | [2016-05-01 22:45:17,840] INFO:       normalize: True
13 | [2016-05-01 22:45:17,840] INFO: Result
14 | [2016-05-01 22:45:17,840] INFO:       Run      RMSE        Shape
15 | [2016-05-02 04:56:39,928] INFO:         1     0.64399    23167 x 722
16 | 


--------------------------------------------------------------------------------
/Log/level1_models/[Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_hyperopt_2016-05-01-02-16.log:
--------------------------------------------------------------------------------
 1 | [2016-05-01 02:16:33,900] INFO: tpe_transform took 0.009265 seconds
 2 | [2016-05-01 02:16:33,901] INFO: TPE using 0 trials
 3 | [2016-05-01 02:16:33,904] INFO: ==================================================
 4 | [2016-05-01 02:16:33,905] INFO: Task
 5 | [2016-05-01 02:16:33,905] INFO:       [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_[Id@1]
 6 | [2016-05-01 02:16:33,905] INFO: Param
 7 | [2016-05-01 02:16:33,905] INFO:       learning_rate: 0.002
 8 | [2016-05-01 02:16:33,906] INFO:       max_depth: 4
 9 | [2016-05-01 02:16:33,906] INFO:       max_features: 0.55
10 | [2016-05-01 02:16:33,906] INFO:       min_samples_leaf: 1
11 | [2016-05-01 02:16:33,906] INFO:       n_estimators: 990
12 | [2016-05-01 02:16:33,907] INFO:       random_state: 2016
13 | [2016-05-01 02:16:33,907] INFO:       verbose: 0
14 | [2016-05-01 02:16:33,907] INFO: Result
15 | [2016-05-01 02:16:33,907] INFO:       Run      RMSE        Shape
16 | [2016-05-01 03:55:18,018] INFO:         1    0.452484    23167 x 726
17 | [2016-05-01 05:23:42,084] INFO:         2     0.45364    21940 x 726
18 | [2016-05-01 06:50:24,247] INFO:         3    0.452617    22182 x 726
19 | [2016-05-01 08:19:01,662] INFO:         4    0.453387    21966 x 726
20 | [2016-05-01 09:52:15,306] INFO:         5    0.453765    21961 x 726
21 | [2016-05-01 09:52:15,764] INFO: RMSE
22 | [2016-05-01 09:52:15,764] INFO:       Mean: 0.453179
23 | [2016-05-01 09:52:15,764] INFO:       Std: 0.000529
24 | [2016-05-01 09:52:15,764] INFO: Time
25 | [2016-05-01 09:52:15,764] INFO:       455 mins
26 | [2016-05-01 09:52:15,765] INFO: --------------------------------------------------
27 | [2016-05-01 14:28:18,909] INFO: tpe_transform took 0.008584 seconds
28 | [2016-05-01 14:28:18,910] INFO: TPE using 1/1 trials with best loss 0.453179
29 | [2016-05-01 14:28:18,913] INFO: ==================================================
30 | [2016-05-01 14:28:18,913] INFO: Task
31 | [2016-05-01 14:28:18,913] INFO:       [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_[Id@2]
32 | [2016-05-01 14:28:18,914] INFO: Param
33 | [2016-05-01 14:28:18,914] INFO:       learning_rate: 0.034
34 | [2016-05-01 14:28:18,914] INFO:       max_depth: 9
35 | [2016-05-01 14:28:18,914] INFO:       max_features: 0.8500000000000001
36 | [2016-05-01 14:28:18,914] INFO:       min_samples_leaf: 11
37 | [2016-05-01 14:28:18,915] INFO:       n_estimators: 200
38 | [2016-05-01 14:28:18,915] INFO:       random_state: 2016
39 | [2016-05-01 14:28:18,915] INFO:       verbose: 0
40 | [2016-05-01 14:28:18,915] INFO: Result
41 | [2016-05-01 14:28:18,915] INFO:       Run      RMSE        Shape
42 | [2016-05-01 15:41:36,106] INFO:         1    0.442676    23167 x 726
43 | [2016-05-01 16:51:39,302] INFO:         2    0.445524    21940 x 726
44 | [2016-05-01 17:57:33,868] INFO:         3    0.442897    22182 x 726
45 | [2016-05-01 19:03:23,230] INFO:         4    0.443379    21966 x 726
46 | [2016-05-01 20:03:51,011] INFO:         5    0.443433    21961 x 726
47 | [2016-05-01 20:03:51,616] INFO: RMSE
48 | [2016-05-01 20:03:51,617] INFO:       Mean: 0.443582
49 | [2016-05-01 20:03:51,617] INFO:       Std: 0.001012
50 | [2016-05-01 20:03:51,617] INFO: Time
51 | [2016-05-01 20:03:51,617] INFO:       335 mins
52 | [2016-05-01 20:03:51,618] INFO: --------------------------------------------------
53 | [2016-05-02 01:03:01,007] INFO: tpe_transform took 0.010242 seconds
54 | [2016-05-02 01:03:01,019] INFO: TPE using 2/2 trials with best loss 0.443582
55 | [2016-05-02 01:03:01,023] INFO: ==================================================
56 | [2016-05-02 01:03:01,023] INFO: Task
57 | [2016-05-02 01:03:01,023] INFO:       [Feat@basic_nonlinear_201605010058]_[Learner@reg_skl_gbm]_[Id@3]
58 | [2016-05-02 01:03:01,023] INFO: Param
59 | [2016-05-02 01:03:01,023] INFO:       learning_rate: 0.004
60 | [2016-05-02 01:03:01,024] INFO:       max_depth: 10
61 | [2016-05-02 01:03:01,024] INFO:       max_features: 0.65
62 | [2016-05-02 01:03:01,024] INFO:       min_samples_leaf: 3
63 | [2016-05-02 01:03:01,024] INFO:       n_estimators: 940
64 | [2016-05-02 01:03:01,024] INFO:       random_state: 2016
65 | [2016-05-02 01:03:01,024] INFO:       verbose: 0
66 | [2016-05-02 01:03:01,025] INFO: Result
67 | [2016-05-02 01:03:01,025] INFO:       Run      RMSE        Shape
68 | [2016-05-02 07:52:25,663] INFO:         1    0.440727    23167 x 726
69 | [2016-05-02 12:14:21,236] INFO:         2    0.443748    21940 x 726
70 | 


--------------------------------------------------------------------------------
/Output/Subm/README.md:
--------------------------------------------------------------------------------
 1 | 0. sub0: `test.pred.[Feat@basic_nonlinear_201604210409]_[Learner@reg_xgb_tree]_[Id@84].[Mean0.438318]_[Std0.000786].csv`
 2 |  - best single model from Chenglong
 3 |  - Public LB: **0.43996**
 4 |  - Private LB: **0.43811** (9th place)
 5 | 
 6 | 1. sub1: `submission_kostia + igor final_ensemble (1 to 3 weights).csv`
 7 |  - best ensembled model from Igor and Kostia
 8 |  - Public LB: **0.43819**
 9 |  - Private LB: **0.43704** (8th place)
10 | 
11 | 2. sub2: `test.pred.[Feat@level2_meta_linear_201605030922]_[Learner@reg_ensemble]_[Id@1].[Mean0.436087]_[Std0.001027].csv`
12 |  - *reproduced* best ensembled model from Chenglong
13 |  - Public LB: **0.43582**
14 |  - Private LB: **0.43325** (4th place)
15 | 
16 | 3. sub3: `reproduced_blend_0.438_0.436CV.csv`
17 |  - *reproduced* best blended model from 0.3 * sub1 + 0.7 * sub2
18 |  - Public LB: **0.43465**
19 |  - Private LB: **0.43248** (3rd place)


--------------------------------------------------------------------------------