├── README.md ├── doc.pdf ├── fm ├── Makefile └── src │ ├── common.cpp │ ├── common.h │ ├── timer.cpp │ ├── timer.h │ └── train.cpp ├── gbdt ├── Makefile └── src │ ├── common.cpp │ ├── common.h │ ├── gbdt.cpp │ ├── gbdt.h │ ├── timer.cpp │ ├── timer.h │ └── train.cpp ├── license.txt └── script ├── addc.py ├── append.py ├── append_gbdt.py ├── append_gbdt_1.py ├── calibrate.py ├── calibrate.pyc ├── ensemble.py ├── fcount.py ├── ftrl_1.py ├── ftrl_2.py ├── gbdt_dense.py ├── genDict.py ├── genM.py ├── id_day.py ├── id_stat.py ├── index1.py ├── index2.py ├── lsa.py ├── prep.py ├── prep_1.py ├── rare.py ├── run.sh └── split.py /README.md: -------------------------------------------------------------------------------- 1 | Random Walker's solution for Avazu Click-Through rate prediction 2 | 3 | The introduction of our approach could be found in doc.pdf. 4 | 5 | System Requirement 6 | ------------------ 7 | - 64-bit Unix-like os 8 | - Python 2.7 9 | - g++ 10 | - pypy 11 | - sklearn 12 | - at least 20GB memory and 50GB disk space 13 | 14 | To reproduce our submission: 15 | ------------------- 16 | - Download tha data("train" and "test") to this folder. 17 | - Change directory to script: 18 | cd script 19 | - Run the code: 20 | ./run.sh 21 | -------------------------------------------------------------------------------- /doc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitezxc/kaggle-avazu/93d719617567dd5d65fdaeb2cded48732c29d907/doc.pdf -------------------------------------------------------------------------------- /fm/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -Wall -Wconversion -O3 -fPIC -std=c++0x -fopenmp 3 | MAIN = fm 4 | FILES = common.cpp timer.cpp 5 | SRCS = $(FILES:%.cpp=src/%.cpp) 6 | HEADERS = $(FILES:%.cpp=src/%.h) 7 | 8 | all: $(MAIN) 9 | 10 | fm: src/train.cpp $(SRCS) $(HEADERS) 11 | $(CXX) $(CXXFLAGS) -o $@ $< $(SRCS) 12 | 13 | clean: 14 | rm -f $(MAIN) 15 | -------------------------------------------------------------------------------- /fm/src/common.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "common.h" 6 | 7 | namespace { 8 | 9 | int const kMaxLineSize = 1000000; 10 | 11 | uint32_t get_nr_line(std::string const &path) 12 | { 13 | FILE *f = open_c_file(path.c_str(), "r"); 14 | char line[kMaxLineSize]; 15 | 16 | uint32_t nr_line = 0; 17 | while(fgets(line, kMaxLineSize, f) != nullptr) 18 | ++nr_line; 19 | 20 | fclose(f); 21 | 22 | return nr_line; 23 | } 24 | 25 | uint32_t get_nr_field(std::string const &path) 26 | { 27 | FILE *f = open_c_file(path.c_str(), "r"); 28 | char line[kMaxLineSize]; 29 | 30 | fgets(line, kMaxLineSize, f); 31 | strtok(line, " \t"); 32 | 33 | uint32_t nr_field = 0; 34 | while(1) 35 | { 36 | char *idx_char = strtok(nullptr," \t"); 37 | if(idx_char == nullptr || *idx_char == '\n') 38 | break; 39 | ++nr_field; 40 | } 41 | 42 | fclose(f); 43 | 44 | return nr_field; 45 | } 46 | 47 | } //unamed namespace 48 | 49 | Problem read_problem(std::string const path) 50 | { 51 | if(path.empty()) 52 | return Problem(0, 0); 53 | Problem prob(get_nr_line(path), get_nr_field(path)); 54 | 55 | FILE *f = open_c_file(path.c_str(), "r"); 56 | char line[kMaxLineSize]; 57 | 58 | uint64_t p = 0; 59 | for(uint32_t i = 0; fgets(line, kMaxLineSize, f) != nullptr; ++i) 60 | { 61 | char *y_char = strtok(line, " \t"); 62 | float const y = (atoi(y_char)>0)? 1.0f : -1.0f; 63 | prob.Y[i] = y; 64 | for(; ; ++p) 65 | { 66 | char *idx_char = strtok(nullptr," \t"); 67 | if(idx_char == nullptr || *idx_char == '\n') 68 | break; 69 | uint32_t idx = static_cast(atoi(idx_char)); 70 | prob.nr_feature = std::max(prob.nr_feature, idx); 71 | prob.J[p] = idx-1; 72 | } 73 | } 74 | 75 | fclose(f); 76 | 77 | return prob; 78 | } 79 | 80 | FILE *open_c_file(std::string const &path, std::string const &mode) 81 | { 82 | FILE *f = fopen(path.c_str(), mode.c_str()); 83 | if(!f) 84 | throw std::runtime_error(std::string("cannot open ")+path); 85 | return f; 86 | } 87 | 88 | std::vector 89 | argv_to_args(int const argc, char const * const * const argv) 90 | { 91 | std::vector args; 92 | for(int i = 1; i < argc; ++i) 93 | args.emplace_back(argv[i]); 94 | return args; 95 | } 96 | 97 | float predict(Problem const &prob, Model &model, 98 | std::string const &output_path) 99 | { 100 | FILE *f = nullptr; 101 | if(!output_path.empty()) 102 | f = open_c_file(output_path, "w"); 103 | 104 | double loss = 0; 105 | #pragma omp parallel for schedule(static) reduction(+:loss) 106 | for(uint32_t i = 0; i < prob.Y.size(); ++i) 107 | { 108 | float const y = prob.Y[i]; 109 | 110 | float const t = wTx(prob, model, i); 111 | 112 | float const prob = 1/(1+static_cast(exp(-t))); 113 | 114 | float const expnyt = static_cast(exp(-y*t)); 115 | 116 | loss += log(1+expnyt); 117 | 118 | if(f) 119 | fprintf(f, "%lf\n", prob); 120 | } 121 | 122 | if(f) 123 | fclose(f); 124 | 125 | return static_cast(loss/static_cast(prob.Y.size())); 126 | } 127 | -------------------------------------------------------------------------------- /fm/src/common.h: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic ignored "-Wunused-result" 2 | 3 | #ifndef _COMMON_H_ 4 | #define _COMMON_H_ 5 | 6 | #define flag { printf("\nLINE: %d\n", __LINE__); fflush(stdout); } 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | struct Problem 13 | { 14 | Problem(uint32_t const nr_instance, uint32_t const nr_field) 15 | : nr_feature(0), nr_instance(nr_instance), nr_field(nr_field), 16 | v(2.0f/static_cast(nr_field)), 17 | J(static_cast(nr_instance)*nr_field), 18 | Y(nr_instance) {} 19 | uint32_t nr_feature, nr_instance, nr_field; 20 | float v; 21 | std::vector J; 22 | std::vector Y; 23 | }; 24 | 25 | Problem read_problem(std::string const path); 26 | 27 | uint32_t const kW_NODE_SIZE = 2; 28 | 29 | struct Model 30 | { 31 | Model(uint32_t const nr_feature, uint32_t const nr_factor, uint32_t const nr_field) 32 | : W(static_cast(nr_feature)*nr_field*nr_factor*kW_NODE_SIZE, 0), 33 | nr_feature(nr_feature), nr_factor(nr_factor), nr_field(nr_field) {} 34 | std::vector W; 35 | const uint32_t nr_feature, nr_factor, nr_field; 36 | }; 37 | 38 | FILE *open_c_file(std::string const &path, std::string const &mode); 39 | 40 | std::vector 41 | argv_to_args(int const argc, char const * const * const argv); 42 | 43 | inline float wTx(Problem const &prob, Model &model, uint32_t const i, 44 | float const kappa=0, float const eta=0, float const lambda=0, 45 | bool const do_update=false) 46 | { 47 | uint32_t const nr_factor = model.nr_factor; 48 | uint32_t const nr_field = model.nr_field; 49 | uint32_t const nr_feature = model.nr_feature; 50 | uint64_t const align0 = nr_factor*kW_NODE_SIZE; 51 | uint64_t const align1 = nr_field*align0; 52 | 53 | uint32_t const * const J = &prob.J[i*nr_field]; 54 | float * const W = model.W.data(); 55 | 56 | __m128 const XMMv = _mm_set1_ps(prob.v); 57 | __m128 const XMMkappav = _mm_set1_ps(kappa*prob.v); 58 | __m128 const XMMeta = _mm_set1_ps(eta); 59 | __m128 const XMMlambda = _mm_set1_ps(lambda); 60 | __m128 const XMMones = _mm_set1_ps(1.0f); 61 | __m128 XMMt = _mm_setzero_ps(); 62 | for(uint32_t f1 = 0; f1 < nr_field; ++f1) 63 | { 64 | uint32_t const j1 = J[f1]; 65 | if(j1 >= nr_feature) 66 | continue; 67 | 68 | for(uint32_t f2 = f1; f2 < nr_field; ++f2) 69 | { 70 | uint32_t const j2 = J[f2]; 71 | if(j2 >= nr_feature) 72 | continue; 73 | 74 | float * const w1 = W + j1*align1 + f2*align0; 75 | float * const w2 = W + j2*align1 + f1*align0; 76 | 77 | if(do_update) 78 | { 79 | float * const wg1 = w1 + nr_factor; 80 | float * const wg2 = w2 + nr_factor; 81 | for(uint32_t d = 0; d < nr_factor; d += 4) 82 | { 83 | if( f1 != f2 ){ 84 | __m128 XMMw1 = _mm_load_ps(w1+d); 85 | __m128 XMMw2 = _mm_load_ps(w2+d); 86 | 87 | __m128 XMMwg1 = _mm_load_ps(wg1+d); 88 | __m128 XMMwg2 = _mm_load_ps(wg2+d); 89 | 90 | __m128 XMMg1 = _mm_add_ps( 91 | _mm_mul_ps(XMMlambda, XMMw1), 92 | _mm_mul_ps(XMMkappav, XMMw2)); 93 | __m128 XMMg2 = _mm_add_ps( 94 | _mm_mul_ps(XMMlambda, XMMw2), 95 | _mm_mul_ps(XMMkappav, XMMw1)); 96 | 97 | XMMwg1 = _mm_add_ps(XMMwg1, _mm_mul_ps(XMMg1, XMMg1)); 98 | XMMwg2 = _mm_add_ps(XMMwg2, _mm_mul_ps(XMMg2, XMMg2)); 99 | 100 | XMMw1 = _mm_sub_ps(XMMw1, _mm_mul_ps(XMMeta, 101 | _mm_mul_ps(_mm_rsqrt_ps(XMMwg1), XMMg1))); 102 | XMMw2 = _mm_sub_ps(XMMw2, _mm_mul_ps(XMMeta, 103 | _mm_mul_ps(_mm_rsqrt_ps(XMMwg2), XMMg2))); 104 | 105 | _mm_store_ps(w1+d, XMMw1); 106 | _mm_store_ps(w2+d, XMMw2); 107 | 108 | _mm_store_ps(wg1+d, XMMwg1); 109 | _mm_store_ps(wg2+d, XMMwg2); 110 | }else{ 111 | __m128 XMMw1 = _mm_load_ps(w1+d); 112 | __m128 XMMwg1 = _mm_load_ps(wg1+d); 113 | __m128 XMMg1 = _mm_add_ps(_mm_mul_ps(XMMlambda, XMMw1),_mm_mul_ps(XMMkappav, XMMones)); 114 | XMMwg1 = _mm_add_ps(XMMwg1, _mm_mul_ps(XMMg1, XMMg1)); 115 | XMMw1 = _mm_sub_ps(XMMw1, _mm_mul_ps(XMMeta,_mm_mul_ps(_mm_rsqrt_ps(XMMwg1), XMMg1))); 116 | _mm_store_ps(w1+d, XMMw1); 117 | _mm_store_ps(wg1+d, XMMwg1); 118 | } 119 | } 120 | } 121 | else 122 | { 123 | for(uint32_t d = 0; d < nr_factor; d += 4) 124 | { 125 | __m128 const XMMw1 = _mm_load_ps(w1+d); 126 | __m128 const XMMw2 = _mm_load_ps(w2+d); 127 | if( f1 != f2 ){ 128 | XMMt = _mm_add_ps(XMMt, 129 | _mm_mul_ps(_mm_mul_ps(XMMw1, XMMw2), XMMv)); 130 | }else{ 131 | XMMt = _mm_add_ps(XMMt,_mm_mul_ps(_mm_mul_ps(XMMw1, XMMones), XMMv)); 132 | } 133 | } 134 | } 135 | } 136 | } 137 | 138 | if(do_update) 139 | return 0; 140 | 141 | XMMt = _mm_hadd_ps(XMMt, XMMt); 142 | XMMt = _mm_hadd_ps(XMMt, XMMt); 143 | float t; 144 | _mm_store_ss(&t, XMMt); 145 | 146 | return t; 147 | } 148 | 149 | float predict(Problem const &prob, Model &model, 150 | std::string const &output_path = std::string("")); 151 | #endif // _COMMON_H_ 152 | -------------------------------------------------------------------------------- /fm/src/timer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "timer.h" 3 | 4 | Timer::Timer() 5 | { 6 | reset(); 7 | } 8 | 9 | void Timer::reset() 10 | { 11 | begin = std::chrono::high_resolution_clock::now(); 12 | duration = 13 | std::chrono::duration_cast(begin-begin); 14 | } 15 | 16 | void Timer::tic() 17 | { 18 | begin = std::chrono::high_resolution_clock::now(); 19 | } 20 | 21 | float Timer::toc() 22 | { 23 | duration += std::chrono::duration_cast 24 | (std::chrono::high_resolution_clock::now()-begin); 25 | return (float)duration.count()/1000; 26 | } 27 | 28 | float Timer::get() 29 | { 30 | float time = toc(); 31 | tic(); 32 | return time; 33 | } 34 | -------------------------------------------------------------------------------- /fm/src/timer.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | class Timer 4 | { 5 | public: 6 | Timer(); 7 | void reset(); 8 | void tic(); 9 | float toc(); 10 | float get(); 11 | private: 12 | std::chrono::high_resolution_clock::time_point begin; 13 | std::chrono::milliseconds duration; 14 | }; 15 | -------------------------------------------------------------------------------- /fm/src/train.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "common.h" 7 | #include "timer.h" 8 | 9 | namespace { 10 | 11 | struct Option 12 | { 13 | Option() 14 | : eta(0.1f), lambda(0.00002f), iter(5), nr_factor(4), 15 | nr_threads(1), do_prediction(true) {} 16 | std::string Tr_path, Va_path; 17 | float eta, lambda; 18 | uint32_t iter, nr_factor, nr_threads; 19 | bool do_prediction; 20 | }; 21 | 22 | std::string train_help() 23 | { 24 | return std::string( 25 | "usage: fm [] \n" 26 | "\n" 27 | ".out will be automatically generated at the end of training\n" 28 | "\n" 29 | "options:\n" 30 | "-l : set the regularization penalty\n" 31 | "-k : set the number of latent factors, which must be a multiple of 4\n" 32 | "-t : set the number of iterations\n" 33 | "-r : set the learning rate\n" 34 | "-s : set the number of threads\n" 35 | "-q: if it is set, then there is no output file\n"); 36 | } 37 | 38 | Option parse_option(std::vector const &args) 39 | { 40 | uint32_t const argc = static_cast(args.size()); 41 | 42 | if(argc == 0) 43 | throw std::invalid_argument(train_help()); 44 | 45 | Option opt; 46 | 47 | uint32_t i = 0; 48 | for(; i < argc; ++i) 49 | { 50 | if(args[i].compare("-t") == 0) 51 | { 52 | if(i == argc-1) 53 | throw std::invalid_argument("invalid command\n"); 54 | opt.iter = std::stoi(args[++i]); 55 | } 56 | else if(args[i].compare("-k") == 0) 57 | { 58 | if(i == argc-1) 59 | throw std::invalid_argument("invalid command\n"); 60 | opt.nr_factor = std::stoi(args[++i]); 61 | if(opt.nr_factor%4 != 0) 62 | throw std::invalid_argument("k should be a multiple of 4\n"); 63 | } 64 | else if(args[i].compare("-r") == 0) 65 | { 66 | if(i == argc-1) 67 | throw std::invalid_argument("invalid command\n"); 68 | opt.eta = std::stof(args[++i]); 69 | } 70 | else if(args[i].compare("-l") == 0) 71 | { 72 | if(i == argc-1) 73 | throw std::invalid_argument("invalid command\n"); 74 | opt.lambda = std::stof(args[++i]); 75 | } 76 | else if(args[i].compare("-s") == 0) 77 | { 78 | if(i == argc-1) 79 | throw std::invalid_argument("invalid command\n"); 80 | opt.nr_threads = std::stoi(args[++i]); 81 | } 82 | else if(args[i].compare("-q") == 0) 83 | { 84 | opt.do_prediction = false; 85 | } 86 | else 87 | { 88 | break; 89 | } 90 | } 91 | 92 | if(i >= argc-1) 93 | throw std::invalid_argument("training or test set not specified\n"); 94 | 95 | opt.Va_path = args[i++]; 96 | opt.Tr_path = args[i++]; 97 | 98 | return opt; 99 | } 100 | 101 | void init_model(Model &model) 102 | { 103 | uint32_t const nr_factor = model.nr_factor; 104 | float const coef = 105 | static_cast(0.33/sqrt(static_cast(nr_factor))); 106 | 107 | float * w = model.W.data(); 108 | for(uint32_t j = 0; j < model.nr_feature; ++j) 109 | { 110 | for(uint32_t f = 0; f < model.nr_field; ++f) 111 | { 112 | for(uint32_t d = 0; d < nr_factor; ++d, ++w) 113 | *w = coef*(static_cast(drand48())); 114 | for(uint32_t d = nr_factor; d < 2*nr_factor; ++d, ++w) 115 | *w = 0.5f; 116 | } 117 | } 118 | } 119 | 120 | void train(Problem const &Tr, Problem const &Va, Model &model, Option const &opt) 121 | { 122 | std::vector order(Tr.Y.size()); 123 | for(uint32_t i = 0; i < Tr.Y.size(); ++i) 124 | order[i] = i; 125 | 126 | Timer timer; 127 | printf("iter time tr_loss va_loss\n"); 128 | for(uint32_t iter = 0; iter < opt.iter; ++iter) 129 | { 130 | timer.tic(); 131 | 132 | double Tr_loss = 0; 133 | #pragma omp parallel for schedule(static) 134 | for(uint32_t i_ = 0; i_ < order.size(); ++i_) 135 | { 136 | uint32_t const i = order[i_]; 137 | 138 | float const y = Tr.Y[i]; 139 | 140 | float const t = wTx(Tr, model, i); 141 | 142 | float const expnyt = static_cast(exp(-y*t)); 143 | 144 | Tr_loss += log(1+expnyt); 145 | 146 | float const kappa = -y*expnyt/(1+expnyt); 147 | 148 | wTx(Tr, model, i, kappa, opt.eta, opt.lambda, true); 149 | } 150 | Tr_loss /= static_cast(Tr.Y.size()); 151 | 152 | double const Va_loss = predict(Va, model); 153 | 154 | printf("%4d %8.1f %10.5f %10.5f\n", 155 | iter, timer.toc(), Tr_loss, Va_loss); 156 | fflush(stdout); 157 | } 158 | } 159 | 160 | } //unnamed namespace 161 | 162 | int main(int const argc, char const * const * const argv) 163 | { 164 | Option opt; 165 | try 166 | { 167 | opt = parse_option(argv_to_args(argc, argv)); 168 | } 169 | catch(std::invalid_argument const &e) 170 | { 171 | std::cout << e.what(); 172 | return EXIT_FAILURE; 173 | } 174 | 175 | std::cout << "reading data..." << std::flush; 176 | Problem const Va = read_problem(opt.Va_path); 177 | Problem const Tr = read_problem(opt.Tr_path); 178 | std::cout << "done\n" << std::flush; 179 | 180 | std::cout << "initializing model..." << std::flush; 181 | Model model(Tr.nr_feature, opt.nr_factor, Tr.nr_field); 182 | init_model(model); 183 | std::cout << "done\n" << std::flush; 184 | 185 | omp_set_num_threads(static_cast(opt.nr_threads)); 186 | 187 | train(Tr, Va, model, opt); 188 | 189 | omp_set_num_threads(1); 190 | 191 | if(opt.do_prediction) 192 | predict(Va, model, opt.Va_path+".out"); 193 | 194 | return EXIT_SUCCESS; 195 | } 196 | -------------------------------------------------------------------------------- /gbdt/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -Wall -Wconversion -O2 -fPIC -std=c++0x -fopenmp 3 | 4 | MAIN = gbdt 5 | FILES = common.cpp timer.cpp gbdt.cpp 6 | SRCS = $(FILES:%.cpp=src/%.cpp) 7 | HEADERS = $(FILES:%.cpp=src/%.h) 8 | 9 | all: $(MAIN) 10 | 11 | gbdt: src/train.cpp $(SRCS) $(HEADERS) 12 | $(CXX) $(CXXFLAGS) -o $@ $< $(SRCS) 13 | 14 | clean: 15 | rm -f $(MAIN) 16 | -------------------------------------------------------------------------------- /gbdt/src/common.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "common.h" 8 | 9 | namespace { 10 | 11 | uint32_t const kMaxLineSize = 1000000; 12 | 13 | uint32_t get_nr_line(std::string const &path) 14 | { 15 | FILE *f = open_c_file(path.c_str(), "r"); 16 | char line[kMaxLineSize]; 17 | 18 | uint32_t nr_line = 0; 19 | while(fgets(line, kMaxLineSize, f) != nullptr) 20 | ++nr_line; 21 | 22 | fclose(f); 23 | 24 | printf("%s,%d",path.c_str(),nr_line); 25 | return nr_line; 26 | } 27 | 28 | uint32_t get_nr_field(std::string const &path) 29 | { 30 | FILE *f = open_c_file(path.c_str(), "r"); 31 | char line[kMaxLineSize]; 32 | 33 | fgets(line, kMaxLineSize, f); 34 | strtok(line, " \t"); 35 | 36 | uint32_t nr_field = 0; 37 | while(1) 38 | { 39 | char *val_char = strtok(nullptr," \t"); 40 | if(val_char == nullptr || *val_char == '\n') 41 | break; 42 | ++nr_field; 43 | } 44 | 45 | fclose(f); 46 | 47 | return nr_field; 48 | } 49 | 50 | 51 | void read_dense(Problem &prob, std::string const &path) 52 | { 53 | char line[kMaxLineSize]; 54 | 55 | FILE *f = open_c_file(path.c_str(), "r"); 56 | for(uint32_t i = 0; fgets(line, kMaxLineSize, f) != nullptr; ++i) 57 | { 58 | char *p = strtok(line, " \t"); 59 | prob.Y[i] = (atoi(p)>0)? 1.0f : -1.0f; 60 | for(uint32_t j = 0; j < prob.nr_field; ++j) 61 | { 62 | char *val_char = strtok(nullptr," \t"); 63 | 64 | float const val = static_cast(atof(val_char)); 65 | 66 | prob.X[j][i] = Node(i, val); 67 | } 68 | } 69 | 70 | fclose(f); 71 | //sort_problem(prob); 72 | } 73 | 74 | void sort_problem(Problem &prob) 75 | { 76 | struct sort_by_v 77 | { 78 | bool operator() (Node const lhs, Node const rhs) 79 | { 80 | return lhs.v < rhs.v; 81 | } 82 | }; 83 | 84 | #pragma omp parallel for schedule(static) 85 | for(uint32_t j = 0; j < prob.nr_field; ++j) 86 | { 87 | std::vector &X1 = prob.X[j]; 88 | std::vector &Z1 = prob.Z[j]; 89 | std::sort(X1.begin(), X1.end(), sort_by_v()); 90 | for(uint32_t i = 0; i < prob.nr_instance; ++i) 91 | Z1[X1[i].i] = Node(i, X1[i].v); 92 | } 93 | } 94 | /* 95 | void read_sparse(Problem &prob, std::string const &path) 96 | { 97 | char line[kMaxLineSize]; 98 | 99 | FILE *f = open_c_file(path.c_str(), "r"); 100 | 101 | std::vector> buffer; 102 | 103 | uint64_t nnz = 0; 104 | uint32_t nr_instance = 0; 105 | prob.SJP.push_back(0); 106 | for(; fgets(line, kMaxLineSize, f) != nullptr; ++nr_instance) 107 | { 108 | strtok(line, " \t"); 109 | for( ; ; ++nnz) 110 | { 111 | char *idx_char = strtok(nullptr," \t"); 112 | if(idx_char == nullptr || *idx_char == '\n') 113 | break; 114 | 115 | uint32_t const idx = atoi(idx_char); 116 | if(idx > buffer.size()) 117 | buffer.resize(idx); 118 | buffer[idx-1].push_back(nr_instance); 119 | prob.SJ.push_back(idx-1); 120 | } 121 | prob.SJP.push_back(prob.SJ.size()); 122 | } 123 | prob.SJ.shrink_to_fit(); 124 | prob.SJP.shrink_to_fit(); 125 | 126 | prob.nr_sparse_field = static_cast(buffer.size()); 127 | prob.SI.resize(nnz); 128 | prob.SIP.resize(prob.nr_sparse_field+1); 129 | prob.SIP[0] = 0; 130 | 131 | uint64_t p = 0; 132 | for(uint32_t j = 0; j < prob.nr_sparse_field; ++j) 133 | { 134 | for(auto i : buffer[j]) 135 | prob.SI[p++] = i; 136 | prob.SIP[j+1] = p; 137 | } 138 | 139 | fclose(f); 140 | 141 | sort_problem(prob); 142 | } 143 | */ 144 | } //unamed namespace 145 | 146 | //Problem read_data(std::string const &dense_path, std::string const &sparse_path) 147 | Problem read_data(std::string const &dense_path) 148 | { 149 | Problem prob(get_nr_line(dense_path), get_nr_field(dense_path)); 150 | 151 | read_dense(prob, dense_path); 152 | 153 | sort_problem(prob) ; 154 | 155 | return prob; 156 | } 157 | 158 | FILE *open_c_file(std::string const &path, std::string const &mode) 159 | { 160 | FILE *f = fopen(path.c_str(), mode.c_str()); 161 | if(!f) 162 | throw std::runtime_error(std::string("cannot open ")+path); 163 | return f; 164 | } 165 | 166 | std::vector 167 | argv_to_args(int const argc, char const * const * const argv) 168 | { 169 | std::vector args; 170 | for(int i = 1; i < argc; ++i) 171 | args.emplace_back(argv[i]); 172 | return args; 173 | } 174 | -------------------------------------------------------------------------------- /gbdt/src/common.h: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic ignored "-Wunused-result" 2 | 3 | #ifndef _COMMON_H_ 4 | #define _COMMON_H_ 5 | 6 | #define flag { printf("\nLINE: %d\n", __LINE__); fflush(stdout); } 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | struct Node 16 | { 17 | Node() : i(0), v(0) {} 18 | Node(uint32_t const i, float const v) : i(i), v(v) {} 19 | uint32_t i; 20 | float v; 21 | }; 22 | 23 | struct Problem 24 | { 25 | Problem() : nr_instance(0), nr_field(0), nr_sparse_field(0) {} 26 | Problem(uint32_t const nr_instance, uint32_t const nr_field) 27 | : nr_instance(nr_instance), nr_field(nr_field), nr_sparse_field(0), 28 | X(nr_field, std::vector(nr_instance)), 29 | Z(nr_field, std::vector(nr_instance)), 30 | Y(nr_instance) {} 31 | uint32_t const nr_instance, nr_field; 32 | uint32_t nr_sparse_field; 33 | std::vector> X, Z; 34 | std::vector SI, SJ; 35 | std::vector SIP, SJP; 36 | std::vector Y; 37 | }; 38 | 39 | inline std::vector 40 | construct_instance(Problem const &prob, uint32_t const i) 41 | { 42 | uint32_t const nr_field = prob.nr_field; 43 | uint32_t const nr_sparse_field = prob.nr_sparse_field; 44 | //std::vector const &SJ = prob.SJ; 45 | //std::vector const &SJP = prob.SJP; 46 | 47 | std::vector x(nr_field+nr_sparse_field, 0); 48 | for(uint32_t j = 0; j < prob.nr_field; ++j) 49 | x[j] = prob.Z[j][i].v; 50 | //for(uint64_t p = SJP[i]; p < SJP[i+1]; ++p) 51 | //x[SJ[p]+nr_field] = 1; 52 | 53 | return x; 54 | } 55 | 56 | Problem read_data(std::string const &dense_path, 57 | std::string const &sparse_path); 58 | 59 | Problem read_data(std::string const &dense_path); 60 | 61 | FILE *open_c_file(std::string const &path, std::string const &mode); 62 | 63 | std::vector 64 | argv_to_args(int const argc, char const * const * const argv); 65 | 66 | #endif // _COMMON_H_ 67 | -------------------------------------------------------------------------------- /gbdt/src/gbdt.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "gbdt.h" 8 | #include "timer.h" 9 | 10 | namespace { 11 | 12 | float calc_bias(std::vector const &Y) 13 | { 14 | double y_bar = std::accumulate(Y.begin(), Y.end(), 0.0); 15 | y_bar /= static_cast(Y.size()); 16 | y_bar = y_bar/2 + 0.5 ; 17 | return static_cast(log((1.0+y_bar)/(1.0-y_bar))); 18 | } 19 | 20 | struct Location 21 | { 22 | Location() : tnode_idx(1), r(0), shrinked(false) {} 23 | uint32_t tnode_idx; 24 | float r; 25 | bool shrinked; 26 | }; 27 | 28 | struct Meta 29 | { 30 | Meta() : sl(0), s(0), nl(0), n(0), v(0.0f/0.0f) {} 31 | double sl, s; 32 | uint32_t nl, n; 33 | float v; 34 | }; 35 | 36 | struct Defender 37 | { 38 | Defender() : ese(0), threshold(0) {} 39 | double ese; 40 | float threshold; 41 | }; 42 | 43 | void scan( 44 | Problem const &prob, 45 | std::vector const &locations, 46 | std::vector const &metas0, 47 | std::vector &defenders, 48 | uint32_t const offset, 49 | bool const forward) 50 | { 51 | uint32_t const nr_field = prob.nr_field; 52 | uint32_t const nr_instance = prob.nr_instance; 53 | 54 | #pragma omp parallel for schedule(dynamic) 55 | for(uint32_t j = 0; j < nr_field; ++j) 56 | { 57 | std::vector metas = metas0; 58 | 59 | for(uint32_t i_bar = 0; i_bar < nr_instance; ++i_bar) 60 | { 61 | uint32_t const i = forward? i_bar : nr_instance-i_bar-1; 62 | 63 | Node const &dnode = prob.X[j][i]; 64 | Location const &location = locations[dnode.i]; 65 | if(location.shrinked) 66 | continue; 67 | 68 | uint32_t const f = location.tnode_idx-offset; 69 | Meta &meta = metas[f]; 70 | 71 | if(dnode.v != meta.v) 72 | { 73 | double const sr = meta.s - meta.sl; 74 | uint32_t const nr = meta.n - meta.nl; 75 | double const current_ese = 76 | (meta.sl*meta.sl)/static_cast(meta.nl) + 77 | (sr*sr)/static_cast(nr); 78 | 79 | Defender &defender = defenders[f*nr_field+j]; 80 | double &best_ese = defender.ese; 81 | if(current_ese > best_ese) 82 | { 83 | best_ese = current_ese; 84 | defender.threshold = forward? dnode.v : meta.v; 85 | } 86 | if(i_bar > nr_instance/2) 87 | break; 88 | } 89 | 90 | meta.sl += location.r; 91 | ++meta.nl; 92 | meta.v = dnode.v; 93 | } 94 | } 95 | } 96 | 97 | void scan_sparse( 98 | Problem const &prob, 99 | std::vector const &locations, 100 | std::vector const &metas0, 101 | std::vector &defenders, 102 | uint32_t const offset, 103 | bool const forward) 104 | { 105 | uint32_t const nr_sparse_field = prob.nr_sparse_field; 106 | uint32_t const nr_leaf = offset; 107 | 108 | #pragma omp parallel for schedule(dynamic) 109 | for(uint32_t j = 0; j < nr_sparse_field; ++j) 110 | { 111 | std::vector metas = metas0; 112 | for(uint64_t p = prob.SIP[j]; p < prob.SIP[j+1]; ++p) 113 | { 114 | Location const &location = locations[prob.SI[p]]; 115 | if(location.shrinked) 116 | continue; 117 | Meta &meta = metas[location.tnode_idx-offset]; 118 | meta.sl += location.r; 119 | ++meta.nl; 120 | } 121 | 122 | for(uint32_t f = 0; f < nr_leaf; ++f) 123 | { 124 | Meta const &meta = metas[f]; 125 | if(meta.nl == 0) 126 | continue; 127 | 128 | double const sr = meta.s - meta.sl; 129 | uint32_t const nr = meta.n - meta.nl; 130 | double const current_ese = 131 | (meta.sl*meta.sl)/static_cast(meta.nl) + 132 | (sr*sr)/static_cast(nr); 133 | 134 | Defender &defender = defenders[f*nr_sparse_field+j]; 135 | double &best_ese = defender.ese; 136 | if(current_ese > best_ese) 137 | { 138 | best_ese = current_ese; 139 | defender.threshold = 1; 140 | } 141 | } 142 | } 143 | } 144 | 145 | } //unnamed namespace 146 | 147 | uint32_t CART::max_depth = 7; 148 | uint32_t CART::max_tnodes = static_cast(pow(2, CART::max_depth+1)); 149 | std::mutex CART::mtx; 150 | bool CART::verbose = false; 151 | 152 | void CART::fit(Problem const &prob, std::vector const &R, 153 | std::vector &F1) 154 | { 155 | uint32_t const nr_field = prob.nr_field; 156 | uint32_t const nr_sparse_field = prob.nr_sparse_field; 157 | uint32_t const nr_instance = prob.nr_instance; 158 | 159 | std::vector locations(nr_instance); 160 | #pragma omp parallel for schedule(static) 161 | for(uint32_t i = 0; i < nr_instance; ++i) 162 | locations[i].r = R[i]; 163 | for(uint32_t d = 0, offset = 1; d < max_depth; ++d, offset *= 2) 164 | { 165 | uint32_t const nr_leaf = static_cast(pow(2, d)); 166 | std::vector metas0(nr_leaf); 167 | 168 | for(uint32_t i = 0; i < nr_instance; ++i) 169 | { 170 | Location &location = locations[i]; 171 | if(location.shrinked) 172 | continue; 173 | 174 | Meta &meta = metas0[location.tnode_idx-offset]; 175 | meta.s += location.r; 176 | ++meta.n; 177 | } 178 | 179 | std::vector defenders(nr_leaf*nr_field); 180 | std::vector defenders_sparse(nr_leaf*nr_sparse_field); 181 | for(uint32_t f = 0; f < nr_leaf; ++f) 182 | { 183 | Meta const &meta = metas0[f]; 184 | double const ese = meta.s*meta.s/static_cast(meta.n); 185 | for(uint32_t j = 0; j < nr_field; ++j) 186 | defenders[f*nr_field+j].ese = ese; 187 | for(uint32_t j = 0; j < nr_sparse_field; ++j) 188 | defenders_sparse[f*nr_sparse_field+j].ese = ese; 189 | } 190 | std::vector defenders_inv = defenders; 191 | 192 | std::thread thread_f(scan, std::ref(prob), std::ref(locations), 193 | std::ref(metas0), std::ref(defenders), offset, true); 194 | std::thread thread_b(scan, std::ref(prob), std::ref(locations), 195 | std::ref(metas0), std::ref(defenders_inv), offset, false); 196 | scan_sparse(prob, locations, metas0, defenders_sparse, offset, true); 197 | thread_f.join(); 198 | thread_b.join(); 199 | 200 | for(uint32_t f = 0; f < nr_leaf; ++f) 201 | { 202 | Meta const &meta = metas0[f]; 203 | double best_ese = meta.s*meta.s/static_cast(meta.n); 204 | TreeNode &tnode = tnodes[f+offset]; 205 | for(uint32_t j = 0; j < nr_field; ++j) 206 | { 207 | Defender defender = defenders[f*nr_field+j]; 208 | if(defender.ese > best_ese) 209 | { 210 | best_ese = defender.ese; 211 | tnode.feature = j; 212 | tnode.threshold = defender.threshold; 213 | } 214 | 215 | defender = defenders_inv[f*nr_field+j]; 216 | if(defender.ese > best_ese) 217 | { 218 | best_ese = defender.ese; 219 | tnode.feature = j; 220 | tnode.threshold = defender.threshold; 221 | } 222 | } 223 | for(uint32_t j = 0; j < nr_sparse_field; ++j) 224 | { 225 | Defender defender = defenders_sparse[f*nr_sparse_field+j]; 226 | if(defender.ese > best_ese) 227 | { 228 | best_ese = defender.ese; 229 | tnode.feature = nr_field + j; 230 | tnode.threshold = defender.threshold; 231 | } 232 | } 233 | 234 | } 235 | 236 | #pragma omp parallel for schedule(static) 237 | for(uint32_t i = 0; i < nr_instance; ++i) 238 | { 239 | Location &location = locations[i]; 240 | if(location.shrinked) 241 | continue; 242 | 243 | uint32_t &tnode_idx = location.tnode_idx; 244 | TreeNode &tnode = tnodes[tnode_idx]; 245 | if(tnode.feature == -1) 246 | { 247 | location.shrinked = true; 248 | } 249 | else if(static_cast(tnode.feature) < nr_field) 250 | { 251 | if(prob.Z[tnode.feature][i].v < tnode.threshold) 252 | tnode_idx = 2*tnode_idx; 253 | else 254 | tnode_idx = 2*tnode_idx+1; 255 | } 256 | else 257 | { 258 | uint32_t const target_feature 259 | = static_cast(tnode.feature-nr_field); 260 | bool is_one = false; 261 | for(uint64_t p = prob.SJP[i]; p < prob.SJP[i+1]; ++p) 262 | { 263 | if(prob.SJ[p] == target_feature) 264 | { 265 | is_one = true; 266 | break; 267 | } 268 | } 269 | if(!is_one) 270 | tnode_idx = 2*tnode_idx; 271 | else 272 | tnode_idx = 2*tnode_idx+1; 273 | } 274 | } 275 | } 276 | 277 | std::vector> 278 | tmp(max_tnodes, std::make_pair(0, 0)); 279 | for(uint32_t i = 0; i < nr_instance; ++i) 280 | { 281 | float const r = locations[i].r; 282 | uint32_t const tnode_idx = locations[i].tnode_idx; 283 | tmp[tnode_idx].first += r; 284 | tmp[tnode_idx].second += fabs(r)*(1-fabs(r)); 285 | } 286 | 287 | for(uint32_t tnode_idx = 1; tnode_idx <= max_tnodes; ++tnode_idx) 288 | { 289 | double a, b; 290 | std::tie(a, b) = tmp[tnode_idx]; 291 | tnodes[tnode_idx].gamma = (b <= 1e-12)? 0 : static_cast(a/b); 292 | } 293 | 294 | #pragma omp parallel for schedule(static) 295 | for(uint32_t i = 0; i < nr_instance; ++i) 296 | F1[i] = tnodes[locations[i].tnode_idx].gamma; 297 | } 298 | 299 | std::pair CART::predict(float const * const x) const 300 | { 301 | uint32_t tnode_idx = 1; 302 | for(uint32_t d = 0; d <= max_depth; ++d) 303 | { 304 | TreeNode const &tnode = tnodes[tnode_idx]; 305 | if(tnode.feature == -1) 306 | return std::make_pair(tnode.idx, tnode.gamma); 307 | 308 | if(x[tnode.feature] < tnode.threshold) 309 | tnode_idx = tnode_idx*2; 310 | else 311 | tnode_idx = tnode_idx*2+1; 312 | } 313 | 314 | return std::make_pair(-1, -1); 315 | } 316 | 317 | void GBDT::fit(Problem const &Tr, Problem const &Va) 318 | { 319 | bias = calc_bias(Tr.Y); 320 | 321 | std::vector F_Tr(Tr.nr_instance, bias), F_Va(Va.nr_instance, bias); 322 | 323 | Timer timer; 324 | printf("iter time tr_loss va_loss\n"); 325 | for(uint32_t t = 0; t < trees.size(); ++t) 326 | { 327 | timer.tic(); 328 | 329 | std::vector const &Y = Tr.Y; 330 | std::vector R(Tr.nr_instance), F1(Tr.nr_instance); 331 | 332 | #pragma omp parallel for schedule(static) 333 | for(uint32_t i = 0; i < Tr.nr_instance; ++i){ 334 | double tmp = Y[i]*F_Tr[i]; 335 | //tmp = (tmp < 1e-12)? 1e-12 : tmp ; 336 | //tmp = (tmp > 1e12)? 1e12 : tmp ; 337 | //R[i] = static_cast(Y[i]/(1+exp(Y[i]*F_Tr[i]))); 338 | R[i] = static_cast(Y[i]/(1+exp(tmp))); 339 | } 340 | trees[t].fit(Tr, R, F1); 341 | double Tr_loss = 0; 342 | #pragma omp parallel for schedule(static) reduction(+: Tr_loss) 343 | for(uint32_t i = 0; i < Tr.nr_instance; ++i) 344 | { 345 | F_Tr[i] += F1[i]; 346 | Tr_loss += log(1+exp(-Y[i]*F_Tr[i])); 347 | } 348 | Tr_loss /= static_cast(Tr.nr_instance); 349 | 350 | #pragma omp parallel for schedule(static) 351 | for(uint32_t i = 0; i < Va.nr_instance; ++i) 352 | { 353 | std::vector x = construct_instance(Va, i); 354 | F_Va[i] += trees[t].predict(x.data()).second; 355 | } 356 | 357 | double Va_loss = 0; 358 | #pragma omp parallel for schedule(static) reduction(+: Va_loss) 359 | for(uint32_t i = 0; i < Va.nr_instance; ++i) 360 | Va_loss += log(1+exp(-Va.Y[i]*F_Va[i])); 361 | Va_loss /= static_cast(Va.nr_instance); 362 | 363 | printf("%4d %8.1f %10.5f %10.5f\n", t, timer.toc(), Tr_loss, Va_loss); 364 | fflush(stdout); 365 | } 366 | } 367 | 368 | float GBDT::predict(float const * const x) const 369 | { 370 | float s = bias; 371 | for(auto &tree : trees) 372 | s += tree.predict(x).second; 373 | return s; 374 | } 375 | 376 | std::vector GBDT::get_indices(float const * const x) const 377 | { 378 | uint32_t const nr_tree = static_cast(trees.size()); 379 | 380 | std::vector indices(nr_tree); 381 | for(uint32_t t = 0; t < nr_tree; ++t) 382 | indices[t] = trees[t].predict(x).first; 383 | return indices; 384 | } 385 | -------------------------------------------------------------------------------- /gbdt/src/gbdt.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "common.h" 6 | 7 | struct TreeNode 8 | { 9 | TreeNode() : idx(0), feature(-1), threshold(0), gamma(0) {} 10 | uint32_t idx; 11 | int32_t feature; 12 | float threshold, gamma; 13 | }; 14 | 15 | class CART 16 | { 17 | public: 18 | CART() : tnodes(max_tnodes) 19 | { 20 | for(uint32_t i = 1; i <= max_tnodes; ++i) 21 | tnodes[i].idx = i; 22 | } 23 | void fit(Problem const &prob, std::vector const &R, 24 | std::vector &F1); 25 | std::pair predict(float const * const x) const; 26 | 27 | static uint32_t max_depth, max_tnodes; 28 | 29 | private: 30 | static std::mutex mtx; 31 | static bool verbose; 32 | std::vector tnodes; 33 | }; 34 | 35 | class GBDT 36 | { 37 | public: 38 | GBDT(uint32_t const nr_tree) : trees(nr_tree), bias(0) {} 39 | void fit(Problem const &Tr, Problem const &Va); 40 | float predict(float const * const x) const; 41 | std::vector get_indices(float const * const x) const; 42 | 43 | private: 44 | std::vector trees; 45 | float bias; 46 | }; 47 | -------------------------------------------------------------------------------- /gbdt/src/timer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "timer.h" 3 | 4 | Timer::Timer() 5 | { 6 | reset(); 7 | } 8 | 9 | void Timer::reset() 10 | { 11 | begin = std::chrono::high_resolution_clock::now(); 12 | duration = 13 | std::chrono::duration_cast(begin-begin); 14 | } 15 | 16 | void Timer::tic() 17 | { 18 | begin = std::chrono::high_resolution_clock::now(); 19 | } 20 | 21 | float Timer::toc() 22 | { 23 | duration += std::chrono::duration_cast 24 | (std::chrono::high_resolution_clock::now()-begin); 25 | return (float)duration.count()/1000; 26 | } 27 | 28 | float Timer::get() 29 | { 30 | float time = toc(); 31 | tic(); 32 | return time; 33 | } 34 | -------------------------------------------------------------------------------- /gbdt/src/timer.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | class Timer 4 | { 5 | public: 6 | Timer(); 7 | void reset(); 8 | void tic(); 9 | float toc(); 10 | float get(); 11 | private: 12 | std::chrono::high_resolution_clock::time_point begin; 13 | std::chrono::milliseconds duration; 14 | }; 15 | -------------------------------------------------------------------------------- /gbdt/src/train.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "common.h" 5 | #include "timer.h" 6 | #include "gbdt.h" 7 | 8 | namespace { 9 | 10 | struct Option 11 | { 12 | Option() : nr_tree(30), nr_thread(1) {} 13 | std::string Tr_path, TrS_path, Va_path, VaS_path, Va_out_path, Tr_out_path; 14 | uint32_t nr_tree, nr_thread; 15 | }; 16 | 17 | std::string train_help() 18 | { 19 | return std::string( 20 | "usage: gbdt [] \n" 21 | "\n" 22 | "options:\n" 23 | "-d : set the maximum depth of a tree\n" 24 | "-s : set the maximum number of threads\n" 25 | "-t : set the number of trees\n"); 26 | } 27 | 28 | Option parse_option(std::vector const &args) 29 | { 30 | uint32_t const argc = static_cast(args.size()); 31 | 32 | if(argc == 0) 33 | throw std::invalid_argument(train_help()); 34 | 35 | Option opt; 36 | 37 | uint32_t i = 0; 38 | for(; i < argc; ++i) 39 | { 40 | if(args[i].compare("-d") == 0) 41 | { 42 | if(i == argc-1) 43 | throw std::invalid_argument("invalid command"); 44 | CART::max_depth = std::stoi(args[++i]); 45 | } 46 | else if(args[i].compare("-t") == 0) 47 | { 48 | if(i == argc-1) 49 | throw std::invalid_argument("invalid command"); 50 | opt.nr_tree = std::stoi(args[++i]); 51 | } 52 | else if(args[i].compare("-s") == 0) 53 | { 54 | if(i == argc-1) 55 | throw std::invalid_argument("invalid command"); 56 | opt.nr_thread = std::stoi(args[++i]); 57 | } 58 | else 59 | { 60 | break; 61 | } 62 | } 63 | 64 | if(i != argc-4) 65 | throw std::invalid_argument("invalid command"); 66 | 67 | opt.Va_path = args[i++]; 68 | //opt.VaS_path = args[i++]; 69 | opt.Tr_path = args[i++]; 70 | //opt.TrS_path = args[i++]; 71 | opt.Va_out_path = args[i++]; 72 | opt.Tr_out_path = args[i++]; 73 | 74 | return opt; 75 | 76 | } 77 | 78 | void write(Problem const &prob, GBDT const &gbdt, std::string const &path) 79 | { 80 | FILE *f = open_c_file(path, "w"); 81 | 82 | for(uint32_t i = 0; i < prob.nr_instance; ++i) 83 | { 84 | std::vector x = construct_instance(prob, i); 85 | std::vector indices = gbdt.get_indices(x.data()); 86 | 87 | fprintf(f, "%d", static_cast(prob.Y[i])); 88 | for(uint32_t t = 0; t < indices.size(); ++t) 89 | fprintf(f, " %d", indices[t]); 90 | fprintf(f, "\n"); 91 | } 92 | 93 | fclose(f); 94 | } 95 | 96 | } //unnamed namespace 97 | 98 | int main(int const argc, char const * const * const argv) 99 | { 100 | Option opt; 101 | try 102 | { 103 | opt = parse_option(argv_to_args(argc, argv)); 104 | } 105 | catch(std::invalid_argument const &e) 106 | { 107 | std::cout << e.what(); 108 | return EXIT_FAILURE; 109 | } 110 | std::cout << "reading data..." << std::flush; 111 | //Problem const Tr = read_data(opt.Tr_path, opt.TrS_path); 112 | //Problem const Va = read_data(opt.Va_path, opt.VaS_path); 113 | Problem const Tr = read_data(opt.Tr_path); 114 | Problem const Va = read_data(opt.Va_path); 115 | std::cout << "done\n" << std::flush; 116 | 117 | omp_set_num_threads(static_cast(opt.nr_thread)); 118 | 119 | GBDT gbdt(opt.nr_tree); 120 | gbdt.fit(Tr, Va); 121 | 122 | write(Tr, gbdt, opt.Tr_out_path); 123 | write(Va, gbdt, opt.Va_out_path); 124 | 125 | return EXIT_SUCCESS; 126 | } 127 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Copyright 2014 Yu-Chin Juan, Wei-Sheng Chin, and Yong Zhuang. 2 | Copyright 2015 Xiaocong Zhou, Peng yan 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use 5 | this file except in compliance with the License. You may obtain a copy of the 6 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 7 | applicable law or agreed to in writing, software distributed under the License 8 | is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 9 | KIND, either express or implied. See the License for the specific language 10 | governing permissions and limitations under the License. -------------------------------------------------------------------------------- /script/addc.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import marshal 3 | 4 | f_train = "../train" 5 | f_test = "../test" 6 | o_train = "../train_c" 7 | o_test = "../test_c" 8 | 9 | 10 | def get_id(id,ip,device): 11 | if id != "i_a99f214a": 12 | return id 13 | else: 14 | return ip + "_" + device 15 | 16 | def run(input,output,isTest): 17 | f = open(input) 18 | o = open(output,"w") 19 | d = defaultdict(int) 20 | d2 = {} 21 | dh = defaultdict(int) 22 | line = f.readline() 23 | print >> o,line[:-2] + ",C22,C23,C24,C25,C26,C27,C28" 24 | count = 0 25 | day = "??" 26 | hour = "??" 27 | date_idx = 2 28 | if isTest: 29 | date_idx = 1 30 | while True: 31 | line = f.readline() 32 | if not line: 33 | break 34 | count += 1 35 | if count % 100000 == 0: 36 | print count 37 | lis = line.split(",") 38 | if lis[date_idx][4:6] != day: 39 | del d 40 | d = defaultdict(int) 41 | d2 = {} 42 | day = lis[date_idx][4:6] 43 | if lis[date_idx][6:] != hour: 44 | del dh 45 | dh = defaultdict(int) 46 | hour = lis[date_idx][6:] 47 | time = int(lis[date_idx][6:]) * 60 + int(int(lis[0][:5]) / 100000. * 60) 48 | id = get_id("i_"+lis[date_idx+9],"j_"+lis[date_idx+10],"k_"+lis[date_idx+11]) 49 | d[id + "_n_" + lis[date_idx+14]] += 1 50 | d[id + "_q_" + lis[date_idx+17]] += 1 51 | dh[id + "_n_" + lis[date_idx+14]] += 1 52 | dh[id + "_q_" + lis[date_idx+17]] += 1 53 | dh[id] += 1 54 | 55 | media_id = "f_"+lis[date_idx+6] 56 | if lis[date_idx+6] == "ecad2386": 57 | media_id = "c_"+lis[date_idx+3] 58 | d[id + "_" + media_id] += 1 59 | t = "-1" 60 | 61 | if id not in d2: 62 | d2[id] = time 63 | else: 64 | t = str(time-d2[id]) 65 | d2[id] = time 66 | 67 | m = d[id + "_" + media_id] 68 | c = d[id + "_n_" + lis[date_idx+14]] 69 | c2 = d[id + "_q_" + lis[date_idx+17]] 70 | ch = dh[id + "_n_" + lis[date_idx+14]] 71 | ch1 = dh[id + "_q_" + lis[date_idx+17]] 72 | ch2 = dh[id] 73 | print >> o,line[:-2] + "," + id + "," + str(m) + "," + str(ch1) + "," + str(ch2) + "," + str(c) + "," + str(c2) + "," + t 74 | f.close() 75 | o.close() 76 | 77 | run(f_train,o_train,False) 78 | run(f_test,o_test,True) 79 | -------------------------------------------------------------------------------- /script/append.py: -------------------------------------------------------------------------------- 1 | f1 = open("../train_pre_1") 2 | f2 = open("../test_pre_1") 3 | out1 = open("../train_pre_1b","w") 4 | out2 = open("../test_pre_1b","w") 5 | t = open("../train_gbdt_out") 6 | v = open("../test_gbdt_out") 7 | add = [] 8 | for i in xrange(30,49): 9 | add.append("C" + str(i)) 10 | 11 | line = f1.readline() 12 | print >> out1, line[:-1] + "," + ",".join(add) 13 | line = f2.readline() 14 | print >> out2, line[:-1] + "," + ",".join(add) 15 | for i in xrange(40428967): 16 | line = f1.readline()[:-1] 17 | a = t.readline()[:-1] 18 | ll = a.split(" ")[1:] 19 | for j in xrange(19): 20 | line += "," + add[j] + "_" + ll[j] 21 | print >> out1,line 22 | for i in xrange(4577464): 23 | line = f2.readline()[:-1] 24 | a = v.readline()[:-1] 25 | ll = a.split(" ")[1:] 26 | for j in xrange(19): 27 | line += "," + add[j] + "_" + ll[j] 28 | print >> out2,line 29 | 30 | f1.close() 31 | f2.close() 32 | out1.close() 33 | out2.close() 34 | t.close() 35 | v.close() 36 | -------------------------------------------------------------------------------- /script/append_gbdt.py: -------------------------------------------------------------------------------- 1 | 2 | def get_feature_num(train,max_num): 3 | f = open(train) 4 | num = max_num 5 | while True: 6 | line = f.readline() 7 | if not line: 8 | break 9 | ss = line.split(" ") 10 | for i in xrange(1,len(ss)): 11 | if num < int(ss[i]): 12 | num = int(ss[i]) 13 | f.close() 14 | return num 15 | 16 | def append(input,gbdt,output,num): 17 | f1 = open(input) 18 | f2 = open(gbdt) 19 | fo = open(output,"w") 20 | while True: 21 | line1 = f1.readline().strip() 22 | line2 = f2.readline().strip() 23 | if not line1: 24 | break 25 | gbdt_fea = [] 26 | ss = line2.split(" ") 27 | for i in xrange(1,len(ss)): 28 | fea = str(i) + "_" + ss[i] 29 | idx = d.get(fea) 30 | if idx == None: 31 | d[fea] = num + 1 + len(d) 32 | gbdt_fea.append(str(d[fea])) 33 | print >> fo, line1+" "+" ".join(gbdt_fea) 34 | f1.close() 35 | f2.close() 36 | fo.close() 37 | 38 | num = -1 39 | 40 | input_train = "../fm_train_1" 41 | input_test = "../fm_test_1" 42 | 43 | gbdt_train = "../train_gbdt_out" 44 | gbdt_test = "../test_gbdt_out" 45 | 46 | output_train = "../fm_train_2" 47 | output_test = "../fm_test_2" 48 | num = get_feature_num(input_train,num) 49 | num = get_feature_num(input_test,num) 50 | 51 | d = {} 52 | 53 | append(input_train,gbdt_train,output_train,num) 54 | append(input_test,gbdt_test,output_test,num) 55 | -------------------------------------------------------------------------------- /script/append_gbdt_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from datetime import datetime 3 | # load name data 4 | def load_name_sample(input,gbdt,isTest): 5 | f = open(input) 6 | f_g = open(gbdt) 7 | y1 = [] 8 | x1 = [] 9 | y2 = [] 10 | x2 = [] 11 | lable1 = [] 12 | label2 = [] 13 | max_feature = 0 14 | field_cnt = 0 15 | line = f.readline() 16 | index = 3 17 | if isTest == True: 18 | index = 2 19 | cnt = 0 20 | 21 | label = -1 22 | id = "??" 23 | 24 | d = {} 25 | while True: 26 | isApp = False 27 | line = f.readline() 28 | l_g = f_g.readline() 29 | if line.strip()=="" : 30 | break 31 | fields = line.split(',') 32 | id = fields[0] 33 | if isTest == False: 34 | label = int(fields[1]) 35 | if label == 0: 36 | label = -1 37 | 38 | cur_x = [] 39 | 40 | if fields[index+2] == "c_85f751fd": 41 | isApp = True 42 | d = d_app 43 | else: 44 | d = d_site 45 | 46 | for i in xrange(index,len(fields)): 47 | if i == len(fields)-2: 48 | ss = fields[i].split('_') 49 | if int(ss[1])>=50: 50 | fields[i]=ss[0]+"_50" 51 | elif i == len(fields)-5: 52 | ss = fields[i].split('_') 53 | if int(ss[1])>=20: 54 | fields[i]=ss[0]+"_20" 55 | elif i > len(fields)-8 and i < len(fields)-1: 56 | ss = fields[i].split('_') 57 | if int(ss[1])>=10: 58 | fields[i]=ss[0]+"_10" 59 | 60 | if isApp == True: 61 | if fields[i][0] == "d" or fields[i][0] == "e" or fields[i][0] == "c": 62 | continue 63 | else: 64 | if fields[i][0] == "g" or fields[i][0] == "h" or fields[i][0] == "f": 65 | continue 66 | 67 | idx = d.get(fields[i]) 68 | if idx == None: 69 | cur_x.append(len(d)) 70 | d[fields[i]] = len(d) 71 | else: 72 | cur_x.append(idx) 73 | 74 | g_feas = l_g.split(" ") 75 | for k in xrange(1,len(g_feas)): 76 | g_fea = "gbdt_" + str(k) + "_" + g_feas[k] 77 | idx = d.get(g_fea) 78 | if idx == None: 79 | cur_x.append(len(d)) 80 | d[g_fea] = len(d) 81 | else: 82 | cur_x.append(idx) 83 | cur_str_x = [str(x) for x in cur_x] 84 | if isApp == False: 85 | if isTest == True: 86 | print >> fm_test_1,str(label)+" "+" ".join(cur_str_x) 87 | else: 88 | print >> fm_train_1,str(label)+" "+" ".join(cur_str_x) 89 | else: 90 | if isTest == True: 91 | print >> fm_test_2,str(label)+" "+" ".join(cur_str_x) 92 | else: 93 | print >> fm_train_2,str(label)+" "+" ".join(cur_str_x) 94 | cnt = cnt + 1 95 | if cnt % 100000 == 0: 96 | print cnt 97 | f.close() 98 | f_g.close() 99 | 100 | starttime = datetime.now() 101 | 102 | d_app = {} 103 | d_site = {} 104 | 105 | fm_train_1 = open("../fm_train_2_1","w") 106 | fm_test_1 = open("../fm_test_2_1","w") 107 | fm_train_2 = open("../fm_train_2_2","w") 108 | fm_test_2 = open("../fm_test_2_2","w") 109 | 110 | load_name_sample('../train_pre','../train_gbdt_out',False) 111 | load_name_sample('../test_pre','../test_gbdt_out',True) 112 | 113 | fm_train_1.close() 114 | fm_test_1.close() 115 | fm_train_2.close() 116 | fm_test_2.close() 117 | 118 | endtime = datetime.now() 119 | 120 | print (endtime-starttime).seconds -------------------------------------------------------------------------------- /script/calibrate.py: -------------------------------------------------------------------------------- 1 | import math 2 | from pprint import pprint 3 | 4 | train_ctr = 0.162 5 | 6 | def get_pred_ctr(input_file): 7 | f = open(input_file) 8 | line = f.readline() 9 | cur_obj_idx = 0 10 | obj_ctr = 0 11 | obj_cnt = 0 12 | while True: 13 | line = f.readline() 14 | if line.strip()=='': 15 | break 16 | obj_cnt = obj_cnt + 1 17 | obj_ctr = obj_ctr + float(line.split(',')[1]) 18 | print obj_cnt 19 | print obj_ctr/obj_cnt 20 | pred_ctr = (obj_ctr/obj_cnt) 21 | f.close() 22 | return pred_ctr 23 | 24 | def inverse_logit(y): 25 | return math.log(y/(1-y)) 26 | 27 | def logit(x): 28 | return 1/(1+math.exp(-x)) 29 | 30 | def calibrate(input_file,output_file): 31 | # delta intercept 32 | pred_ctr = get_pred_ctr(input_file) 33 | intercept = (pred_ctr*(1-train_ctr)/train_ctr/(1-pred_ctr)) 34 | 35 | f1 = open(input_file) 36 | f2 = open(output_file,'w') 37 | line = f1.readline() 38 | f2.write(line) 39 | cur_obj_idx = 0 40 | cnt = 0 41 | new_ctr = 0.0 42 | while True: 43 | cnt += 1 44 | line = f1.readline() 45 | if line.strip()=='': 46 | break 47 | fields = line.split(',') 48 | cur_ctr = float(fields[1]) 49 | cal_ctr = logit(inverse_logit(cur_ctr)-math.log(intercept)) 50 | new_ctr += cal_ctr 51 | f2.write(fields[0]+","+str(cal_ctr)+"\n") 52 | f1.close() 53 | f2.close() 54 | print new_ctr/cnt 55 | 56 | -------------------------------------------------------------------------------- /script/calibrate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitezxc/kaggle-avazu/93d719617567dd5d65fdaeb2cded48732c29d907/script/calibrate.pyc -------------------------------------------------------------------------------- /script/ensemble.py: -------------------------------------------------------------------------------- 1 | import math 2 | from pprint import pprint 3 | from calibrate import calibrate 4 | 5 | inputs = ["lr_c1_cal.txt","ftrl_c5_cal.txt"] 6 | label = "" 7 | output_file = "./ensemble1.txt" 8 | 9 | def cal_weights(weights): 10 | r = [] 11 | sum = 0.0 12 | for w in weights: 13 | sum += w 14 | for w in weights: 15 | r.append(w/sum) 16 | return r 17 | 18 | def get_pred_ctr(input): 19 | f = open(input) 20 | ctr = [] 21 | while True: 22 | line = f.readline().strip() 23 | if not line: 24 | break 25 | ctr.append(float(line)) 26 | f.close() 27 | return ctr 28 | 29 | def ensemble(weights,files,output): 30 | ctrs = [] 31 | weights = cal_weights(weights) 32 | f = open(output,"w") 33 | for file in files: 34 | ctr = get_pred_ctr(file) 35 | print "loading " + file 36 | ctrs.append(ctr) 37 | sample_num = len(ctrs[0]) 38 | 39 | for j in xrange(sample_num): 40 | cur_ctr = 0.0 41 | for k in xrange(len(ctrs)): 42 | cur_ctr += weights[k] * math.log(ctrs[k][j]/(1-ctrs[k][j])) 43 | cur_ctr = 1/(1+math.exp(-cur_ctr)) 44 | print >> f, str(cur_ctr) 45 | f.close() 46 | 47 | def sub(result,testfile,output): 48 | f = open(testfile) 49 | r = open(result) 50 | o = open(output,"w") 51 | l1 = f.readline() 52 | print >> o, "id,click" 53 | while True: 54 | l1 = f.readline() 55 | l2 = r.readline().strip() 56 | if not l1: 57 | break 58 | print >> o, l1.split(',')[0]+","+l2 59 | f.close() 60 | r.close() 61 | o.close() 62 | 63 | files = ["../ftrl_1","../ftrl_2","../fm_test_2.out","../fm_test_2_split"] 64 | weights = [1,1,1,1] 65 | ensemble(weights,files,"../ensemble") 66 | sub("../ensemble","../test","../ensemble_sub") 67 | calibrate("../ensemble_sub","../ensemble_cal") 68 | 69 | -------------------------------------------------------------------------------- /script/fcount.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import marshal 3 | 4 | f = open("../train") 5 | f2 = open("../test") 6 | fc = open("../fc","w") 7 | 8 | d = {} 9 | count = 0 10 | line = f.readline() 11 | while True: 12 | line = f.readline() 13 | if not line: 14 | break 15 | count += 1 16 | if count % 100000 == 0: 17 | print count 18 | lis = line[:-2].split(",") 19 | for i in xrange(3,len(lis)): 20 | name = chr(ord('a') + i - 3) 21 | feat = name + "_" + lis[i] 22 | if feat in d: 23 | d[feat] += 1 24 | else: 25 | d[feat] = 1 26 | 27 | count = 0 28 | line = f2.readline() 29 | while True: 30 | line = f2.readline() 31 | if not line: 32 | break 33 | count += 1 34 | if count % 100000 == 0: 35 | print count 36 | lis = line[:-2].split(",") 37 | for i in xrange(2,len(lis)): 38 | name = chr(ord('a') + i - 2) 39 | feat = name + "_" + lis[i] 40 | if feat in d: 41 | d[feat] += 1 42 | else: 43 | d[feat] = 1 44 | 45 | s = [] 46 | dd = {} 47 | for x in d: 48 | if d[x] >= 10: 49 | s.append(x) 50 | marshal.dump(set(s),fc) 51 | -------------------------------------------------------------------------------- /script/ftrl_1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2004 Sam Hocevar 6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 | ''' 16 | 17 | 18 | from datetime import datetime 19 | from csv import DictReader 20 | from math import exp, log, sqrt 21 | from collections import defaultdict 22 | import marshal 23 | import random 24 | 25 | 26 | # TL; DR, the main training process starts on line: 250, 27 | # you may want to start reading the code from there 28 | 29 | 30 | ############################################################################## 31 | # parameters ################################################################# 32 | ############################################################################## 33 | 34 | # A, paths 35 | train = '../train_pre_1b' # path to training file 36 | test = '../test_pre_1b' # path to testing file 37 | submission = '../ftrl_1' # path of to be outputted submission file 38 | 39 | # B, model 40 | alpha = .05 # learning rate 41 | beta = 1. # smoothing parameter for adaptive learning rate 42 | L1 = .4 # L1 regularization, larger value means more regularized 43 | L2 = .1 # L2 regularization, larger value means more regularized 44 | # C, feature/hash trick 45 | D = 2 ** 26 # number of weights to use 46 | interaction = True # whether to enable poly2 feature interactions 47 | SUB = True 48 | 49 | # D, training/validation 50 | epoch = 3 # learn training data for N passes 51 | holdafter = 30 if SUB else 28 # data after date N (exclusive) are used as validation 52 | holdout = None # use every N training instance for holdout validation 53 | 54 | #interactions 55 | inter_s = "ac,af,bc,bf,bi,bk,ck,cm,cn,fk,fm,fn,wc,wf,tc,tf,ic,if,in,jc,jf,jq,xc,xf,yc,yf,zc,zf,1c,1f,2c,2f,3c,3f,4c,4f,xy,xz,x1,x2,x3,x4" 56 | 57 | inter = [] 58 | featdict = {"a":"C1","b":"banner_pos","c":"site_id","d":"site_domain","e":"site_category","f":"app_id","g":"app_domain","h":"app_category","i":"device_id","j":"ips","k":"device_model","l":"device_type","m":"device_conn_type","n":"C14","o":"C15","p":"C16","q":"C17","r":"C18","s":"C19","t":"C20","u":"C21","w":"ipcate","x":"C22","y":"C23","z":"C24","1":"C25","2":"C26","3":"C27","4":"C28"} 59 | for i in inter_s.split(","): 60 | inter.append((featdict[i[0]],featdict[i[1]])) 61 | for i in xrange(29,49): 62 | co = "C" + str(i) 63 | inter.append((co,"site_id")) 64 | inter.append((co,"app_id")) 65 | 66 | ipcate = marshal.load(open("../testcate")) 67 | 68 | def convt(s,t): 69 | s = s.split("_")[1] 70 | if int(s) <= 70: 71 | return t + "_" + s 72 | else: 73 | return t + "_" + "l" 74 | 75 | ############################################################################## 76 | # class, function, generator definitions ##################################### 77 | ############################################################################## 78 | 79 | class ftrl_proximal(object): 80 | ''' Our main algorithm: Follow the regularized leader - proximal 81 | 82 | In short, 83 | this is an adaptive-learning-rate sparse logistic-regression with 84 | efficient L1-L2-regularization 85 | 86 | Reference: 87 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 88 | ''' 89 | 90 | def __init__(self, alpha, beta, L1, L2, D, interaction): 91 | # parameters 92 | self.alpha = alpha 93 | self.beta = beta 94 | self.L1 = L1 95 | self.L2 = L2 96 | 97 | # feature related parameters 98 | self.D = D 99 | self.interaction = interaction 100 | 101 | # model 102 | # n: squared sum of past gradients 103 | # z: weights 104 | # w: lazy weights 105 | self.n = [0.] * D 106 | self.z = [0.] * D 107 | self.w = [0.] * D 108 | 109 | def _indices(self, x): 110 | ''' A helper generator that yields the indices in x 111 | 112 | The purpose of this generator is to make the following 113 | code a bit cleaner when doing feature interaction. 114 | ''' 115 | 116 | # first yield index of the bias term 117 | yield 0 118 | 119 | D = self.D 120 | # then yield the normal indices 121 | for feat in x: 122 | index = abs(hash(feat)) % D 123 | yield index 124 | 125 | def predict(self, x): 126 | ''' Get probability estimation on x 127 | 128 | INPUT: 129 | x: features 130 | 131 | OUTPUT: 132 | probability of p(y = 1 | x; w) 133 | ''' 134 | 135 | # model 136 | w = self.w 137 | 138 | # wTx is the inner product of w and x 139 | wTx = 0. 140 | for i in self._indices(x): 141 | wTx += w[i] 142 | 143 | # bounded sigmoid function, this is the probability estimation 144 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 145 | 146 | def update(self, x, p, y): 147 | ''' Update model using x, p, y 148 | 149 | INPUT: 150 | x: feature, a list of indices 151 | p: click probability prediction of our model 152 | y: answer 153 | 154 | MODIFIES: 155 | self.n: increase by squared gradient 156 | self.z: weights 157 | ''' 158 | 159 | # parameter 160 | alpha = self.alpha 161 | 162 | # model 163 | n = self.n 164 | z = self.z 165 | w = self.w 166 | 167 | # gradient under logloss 168 | g = p - y 169 | 170 | # update z and n 171 | tmp = 0 172 | for i in self._indices(x): 173 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 174 | z[i] += g - sigma * w[i] 175 | n[i] += g * g 176 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 177 | # build w using z and n 178 | if sign * z[i] <= L1: 179 | # w[i] vanishes due to L1 regularization 180 | w[i] = 0. 181 | else: 182 | # apply prediction time L1, L2 regularization to z and get w 183 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 184 | 185 | def logloss(p, y): 186 | ''' FUNCTION: Bounded logloss 187 | 188 | INPUT: 189 | p: our prediction 190 | y: real answer 191 | 192 | OUTPUT: 193 | logarithmic loss of p given y 194 | ''' 195 | 196 | p = max(min(p, 1. - 10e-15), 10e-15) 197 | return -log(p) if y == 1. else -log(1. - p) 198 | 199 | 200 | def data(path, D): 201 | ''' GENERATOR: Apply hash-trick to the original csv row 202 | and for simplicity, we one-hot-encode everything 203 | 204 | INPUT: 205 | path: path to training or testing file 206 | D: the max index that we can hash to 207 | 208 | YIELDS: 209 | ID: id of the instance, mainly useless 210 | x: a list of hashed and one-hot-encoded 'indices' 211 | we only need the index since all values are either 0 or 1 212 | y: y = 1 if we have a click, else we have y = 0 213 | ''' 214 | 215 | for t, row in enumerate(DictReader(open(path))): 216 | # process id 217 | ID = row['id'] 218 | del row['id'] 219 | 220 | # process clicks 221 | y = 0. 222 | if 'click' in row: 223 | if row['click'] == '1': 224 | y = 1. 225 | del row['click'] 226 | 227 | # turn hour really into hour, it was originally YYMMDDHH 228 | 229 | date = int(row["hour"][4:6]) 230 | row["C28"] = convt(row["C28"],"C28") 231 | del row["hour"] 232 | row["ips"] = row["device_ip"] 233 | if row["device_ip"][-3:] == "ips": 234 | row["device_ip"] = "ips" 235 | row["ips"] = row["ips"][:-3] 236 | row["ipcate"] = "ipcate_null" 237 | if row["ips"][2:] in ipcate: 238 | row["ipcate"] = "ipcate_" + str(ipcate[row["ips"][2:]]) 239 | # build x 240 | x = [] 241 | if interaction: 242 | for pair in inter: 243 | x.append(row[pair[0]] + "_" + row[pair[1]]) 244 | del row["ips"] 245 | for key in row: 246 | value = row[key] 247 | # one-hot encode everything with hash trick 248 | x.append(value) 249 | yield t, date, ID, x, y 250 | 251 | 252 | ############################################################################## 253 | # start training ############################################################# 254 | ############################################################################## 255 | 256 | start = datetime.now() 257 | 258 | # initialize ourselves a learner 259 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 260 | # start training 261 | for e in xrange(epoch): 262 | loss = 0. 263 | count = 1 264 | localcount = 0 265 | learner.alpha = 0.05 - 0.01 * e 266 | for t, date, ID, x, y in data(train, D): # data is a generator 267 | # t: just a instance counter 268 | # date: you know what this is 269 | # ID: id provided in original data 270 | # x: features 271 | # y: label (click) 272 | 273 | # step 1, get prediction from learner 274 | p = learner.predict(x) 275 | #print progress 276 | localcount += 1 277 | if localcount % 1000000 == 0: 278 | if (holdafter and date > holdafter) or (holdout and t % holdout == 0): 279 | print "valid: " + str(localcount) 280 | else: 281 | print "train: " + str(localcount) 282 | 283 | if (holdafter and date > holdafter) or (holdout and t % holdout == 0): 284 | # step 2-1, calculate validation loss 285 | # we do not train with the validation data so that our 286 | # validation loss is an accurate estimation 287 | # 288 | # holdafter: train instances from day 1 to day N 289 | # validate with instances from day N + 1 and after 290 | # 291 | # holdout: validate with every N instance, train with others 292 | loss += logloss(p, y) 293 | count += 1 294 | else: 295 | # step 2-2, update learner with label (click) information 296 | learner.update(x, p, y) 297 | 298 | print('Epoch %d finished, validation logloss: %f, elapsed time: %s' % ( 299 | e, loss/count, str(datetime.now() - start))) 300 | print learner.alpha 301 | 302 | ############################################################################## 303 | # start testing, and build Kaggle's submission file ########################## 304 | ############################################################################## 305 | def sub(): 306 | with open(submission, 'w') as outfile: 307 | for t, date, ID, x, y in data(test, D): 308 | p = learner.predict(x) 309 | outfile.write('%f\n' % p) 310 | 311 | sub() -------------------------------------------------------------------------------- /script/ftrl_2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2004 Sam Hocevar 6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 | ''' 16 | 17 | 18 | from datetime import datetime 19 | from csv import DictReader 20 | from math import exp, log, sqrt 21 | import marshal 22 | 23 | 24 | # TL; DR, the main training process starts on line: 250, 25 | # you may want to start reading the code from there 26 | 27 | 28 | ############################################################################## 29 | # parameters ################################################################# 30 | ############################################################################## 31 | 32 | # A, paths 33 | train = '../train_pre_1b' # path to training file 34 | test = '../test_pre_1b' # path to testing file 35 | submission = '../ftrl_2' # path of to be outputted submission file 36 | 37 | # B, model 38 | alpha = .05 # learning rate 39 | beta = 1. # smoothing parameter for adaptive learning rate 40 | L1 = .4 # L1 regularization, larger value means more regularized 41 | L2 = .1 # L2 regularization, larger value means more regularized 42 | 43 | # C, feature/hash trick 44 | D = 2 ** 26 # number of weights to use 45 | interaction = True # whether to enable poly2 feature interactions 46 | SUB = True 47 | 48 | # D, training/validation 49 | epoch = 3 # learn training data for N passes 50 | holdafter = 30 if SUB else 28 # data after date N (exclusive) are used as validation 51 | holdout = None # use every N training instance for holdout validation 52 | 53 | #interactions 54 | inter_s = "ac,bc,bi,bk,ci,ck,cm,cn,in,jc,cw,tc,xc,yc,zc,1c,2c,3c,4c,x1,x2,x3,x4,xy,xz" #site 55 | inter = [] 56 | inter_s2 = "af,bf,bi,bk,fi,fk,fm,fn,in,jf,fw,tf,xf,yf,zf,1f,2f,3f,4f,x1,x2,x3,x4,xy,xz" #app 57 | inter2 = [] 58 | featdict = {"a":"C1","b":"banner_pos","c":"site_id","d":"site_domain","e":"site_category","f":"app_id","g":"app_domain","h":"app_category","i":"device_id","j":"ips","k":"device_model","l":"device_type","m":"device_conn_type","n":"C14","o":"C15","p":"C16","q":"C17","r":"C18","s":"C19","t":"C20","u":"C21","w":"ipcate","x":"C22","y":"C23","z":"C24","1":"C25","2":"C26","3":"C27","4":"C28"} 59 | for i in inter_s.split(","): 60 | inter.append((featdict[i[0]],featdict[i[1]])) 61 | for i in inter_s2.split(","): 62 | inter2.append((featdict[i[0]],featdict[i[1]])) 63 | for i in xrange(29,49): 64 | co = "C" + str(i) 65 | inter.append((co,"site_id")) 66 | inter2.append((co,"app_id")) 67 | ipcate = marshal.load(open("../testcate")) 68 | 69 | def convt(s,t): 70 | s = s.split("_")[1] 71 | if int(s) <= 70: 72 | return t + "_" + s 73 | else: 74 | return t + "_" + "l" 75 | 76 | ############################################################################## 77 | # class, function, generator definitions ##################################### 78 | ############################################################################## 79 | 80 | class ftrl_proximal(object): 81 | ''' Our main algorithm: Follow the regularized leader - proximal 82 | 83 | In short, 84 | this is an adaptive-learning-rate sparse logistic-regression with 85 | efficient L1-L2-regularization 86 | 87 | Reference: 88 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 89 | ''' 90 | 91 | def __init__(self, alpha, beta, L1, L2, D, interaction): 92 | # parameters 93 | self.alpha = alpha 94 | self.beta = beta 95 | self.L1 = L1 96 | self.L2 = L2 97 | 98 | # feature related parameters 99 | self.D = D 100 | self.interaction = interaction 101 | 102 | # model 103 | # n: squared sum of past gradients 104 | # z: weights 105 | # w: lazy weights 106 | self.n = [0.] * D 107 | self.z = [0.] * D 108 | self.w = [0.] * D 109 | 110 | def _indices(self, x): 111 | ''' A helper generator that yields the indices in x 112 | 113 | The purpose of this generator is to make the following 114 | code a bit cleaner when doing feature interaction. 115 | ''' 116 | 117 | # first yield index of the bias term 118 | yield 0 119 | 120 | D = self.D 121 | # then yield the normal indices 122 | for feat in x: 123 | index = abs(hash(feat)) % D 124 | yield index 125 | 126 | def predict(self, x): 127 | ''' Get probability estimation on x 128 | 129 | INPUT: 130 | x: features 131 | 132 | OUTPUT: 133 | probability of p(y = 1 | x; w) 134 | ''' 135 | 136 | # model 137 | w = self.w 138 | 139 | # wTx is the inner product of w and x 140 | wTx = 0. 141 | for i in self._indices(x): 142 | wTx += w[i] 143 | 144 | # bounded sigmoid function, this is the probability estimation 145 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 146 | 147 | def update(self, x, p, y): 148 | ''' Update model using x, p, y 149 | 150 | INPUT: 151 | x: feature, a list of indices 152 | p: click probability prediction of our model 153 | y: answer 154 | 155 | MODIFIES: 156 | self.n: increase by squared gradient 157 | self.z: weights 158 | ''' 159 | 160 | # parameter 161 | alpha = self.alpha 162 | 163 | # model 164 | n = self.n 165 | z = self.z 166 | w = self.w 167 | 168 | # gradient under logloss 169 | g = p - y 170 | 171 | # update z and n 172 | tmp = 0 173 | for i in self._indices(x): 174 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 175 | z[i] += g - sigma * w[i] 176 | n[i] += g * g 177 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 178 | # build w using z and n 179 | if sign * z[i] <= L1: 180 | # w[i] vanishes due to L1 regularization 181 | w[i] = 0. 182 | else: 183 | # apply prediction time L1, L2 regularization to z and get w 184 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 185 | 186 | def logloss(p, y): 187 | ''' FUNCTION: Bounded logloss 188 | 189 | INPUT: 190 | p: our prediction 191 | y: real answer 192 | 193 | OUTPUT: 194 | logarithmic loss of p given y 195 | ''' 196 | 197 | p = max(min(p, 1. - 10e-15), 10e-15) 198 | return -log(p) if y == 1. else -log(1. - p) 199 | 200 | 201 | def data(path, D): 202 | ''' GENERATOR: Robotly hash-trick to the original csv row 203 | and for simplicity, we one-hot-encode everything 204 | 205 | INPUT: 206 | path: path to training or testing file 207 | D: the max index that we can hash to 208 | 209 | YIELDS: 210 | ID: id of the instance, mainly useless 211 | x: a list of hashed and one-hot-encoded 'indices' 212 | we only need the index since all values are either 0 or 1 213 | y: y = 1 if we have a click, else we have y = 0 214 | ''' 215 | 216 | for t, row in enumerate(DictReader(open(path))): 217 | # process id 218 | ID = row['id'] 219 | del row['id'] 220 | 221 | # process clicks 222 | y = 0. 223 | if 'click' in row: 224 | if row['click'] == '1': 225 | y = 1. 226 | del row['click'] 227 | 228 | # extract date 229 | date = int(row['hour'][4:6]) 230 | row["C28"] = convt(row["C28"],"C28") 231 | del row["hour"] 232 | row["ips"] = row["device_ip"] 233 | if row["device_ip"][-3:] == "ips": 234 | row["device_ip"] = "ips" 235 | row["ips"] = row["ips"][:-3] 236 | row["ipcate"] = "ipcate_null" 237 | if row["ips"][2:] in ipcate: 238 | row["ipcate"] = "ipcate_" + str(ipcate[row["ips"][2:]]) 239 | # turn hour really into hour, it was originally YYMMDDHH 240 | isApp = row["site_id"] == "c_85f751fd" 241 | if isApp: 242 | del row["site_id"] 243 | del row["site_domain"] 244 | del row["site_category"] 245 | else: 246 | del row["app_id"] 247 | del row["app_domain"] 248 | del row["app_category"] 249 | # build x 250 | x = [] 251 | if interaction: 252 | if not isApp: 253 | for pair in inter: 254 | x.append(row[pair[0]] + "_" + row[pair[1]]) 255 | else: 256 | for pair in inter2: 257 | x.append(row[pair[0]] + "_" + row[pair[1]]) 258 | del row["ips"] 259 | for key in row: 260 | value = row[key] 261 | # one-hot encode everything with hash trick 262 | x.append(value) 263 | 264 | yield t, date, ID, x, y, isApp 265 | 266 | 267 | ############################################################################## 268 | # start training ############################################################# 269 | ############################################################################## 270 | 271 | start = datetime.now() 272 | 273 | # initialize ourselves a learner 274 | learner1 = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 275 | learner2 = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 276 | 277 | # start training 278 | for e in xrange(epoch): 279 | loss1 = 0. 280 | loss2 = 0. 281 | count1 = 1 282 | count2 = 1 283 | loss = 0. 284 | count = 1 285 | localcount = 0 286 | learner1.alpha = 0.05 - 0.01 * e 287 | learner2.alpha = 0.05 - 0.01 * e 288 | for t, date, ID, x, y, isApp in data(train, D): # data is a generator 289 | # t: just a instance counter 290 | # date: you know what this is 291 | # ID: id provided in original data 292 | # x: features 293 | # y: label (click) 294 | 295 | # step 1, get prediction from learner 296 | if not isApp: 297 | p = learner1.predict(x) 298 | else: 299 | p = learner2.predict(x) 300 | 301 | #print progress 302 | localcount += 1 303 | if localcount % 1000000 == 0: 304 | if (holdafter and date > holdafter) or (holdout and t % holdout == 0): 305 | print "valid: " + str(localcount) 306 | else: 307 | print "train: " + str(localcount) 308 | 309 | if (holdafter and date > holdafter) or (holdout and t % holdout == 0): 310 | # step 2-1, calculate validation loss 311 | # we do not train with the validation data so that our 312 | # validation loss is an accurate estimation 313 | # 314 | # holdafter: train instances from day 1 to day N 315 | # validate with instances from day N + 1 and after 316 | # 317 | # holdout: validate with every N instance, train with others 318 | loss += logloss(p,y) 319 | count += 1 320 | if isApp: 321 | loss1 += logloss(p, y) 322 | count1 += 1 323 | else: 324 | loss2 += logloss(p, y) 325 | count2 += 1 326 | else: 327 | # step 2-2, update learner with label (click) information 328 | if not isApp: 329 | learner1.update(x, p, y) 330 | else: 331 | learner2.update(x, p, y) 332 | if not SUB: 333 | print('Epoch %d finished, validation logloss: %f, logloss1: %f, logloss2: %f, elapsed time: %s' % (e, loss/count, loss1/count1, loss2/count2, str(datetime.now() - start))) 334 | 335 | ############################################################################## 336 | # start testing, and build Kaggle's submission file ########################## 337 | ############################################################################## 338 | def sub(): 339 | with open(submission, 'w') as outfile: 340 | for t, date, ID, x, y, isApp in data(test, D): 341 | if not isApp: 342 | p = learner1.predict(x) 343 | else: 344 | p = learner2.predict(x) 345 | outfile.write('%f\n' % p) 346 | 347 | sub() -------------------------------------------------------------------------------- /script/gbdt_dense.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # generate dense feature for gbdt 4 | from datetime import datetime 5 | import marshal 6 | 7 | id_stat = marshal.load(open("../id_stat")) 8 | 9 | # load name data 10 | def load_name_sample(input,isTest): 11 | f = open(input) 12 | y = [] 13 | x = [] 14 | line = f.readline() 15 | index = 3 16 | if isTest == True: 17 | index = 2 18 | cnt = 0 19 | isValid = False 20 | 21 | while True: 22 | line = f.readline().strip() 23 | if not line : 24 | break 25 | fields = line.split(',') 26 | if isTest == False: 27 | label = int(fields[1]) 28 | if label == 0: 29 | label = -1 30 | y.append(label) 31 | if isValid==False: 32 | if int(fields[2][4:6]) > 28: 33 | isValid = True 34 | else: 35 | y.append(-1) 36 | 37 | cur_x = [] 38 | for i in xrange(index,len(fields)): 39 | if i == len(fields)-19: 40 | cur_x.append(id_stat["j_"+fields[i]]) 41 | #continue 42 | elif i == len(fields)-20: 43 | #cur_x.append(gbdt_id["i_"+fields[i]]) 44 | continue 45 | elif i == len(fields)-7: 46 | cur_x.append(id_stat["v_"+fields[i]]) 47 | elif i > len(fields)-7: 48 | cur_x.append(int(fields[i])) 49 | 50 | cur_str_x = [str(x) for x in cur_x] 51 | if isTest == True: 52 | print >> gbdt_test,str(y[cnt])+" "+" ".join(cur_str_x) 53 | else: 54 | print >> gbdt_train,str(y[cnt])+" "+" ".join(cur_str_x) 55 | cnt = cnt + 1 56 | if cnt % 1000000 == 0: 57 | print cnt 58 | 59 | starttime = datetime.now() 60 | 61 | d = {} 62 | 63 | gbdt_train = open("../train_dense","w") 64 | gbdt_test = open("../test_dense","w") 65 | 66 | load_name_sample('../train_c',False) 67 | load_name_sample('../test_c',True) 68 | 69 | gbdt_train.close() 70 | gbdt_test.close() 71 | 72 | #learner = field_fm(k,l,t,alpha,beta,max_feature,field_cnt) 73 | endtime = datetime.now() 74 | print (endtime-starttime).seconds -------------------------------------------------------------------------------- /script/genDict.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | 3 | d = {} 4 | dc = {} 5 | l = [] 6 | fset = marshal.load(open("../fc")) 7 | site_null = "85f751fd" 8 | app_null = "ecad2386" 9 | 10 | f1 = open("../train_c") 11 | f2 = open("../test_c") 12 | line = f1.readline() 13 | line = f2.readline() 14 | count = 0 15 | while True: 16 | line = f1.readline() 17 | if not line: 18 | break 19 | count += 1 20 | if count % 100000 == 0: 21 | print count 22 | lis = line[:-1].split(",") 23 | date = int(lis[2][4:6]) 24 | ip = lis[12] 25 | if "j_" + ip not in fset: 26 | continue 27 | w = lis[7] 28 | if lis[5] == site_null: 29 | w = lis[10] 30 | if w not in dc: 31 | dc[w] = set() 32 | dc[w].add(ip) 33 | else: 34 | dc[w].add(ip) 35 | if w not in d: 36 | l.append(w) 37 | d[w] = len(l) - 1 38 | 39 | count = 0 40 | while True: 41 | line = f2.readline() 42 | if not line: 43 | break 44 | count += 1 45 | if count % 100000 == 0: 46 | print count 47 | lis = line[:-1].split(",") 48 | ip = lis[11] 49 | if "j_" + ip not in fset: 50 | continue 51 | w = lis[6] 52 | if lis[4] == site_null: 53 | w = lis[9] 54 | if w not in dc: 55 | dc[w] = set() 56 | dc[w].add(ip) 57 | else: 58 | dc[w].add(ip) 59 | if w not in d: 60 | l.append(w) 61 | d[w] = len(l) - 1 62 | print len(d) 63 | for k in dc: 64 | dc[k] = len(dc[k]) 65 | marshal.dump([d,dc],open("../ip_dict","w")) -------------------------------------------------------------------------------- /script/genM.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | from math import log 3 | 4 | site_null = "85f751fd" 5 | app_null = "ecad2386" 6 | 7 | [dic,dc] = marshal.load(open("../ip_dict")) 8 | l = [""] * len(dic) 9 | for x in dic: 10 | l[dic[x]] = x 11 | for i in xrange(len(dc)): 12 | print dc[l[i]] 13 | d = {} 14 | fset = marshal.load(open("../fc")) 15 | 16 | f1 = open("../train_c") 17 | f2 = open("../test_c") 18 | line = f1.readline() 19 | line = f2.readline() 20 | count = 0 21 | while True: 22 | line = f1.readline() 23 | if not line: 24 | break 25 | count += 1 26 | if count % 100000 == 0: 27 | print count 28 | lis = line[:-1].split(",") 29 | date = int(lis[2][4:6]) 30 | ip = lis[12] 31 | if "j_" + ip not in fset: 32 | continue 33 | w = lis[7] 34 | if lis[5] == site_null: 35 | w = lis[10] 36 | if ip not in d: 37 | d[ip] = [0.] * len(dic) 38 | d[ip][dic[w]] += 1 39 | 40 | count = 0 41 | while True: 42 | line = f2.readline() 43 | if not line: 44 | break 45 | count += 1 46 | if count % 100000 == 0: 47 | print count 48 | lis = line[:-1].split(",") 49 | ip = lis[11] 50 | if "j_" + ip not in fset: 51 | continue 52 | w = lis[6] 53 | if lis[4] == site_null: 54 | w = lis[9] 55 | if ip not in d: 56 | d[ip] = [0.] * len(dic) 57 | d[ip][dic[w]] += 1 58 | ll = float(len(d)) 59 | for k in d: 60 | s = reduce(lambda x,y:x + y,d[k]) 61 | for i in xrange(len(d[k])): 62 | d[k][i] = d[k][i] / s * log(ll / dc[l[i]]) 63 | 64 | marshal.dump(d,open("../ip_mat","w")) -------------------------------------------------------------------------------- /script/id_day.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | 3 | def stat(input,isTest): 4 | f = open(input) 5 | line = f.readline() 6 | count = 0 7 | while True: 8 | line = f.readline() 9 | if not line: 10 | break 11 | count += 1 12 | if count % 100000 == 0: 13 | print count 14 | lis = line.split(",") 15 | # ip 16 | index = 12 17 | if isTest: 18 | index = 11 19 | ip = "j_" + lis[index] 20 | if ip in d_id: 21 | d_id[ip].add(lis[2][4:6]) 22 | else: 23 | s = set() 24 | s.add(lis[2][4:6]) 25 | d_id[ip] = s 26 | # identical id 27 | index = len(lis)-7 28 | iid = "v_"+lis[index] 29 | if iid in d_id: 30 | d_id[iid].add(lis[2][4:6]) 31 | else: 32 | s = set() 33 | s.add(lis[2][4:6]) 34 | d_id[iid] = s 35 | f.close() 36 | 37 | d_id = {} 38 | d_set = {} 39 | 40 | stat("../train_c",False) 41 | stat("../test_c",True) 42 | 43 | for k in d_id: 44 | d_set[k] = len(d_id[k]) 45 | 46 | marshal.dump(d_set,open("../id_day","w")) 47 | -------------------------------------------------------------------------------- /script/id_stat.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | 3 | def stat(input): 4 | f = open(input) 5 | line = f.readline() 6 | count = 0 7 | while True: 8 | line = f.readline() 9 | if not line: 10 | break 11 | count += 1 12 | if count % 100000 == 0: 13 | print count 14 | lis = line.split(",") 15 | # identical id 16 | index = len(lis)-7 17 | iid = "v_"+lis[index] 18 | if iid in d_id: 19 | d_id[iid] += 1 20 | else: 21 | d_id[iid] = 1 22 | # ip 23 | index = len(lis)-19 24 | ip = "j_"+lis[index] 25 | if ip in d_id: 26 | d_id[ip] += 1 27 | else: 28 | d_id[ip] = 1 29 | # id 30 | index = len(lis)-20 31 | id = "i_"+lis[index] 32 | if id in d_id: 33 | d_id[id] += 1 34 | else: 35 | d_id[id] = 1 36 | f.close() 37 | 38 | d_id = {} 39 | 40 | stat("../train_c") 41 | stat("../test_c") 42 | 43 | 44 | marshal.dump(d_id,open("../id_stat","w")) -------------------------------------------------------------------------------- /script/index1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from datetime import datetime 4 | # load name data 5 | def load_name_sample(input,isTest): 6 | f = open(input) 7 | y = [] 8 | x = [] 9 | max_feature = 0 10 | field_cnt = 0 11 | line = f.readline() 12 | index = 3 13 | if isTest == True: 14 | index = 2 15 | cnt = 0 16 | 17 | while True: 18 | line = f.readline() 19 | if line.strip()=="" : 20 | break 21 | fields = line.split(',') 22 | if isTest == False: 23 | label = int(fields[1]) 24 | if label == 0: 25 | label = -1 26 | y.append(label) 27 | else: 28 | y.append(-1) 29 | 30 | cur_x = [] 31 | for i in xrange(index,len(fields)): 32 | if i == len(fields)-2: 33 | ss = fields[i].split('_') 34 | if int(ss[1])>=50: 35 | fields[i]=ss[0]+"_50" 36 | elif i == len(fields)-5: 37 | ss = fields[i].split('_') 38 | if int(ss[1])>=20: 39 | fields[i]=ss[0]+"_20" 40 | elif i > len(fields)-8 and i < len(fields)-1: 41 | ss = fields[i].split('_') 42 | if int(ss[1])>=10: 43 | fields[i]=ss[0]+"_10" 44 | idx = d.get(fields[i]) 45 | if idx == None: 46 | cur_x.append(len(d)) 47 | d[fields[i]] = len(d) 48 | else: 49 | cur_x.append(idx) 50 | cur_str_x = [str(x) for x in cur_x] 51 | if isTest == True: 52 | print >> fm_test,str(y[cnt])+" "+" ".join(cur_str_x) 53 | else: 54 | print >> fm_train,str(y[cnt])+" "+" ".join(cur_str_x) 55 | cnt = cnt + 1 56 | if cnt % 1000000 == 0: 57 | print cnt 58 | 59 | starttime = datetime.now() 60 | 61 | d = {} 62 | 63 | fm_train = open("../fm_train_1","w") 64 | fm_test = open("../fm_test_1","w") 65 | 66 | load_name_sample('../train_pre',False) 67 | load_name_sample('../test_pre',True) 68 | 69 | fm_train.close() 70 | fm_test.close() 71 | 72 | #learner = field_fm(k,l,t,alpha,beta,max_feature,field_cnt) 73 | endtime = datetime.now() 74 | print (endtime-starttime).seconds -------------------------------------------------------------------------------- /script/index2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from datetime import datetime 3 | 4 | # load name data 5 | def load_name_sample(input,isTest): 6 | f = open(input) 7 | y1 = [] 8 | x1 = [] 9 | y2 = [] 10 | x2 = [] 11 | lable1 = [] 12 | label2 = [] 13 | max_feature = 0 14 | field_cnt = 0 15 | line = f.readline() 16 | index = 3 17 | if isTest == True: 18 | index = 2 19 | cnt = 0 20 | 21 | label = -1 22 | id = "??" 23 | 24 | d = {} 25 | while True: 26 | isApp = False 27 | line = f.readline() 28 | if line.strip()=="" : 29 | break 30 | fields = line.split(',') 31 | id = fields[0] 32 | if isTest == False: 33 | label = int(fields[1]) 34 | if label == 0: 35 | label = -1 36 | cur_x = [] 37 | 38 | if fields[index+2] == "c_85f751fd": 39 | isApp = True 40 | d = d_app 41 | else: 42 | d = d_site 43 | 44 | for i in xrange(index,len(fields)): 45 | if i == len(fields)-2: 46 | ss = fields[i].split('_') 47 | if int(ss[1])>=50: 48 | fields[i]=ss[0]+"_50" 49 | elif i == len(fields)-5: 50 | ss = fields[i].split('_') 51 | if int(ss[1])>=20: 52 | fields[i]=ss[0]+"_20" 53 | elif i > len(fields)-8 and i < len(fields)-1: 54 | ss = fields[i].split('_') 55 | if int(ss[1])>=10: 56 | fields[i]=ss[0]+"_10" 57 | 58 | if isApp == True: 59 | if fields[i][0] == "d" or fields[i][0] == "e" or fields[i][0] == "c": 60 | continue 61 | else: 62 | if fields[i][0] == "g" or fields[i][0] == "h" or fields[i][0] == "f": 63 | continue 64 | 65 | idx = d.get(fields[i]) 66 | if idx == None: 67 | cur_x.append(len(d)) 68 | d[fields[i]] = len(d) 69 | else: 70 | cur_x.append(idx) 71 | cur_str_x = [str(x) for x in cur_x] 72 | if isApp == False: 73 | if isTest == True: 74 | print >> fm_test_1,str(label)+" "+" ".join(cur_str_x) 75 | else: 76 | print >> fm_train_1,str(label)+" "+" ".join(cur_str_x) 77 | else: 78 | if isTest == True: 79 | print >> fm_test_2,str(label)+" "+" ".join(cur_str_x) 80 | else: 81 | print >> fm_train_2,str(label)+" "+" ".join(cur_str_x) 82 | cnt = cnt + 1 83 | if cnt % 100000 == 0: 84 | print cnt 85 | 86 | 87 | starttime = datetime.now() 88 | 89 | d_app = {} 90 | d_site = {} 91 | 92 | fm_train_1 = open("../fm_train_1_1","w") 93 | fm_test_1 = open("../fm_test_1_1","w") 94 | fm_train_2 = open("../fm_train_1_2","w") 95 | fm_test_2 = open("../fm_test_1_2","w") 96 | 97 | load_name_sample('../train_pre',False) 98 | load_name_sample('../test_pre',True) 99 | 100 | fm_train_1.close() 101 | fm_test_1.close() 102 | fm_train_2.close() 103 | fm_test_2.close() 104 | 105 | endtime = datetime.now() 106 | 107 | print (endtime-starttime).seconds -------------------------------------------------------------------------------- /script/lsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.decomposition import TruncatedSVD 3 | import marshal 4 | 5 | d = marshal.load(open("../ip_mat")) 6 | 7 | X = [] 8 | l = [] 9 | for x in d: 10 | l.append(x) 11 | X.append(d[x]) 12 | X = np.array(X) 13 | 14 | svd = TruncatedSVD(n_components = 16, random_state=42) 15 | X = svd.fit_transform(X) 16 | 17 | print X.shape 18 | 19 | for i in xrange(len(l)): 20 | d[l[i]] = int(np.argmax(X[i])) 21 | marshal.dump(d,open("../testcate","w")) -------------------------------------------------------------------------------- /script/prep.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | 3 | ftr = "../train_c" 4 | fte = "../test_c" 5 | fset = marshal.load(open("../fc")) 6 | rare_d = marshal.load(open("../rare_d")) 7 | ftrain = "../train_pre" 8 | ftest = "../test_pre" 9 | 10 | id_day = marshal.load(open("../id_day")) 11 | 12 | def prep(input,output,isTest): 13 | f = open(input) 14 | out = open(output,"w") 15 | line = f.readline() 16 | print >> out,line[:-1] 17 | count = 0 18 | bias = 3 19 | if isTest: 20 | bias = 2 21 | while True: 22 | line = f.readline() 23 | if not line: 24 | break 25 | count += 1 26 | if count % 100000 == 0: 27 | print count 28 | lis = line[:-1].split(",") 29 | uid = "??" 30 | for i in xrange(bias,len(lis)): 31 | name = chr(ord('a') + i - bias) 32 | if name == "j": 33 | ip = name + "_" + lis[i] 34 | rare = rare_d.get(ip) 35 | if rare != None: 36 | lis[i] = "j_rare_" + str(rare) 37 | #print lis[i] 38 | continue 39 | if name == "i": 40 | id = name + "_" + lis[i] 41 | rare = rare_d.get(id) 42 | if rare != None: 43 | lis[i] = "i_rare_" + str(rare) 44 | #print lis[i] 45 | continue 46 | if name == "v": 47 | id = name + "_" + lis[i] 48 | uid = id 49 | rare = rare_d.get(id) 50 | if rare != None: 51 | lis[i] = "v_rare_" + str(rare) 52 | continue 53 | elif id_day.get(id) == 1: 54 | lis[i] = "v_id_s" 55 | continue 56 | if name + "_" + lis[i] not in fset and i < len(lis) - 6: 57 | lis[i] = name + "_rare" 58 | else: 59 | lis[i] = name + "_" + lis[i] 60 | lis.append("id_day_"+str(id_day[uid])) 61 | print >> out,",".join(lis) 62 | f.close() 63 | out.close() 64 | 65 | prep(ftr,ftrain,False) 66 | prep(fte,ftest,True) -------------------------------------------------------------------------------- /script/prep_1.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | 3 | ftr = "../train_c" 4 | fte = "../test_c" 5 | fset = marshal.load(open("../fc")) 6 | rare_d = marshal.load(open("../rare_d")) 7 | ftrain = "../train_pre_1" 8 | ftest = "../test_pre_1" 9 | 10 | id_day = marshal.load(open("../id_day")) 11 | 12 | def prep(input,output,isTest): 13 | f = open(input) 14 | out = open(output,"w") 15 | line = f.readline() 16 | print >> out,line[:-1] + ",C29" 17 | count = 0 18 | bias = 3 19 | if isTest: 20 | bias = 2 21 | while True: 22 | line = f.readline() 23 | if not line: 24 | break 25 | count += 1 26 | if count % 100000 == 0: 27 | print count 28 | lis = line[:-1].split(",") 29 | uid = "??" 30 | ip = "??" 31 | for i in xrange(bias,len(lis)): 32 | name = chr(ord('a') + i - bias) 33 | if name == "j": 34 | ip = name + "_" + lis[i] 35 | rare = rare_d.get(ip) 36 | if rare != None: 37 | lis[i] = "j_rare_" + str(rare) 38 | #print lis[i] 39 | continue 40 | if name == "i": 41 | id = name + "_" + lis[i] 42 | rare = rare_d.get(id) 43 | if rare != None: 44 | lis[i] = "i_rare_" + str(rare) 45 | #print lis[i] 46 | continue 47 | if name == "v": 48 | iid = name + "_" + lis[i] 49 | uid = iid 50 | rare = rare_d.get(iid) 51 | if rare != None: 52 | lis[i] = "v_rare_" + str(rare) 53 | continue 54 | if name + "_" + lis[i] not in fset and i < len(lis) - 6: 55 | lis[i] = name + "_rare" 56 | else: 57 | lis[i] = name + "_" + lis[i] 58 | if name == "j" and id_day[ip] == 1 : 59 | lis[i] += "#ips" 60 | lis.append("id_day_"+str(id_day[uid])) 61 | print >> out,",".join(lis) 62 | f.close() 63 | out.close() 64 | 65 | prep(ftr,ftrain,False) 66 | prep(fte,ftest,True) -------------------------------------------------------------------------------- /script/rare.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | 3 | def stat(input,isTest): 4 | f = open(input) 5 | line = f.readline() 6 | count = 0 7 | while True: 8 | line = f.readline() 9 | if not line: 10 | break 11 | count += 1 12 | if count % 100000 == 0: 13 | print count 14 | lis = line.split(",") 15 | index = 11 16 | if isTest: 17 | index = 10 18 | id = "i_"+lis[index] 19 | ip = "j_" + lis[index+1] 20 | iid = "v_" + lis[len(lis)-7] 21 | if id in d: 22 | d[id] += 1 23 | else: 24 | d[id] = 1 25 | if ip in d: 26 | d[ip] += 1 27 | else: 28 | d[ip] = 1 29 | if iid in d: 30 | d[iid] += 1 31 | else: 32 | d[iid] = 1 33 | f.close() 34 | 35 | d = {} 36 | 37 | stat("../train_c",False) 38 | stat("../test_c",True) 39 | 40 | rare_d = {} 41 | 42 | for k in d: 43 | if d[k] <=10: 44 | rare_d[k] = d[k] 45 | 46 | marshal.dump(rare_d,open("../rare_d","w")) -------------------------------------------------------------------------------- /script/run.sh: -------------------------------------------------------------------------------- 1 | cd ../fm 2 | make 3 | cd ../gbdt 4 | make 5 | cd ../script 6 | 7 | pypy addc.py 8 | pypy fcount.py 9 | pypy rare.py 10 | pypy id_day.py 11 | pypy prep.py 12 | pypy id_stat.py 13 | pypy gbdt_dense.py 14 | pypy index1.py 15 | pypy index2.py 16 | ../gbdt/gbdt -d 5 -t 19 ../test_dense ../train_dense ../test_gbdt_out ../train_gbdt_out 17 | 18 | # fm model 1 19 | pypy append_gbdt.py 20 | ../fm/fm -k 8 -t 5 -l 0.00003 ../fm_test_2 ../fm_train_2 21 | 22 | # fm model 2 23 | pypy append_gbdt_1.py 24 | ../fm/fm -k 8 -t 4 -l 0.00004 ../fm_test_2_1 ../fm_train_2_1 25 | ../fm/fm -k 8 -t 10 -l 0.00005 ../fm_test_2_2 ../fm_train_2_2 26 | pypy split.py ../fm_test_2_split ../fm_test_2_1.out ../fm_test_2_2.out 27 | 28 | # ftrl model prepare 29 | pypy prep_1.py 30 | pypy append.py 31 | pypy genDict.py 32 | pypy genM.py 33 | python lsa.py 34 | 35 | # ftrl model 1 36 | pypy ftrl_1.py 37 | 38 | # ftrl model 2 39 | pypy ftrl_2.py 40 | 41 | # ensemble 42 | pypy ensemble.py -------------------------------------------------------------------------------- /script/split.py: -------------------------------------------------------------------------------- 1 | import math 2 | import sys 3 | 4 | def load_result(result): 5 | fr = open(result) 6 | r = [] 7 | y = [] 8 | while True: 9 | line1 = fr.readline() 10 | if not line1 : 11 | break 12 | r.append(float(line1)) 13 | fr.close() 14 | return r 15 | 16 | 17 | def merge(output,input1,input2): 18 | r1 = load_result(input1) 19 | r2 = load_result(input2) 20 | ori_file = "../test_pre" 21 | index = 1 22 | 23 | f = open(ori_file) 24 | sub = open(output,"w") 25 | line = f.readline() 26 | 27 | cnt1 = -1 28 | cnt2 = -1 29 | while True: 30 | line = f.readline() 31 | if not line: 32 | break 33 | fields = line.split(",") 34 | if fields[index+3]=="c_85f751fd": 35 | cnt2 += 1 36 | print >> sub, str(r2[cnt2]) 37 | else: 38 | cnt1 += 1 39 | print >> sub, str(r1[cnt1]) 40 | f.close() 41 | 42 | 43 | merge(sys.argv[1],sys.argv[2],sys.argv[3]) 44 | --------------------------------------------------------------------------------