├── CMakeLists.txt ├── Eigen └── down.txt ├── README.md ├── index.html ├── install.pdf ├── python ├── KOL.exe ├── a9a_test ├── a9a_train ├── cross.py ├── german ├── learn.py └── rand.py ├── sample_dataset ├── a9a_test └── a9a_train └── src ├── Params.cpp ├── Params.h ├── common ├── ezOptionParser.hpp ├── init_param.h ├── md5.h └── util.h ├── data ├── Cacher.cpp ├── DataPoint.h ├── DataReader.h ├── DataSet.h ├── DataSetHelper.h ├── MNISTConvert.cpp ├── MNISTReader.h ├── basic_io.cpp ├── basic_io.h ├── comp.h ├── data_analysis.cpp ├── gzip_io.cpp ├── gzip_io.h ├── io_handler.h ├── io_interface.h ├── libsvm_binary.h ├── libsvmread.h ├── makefile ├── parser.h ├── s_array.h ├── thread_primitive.h ├── zlib_io.cpp └── zlib_io.h ├── kernel ├── kernel_RBP.h ├── kernel_bogd.h ├── kernel_bpas.h ├── kernel_fogd.h ├── kernel_forgetron.h ├── kernel_nogd.h ├── kernel_optim.h ├── kernel_pa.h ├── kernel_perceptron.h ├── kernel_projectron.h ├── kernel_projectronpp.h └── kernel_sgd.h ├── loss ├── HingeLoss.h ├── LogisticLoss.h ├── LossFunction.h ├── SquareLoss.h └── SquaredHingeLoss.h ├── main.cpp └── makefile /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(KOL) 4 | 5 | set (EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) 6 | #IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) 7 | set (CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR}) 8 | #ENDIF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) 9 | 10 | #check if Eigen exists 11 | find_path(EIGEN_PATH NAMES Eigen PATHS ${EIGEN_DIR} REQUIRED) 12 | if (NOT EIGEN_DIR) 13 | message(FATAL_ERROR "Eigen is not found, please specify by: -DEIGEN_DIR=") 14 | endif (NOT EIGEN_DIR) 15 | 16 | 17 | FILE(GLOB data_files src/data/*.h) 18 | FILE(GLOB loss_files src/loss/*.h) 19 | FILE(GLOB opti_files src/kernel/*.h) 20 | FILE(GLOB comm_files src/common/*.h) 21 | 22 | set (data_files ${data_files} 23 | src/data/basic_io.cpp 24 | ) 25 | 26 | include_directories( 27 | ${EIGEN_PATH} 28 | ) 29 | 30 | #set (data_files ${datafiles} 31 | # src/data/basic_io.cpp 32 | # src/data/zlib_io.cpp 33 | # src/data/gzip_io.cpp 34 | # ) 35 | source_group("data" FILES ${data_files}) 36 | source_group("loss" FILES ${loss_files}) 37 | source_group("kernel" FILES ${opti_files}) 38 | source_group("common" FILES ${comm_files}) 39 | 40 | set (SRC_LIST 41 | ${data_files} ${loss_files} ${opti_files} ${comm_files} 42 | ) 43 | 44 | IF(CMAKE_COMPILER_IS_GNUCXX) 45 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings -O2 -s") 46 | ENDIF(CMAKE_COMPILER_IS_GNUCXX) 47 | 48 | add_executable(KOL src/Params.cpp src/Params.h 49 | src/main.cpp ${SRC_LIST}) 50 | IF(UNIX) 51 | target_link_libraries(KOL pthread) 52 | ENDIF(UNIX) 53 | 54 | add_executable(Cacher src/data/Cacher.cpp ${data_files}) 55 | IF(UNIX) 56 | target_link_libraries(Cacher pthread) 57 | ENDIF(UNIX) 58 | 59 | install(TARGETS KOL Cacher 60 | DESTINATION .) 61 | -------------------------------------------------------------------------------- /Eigen/down.txt: -------------------------------------------------------------------------------- 1 | eigen.tuxfamily.org -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | LIBOKL-- A Library for Online Kernel Learning Algorithms 2 | 3 | Authors: 4 | Lu Jing, Wu Yue, Steven Hoi 5 | 6 | Contact: chhoi@ntu.edu.sg, jing.lu.2014@phdis.smu.edu.sg 7 | 8 | LIBOKL is a package for solving large scale online kernel learning tasks. The current version is in C++ and has a total of 10 different online single kernel learning algorithm for binary classification, which are all widely used in online kernel learning research. We also provide other packages for multi-class classification and regression and multiple kernel learning. See the link at the end. 9 | 10 | The algorithms in this package includes: 11 | 12 | 1. Perceptron: The kernelized Perceptron without budget maintainance. http://cseweb.ucsd.edu/~yfreund/papers/LargeMarginsUsingPerceptron.pdf 13 | 14 | 2. Online Gradient Descent (OGD): The kernelized online gradient descent algorithm without budget maintainance. 15 | http://eprints.pascal-network.org/archive/00002055/01/KivSmoWil04.pdf 16 | 17 | 3. Random Budget Perceptron (RBP): Budgeted perceptron algorithm with random support vector removal strategy. 18 | http://air.unimi.it/bitstream/2434/26350/1/J29.pdf 19 | 20 | 4. Forgetron: Forgetron algorithm that maintains the budget size by discarding the oldest support vectors. 21 | http://papers.nips.cc/paper/2806-the-forgetron-a-kernel-based-perceptron-on-a-fixed-budget.pdf 22 | 23 | 5. Projectron: The Projectron algorithm using budget projection strategy. 24 | http://eprints.pascal-network.org/archive/00004472/01/355.pdf 25 | 26 | 6. Projectron++: The aggressive version of Projectron algorithm that updates with both margin error and mistake case. 27 | http://eprints.pascal-network.org/archive/00004472/01/355.pdf 28 | 29 | 7. BPAs: The budget passive-aggressive algrotihtm with simple supprot removal strategy. 30 | http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_WangV10.pdf 31 | 32 | 8. BOGD: The budget online gradient descent algorithm by SV removal strategy 33 | http://arxiv.org/ftp/arxiv/papers/1206/1206.4633.pdf 34 | 35 | 9. FOGD: The Fourier Online Gradient Descent algorithm using functional approximation method. 36 | http://jingonline.weebly.com/uploads/5/3/7/3/53733905/lu15a.pdf 37 | 38 | 10. NOGD: The Nystrom Online Gradient Descent algorithm using functional approximation method.[pdf] 39 | http://jingonline.weebly.com/uploads/5/3/7/3/53733905/lu15a.pdf 40 | 41 | The last two were proposed by our group and published on Journal of Machine Learning Research. If you need to use this code package, please cite our paper as: 42 | ________________________________________ 43 | 44 | Lu J, Hoi S C H, Wang J, et al. Large scale online kernel learning[J]. Journal of Machine Learning Research, 2016, 17(47): 1. 45 | 46 | or bib: 47 | ________________________________________ 48 | @article{lu2016large, 49 | title={Large scale online kernel learning}, 50 | author={Lu, Jing and Hoi, Steven CH and Wang, Jialei and Zhao, Peilin and Liu, Zhi-Yong}, 51 | journal={Journal of Machine Learning Research}, 52 | volume={17}, 53 | number={47}, 54 | pages={1}, 55 | year={2016}, 56 | publisher={Journal of Machine Learning Research/Microtome Publishing} 57 | } 58 | _________________________________________ 59 | 60 | To get started, please refer to the file install.pdf, which provide a detailed step-by-step guide on the installation of this package. Before it, an Eigen package is needed (http://eigen.tuxfamily.org/index.php?title=Main_Page). After building, we get an executable file KOL and use it in command line. 61 | _______________________________________ 62 | 63 | Prepare for the input data 64 | 65 | We use the LIBSVM dataset formate, which is an effcient sparse data representation as input. Each instance in the dataset is represented by a row of numbers ended by "\n". For example: 66 | 67 | +1 5:1 16:1 20:1 37:1 40:1 63:1 68:1 73:1 74:1 76:1 82:1 93:1 68 | 69 | -1 2:1 6:1 18:1 19:1 39:1 40:1 52:1 61:1 71:1 72:1 74:1 76:1 80:1 95:1 70 | 71 | In the above dataset, there are 2 instances stored in two rows. Each row begins with the class label of this instance. In binary classification the label appears in two forms: {+1, -1}. Note that some dataset files might be labeled with {0, 1}, which is not allowed by our toolbox. They have to be preprocessed and transformed to the {-1,+1} formate. Following the label, the feature values appears in form feature_index:feature_value. This is a sparse feature representation. If one certain feature index does not appear, it indicates that its value is zero. 72 | 73 | Our toolbox is well designed to follow the standard online learning setting and load the dataset sequentially. So there is no memory limitation at all for large scale datasets. Users are not required to input the feature dimension of the dataset before training, since the algorithm will automaticly adjust to the increase of feature dimension. 74 | 75 | _________________________________ 76 | 77 | Command Line 78 | 79 | After compiling the code of the toolbox and getting the executable file "KOL", we can use command line mode to run the algorithms: 80 | 81 | >>KOL -i training_dataset [-t testing_dataset] -opt algorithm_name [parameter setting] 82 | 83 | KOL is the name of the executable file we got from compiling the code. -i training_dataset is a necessary input indicating the training dataset name. -opt algorithm_name is another necessary input indicating the selected algorithm for learning. -t testing_dataset is an optional input indicating the testing dataset name. If not indicated, the algorithm will only conduct the training process and output the online training accuracy and time cost. Parameter setting is also optional and diverses among different algorithms. If not indicated, the algorithm will use default setting. 84 | 85 | ______________________________________ 86 | 87 | A quick example: 88 | 89 | We may download the a9a datasets and perform the online kernel learning using the perceptron algorithm. We try the following command line: 90 | 91 | >>KOL -i a9a_train -t a9a_test -opt kernel-perceptron 92 | 93 | The ourput is as followings: 94 | 95 | Algorithm: kernel_perceptron 96 | 97 | 0 10000 20000 30000 98 | 99 | #Training Instances:32561 100 | 101 | Learn acuracy: 78.851997% 102 | 103 | #SV:6887 104 | 105 | Learning time: 10.218000 s 106 | 107 | Test acuracy: 70.738899 % 108 | 109 | Test time: 9.766000 s 110 | 111 | The second line indicates the number of processed training samples until now, which can give an intuitive impression of the processing speed. This is a necessary output in the case when the training time is extremely long. The output includes the training accuracy, training time cost (including loading time), the number of support vectors, test accuracy and test time (including loading time). 112 | 113 | __________________________________________________________ 114 | 115 | Parameter Setting: 116 | 117 | Each algorithm has its own set of parameters. We will give detailed explainations about the useage of each algorithm. 118 | 119 | parameter command line default value 120 | 121 | the gaussian width parameter for gaussian kernel exp(-\gamma||x-y||_2^2) -gamma gamma=0.01 122 | 123 | budget size for all budget algorithms, the max number of support vectors -B B=100 124 | 125 | the learning rate for gradient descent based algorithms -eta eta= 0.5 126 | 127 | the regularizer parameter for bogd -lambda gamma=0.01 128 | 129 | 130 | For parameters specially for some algorithms, we will introduce with the following examples: 131 | 132 | 1. Perceptron: 133 | 134 | >>KOL -i a9a_train -t a9a_test -opt kernel-perceptron -gamma 0.1 135 | 136 | 2. OGD: 137 | 138 | >>KOL -i a9a_train -t a9a_test -opt kernel-ogd -eta 0.1 -gamma 0.01 139 | 140 | 3. RBP 141 | 142 | >>KOL -i a9a_train -t a9a_test -opt kernel-rbp -B 300 143 | 144 | 4. Kernel-forgetron 145 | 146 | >>KOL -i a9a_train -t a9a_test -opt kernel-forgetron -B 300 -gamma 0.01 147 | 148 | 5. Kernel-projectron 149 | 150 | >>KOL -i a9a_train -t a9a_test -opt kernel-projectron -B 300 151 | 152 | 6. Kernel-projectronpp 153 | 154 | >>KOL -i a9a_train -t a9a_test -opt kernel-projectronpp -B 300 -gamma 0.01 155 | 156 | 7. Kernel-bpas 157 | 158 | >>KOL -i a9a_train -t a9a_test -opt kernel-bpas -B 300 -cbpas 1 -gamma 0.01 159 | 160 | Note that the parameter cbpas is the weight paramter C, which controls the step size. default value is 1. 161 | 162 | 8: BOGD 163 | 164 | >>KOL -i a9a_train -opt kernel-bogd -B 300 -lambda 0.1 -eta 0.1 -gamma 0.01 165 | 166 | 9: FOGD 167 | 168 | >>KOL -i a9a_train -opt kernel-fogd -D 400 -eta 0.001 -gamma 0.001 169 | 170 | Note that the parameter D is the number of fourier components for the FOGD algorithm. default value is 400 171 | 172 | 10: NOGD 173 | 174 | >>KOL -i a9a_train -opt kernel-nogd -knogd 30 -eta 0.1 -eta1 0.3 -gamma 0.01 -B 300 175 | 176 | Note that the parameter -knogd is the matrix rank for SVD. default value 20. The eta is the kernel step size and eta1 is the linear step size, both with 0.5 default value. 177 | ____________________________________________________ 178 | 179 | Related links: 180 | 181 | Steven Hoi's home page: http://stevenhoi.org/ 182 | 183 | LU Jing's home page: http://jingonline.weebly.com/ 184 | 185 | LIBOL: http://libol.stevenhoi.org/ 186 | 187 | LIBSOL: http://libsol.stevenhoi.org/ 188 | 189 | Eigen: http://eigen.tuxfamily.org/index.php?title=Main_Page 190 | 191 | LIBSVM: https://www.csie.ntu.edu.tw/~cjlin/libsvm/ 192 | 193 | Journal of Machine Learning Reseaerch: http://jmlr.org/papers/v17/14-148.html 194 | 195 | 196 | Our Matlab codes for all experiments in the research paper:https://github.com/jingcoco/Online-Kernel-Learning 197 | 198 | Our follow-up research in online multiple kernel learning: https://github.com/jingcoco/Online-Multiple-Kernel-Learning 199 | 200 | 201 | A follow-up work to our proposed algorithm in NIPS: https://papers.nips.cc/paper/6560-dual-space-gradient-descent-for-online-learning.pdf 202 | 203 | 204 | 205 | 206 | -------------------------------------------------------------------------------- /install.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LIBOL/KOL/e8627b81635dde6007af4715ec9dc84b76e98152/install.pdf -------------------------------------------------------------------------------- /python/KOL.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LIBOL/KOL/e8627b81635dde6007af4715ec9dc84b76e98152/python/KOL.exe -------------------------------------------------------------------------------- /python/cross.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import platform 4 | 5 | 6 | train_file ='german'####training file name 7 | eta=['1','0.1','0.01','0.001','0.0001']##### 8 | gamma=['1','0.1','0.01','0.001','0.0001']##### 9 | 10 | 11 | 12 | fold_num=5 13 | count_cmd = 'wc -l %s' %train_file 14 | count_handler = os.popen(count_cmd) 15 | line_num = int(count_handler.read().split()[0]) 16 | count_handler.close() 17 | 18 | split_line_num = int(line_num / fold_num) 19 | 20 | split_list = [] 21 | for k in range(0,fold_num): 22 | file_name = train_file + '_cva' + chr(ord('a') + k) 23 | os.system('rm -f %s' %file_name) 24 | split_list.append(file_name) 25 | 26 | split_cmd = 'split -l {0} {1} {2}_cv'\ 27 | .format(split_line_num,train_file, train_file) 28 | 29 | os.system(split_cmd) 30 | 31 | f0 = open(split_list[0]) 32 | raw0=f0.read() 33 | f0.close() 34 | f1 = open(split_list[1]) 35 | raw1=f1.read() 36 | f1.close() 37 | f2 = open(split_list[2]) 38 | raw2=f2.read() 39 | f2.close() 40 | f3 = open(split_list[3]) 41 | raw3=f3.read() 42 | f3.close() 43 | f4 = open(split_list[4]) 44 | raw4=f4.read() 45 | f4.close() 46 | train_list=['train0','train1','train2','train3','train4'] 47 | output_file = open(train_list[0], 'w') 48 | output_file.write(raw1+raw2+raw3+raw4) 49 | output_file.close() 50 | 51 | output_file = open(train_list[1], 'w') 52 | output_file.write(raw0+raw2+raw3+raw4) 53 | output_file.close() 54 | 55 | output_file = open(train_list[2], 'w') 56 | output_file.write(raw0+raw1+raw3+raw4) 57 | output_file.close() 58 | 59 | output_file = open(train_list[3], 'w') 60 | output_file.write(raw0+raw1+raw2+raw4) 61 | output_file.close() 62 | 63 | output_file = open(train_list[4], 'w') 64 | output_file.write(raw0+raw1+raw2+raw3) 65 | output_file.close() 66 | 67 | del raw0 68 | del raw1 69 | del raw2 70 | del raw3 71 | del raw4 72 | 73 | 74 | for k in range(len(gamma)): 75 | for j in range(len(eta)): 76 | for i in range(0,5): 77 | command="KOL -i "+train_list[i]+" -opt kernel-ogd -t "+split_list[i]+" -eta "+eta[j]+' -gamma '+gamma[k]+' >>result.txt'#### 78 | os.system(command) 79 | 80 | f0 = open("result.txt") 81 | raw=f0.read() 82 | 83 | sum=0 84 | for i in range(0,5): 85 | indexleft=raw.find('Test acuracy:') 86 | indexright=indexleft+20 87 | sss=raw[indexleft+13:indexright] 88 | raw=raw[indexright+10:] 89 | accu=float(sss) 90 | sum=sum+accu 91 | 92 | accurate=sum/5 93 | print(gamma[k]+'\t'+eta[j]+"\t"+str(accurate)) 94 | f0.close() 95 | os.remove("result.txt") 96 | 97 | for i in range(0,5): 98 | os.remove(train_list[i]) 99 | os.remove(split_list[i]) 100 | -------------------------------------------------------------------------------- /python/learn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | data='a9a' 5 | train=data+'_train' 6 | test=data+'_test' 7 | train_set=[train,'train0','train1','train2','train3','train4','train5','train6','train7','train8','train9','train10'] 8 | for i in range(0,10): 9 | cmd='rand.py '+ train_set[i]+' '+ train_set[i+1] 10 | os.system(cmd) 11 | cmd='KOL.exe -i '+ train_set[i]+" -t "+test+' -opt kernel-ogd >>"reslut_project.txt' 12 | os.system(cmd) 13 | 14 | 15 | f0 = open("reslut_project.txt") 16 | raw=f0.read() 17 | sum=0 18 | for i in range(0,10): 19 | indexleft=raw.find('Test acuracy:') 20 | indexright=indexleft+20 21 | sss=raw[indexleft+13:indexright] 22 | raw=raw[indexright+10:] 23 | accu=float(sss) 24 | sum=sum+accu 25 | print(accu) 26 | 27 | accurate=sum/10 28 | print("\n") 29 | print("test accuracy") 30 | print(accurate) 31 | print("\n") 32 | 33 | 34 | f0 = open("reslut_project.txt") 35 | raw=f0.read() 36 | sum=0 37 | for i in range(0,10): 38 | indexleft=raw.find('Learning time: ') 39 | indexright=indexleft+22 40 | sss=raw[indexleft+15:indexright] 41 | raw=raw[indexright+10:] 42 | accu=float(sss) 43 | sum=sum+accu 44 | 45 | accurate=sum/10 46 | print("\n") 47 | print("learning time:") 48 | print(accurate) 49 | print("\n") 50 | 51 | f0 = open("reslut_project.txt") 52 | raw=f0.read() 53 | sum=0 54 | for i in range(0,10): 55 | indexleft=raw.find('Test time: ') 56 | indexright=indexleft+18 57 | sss=raw[indexleft+11:indexright] 58 | raw=raw[indexright+10:] 59 | accu=float(sss) 60 | sum=sum+accu 61 | 62 | accurate=sum/10 63 | print("\n") 64 | print("Test time") 65 | print(accurate) 66 | print("\n") 67 | f0.close() 68 | #os.remove('reslut1.txt') 69 | 70 | 71 | -------------------------------------------------------------------------------- /python/rand.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """rand a dataset""" 3 | 4 | import sys 5 | import random 6 | def Usage(): 7 | print 'rand.py inputfile outputfile' 8 | 9 | if len(sys.argv) != 3: 10 | Usage() 11 | sys.exit() 12 | input_file = sys.argv[1] 13 | output_file = sys.argv[2] 14 | file_handler = open(input_file,'r') 15 | content = file_handler.readlines() 16 | #print content[-1] 17 | if content[-1][-1] != '\n': 18 | #print content[-1][-1] 19 | content[-1]+='\n' 20 | file_handler.close() 21 | 22 | random.shuffle(content) 23 | 24 | file_handler = open(output_file, 'w') 25 | file_handler.writelines(content) 26 | file_handler.close() 27 | -------------------------------------------------------------------------------- /src/Params.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: Params.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Thu 26 Sep 2013 05:49:18 PM SGT 5 | > Functions: Class for Parsing parameters 6 | ************************************************************************/ 7 | #include "Params.h" 8 | #include "common/util.h" 9 | #include "common/init_param.h" 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | using namespace ez; 16 | 17 | namespace SOL { 18 | Params::Params() { 19 | this->vfloat = new ezOptionValidator("f"); 20 | this->vint = new ezOptionValidator("u4"); 21 | this->vbool = new ezOptionValidator("t","in","true,false",false); 22 | 23 | this->Init(); 24 | } 25 | 26 | Params::~Params(){ 27 | } 28 | void Params::Init(){ 29 | //initialize params 30 | opt.overview = "Sparse Online Learning Library"; 31 | opt.syntax = "SOL [options] -i train_file" ; 32 | opt.example = "SOL -i train_file -opt SGD"; 33 | 34 | opt.add("",0,0,',',"help message","-h","--help"); 35 | 36 | this->add_option("",0,1,"train file","-i", &this->fileName); 37 | this->add_option("",0,1,"test file name","-t",&this->test_fileName); 38 | this->add_option("",0,1,"cached train file name","-c",&this->cache_fileName); 39 | this->add_option("",0,1,"cached test file name","-tc",&this->test_cache_fileName); 40 | 41 | this->add_option(init_data_type,0,1,"data type format","-dt",&this->str_data_type); 42 | this->add_option(init_buf_size,0,1,"number of chunks for buffering","-bs",&this->buf_size); 43 | 44 | this->add_option(init_loss_type,0,1,"loss function type:\nHinge, Logit, Square, SquareHinge","-loss",&this->str_loss); 45 | 46 | this->add_option(init_opti_method,0,1, 47 | "optimization method:\nSGD, STG, RDA, RDA_E, FOBOS, Ada-RDA, Ada-FOBOS, AROW, SAROW, CW-RDA, SCW-RDA","-opt", &this->str_opt); 48 | this->add_option(init_is_learn_best_param,0,0,"learn best parameter", 49 | "-lbp", &this->is_learn_best_param); 50 | this->add_option(init_eta,0,1,"learning rate", "-eta",&this->eta); 51 | this->add_option(gamma_int,0,1,"sigma_kernel", "-gamma",&this->gamma); 52 | this->add_option(Budget_ini,0,1,"Budget", "-B",&this->Budget_set); 53 | this->add_option(D_fogd,0,1,"D_fogd", "-D",&this->D_set); 54 | this->add_option(init_power_t,0,1,"power t of decaying learning rate","-power_t",&this->power_t); 55 | this->add_option(init_initial_t,0,1,"initial iteration number","-t0",&this->initial_t); 56 | this->add_option(init_lambda,0,1,"l1 regularization","-lambda", &this->lambda); 57 | this->add_option(1,0,1,"number of passes","-passes", &this->passNum); 58 | this->add_option(k_nogd_ini,0,1,"k_nogd","-knogd", &this->k_nogd); 59 | this->add_option(init_eta,0,1,"k_nogd","-eta1", &this->eta1); 60 | this->add_option(10000,0,1,"c for pa","-cpa",&this->C); 61 | this->add_option(C_bpas_ini,0,1,"c for bpas","-cbpas",&this->C_bpas); 62 | 63 | } 64 | 65 | void Params::add_option(float default_val, bool is_required, int expectArgs, 66 | const char* descr, const char* flag, float *storage){ 67 | *storage = default_val; 68 | this->opt.add("",is_required,expectArgs,0,descr,flag,this->vfloat); 69 | this->flag2storage_float[flag] = storage; 70 | } 71 | 72 | 73 | void Params::add_option(int default_val, bool is_required, int expectArgs, 74 | const char* descr, const char* flag, int *storage){ 75 | *storage = default_val; 76 | this->opt.add("",is_required,expectArgs,0,descr,flag,this->vint); 77 | this->flag2storage_int[flag] = storage; 78 | } 79 | void Params::add_option(bool default_val, bool is_required, int expectArgs, 80 | const char* descr, const char* flag, bool *storage){ 81 | *storage = default_val; 82 | this->opt.add("",is_required,expectArgs,0,descr,flag, this->vbool); 83 | this->flag2storage_bool[flag] = storage; 84 | } 85 | 86 | void Params::add_option(const char* default_val, bool is_required, int expectArgs, 87 | const char* descr, const char* flag, string *storage){ 88 | *storage = default_val; 89 | this->opt.add("",is_required,expectArgs,0,descr,flag); 90 | this->flag2storage_str[flag] = storage; 91 | } 92 | 93 | bool Params::Parse(int argc, const char** args) { 94 | if (opt.isSet("-h")){ 95 | this->Help(); 96 | return false; 97 | } 98 | opt.parse(argc, args); 99 | vector badOptions; 100 | if (!opt.gotRequired(badOptions)){ 101 | for (size_t i = 0; i < badOptions.size(); i++) 102 | cerr<<"ERROR: Missing required option "<Help(); 104 | return false; 105 | } 106 | if (!opt.gotExpected(badOptions)){ 107 | for (size_t i = 0; i < badOptions.size(); i++) 108 | cerr<<"ERROR: Got unexpected number of arguments for option "<Help(); 110 | return false; 111 | } 112 | for (map_float_iter iter = this->flag2storage_float.begin(); 113 | iter != this->flag2storage_float.end(); iter++){ 114 | if (opt.isSet(iter->first.c_str())) 115 | opt.get(iter->first.c_str())->getFloat(*(iter->second)); 116 | } 117 | 118 | for (map_int_iter iter = this->flag2storage_int.begin(); 119 | iter != this->flag2storage_int.end(); iter++){ 120 | if (opt.isSet(iter->first.c_str())) 121 | opt.get(iter->first.c_str())->getInt(*(iter->second)); 122 | } 123 | for (map_bool_iter iter = this->flag2storage_bool.begin(); 124 | iter != this->flag2storage_bool.end(); iter++){ 125 | if (opt.isSet(iter->first.c_str())) 126 | if (opt.get(iter->first.c_str())->expectArgs == 0) 127 | *(iter->second) = true; 128 | else{ 129 | string out; 130 | opt.get(iter->first.c_str())->getString(out); 131 | ToLowerCase(out); 132 | if (out == "true") 133 | *(iter->second) = true; 134 | else 135 | *(iter->second) = false; 136 | } 137 | } 138 | for (map_str_iter iter = this->flag2storage_str.begin(); 139 | iter != this->flag2storage_str.end(); iter++){ 140 | if (opt.isSet(iter->first.c_str())) 141 | opt.get(iter->first.c_str())->getString(*(iter->second)); 142 | } 143 | 144 | if (this->cache_fileName.size() == 0 && this->fileName.length() == 0){ 145 | cerr<<"you must specify the training data"< File Name: Params.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Thu 26 Sep 2013 05:51:05 PM SGT 5 | > Functions: Class for Parsing parameters 6 | ************************************************************************/ 7 | 8 | #ifndef HEADER_PARSER_PARAM 9 | #define HEADER_PARSER_PARAM 10 | 11 | #include "common/ezOptionParser.hpp" 12 | 13 | #include "data/parser.h" 14 | 15 | #include 16 | #include 17 | 18 | 19 | using std::string; 20 | using std::map; 21 | 22 | //using namespace ez; 23 | 24 | namespace SOL 25 | { 26 | class Params 27 | { 28 | private: 29 | ez::ezOptionParser opt; 30 | ez::ezOptionValidator* vfloat; 31 | ez::ezOptionValidator* vint; 32 | ez::ezOptionValidator* vbool; 33 | 34 | map flag2storage_float; 35 | map flag2storage_int; 36 | map flag2storage_bool; 37 | map flag2storage_str; 38 | 39 | typedef map::iterator map_float_iter; 40 | typedef map::iterator map_int_iter; 41 | typedef map::iterator map_bool_iter; 42 | typedef map::iterator map_str_iter; 43 | 44 | public: 45 | //input data 46 | string fileName; //source file name 47 | string cache_fileName; //cached file name 48 | string test_fileName; //test file name 49 | string test_cache_fileName; //cached test file name 50 | 51 | //dataset type 52 | string str_data_type; 53 | //loss function type 54 | string str_loss; 55 | //optimization method 56 | string str_opt; 57 | 58 | int passNum; 59 | int D_set; 60 | bool ave; 61 | 62 | //optimzation parameters 63 | float eta; //learning rate 64 | float eta1; 65 | float gamma; 66 | float lambda; //for l1 regularization 67 | int K; //for STG method 68 | int Budget_set; 69 | float gamma_rou; //for RDA 70 | int k_nogd; 71 | float delta; //for Ada- 72 | float r; //for AROW 73 | float phi; //for SCW 74 | float C; 75 | int buf_size; //number of chunks in dataset 76 | int start_ave; 77 | float C_bpas; 78 | 79 | int initial_t; 80 | float power_t; 81 | bool is_learn_best_param; //whether learn best parameter 82 | 83 | bool is_normalize; 84 | 85 | float beta_spa; 86 | float alpha_spa; 87 | int weight_sum; 88 | float delt_max; 89 | public: 90 | Params(); 91 | ~Params(); 92 | 93 | bool Parse(int argc, const char** args); 94 | void Help(); 95 | 96 | private: 97 | void Init(); 98 | 99 | void add_option(float default_val, bool is_required, int expectArgs, 100 | const char* descr, const char* flag, float *storage); 101 | void add_option(int default_val, bool is_required, int expectArgs, 102 | const char* descr, const char* flag, int *storage); 103 | void add_option(bool default_val, bool is_required, int expectArgs, 104 | const char* descr, const char* flag, bool *storage); 105 | void add_option(const char* default_val, bool is_required, int expectArgs, 106 | const char* descr, const char* flag, string *storage); 107 | }; 108 | } 109 | #endif 110 | -------------------------------------------------------------------------------- /src/common/init_param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: init_param.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/9/28 15:12:27 5 | > Functions: init parameters 6 | ************************************************************************/ 7 | 8 | #ifndef HEADER_INIT_PARAM 9 | #define HEADER_INIT_PARAM 10 | 11 | #include 12 | #include 13 | namespace SOL { 14 | #define IndexType uint32_t 15 | 16 | //compress cache 17 | #define BASIC_IO 0 18 | #define GZIP_IO 1 19 | #define ZLIB_IO 2 20 | // 21 | /////////////////////Optimizer Initalization parameters////////////////// 22 | // 23 | //learning rate for sgd 24 | static const float init_eta = 0.5; 25 | //budget size 26 | static const int Budget_ini=100; 27 | static const float lambda_ini=0.01f;/// 28 | static const float C_bpas_ini=1; 29 | static const int D_fogd=4*Budget_ini; 30 | static const float gamma_int=0.01; 31 | static const float ini_eta_fogd=5e-4f; 32 | static const int k_nogd_ini= (int)(0.2*Budget_ini); 33 | // static const int x_ini_dimension=24;///////////////for fogd 34 | 35 | /////////////////////Optimizer Initalization parameters////////////////// 36 | // 37 | //whether to learn the best parameter 38 | static const bool init_is_learn_best_param = false; 39 | //learning rate 40 | 41 | static const float init_eta_max = 128.f; 42 | static const float init_eta_min = 1.f; 43 | static const float init_eta_step = 2.f; 44 | //pow decaying learing rate 45 | static const float init_power_t = 0.5; 46 | //initial t 47 | static const int init_initial_t = 1; 48 | //l1 regularization 49 | static const float init_lambda = 0.001; 50 | //sparse soft threshold when counting zero-weights 51 | static const float init_sparse_soft_thresh = (float)(1e-5); 52 | //truncate gradients every K steps 53 | static const int init_k = 10; 54 | //gammarou in enchanced RDA 55 | static const float init_gammarou = 25; 56 | //delta in adaptive algorithms 57 | static const float init_delta = 10; 58 | static const float init_delta_max = 16.f; 59 | static const float init_delta_min = 0.125f; 60 | static const float init_delta_step = 2.f; 61 | //r in AROW 62 | static const float init_r = 1; 63 | static const float init_r_max = 16.f; 64 | static const float init_r_min = 0.125f; 65 | static const float init_r_step = 2.f; 66 | 67 | //skip value in SVM2SGD 68 | static const int init_skip = 16; 69 | //intial value of norminv in Confidence weighted algorithms 70 | static const float init_phi = 1.f; 71 | //is normalize the data 72 | static const bool init_normalize = false; 73 | 74 | static const char* init_loss_type = "Hinge"; 75 | static const char* init_data_type = "LibSVM"; 76 | static const char* init_opti_method = "SGD"; 77 | 78 | //trying the optimal parameters 79 | 80 | 81 | 82 | ////////////////////Data Set Reader Parameters/////////////////////////// 83 | static const size_t init_chunk_size = 256; 84 | static const size_t init_buf_size = 2; 85 | 86 | //////////////////////Zlib Parameters///////////////////////////// 87 | static const int zlib_deflate_level = -1; // use default deflate level 88 | static const size_t zlib_buf_size = 16348; //default buffer size of zlib 89 | } 90 | #endif 91 | -------------------------------------------------------------------------------- /src/common/md5.h: -------------------------------------------------------------------------------- 1 | /* MD5 2 | converted to C++ class by Frank Thilo (thilo@unix-ag.org) 3 | for bzflag (http://www.bzflag.org) 4 | 5 | based on: 6 | 7 | md5.h and md5.c 8 | reference implementation of RFC 1321 9 | 10 | Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All 11 | rights reserved. 12 | 13 | License to copy and use this software is granted provided that it 14 | is identified as the "RSA Data Security, Inc. MD5 Message-Digest 15 | Algorithm" in all material mentioning or referencing this software 16 | or this function. 17 | 18 | License is also granted to make and use derivative works provided 19 | that such works are identified as "derived from the RSA Data 20 | Security, Inc. MD5 Message-Digest Algorithm" in all material 21 | mentioning or referencing the derived work. 22 | 23 | RSA Data Security, Inc. makes no representations concerning either 24 | the merchantability of this software or the suitability of this 25 | software for any particular purpose. It is provided "as is" 26 | without express or implied warranty of any kind. 27 | 28 | These notices must be retained in any copies of any part of this 29 | documentation and/or software. 30 | 31 | */ 32 | 33 | #ifndef BZF_MD5_H 34 | #define BZF_MD5_H 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | // Constants for MD5Transform routine. 42 | #define S11 7 43 | #define S12 12 44 | #define S13 17 45 | #define S14 22 46 | #define S21 5 47 | #define S22 9 48 | #define S23 14 49 | #define S24 20 50 | #define S31 4 51 | #define S32 11 52 | #define S33 16 53 | #define S34 23 54 | #define S41 6 55 | #define S42 10 56 | #define S43 15 57 | #define S44 21 58 | 59 | // a small class for calculating MD5 hashes of strings or byte arrays 60 | // it is not meant to be fast or secure 61 | // 62 | // usage: 1) feed it blocks of uchars with update() 63 | // 2) finalize() 64 | // 3) get hexdigest() string 65 | // or 66 | // MD5(std::string).hexdigest() 67 | // 68 | // assumes that char is 8 bit and int is 32 bit 69 | class MD5 70 | { 71 | public: 72 | typedef unsigned int size_type; // must be 32bit 73 | 74 | MD5() { init();} 75 | MD5(const std::string& text) 76 | { 77 | init(); 78 | update(text.c_str(), text.length()); 79 | finalize(); 80 | } 81 | void update(const unsigned char input[], size_type length) 82 | { 83 | 84 | // compute number of bytes mod 64 85 | size_type index = count[0] / 8 % blocksize; 86 | 87 | // Update number of bits 88 | if ((count[0] += (length << 3)) < (length << 3)) 89 | count[1]++; 90 | count[1] += (length >> 29); 91 | 92 | // number of bytes we need to fill in buffer 93 | size_type firstpart = 64 - index; 94 | 95 | size_type i; 96 | 97 | // transform as many times as possible. 98 | if (length >= firstpart) 99 | { 100 | // fill buffer first, transform 101 | memcpy(&buffer[index], input, firstpart); 102 | transform(buffer); 103 | 104 | // transform chunks of blocksize (64 bytes) 105 | for (i = firstpart; i + blocksize <= length; i += blocksize) 106 | transform(&input[i]); 107 | 108 | index = 0; 109 | } 110 | else 111 | i = 0; 112 | 113 | // buffer remaining input 114 | memcpy(&buffer[index], &input[i], length-i); 115 | } 116 | 117 | void update(const char input[], size_type length) 118 | { 119 | update((const unsigned char*)input, length); 120 | } 121 | MD5& finalize() 122 | { 123 | 124 | static unsigned char padding[64] = { 125 | 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 126 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 128 | }; 129 | 130 | if (!finalized) { 131 | // Save number of bits 132 | unsigned char bits[8]; 133 | encode(bits, count, 8); 134 | 135 | // pad out to 56 mod 64. 136 | size_type index = count[0] / 8 % 64; 137 | size_type padLen = (index < 56) ? (56 - index) : (120 - index); 138 | update(padding, padLen); 139 | 140 | // Append length (before padding) 141 | update(bits, 8); 142 | 143 | // Store state in digest 144 | encode(digest, state, 16); 145 | 146 | // Zeroize sensitive information. 147 | memset(buffer, 0, sizeof buffer); 148 | memset(count, 0, sizeof count); 149 | 150 | finalized=true; 151 | } 152 | 153 | return *this; 154 | } 155 | std::string hexdigest() const 156 | { 157 | if (!finalized) 158 | return ""; 159 | 160 | char buf[33]; 161 | for (int i=0; i<16; i++) 162 | sprintf(buf+i*2, "%02x", digest[i]); 163 | buf[32]=0; 164 | 165 | return std::string(buf); 166 | } 167 | friend std::ostream& operator<<(std::ostream&, MD5 md5); 168 | 169 | private: 170 | void init() 171 | { 172 | 173 | finalized=false; 174 | 175 | count[0] = 0; 176 | count[1] = 0; 177 | 178 | // load magic initialization constants. 179 | state[0] = 0x67452301; 180 | state[1] = 0xefcdab89; 181 | state[2] = 0x98badcfe; 182 | state[3] = 0x10325476; 183 | } 184 | typedef unsigned char uint1; // 8bit 185 | typedef unsigned int uint4; // 32bit 186 | enum {blocksize = 64}; // VC6 won't eat a const static int here 187 | 188 | void transform(const uint1 block[blocksize]) 189 | { 190 | 191 | uint4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; 192 | decode (x, block, blocksize); 193 | 194 | /* Round 1 */ 195 | FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ 196 | FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ 197 | FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ 198 | FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ 199 | FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ 200 | FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ 201 | FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ 202 | FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ 203 | FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ 204 | FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ 205 | FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ 206 | FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ 207 | FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ 208 | FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ 209 | FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ 210 | FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ 211 | 212 | /* Round 2 */ 213 | GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ 214 | GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ 215 | GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ 216 | GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ 217 | GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ 218 | GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ 219 | GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ 220 | GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ 221 | GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ 222 | GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ 223 | GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ 224 | GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ 225 | GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ 226 | GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ 227 | GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ 228 | GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ 229 | 230 | /* Round 3 */ 231 | HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ 232 | HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ 233 | HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ 234 | HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ 235 | HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ 236 | HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ 237 | HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ 238 | HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ 239 | HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ 240 | HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ 241 | HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ 242 | HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ 243 | HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ 244 | HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ 245 | HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ 246 | HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ 247 | 248 | /* Round 4 */ 249 | II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ 250 | II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ 251 | II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ 252 | II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ 253 | II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ 254 | II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ 255 | II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ 256 | II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ 257 | II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ 258 | II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ 259 | II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ 260 | II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ 261 | II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ 262 | II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ 263 | II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ 264 | II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ 265 | 266 | state[0] += a; 267 | state[1] += b; 268 | state[2] += c; 269 | state[3] += d; 270 | 271 | // Zeroize sensitive information. 272 | memset(x, 0, sizeof x); 273 | } 274 | static void decode(uint4 output[], const uint1 input[], size_type len) 275 | { 276 | 277 | for (unsigned int i = 0, j = 0; j < len; i++, j += 4) 278 | output[i] = ((uint4)input[j]) | (((uint4)input[j+1]) << 8) | 279 | (((uint4)input[j+2]) << 16) | (((uint4)input[j+3]) << 24); 280 | } 281 | static void encode(uint1 output[], const uint4 input[], size_type len) 282 | { 283 | 284 | for (size_type i = 0, j = 0; j < len; i++, j += 4) { 285 | output[j] = input[i] & 0xff; 286 | output[j+1] = (input[i] >> 8) & 0xff; 287 | output[j+2] = (input[i] >> 16) & 0xff; 288 | output[j+3] = (input[i] >> 24) & 0xff; 289 | } 290 | } 291 | 292 | bool finalized; 293 | uint1 buffer[blocksize]; // bytes that didn't fit in last 64 byte chunk 294 | uint4 count[2]; // 64bit counter for number of bits (lo, hi) 295 | uint4 state[4]; // digest so far 296 | uint1 digest[16]; // the result 297 | 298 | // low level logic operations 299 | static inline uint4 F(uint4 x, uint4 y, uint4 z) 300 | { 301 | return (x&y) | (~x&z); 302 | } 303 | static inline uint4 G(uint4 x, uint4 y, uint4 z) 304 | { 305 | return (x&z) | (y&~z); 306 | } 307 | static inline uint4 H(uint4 x, uint4 y, uint4 z) 308 | { 309 | return x^y^z; 310 | } 311 | static inline uint4 I(uint4 x, uint4 y, uint4 z) 312 | { 313 | return y ^ (x | ~z); 314 | } 315 | 316 | static inline uint4 rotate_left(uint4 x, int n) 317 | { 318 | return (x << n) | (x >> (32-n)); 319 | } 320 | 321 | static inline void FF(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) 322 | { 323 | a = rotate_left(a+ F(b,c,d) + x + ac, s) + b; 324 | } 325 | static inline void GG(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) 326 | { 327 | a = rotate_left(a + G(b,c,d) + x + ac, s) + b; 328 | } 329 | static inline void HH(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) 330 | { 331 | a = rotate_left(a + H(b,c,d) + x + ac, s) + b; 332 | } 333 | static inline void II(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) 334 | { 335 | a = rotate_left(a + I(b,c,d) + x + ac, s) + b; 336 | } 337 | 338 | }; 339 | 340 | ////////////////////////////// 341 | 342 | std::ostream& operator<<(std::ostream& out, MD5 md5) 343 | { 344 | return out << md5.hexdigest(); 345 | } 346 | 347 | ////////////////////////////// 348 | 349 | std::string md5(const std::string str) 350 | { 351 | MD5 md5 = MD5(str); 352 | 353 | return md5.hexdigest(); 354 | } 355 | #endif 356 | 357 | -------------------------------------------------------------------------------- /src/common/util.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: util.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 8/19/2013 Monday 2:17:56 PM 5 | > Functions: 6 | ************************************************************************/ 7 | #ifndef HEADER_UTIL 8 | #define HEADER_UTIL 9 | 10 | #include "init_param.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #if WIN32 18 | #include 19 | #include 20 | #include 21 | #define SOL_ACCESS(x) _access(x,0) 22 | #else 23 | #include 24 | #include 25 | #include 26 | #include 27 | #define SOL_ACCESS(x) access(x,F_OK) 28 | #endif 29 | 30 | 31 | template 32 | inline char Sgn(T x) { 33 | if (x > 0) return 1; 34 | else if (x < 0) return -1; 35 | else return 0; 36 | } 37 | 38 | //#define ABS(x) (x > 0 ? x : -x) 39 | template 40 | inline T ABS(T x) { 41 | return x > 0 ? x : -x; 42 | } 43 | 44 | template 45 | inline float Average(const T* data, int dim) { 46 | return std::accumulate(data,data + dim, 0.f) / (float)dim; 47 | } 48 | 49 | template 50 | float Variance(const T* data, int dim) { 51 | if (dim <= 1) 52 | return 0; 53 | float ave = std::accumulate(data, data + dim,0.f) / (float)dim; 54 | double var(0); 55 | for (int i = 0; i < dim; i++) 56 | var += (data[i] - ave) * (data[i] - ave); 57 | return (float)(sqrt(var / (dim - 1))); 58 | } 59 | 60 | inline float trunc_weight(float w, float gravity){ 61 | if (w > 0) 62 | return (gravity < w) ? w - gravity : 0.f; 63 | else 64 | return (gravity < -w) ? w + gravity : 0.f; 65 | } 66 | inline float trunc_weight2(float w, float gravity){ 67 | if (w > 0) 68 | return (gravity < w) ? -gravity : -w; 69 | else 70 | return (gravity < -w) ? gravity : -w; 71 | } 72 | 73 | inline void ToUpperCase(string &str) { 74 | string dst_str; 75 | int len = str.length(); 76 | for (int i = 0; i < len; i++) 77 | dst_str.push_back(toupper(str[i])); 78 | std::swap(str,dst_str); 79 | } 80 | 81 | inline void ToLowerCase(string &str) { 82 | string dst_str; 83 | int len = str.length(); 84 | for (int i = 0; i < len; i++) 85 | dst_str.push_back(tolower(str[i])); 86 | std::swap(str,dst_str); 87 | } 88 | 89 | 90 | 91 | inline double get_current_time(){ 92 | #if _WIN32 93 | return GetTickCount() / 1000.0; 94 | #else 95 | struct timeval tim; 96 | gettimeofday(&tim, NULL); 97 | return tim.tv_sec + tim.tv_usec / 1000000.0; 98 | #endif 99 | } 100 | #endif 101 | -------------------------------------------------------------------------------- /src/data/Cacher.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: test.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Mon 04 Nov 2013 09:50:06 PM 5 | > Descriptions: 6 | ************************************************************************/ 7 | #if defined(_MSC_VER) && defined(_DEBUG) 8 | #define _CRTDBG_MAP_ALLOC 9 | #include 10 | #include 11 | #endif 12 | 13 | #include "libsvmread.h" 14 | #include "libsvm_binary.h" 15 | #include "DataSet.h" 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | using namespace SOL; 22 | void Usage(){ 23 | cout<<"Usage: Cache input_file output_file [-d]"< dt; 80 | dt.Load(input_file,output_file); 81 | size_t dataNum = 0; 82 | 83 | if(dt.Rewind()){ 84 | while(1){ 85 | const DataChunk chunk = dt.GetChunk(); 86 | dataNum += chunk.dataNum; 87 | if (chunk.dataNum == 0){ 88 | dt.FinishRead(); 89 | break; 90 | } 91 | dt.FinishRead(); 92 | } 93 | } 94 | } 95 | 96 | void Cache(const string &input_file, const string &output_file){ 97 | cout<<"Caching file..."< data; 108 | size_t dataNum = 0; 109 | size_t featNum = 0; 110 | while(reader.GetNextData(data) == true){ 111 | dataNum++; 112 | featNum += data.indexes.size(); 113 | if(!(data.label == 1 || data.label == -1)){ 114 | cout<<"data index "< 0 ? true : false; 142 | std::ofstream writer; 143 | if (is_write){ 144 | writer.open(output_file.c_str(), ios::out); 145 | if (writer.good() == false){ 146 | cerr<<"open output file" < data; 151 | size_t dataNum = 0; 152 | size_t featNum = 0; 153 | while(reader.GetNextData(data) == true){ 154 | dataNum++; 155 | featNum += data.indexes.size(); 156 | if (is_write){ 157 | writer< File Name: DataPoint.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/8/18 星期日 20:13:31 5 | > Functions: Data Point Definition 6 | ************************************************************************/ 7 | 8 | #pragma once 9 | 10 | #include "s_array.h" 11 | #include "../common/init_param.h" 12 | 13 | #include 14 | 15 | namespace SOL { 16 | /** 17 | * Definitions of DataPoint: one lable, and DataPoints 18 | * 19 | * @tparam DataType 20 | */ 21 | template 22 | class DataPoint { 23 | public: 24 | //////////////Member Variables 25 | s_array indexes; 26 | s_array features; 27 | LabelType label; 28 | FeatType sum_sq; //sum of square 29 | 30 | //for copy and release control 31 | int *count; 32 | 33 | IndexType max_index; //max index, also the dimension 34 | public: 35 | DataPoint() { 36 | this->count = new int; 37 | *count = 1; 38 | this->max_index = 0; 39 | this->label = 0; 40 | this->sum_sq = 0; 41 | } 42 | 43 | //copy constructor 44 | DataPoint(const DataPoint &point) { 45 | this->indexes = point.indexes; 46 | this->features = point.features; 47 | this->label = point.label; 48 | this->count = point.count; 49 | this->max_index = point.max_index; 50 | this->sum_sq = 0; 51 | ++(*count); 52 | } 53 | 54 | ~DataPoint(){ 55 | this->release(); 56 | } 57 | 58 | //assignment 59 | DataPoint& operator= 60 | (const DataPoint &data) { 61 | if (data.count == this->count) 62 | return *this; 63 | this->release(); 64 | 65 | this->indexes = data.indexes; 66 | this->features = data.features; 67 | this->label = data.label; 68 | this->max_index = data.max_index; 69 | this->sum_sq = data.sum_sq; 70 | this->count = data.count; 71 | ++(*count); 72 | return *this; 73 | } 74 | //set new index-value pair 75 | void AddNewFeat(const IndexType &index, 76 | const FeatType &feat) { 77 | this->indexes.push_back(index); 78 | this->features.push_back(feat); 79 | if(this->max_index < index){ 80 | this->max_index = index; 81 | } 82 | this->sum_sq += feat * feat; 83 | } 84 | 85 | void erase() { 86 | this->indexes.erase(); 87 | this->features.erase(); 88 | this->max_index = 0; 89 | this->sum_sq = 0; 90 | } 91 | 92 | 93 | DataPoint clone() const{ 94 | DataPoint newPt; 95 | newPt.label = this->label; 96 | newPt.max_index = this->max_index; 97 | newPt.sum_sq = this->sum_sq; 98 | newPt.indexes.resize(this->indexes.size()); 99 | memcpy(newPt.indexes.begin,this->indexes.begin, this->indexes.size() * sizeof(IndexType) ); 100 | newPt.features.resize(this->features.size()); 101 | memcpy(newPt.features.begin, this->features.begin, this->features.size() * sizeof(FeatType)); 102 | return newPt; 103 | } 104 | 105 | IndexType dim() const {return this->max_index;} 106 | 107 | private: 108 | void release() { 109 | --(*count); 110 | if (*count == 0) 111 | delete count; 112 | this->count = NULL; 113 | } 114 | 115 | }; 116 | template 117 | struct DataChunk{ 118 | DataPoint data[init_chunk_size]; 119 | size_t dataNum; 120 | bool is_inuse; 121 | bool is_parsed; 122 | DataChunk *next; 123 | 124 | DataChunk():dataNum(0),next(NULL), is_inuse(false), is_parsed(false){ 125 | } 126 | void erase() { 127 | for (size_t i = 0; i < dataNum; i++) 128 | data[i].erase(); 129 | dataNum = 0; 130 | } 131 | }; 132 | 133 | } 134 | -------------------------------------------------------------------------------- /src/data/DataReader.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: DataReader.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 8/21/2013 Wednesday 4:48:28 PM 5 | > Functions: Interface for data reader 6 | ************************************************************************/ 7 | 8 | #pragma once 9 | 10 | 11 | #include "DataPoint.h" 12 | #include 13 | 14 | namespace SOL { 15 | template 16 | class DataReader { 17 | public: 18 | virtual ~DataReader(){} 19 | public: 20 | /** 21 | * OpenReading: Open a dataset file and get it prepared to be read 22 | * 23 | * @Return: true if everything is ok 24 | */ 25 | virtual bool OpenReading() = 0; 26 | /** 27 | * GetNextData: for loading data sequentially 28 | * 29 | * @Param data: the variable to place the loaded data 30 | * 31 | * @Return: true if everything is ok 32 | */ 33 | virtual bool GetNextData(DataPoint &data) = 0; 34 | /** 35 | * Rewind: Rewind the dataset to the beginning of the file 36 | */ 37 | virtual void Rewind() = 0; 38 | 39 | /** 40 | * Close: Close the dataset when finished loading data 41 | */ 42 | virtual void Close() = 0; 43 | 44 | /** 45 | * Good : test the status of the data reader 46 | * 47 | * @Return: true if everything is ok 48 | */ 49 | virtual bool Good() = 0; 50 | }; 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/data/DataSet.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: DataSet.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/8/18 星期日 15:38:09 5 | > Functions: Class to interact with datasets 6 | ************************************************************************/ 7 | 8 | #pragma once 9 | 10 | 11 | #if WIN32 12 | #include 13 | #endif 14 | 15 | #include "DataSetHelper.h" 16 | #include "DataPoint.h" 17 | #include "DataReader.h" 18 | #include "libsvm_binary.h" 19 | #include "libsvmread.h" 20 | #include "../common/util.h" 21 | 22 | #include "thread_primitive.h" 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | using namespace std; 29 | 30 | /** 31 | * namespace: Sparse Online Learning 32 | */ 33 | namespace SOL { 34 | //data set, can work in both read-and-write mode and read-once mode 35 | template class DataSet { 36 | private: 37 | string fileName; 38 | string cache_fileName; 39 | bool is_cache; 40 | 41 | size_t bufSize; //buffer to load data 42 | size_t passNum; //number of passes 43 | size_t dataNum; //total data number 44 | 45 | size_t curChunkNum; //data number in buffer 46 | 47 | //pointer to the first element, circlar linked list will be used 48 | DataChunk *head; 49 | DataChunk *wt_ptr; //pointer to the write location 50 | DataChunk *rd_ptr; //pointer to the read location 51 | 52 | bool load_finished; //this is used for GetChunk to test if current loading has finished 53 | bool is_on_loading; //this is used for Rewind to test if rewind can be performed 54 | 55 | DataReader *reader; 56 | 57 | //thread-safety 58 | MUTEX data_lock; 59 | CV data_available; 60 | CV buffer_full; 61 | 62 | public: 63 | DataSet(size_t passes = 1, int buf_size = -1) { 64 | this->head = NULL; 65 | this->wt_ptr = NULL; 66 | this->rd_ptr = NULL; 67 | 68 | this->passNum = passes > 0 ? passes : 1; 69 | this->dataNum = 0; 70 | this->curChunkNum = 0; 71 | 72 | this->load_finished = false; 73 | this->is_on_loading = false; 74 | this->reader = NULL; 75 | this->is_cache = false; 76 | 77 | this->CreateBuffer(buf_size); 78 | 79 | //init thread-safety 80 | initialize_mutex(&this->data_lock); 81 | initialize_condition_variable(&data_available); 82 | initialize_condition_variable(&buffer_full); 83 | } 84 | ~DataSet() { 85 | delete_mutex(&data_lock); 86 | if (this->reader != NULL) 87 | delete this->reader; 88 | this->reader = NULL; 89 | this->ReleaseBuffer(); 90 | } 91 | 92 | private: 93 | bool CreateBuffer(int buf_size = 0) { 94 | this->ReleaseBuffer(); 95 | this->bufSize = buf_size > 0 ? buf_size : init_buf_size; 96 | if (this->bufSize <= 0) 97 | return true; 98 | 99 | this->head = new DataChunk; 100 | DataChunk *p = this->head; 101 | for (size_t i = 1; i < this->bufSize; i++) { 102 | p->next = new DataChunk; 103 | p = p->next; 104 | } 105 | p->next = this->head; 106 | this->wt_ptr = this->head; 107 | this->rd_ptr = this->head; 108 | 109 | return true; 110 | } 111 | 112 | private: 113 | void ClearBuffer() { 114 | DataChunk *p = this->head; 115 | if (p == NULL) 116 | return; 117 | p = p->next; 118 | while (p != this->head) { 119 | p->erase(); 120 | p = p->next; 121 | } 122 | p->erase(); 123 | this->dataNum = 0; 124 | this->curChunkNum = 0; 125 | this->wt_ptr = this->head; 126 | this->rd_ptr = this->head; 127 | } 128 | 129 | void ReleaseBuffer() { 130 | DataChunk *p = this->head; 131 | if (p == NULL) 132 | return; 133 | DataChunk *q = p->next; 134 | while (q != this->head) { 135 | p = q->next; 136 | delete q; 137 | q = p; 138 | } 139 | delete this->head; 140 | this->head = NULL; 141 | this->wt_ptr = NULL; 142 | this->rd_ptr = NULL; 143 | this->dataNum = 0; 144 | } 145 | 146 | public: 147 | template friend bool CacheLoad(DataSet *dataset); 148 | #if WIN32 149 | template friend DWORD WINAPI thread_LoadData(LPVOID param); 150 | #else 151 | template friend void* thread_LoadData(void* param); 152 | #endif 153 | 154 | //bind a data reader to the dataset 155 | bool Load(const string& filename, const string& cache_filename) { 156 | this->fileName = filename; 157 | this->cache_fileName = cache_filename; 158 | 159 | if (this->reader != NULL) 160 | delete this->reader; 161 | this->reader = NULL; 162 | 163 | if (SOL_ACCESS(this->cache_fileName.c_str()) == 0){ //already cached 164 | this->is_cache = false; 165 | this->reader = new libsvm_binary_(this->cache_fileName); 166 | } 167 | else if(SOL_ACCESS(this->fileName.c_str()) == 0){ 168 | this->reader = new LibSVMReader_(this->fileName); 169 | if (this->cache_fileName.length() == 0 && this->passNum > 1){ 170 | this->cache_fileName = "cache_file"; 171 | #if WIN32 172 | string cmd = "del " + this->cache_fileName; 173 | #else 174 | string cmd = "rm " + this->cache_fileName; 175 | #endif 176 | system(cmd.c_str()); 177 | this->is_cache = true; 178 | } 179 | else if (this->cache_fileName.length() > 0) 180 | this->is_cache = true; 181 | } 182 | else 183 | return false; 184 | 185 | if (this->reader != NULL){ 186 | if (this->reader->OpenReading() == false){ 187 | delete this->reader; 188 | this->reader = NULL; 189 | return false; 190 | } 191 | } 192 | 193 | return true; 194 | } 195 | 196 | /////////////Data Access///////////////////// 197 | public: 198 | 199 | //get the next write chunk 200 | inline DataChunk &GetWriteChunk(){ 201 | mutex_lock(&this->data_lock); 202 | if (this->wt_ptr->is_inuse == false){ 203 | this->wt_ptr->is_inuse = true; 204 | DataChunk* p = this->wt_ptr; 205 | mutex_unlock(&this->data_lock); 206 | return *p; 207 | } 208 | else{ 209 | condition_variable_wait(&this->buffer_full,&this->data_lock); 210 | mutex_unlock(&this->data_lock); 211 | return this->GetWriteChunk(); 212 | } 213 | } 214 | 215 | inline void EndWriteChunk(){ 216 | mutex_lock(&this->data_lock); 217 | this->wt_ptr->is_parsed = true; 218 | this->dataNum += this->wt_ptr->dataNum; 219 | //if (this->wt_ptr->dataNum == 0){ 220 | // cout<<"chunk size is zero!"<wt_ptr = this->wt_ptr->next; 223 | condition_variable_signal_all(&this->data_available); 224 | mutex_unlock(&this->data_lock); 225 | } 226 | 227 | inline void FinishParse(){ 228 | //notice that the all the data has been loaded 229 | mutex_lock(&this->data_lock); 230 | this->load_finished = true; 231 | this->is_on_loading = false; 232 | condition_variable_signal_all(&this->data_available); 233 | mutex_unlock(&this->data_lock); 234 | } 235 | 236 | //get the data to read 237 | inline const DataChunk& GetChunk() { 238 | mutex_lock(&this->data_lock); 239 | //check if there is available data 240 | if (this->rd_ptr->is_parsed == true){ 241 | this->rd_ptr->is_parsed = false; 242 | mutex_unlock(&this->data_lock); 243 | return *(this->rd_ptr); 244 | } 245 | else{ //no available data 246 | if (this->load_finished == true){ 247 | this->rd_ptr->is_parsed = false; 248 | this->rd_ptr->erase(); 249 | mutex_unlock(&this->data_lock); 250 | return *(this->rd_ptr); //return an invalid data 251 | } 252 | else{ //suspend the current thread 253 | condition_variable_wait(&this->data_available,&this->data_lock); 254 | mutex_unlock(&this->data_lock); 255 | return this->GetChunk(); 256 | } 257 | } 258 | } 259 | 260 | void FinishRead() { 261 | mutex_lock(&this->data_lock); 262 | this->rd_ptr->is_inuse = false; 263 | //notice that the last data have been processed 264 | this->rd_ptr = this->rd_ptr->next; 265 | condition_variable_signal_all(&this->buffer_full); 266 | mutex_unlock(&this->data_lock); 267 | } 268 | 269 | //the number of features 270 | inline size_t size() const {return this->dataNum; } 271 | bool Rewind() { 272 | mutex_lock(&this->data_lock); 273 | if (this->is_on_loading == true) { 274 | cout<<"data is on loading"<data_lock); 276 | return false; 277 | } 278 | reader->Rewind(); 279 | this->ClearBuffer(); 280 | this->load_finished = false; 281 | this->is_on_loading = true; 282 | mutex_unlock(&this->data_lock); 283 | 284 | #if WIN32 285 | HANDLE thread = ::CreateThread(NULL, 0, static_cast(thread_LoadData), this, NULL, NULL); 286 | #else 287 | pthread_t thread; 288 | pthread_create(&thread,NULL,thread_LoadData,this); 289 | #endif 290 | return true; 291 | } 292 | }; 293 | } 294 | -------------------------------------------------------------------------------- /src/data/DataSetHelper.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: DataSetHelper.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Thu 24 Oct 2013 03:33:10 PM 5 | > Descriptions: thread function definitions 6 | ************************************************************************/ 7 | #pragma once 8 | 9 | 10 | #include "libsvm_binary.h" 11 | #include "thread_primitive.h" 12 | 13 | namespace SOL{ 14 | template class DataSet; 15 | 16 | //load a chunk of data, return if file ended 17 | template 18 | bool load_chunk(DataReader* reader, DataChunk&chunk){ 19 | bool not_file_end = true; 20 | chunk.erase(); 21 | while(chunk.dataNum < init_chunk_size && not_file_end == true){ 22 | DataPoint &data = chunk.data[chunk.dataNum]; 23 | not_file_end = reader->GetNextData(data); 24 | if (not_file_end == true) 25 | chunk.dataNum++; 26 | else 27 | break; 28 | } 29 | return not_file_end; 30 | } 31 | 32 | //save chunk to disk 33 | template 34 | bool save_chunk(libsvm_binary_ *writer, DataChunk&chunk){ 35 | size_t w_num = 0; 36 | while(w_num < chunk.dataNum){ 37 | if (writer->WriteData(chunk.data[w_num]) == true) 38 | w_num++; 39 | else 40 | return false; 41 | } 42 | return false; 43 | } 44 | 45 | template 46 | libsvm_binary_* get_cacher(const std::string &cache_filename){ 47 | string tmpFileName = cache_filename + ".writing"; 48 | libsvm_binary_* cacher = new libsvm_binary_(tmpFileName); 49 | if (cacher->OpenWriting() == false){ 50 | cerr<<"Open cache file failed!"< 58 | bool end_cache(libsvm_binary_**cacher, const std::string& cache_fileName){ 59 | string tmpFileName = (*cacher)->get_filename(); 60 | (*cacher)->Close(); 61 | delete *cacher; 62 | *cacher = NULL; 63 | 64 | //rename 65 | #if WIN32 66 | string cmd = "ren \""; 67 | cmd = cmd + tmpFileName + "\" \""; 68 | //in windows, the second parameter of ren should not include path 69 | cmd = cmd + cache_fileName.substr(cache_fileName.find_last_of("/\\") + 1) + "\""; 70 | #else 71 | string cmd = "mv \""; 72 | cmd = cmd + tmpFileName + "\" \""; 73 | cmd = cmd + cache_fileName + "\""; 74 | #endif 75 | 76 | if(system(cmd.c_str()) != 0){ 77 | cerr<<"rename cahe file name failed!"< 84 | bool CacheLoad(DataSet *dataset){ 85 | DataReader* reader = dataset->reader; 86 | reader->Rewind(); 87 | if (reader->Good() == false) { 88 | cerr<<"reader is incorrect!"<* writer = get_cacher(dataset->cache_fileName); 93 | if (writer == NULL) 94 | return false; 95 | 96 | //load data 97 | bool not_file_end = false; 98 | do { 99 | DataChunk &chunk = dataset->GetWriteChunk(); 100 | not_file_end = load_chunk(reader, chunk); 101 | save_chunk(writer, chunk); 102 | dataset->EndWriteChunk(); 103 | }while(not_file_end == true); 104 | 105 | return end_cache(&writer, dataset->cache_fileName); 106 | } 107 | 108 | template 109 | #if WIN32 110 | DWORD WINAPI thread_LoadData(LPVOID param) 111 | #else 112 | void* thread_LoadData(void* param) 113 | #endif 114 | { 115 | DataSet* dataset = static_cast*>(param); 116 | DataReader* reader = dataset->reader; 117 | 118 | size_t pass = 0; 119 | if (dataset->is_cache == true){ 120 | if(CacheLoad(dataset) == false){ 121 | cerr<<"caching data failed!"<FinishParse(); 123 | return NULL; 124 | } 125 | dataset->reader->Close(); 126 | delete dataset->reader; 127 | //load cache file 128 | dataset->reader = new libsvm_binary_(dataset->cache_fileName); 129 | if (dataset->reader->OpenReading() == false){ 130 | cerr<<"load cache data failed!"<FinishParse(); 132 | return NULL; 133 | } 134 | reader = dataset->reader; 135 | dataset->is_cache = false; 136 | pass++; 137 | } 138 | //load cache 139 | for (;pass < dataset->passNum; pass++) { 140 | reader->Rewind(); 141 | if (reader->Good()) { 142 | bool not_file_end = false; 143 | do { 144 | DataChunk &chunk = dataset->GetWriteChunk(); 145 | not_file_end = load_chunk(reader, chunk); 146 | dataset->EndWriteChunk(); 147 | }while(not_file_end == true); 148 | } 149 | else { 150 | cerr<<"reader is incorrect!"<FinishParse(); 155 | return NULL; 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/data/MNISTConvert.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: Convert.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Thu 14 Nov 2013 06:49:38 PM 5 | > Descriptions: Convert other file formats to LIBSVM 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | #include "MNISTReader.h" 13 | using namespace SOL; 14 | #define FeatType float 15 | #define LabelType char 16 | //define your own data reader here 17 | #define ReaderType MNISTReader 18 | 19 | int main(int argc, char** args){ 20 | if (argc != 6){ 21 | cout<<"Usage: MNISTConvert train_file label_file digit1 digit2 output_file"< data; 32 | if (reader.OpenReading() == false) { 33 | return -1; 34 | } 35 | ofstream outFile(outfilename.c_str(), ios::out); 36 | if (!outFile){ 37 | cerr<<"open file "< File Name: MNISTReader.h 4 | 5 | > Copyright (C) 2013 Yue Wu 6 | 7 | > Created Time: 2013/8/18 Sunday 20:25:28 8 | 9 | > Functions: MNIST reader 10 | 11 | ************************************************************************/ 12 | 13 | #ifndef HEADER_MINST_READER 14 | #define HEADER_MINST_READER 15 | #include "DataReader.h" 16 | 17 | #include 18 | using std::ios_base; 19 | using std::ios; 20 | using std::ifstream; 21 | using std::string; 22 | 23 | namespace SOL { 24 | 25 | template 26 | int MSB2LSB(T x) { 27 | int y = x; 28 | int byteNum = sizeof(x); 29 | char *buf = new char[byteNum]; 30 | char *buf1 = new char[byteNum]; 31 | memcpy(buf, &x, byteNum); 32 | for (int i = 0; i < byteNum; i++) 33 | buf1[i] = buf[byteNum - 1 - i]; 34 | memcpy(&y,buf1, byteNum); 35 | 36 | delete []buf; 37 | delete []buf1; 38 | return y; 39 | } 40 | 41 | 42 | template 43 | class MNISTReader: public DataReader { 44 | private: 45 | ifstream inTrainFile, inLabelFile; 46 | string trainFileName, labelFileName; 47 | int num1, num2; 48 | 49 | int featDim; 50 | int imgNum; 51 | 52 | std::streamoff trainFileStartPos; 53 | std::streamoff labelFileStartPos; 54 | 55 | unsigned char* rd_buf; 56 | 57 | public: 58 | MNISTReader(const string &trainFile, const string &labelFile, 59 | int digit1 = -1, int digit2 = -1): 60 | trainFileName(trainFile),labelFileName(labelFile), 61 | num1(digit1),num2(digit2) { 62 | featDim = 0; 63 | imgNum = 0; 64 | rd_buf = NULL; 65 | } 66 | 67 | ~MNISTReader() { 68 | this->Close(); 69 | if (rd_buf != NULL) 70 | delete []this->rd_buf; 71 | } 72 | 73 | public: 74 | virtual bool OpenReading() { 75 | this->Close(); 76 | if (this->rd_buf != NULL) 77 | delete []this->rd_buf; 78 | this->rd_buf = NULL; 79 | 80 | inTrainFile.open(trainFileName.c_str(), ios::in | ios::binary); 81 | if(!inTrainFile) { 82 | printf("can't open input file %s\n",trainFileName.c_str()); 83 | return false; 84 | } 85 | 86 | inLabelFile.open(labelFileName.c_str(), ios::in | ios::binary); 87 | if(!inLabelFile) { 88 | printf("can't open input file %s\n",labelFileName.c_str()); 89 | return false; 90 | 91 | } 92 | 93 | return this->GetFeatInfo(); 94 | } 95 | 96 | virtual void Rewind() { 97 | if(this->inTrainFile.is_open() == true) 98 | this->inTrainFile.seekg(trainFileStartPos,ios_base::beg); 99 | 100 | if(this->inLabelFile.is_open() == true) 101 | this->inLabelFile.seekg(labelFileStartPos,ios_base::beg); 102 | 103 | } 104 | virtual void Close() { 105 | this->inTrainFile.close(); 106 | this->inLabelFile.close(); 107 | } 108 | virtual bool Good() { 109 | if (this->inTrainFile.good() || this->inLabelFile.good() || 110 | this->inTrainFile.eof() || this->inLabelFile.eof()) 111 | return true; 112 | return false; 113 | } 114 | 115 | virtual bool GetNextData(DataPoint &data) { 116 | if (num1 == -1 || num2 == -1) 117 | return ReadData(data); 118 | else { 119 | while(ReadData(data)) { 120 | if (data.label == num1) { 121 | data.label = 1; 122 | return true; 123 | } 124 | else if (data.label == num2) { 125 | data.label = -1; 126 | return true; 127 | } 128 | } 129 | return false; 130 | } 131 | } 132 | 133 | private: 134 | bool ReadData(DataPoint &data) { 135 | if (!inTrainFile.good() || !inLabelFile.good()) 136 | return false; 137 | 138 | data.erase(); 139 | //get next label 140 | char label; 141 | inLabelFile.read((char*)&label,sizeof(char)); 142 | //get feature 143 | inTrainFile.read((char*)this->rd_buf,sizeof(unsigned char) *featDim); 144 | 145 | for (int i = 0; i < featDim; i++) { 146 | if (this->rd_buf[i] != 0) 147 | data.AddNewFeat(i + 1,this->rd_buf[i]); 148 | } 149 | data.label = label; 150 | return true; 151 | } 152 | 153 | 154 | bool GetFeatInfo() { 155 | inTrainFile.seekg(0,ios_base::beg); 156 | inLabelFile.seekg(0,ios_base::beg); 157 | 158 | if(!inTrainFile || !inLabelFile) 159 | return false; 160 | 161 | //label file 162 | //Load header 163 | int magicNum(0); 164 | inLabelFile.seekg(0,ios_base::beg); 165 | inLabelFile.read((char*)&magicNum, sizeof(int)); 166 | magicNum = MSB2LSB(magicNum); 167 | if (magicNum != 2049) { 168 | std::cerr<<"Incorrect file!"< 0) 207 | this->rd_buf = new unsigned char[featDim]; 208 | 209 | return true; 210 | } 211 | }; 212 | } 213 | #endif 214 | -------------------------------------------------------------------------------- /src/data/basic_io.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: basic_io.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 03:49:24 PM 5 | > Descriptions: most basic io handler, work with FILE 6 | ************************************************************************/ 7 | 8 | #include "basic_io.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace SOL{ 16 | bool basic_io::open_file(const char* filename, const char* mode){ 17 | this->close_file(); 18 | #if _WIN32 19 | errno_t ret = fopen_s(&file,filename, mode); 20 | if ( ret != 0){ 21 | printf("error %d: can't open input file %s\n",ret,filename); 22 | return false; 23 | } 24 | 25 | #else 26 | file = fopen(filename, mode); 27 | if (file == NULL){ 28 | fprintf(stderr,"open file failed!"); 29 | return false; 30 | } 31 | #endif 32 | if (this->good() != 0){ 33 | this->close_file(); 34 | return false; 35 | } 36 | return true; 37 | } 38 | 39 | // bind_stdin: bind the input to stdin 40 | bool basic_io::open_stdin(){ 41 | file = stdin; 42 | return true; 43 | } 44 | 45 | // bind_stdin: bind the output to stdout 46 | bool basic_io::open_stdout(){ 47 | file = stdout; 48 | return true; 49 | } 50 | 51 | void basic_io::close_file(){ 52 | if (file != NULL && file != stdin && file != stdout){ 53 | fclose(file); 54 | } 55 | file = NULL; 56 | } 57 | 58 | void basic_io::rewind(){ 59 | if (file != NULL) 60 | std::rewind(file); 61 | } 62 | /** 63 | * good : test if the io is good 64 | * 65 | * @Return: zero if correct, else zero code 66 | */ 67 | int basic_io::good(){ 68 | return ferror(file); 69 | } 70 | 71 | 72 | /** 73 | * read_data : read the data from file 74 | * 75 | * @Param dst: container to place the read data 76 | * @Param length: length of data of read in bytes 77 | * 78 | * @Return: true if succeed 79 | */ 80 | bool basic_io::read_data(char* dst, size_t length){ 81 | return fread(dst, 1, length, file) == length; 82 | } 83 | 84 | /** 85 | * read_line : read a line from disk 86 | * 87 | * @Param dst: container to place the read data 88 | * @Param dst_len: length of dst 89 | * 90 | * @Return: size of data read in bytes 91 | */ 92 | char* basic_io::read_line(char* &dst, size_t &dst_len){ 93 | size_t len; 94 | if(fgets(dst,dst_len,file) == NULL) 95 | return NULL; 96 | while(strrchr(dst,'\n') == NULL) { 97 | dst_len *= 2; 98 | dst = (char *) realloc(dst, dst_len); 99 | len = strlen(dst); 100 | if(fgets(dst+len,dst_len-len,file) == NULL) 101 | break; 102 | } 103 | return dst; 104 | } 105 | 106 | /** 107 | * write_data : write content to disk 108 | * 109 | * @Param src: source of the data 110 | * @Param length: length to write the data 111 | * 112 | * @Return: true of succeed 113 | */ 114 | bool basic_io::write_data(char* src, size_t length){ 115 | return fwrite(src, 1, length, file) == length; 116 | } 117 | } 118 | 119 | 120 | -------------------------------------------------------------------------------- /src/data/basic_io.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: basic_io.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 03:44:46 PM 5 | > Descriptions: most basic io handler, work with FILE 6 | ************************************************************************/ 7 | 8 | #ifndef HEADER_BASIC_IO 9 | #define HEADER_BASIC_IO 10 | 11 | #include "io_interface.h" 12 | #include 13 | 14 | namespace SOL{ 15 | class basic_io: public io_interface { 16 | private: 17 | FILE* file; 18 | 19 | public: 20 | basic_io():file(NULL){} 21 | virtual ~basic_io(){ 22 | this->close_file(); 23 | } 24 | 25 | public: 26 | virtual bool open_file(const char* filename, const char* mode); 27 | // bind_stdin: bind the input to stdin 28 | virtual bool open_stdin(); 29 | // bind_stdin: bind the output to stdout 30 | virtual bool open_stdout(); 31 | 32 | virtual void close_file(); 33 | virtual void rewind(); 34 | 35 | /** 36 | * good : test if the io is good 37 | * 38 | * @Return: zero if correct, else zero code 39 | */ 40 | virtual int good(); 41 | 42 | public: 43 | /** 44 | * read_data : read the data from file 45 | * 46 | * @Param dst: container to place the read data 47 | * @Param length: length of data of read in bytes 48 | * 49 | * @Return: true if succeed 50 | */ 51 | virtual bool read_data(char* dst, size_t length); 52 | 53 | /** 54 | * read_line : read a line from disk 55 | * 56 | * @Param dst: container to place the read data 57 | * @Param dst_len: length of dst 58 | * 59 | * @Return: pointer to the read line, null if failed 60 | */ 61 | virtual char* read_line(char* &dst, size_t &dst_len); 62 | 63 | /** 64 | * write_data : write content to disk 65 | * 66 | * @Param src: source of the data 67 | * @Param length: length to write the data 68 | * 69 | * @Return: true if succeed 70 | */ 71 | virtual bool write_data(char* src, size_t length); 72 | }; 73 | } 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /src/data/comp.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: comp.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Thu 07 Nov 2013 11:01:37 PM 5 | > Descriptions: compression algorithms 6 | ************************************************************************/ 7 | 8 | #ifndef HEADER_COMP_ALGO 9 | #define HEADER_COMP_ALGO 10 | 11 | #include "DataPoint.h" 12 | #include 13 | #include 14 | 15 | namespace SOL{ 16 | 17 | inline uint32_t ZigZagEncode(int32_t n) { 18 | uint32_t ret = (n << 1) ^ (n >> 31); 19 | return ret; 20 | } 21 | inline int32_t ZigZagDecode(uint32_t n) { 22 | return (n >> 1) ^ -static_cast(n & 1); 23 | } 24 | 25 | //encode an unsigned int with run length encoding 26 | //if encode signed int, first map it to unsigned with ZigZag Encoding 27 | inline void run_len_encode(s_array &codes, uint32_t i){ 28 | // store an int 7 bits at a time. 29 | while (i >= 128) { 30 | codes.push_back((i & 127) | 128); 31 | i = i >> 7; 32 | } 33 | codes.push_back((i & 127)); 34 | } 35 | 36 | inline char* run_len_decode(char* p, uint32_t& i) { // read an int 7 bits at a time. 37 | size_t count = 0; 38 | while(*p & 128)\ 39 | i = i | ((*(p++) & 127) << 7*count++); 40 | i = i | (*(p++) << 7*count); 41 | return p; 42 | } 43 | 44 | 45 | /** 46 | * comp : compress the index list, note that the indexes must be sorted from small to big 47 | * Note: the function will not erase codes by iteself 48 | * 49 | * @Param indexes: indexes to be encoded 50 | * @Param codes: ouput codes 51 | */ 52 | inline void comp_index(const s_array& indexes, s_array &codes){ 53 | uint32_t last = 0; 54 | size_t featNum = indexes.size(); 55 | for (size_t i = 0; i< featNum; i++) { 56 | run_len_encode(codes,indexes[i] - last); 57 | last = indexes[i]; 58 | } 59 | } 60 | 61 | /** 62 | * decomp_index : de-compress the codes to indexes 63 | * 64 | * @Param codes: input codes 65 | * @Param indexes: output indexes 66 | */ 67 | inline void decomp_index(s_array &codes, s_array &indexes){ 68 | indexes.erase(); 69 | uint32_t last = 0; 70 | uint32_t index = 0; 71 | 72 | char* p = codes.begin; 73 | while(p < codes.end){ 74 | index = 0; 75 | p = run_len_decode(p,index); 76 | index += last; 77 | last = index; 78 | indexes.push_back(index); 79 | } 80 | assert(p == codes.end ); 81 | } 82 | } 83 | #endif 84 | -------------------------------------------------------------------------------- /src/data/data_analysis.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: data_analysis.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Thu 24 Oct 2013 08:09:38 PM 5 | > Descriptions: analyse the sparsity of data 6 | ************************************************************************/ 7 | #include "DataPoint.h" 8 | #include "DataReader.h" 9 | #include "libsvmread.h" 10 | #include "MNISTReader.h" 11 | 12 | #include 13 | using namespace std; 14 | using namespace SOL; 15 | 16 | template 17 | bool Analyze(DataReader *reader) { 18 | if (reader == NULL){ 19 | cerr<<"data reader is emptyp!"< index_set; 31 | DataPoint data; 32 | if (reader->OpenReading() == true) { 33 | reader->Rewind(); 34 | while(true) { 35 | if (reader->GetNextData(data) == true) { 36 | if (data.indexes.size() == 0) 37 | continue; 38 | if (max_index < data.dim()){ 39 | max_index = data.dim(); 40 | } 41 | size_t prev_size = index_set.size(); 42 | if (max_index > prev_size){ 43 | index_set.reserve(max_index); 44 | index_set.resize(max_index); 45 | //set the new value to zero 46 | index_set.zeros(index_set.begin + prev_size, 47 | index_set.end); 48 | } 49 | for (size_t i = 0; i < data.indexes.size(); i++){ 50 | index_set[data.indexes[i] - 1] = 1; 51 | } 52 | 53 | dataNum++; 54 | if (data.label == 1) 55 | pos_num++; 56 | else if (data.label == -1) 57 | neg_num++; 58 | else{ 59 | cerr<<"\nunrecognized label!"< max_show_count ? 70 | max_show_count : show_count; 71 | } 72 | } 73 | else 74 | break; 75 | } 76 | } 77 | else { 78 | cerr<<"Can not open file to read!"<Close(); 83 | size_t valid_dim = 0; 84 | for (size_t i = 0; i < index_set.size(); i++) { 85 | if (index_set[i] == 1) 86 | valid_dim++; 87 | } 88 | cout<<"data number : "< 0){ 95 | printf("data sparsity: %.2lf%%\n",100 - valid_dim * 100.0 / max_index); 96 | } 97 | 98 | return true; 99 | } 100 | 101 | int main(int argc, char** args){ 102 | if (argc != 2){ 103 | cout<<"Usage: data_analysis data_file"< File Name: gzip_io.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 04:01:46 PM 5 | > Descriptions: read and write file in gzip format 6 | ************************************************************************/ 7 | #include "gzip_io.h" 8 | 9 | #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) 10 | # include 11 | # include 12 | # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) 13 | #else 14 | # define SET_BINARY_MODE(file) 15 | #endif 16 | 17 | #include 18 | using namespace std; 19 | 20 | namespace SOL{ 21 | bool gzip_io::open_file(const char* filename, const char* mode){ 22 | this->close_file(); 23 | 24 | file = gzopen(filename, mode); 25 | if (file == NULL){ 26 | cerr<<"open file failed!"<good() != 0){ 30 | this->close_file(); 31 | return false; 32 | } 33 | return true; 34 | } 35 | // bind_stdin: bind the input to stdin 36 | bool gzip_io::open_stdin(){ 37 | file = gzdopen(fileno(stdin),"rb"); 38 | return true; 39 | } 40 | 41 | // bind_stdin: bind the output to stdout 42 | bool gzip_io::open_stdout(){ 43 | file = gzdopen(fileno(stdout),"wb"); 44 | return true; 45 | } 46 | 47 | void gzip_io::close_file(){ 48 | if (file != NULL && file != stdin && file != stdout){ 49 | gzclose(file); 50 | } 51 | file = NULL; 52 | } 53 | 54 | void gzip_io::rewind(){ 55 | if (file != NULL) 56 | gzrewind(file); 57 | } 58 | 59 | /** 60 | * good : test if the io is good 61 | * 62 | * @Return: zero if correct, else zero code 63 | */ 64 | int gzip_io::good(){ 65 | int errCode; 66 | const char* errmsg = gzerror(file ,&errCode);; 67 | if (errCode != Z_OK){ 68 | if (gzeof(file) == 1) //eof is not an error 69 | return 0; 70 | printf("%s\n",errmsg); 71 | } 72 | return errCode; 73 | } 74 | 75 | /** 76 | * read_data : read the data from file 77 | * 78 | * @Param dst: container to place the read data 79 | * @Param length: length of data of read in bytes 80 | * 81 | * @Return: true if succeed 82 | */ 83 | bool gzip_io::read_data(char* dst, size_t length){ 84 | return size_t(gzread(file, dst, length)) == length; 85 | } 86 | 87 | /** 88 | * read_line : read a line from disk 89 | * 90 | * @Param dst: container to place the read data 91 | * @Param dst_len: length of dst 92 | * 93 | * @Return: size of data read in bytes 94 | */ 95 | char* gzip_io::read_line(char* &dst, size_t &dst_len){ 96 | printf("error: no read line is supported in gzip io\n"); 97 | return NULL; 98 | } 99 | 100 | /** 101 | * write_data : write content to disk 102 | * 103 | * @Param src: source of the data 104 | * @Param length: length to write the data 105 | * 106 | * @Return: true if succeed 107 | */ 108 | bool gzip_io::write_data(char* src, size_t length){ 109 | return size_t(gzwrite(file, src, length)) == length; 110 | } 111 | } 112 | 113 | 114 | -------------------------------------------------------------------------------- /src/data/gzip_io.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: /home/matthew/work/SOL/src/data/gzip_io.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 04:02:12 PM 5 | > Descriptions: 6 | ************************************************************************/ 7 | #ifndef HEADER_GZIP_IO 8 | #define HEADER_GZIP_IO 9 | 10 | #include "io_interface.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include "zlib.h" 16 | 17 | namespace SOL{ 18 | class gzip_io: public io_interface{ 19 | private: 20 | gzFile file; 21 | 22 | public: 23 | gzip_io():file(NULL){} 24 | ~gzip_io(){ 25 | this->close_file(); 26 | } 27 | 28 | 29 | public: 30 | virtual bool open_file(const char* filename, const char* mode); 31 | virtual void close_file(); 32 | virtual void rewind(); 33 | // bind_stdin: bind the input to stdin 34 | virtual bool open_stdin(); 35 | // bind_stdin: bind the output to stdout 36 | virtual bool open_stdout(); 37 | 38 | /** 39 | * good : test if the io is good 40 | * 41 | * @Return: zero if correct, else zero code 42 | */ 43 | virtual int good(); 44 | 45 | public: 46 | /** 47 | * read_data : read the data from file 48 | * 49 | * @Param dst: container to place the read data 50 | * @Param length: length of data of read in bytes 51 | * 52 | * @Return: true if succeed 53 | */ 54 | virtual bool read_data(char* dst, size_t length); 55 | 56 | /** 57 | * read_line : read a line from disk 58 | * 59 | * @Param dst: container to place the read data 60 | * @Param dst_len: length of dst 61 | * 62 | * @Return: pointer to the read line, null if failed 63 | */ 64 | virtual char* read_line(char* &dst, size_t &dst_len); 65 | 66 | /** 67 | * write_data : write content to disk 68 | * 69 | * @Param src: source of the data 70 | * @Param length: length to write the data 71 | * 72 | * @Return: true if succeed 73 | */ 74 | virtual bool write_data(char* src, size_t length); 75 | }; 76 | } 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/data/io_handler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: io_handler.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 03:23:19 PM 5 | > Descriptions: handler for io 6 | ************************************************************************/ 7 | 8 | #include 9 | 10 | using namespace std; 11 | namespace SOL{ 12 | class io_handler{ 13 | public: 14 | io_handler(){} 15 | ~io_handler(){} 16 | 17 | public: 18 | bool open_file(const char* filename); 19 | void close_file(); 20 | 21 | public: 22 | int read_data(unsigned char* dst, size_t length); 23 | int write_data(unsigned char* src, size_t length); 24 | }; 25 | } 26 | 27 | -------------------------------------------------------------------------------- /src/data/io_interface.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: io_interface.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 03:26:20 PM 5 | > Descriptions: interface definition for io 6 | ************************************************************************/ 7 | #ifndef HEADER_IO_INTERFACE_ 8 | #define HEADER_IO_INTERFACE_ 9 | 10 | #include 11 | 12 | namespace SOL{ 13 | class io_interface{ 14 | public: 15 | virtual bool open_file(const char* filename, const char* mode) = 0; 16 | // bind_stdin: bind the input to stdin 17 | virtual bool open_stdin() = 0; 18 | // bind_stdin: bind the output to stdout 19 | virtual bool open_stdout() = 0; 20 | 21 | virtual void close_file() = 0; 22 | virtual void rewind() = 0; 23 | /** 24 | * good : test if the io is good 25 | * 26 | * @Return: zero if correct, else zero code 27 | */ 28 | virtual int good() = 0; 29 | 30 | public: 31 | /** 32 | * read_data : read the data from file 33 | * 34 | * @Param dst: container to place the read data 35 | * @Param length: length of data of read in bytes 36 | * 37 | * @Return: true if succeed 38 | */ 39 | virtual bool read_data(char* dst, size_t length) = 0; 40 | 41 | /** 42 | * read_line : read a line from disk 43 | * 44 | * @Param dst: container to place the read data 45 | * @Param dst_len: length of dst 46 | * 47 | * @Return: pointer to the read line, null if failed 48 | */ 49 | virtual char* read_line(char* &dst, size_t &dst_len) = 0; 50 | 51 | /** 52 | * write_data : write content to disk 53 | * 54 | * @Param src: source of the data 55 | * @Param length: length to write the data 56 | * 57 | * @Return: true if succeed 58 | */ 59 | virtual bool write_data(char* src, size_t length) = 0; 60 | }; 61 | } 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/data/libsvm_binary.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: libsvm_binary.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Sat 21 Sep 2013 10:52:41 PM SGT 5 | > Functions: io for binary libsvm dataset 6 | ************************************************************************/ 7 | 8 | #ifndef HEADER_LIBSVM_BINARY 9 | #define HEADER_LIBSVM_BINARY 10 | 11 | 12 | #include "DataReader.h" 13 | #include "basic_io.h" 14 | //#include "zlib_io.h" 15 | //#include "gzip_io.h" 16 | 17 | #include "comp.h" 18 | 19 | #include 20 | 21 | using namespace std; 22 | 23 | namespace SOL { 24 | template 25 | class libsvm_binary_:public DataReader { 26 | private: 27 | std::string fileName; 28 | basic_io io_handler; 29 | //gzip_io io_handler. 30 | //zlib_io io_handler. 31 | 32 | //compressed codes of indexes 33 | s_array comp_codes; 34 | 35 | public: 36 | libsvm_binary_(const std::string &fileName) { 37 | this->fileName = fileName; 38 | } 39 | 40 | ~libsvm_binary_() { 41 | this->Close(); 42 | } 43 | const std::string& get_filename() const { 44 | return this->fileName; 45 | } 46 | 47 | //////////////////online mode////////////////// 48 | public: 49 | bool OpenReading() { 50 | this->Close(); 51 | return io_handler.open_file(this->fileName.c_str(), "rb"); 52 | } 53 | 54 | bool OpenWriting() { 55 | this->Close(); 56 | return io_handler.open_file(this->fileName.c_str(), "wb"); 57 | } 58 | 59 | void Rewind() { 60 | io_handler.rewind(); 61 | } 62 | 63 | void Close() { 64 | io_handler.close_file(); 65 | } 66 | 67 | inline bool Good() { 68 | return io_handler.good() == 0 ? true : false; 69 | } 70 | 71 | bool GetNextData(DataPoint &data) { 72 | data.erase(); 73 | if (io_handler.read_data((char*)&(data.label),sizeof(LabelType)) == false){ 74 | if (this->Good() == true){ 75 | return false; 76 | } 77 | else{ 78 | cerr<<"unexpected error occured when loading data!"< 0){ 90 | if(io_handler.read_data((char*)&data.max_index,sizeof(IndexType)) == false){ 91 | cerr<<"load max index failed!"<comp_codes.resize(code_len); 101 | if(io_handler.read_data(this->comp_codes.begin, 102 | code_len) == false){ 103 | cerr<<"read coded index failed!"<comp_codes, data.indexes); 107 | if (data.indexes.size() != featNum){ 108 | cerr<<"decoded index number is not correct!"< &data) { 126 | size_t featNum = data.indexes.size(); 127 | if(io_handler.write_data((char*)&data.label,sizeof(LabelType)) == false){ 128 | cerr<<"write label failed!"< 0){ 137 | if(io_handler.write_data((char*)&data.max_index, 138 | sizeof(IndexType)) == false){ 139 | cerr<<"write max index failed!"<comp_codes.erase(); 143 | comp_index(data.indexes, this->comp_codes); 144 | unsigned int code_len = (unsigned int)(this->comp_codes.size()); 145 | if(io_handler.write_data((char*)&code_len, 146 | sizeof(unsigned int)) == false){ 147 | cerr<<"write coded index length failed!"<comp_codes.begin, 151 | code_len) == false){ 152 | cerr<<"write coded index failed!"< libsvm_binary; 171 | } 172 | #endif 173 | -------------------------------------------------------------------------------- /src/data/libsvmread.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: libsvmread.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/8/18 星期日 20:25:28 5 | > Functions: libsvm reader 6 | ************************************************************************/ 7 | #pragma once 8 | 9 | #if _WIN32 10 | #define _CRT_SECURE_NO_WARNINGS 11 | #endif 12 | 13 | #include "DataReader.h" 14 | #include "basic_io.h" 15 | #include "parser.h" 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace std; 26 | 27 | namespace SOL { 28 | template 29 | class LibSVMReader_: public DataReader { 30 | private: 31 | string fileName; 32 | basic_io reader; 33 | 34 | char *line; 35 | size_t max_line_len; 36 | 37 | public: 38 | LibSVMReader_(const string &fileName) { 39 | this->max_line_len = 4096; 40 | this->fileName = fileName; 41 | line = (char *) malloc(max_line_len*sizeof(char)); 42 | } 43 | ~LibSVMReader_() { 44 | this->Close(); 45 | if (line != NULL) 46 | free(line); 47 | } 48 | 49 | //////////////////online mode////////////////// 50 | public: 51 | virtual bool OpenReading() { 52 | this->Close(); 53 | return reader.open_file(this->fileName.c_str(), "r"); 54 | } 55 | virtual void Rewind() { 56 | reader.rewind(); 57 | } 58 | virtual void Close() { 59 | reader.close_file(); 60 | } 61 | 62 | virtual inline bool Good() { 63 | return reader.good() == 0 ? true: false; 64 | } 65 | 66 | virtual bool GetNextData(DataPoint &data) { 67 | if(reader.read_line(line, max_line_len) == NULL) 68 | return false; 69 | 70 | LabelType labelVal; 71 | char* p = line, *endptr = NULL; 72 | if (*p == '\0') 73 | return false; 74 | labelVal = (LabelType)parseInt(p,&endptr); 75 | if (endptr == p) { 76 | return false; 77 | } 78 | 79 | data.erase(); 80 | IndexType index; 81 | FeatType feat; 82 | // features 83 | while(1) { 84 | p = strip_line(endptr); 85 | if (*p == '\0') 86 | break; 87 | index = (IndexType)(parseUint(p,&endptr)); 88 | if (endptr == p) { //parse index failed 89 | fprintf(stderr,"parse index value failed!\n%s", p); 90 | return false; 91 | } 92 | 93 | p = endptr; 94 | feat = parseFloat(p,&endptr); 95 | //feat =(float)(strtod(val,&endptr)); 96 | if (endptr == p) { 97 | fprintf(stderr,"parse feature value failed!\n"); 98 | return false; 99 | } 100 | 101 | data.AddNewFeat(index,feat); 102 | } 103 | data.label = labelVal; 104 | 105 | return true; 106 | } 107 | }; 108 | 109 | //for special definition 110 | typedef LibSVMReader_ LibSVMReader; 111 | } 112 | -------------------------------------------------------------------------------- /src/data/makefile: -------------------------------------------------------------------------------- 1 | FLAGS = -g -Wall 2 | #FLAGS = -O2 -s 3 | 4 | LIBS=basic_io.o zlib_io.o gzip_io.o 5 | TARGETS=test 6 | 7 | all: $(TARGETS) 8 | 9 | test:test.o 10 | g++ test.o -o test 11 | 12 | MNISTConvert:MNISTConvert.cpp 13 | g++ $^ $(FLAGS) -o $@ 14 | 15 | analysis:data_analysis.o $(LIBS) 16 | g++ $^ -lz -o $@ 17 | 18 | Cacher:Cacher.o $(LIBS) 19 | g++ $^ -lz -o $@ 20 | 21 | %.o:%.cpp 22 | g++ -c $^ -o $@ $(FLAGS) 23 | 24 | .PHONY: clean 25 | clean: 26 | -rm $(TARGETS) *.o 27 | -------------------------------------------------------------------------------- /src/data/parser.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: parser.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Thu 07 Nov 2013 08:16:26 PM 5 | > Descriptions: public funtions to parse 6 | ************************************************************************/ 7 | 8 | #ifndef HEADER_PARSER 9 | #define HEADER_PARSER 10 | #include 11 | #include 12 | 13 | namespace SOL{ 14 | 15 | inline bool is_space(char* p){ 16 | return (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r'); 17 | } 18 | 19 | inline char* strip_line(char* p){ 20 | while(is_space(p) == true) 21 | p++; 22 | return p; 23 | } 24 | 25 | //The following function is a home made strtoi 26 | inline int parseInt(char * p, char **end) { 27 | *end = p; 28 | p = strip_line(p); 29 | 30 | if (*p == '\0'){ 31 | return 0; 32 | } 33 | int s = 1; 34 | if (*p == '+')p++; 35 | if (*p == '-') { 36 | s = -1; p++; 37 | } 38 | int acc = 0; 39 | while (*p >= '0' && *p <= '9') 40 | acc = acc * 10 + *p++ - '0'; 41 | 42 | int num_dec = 0; 43 | if (*p == '.') { 44 | p++; 45 | while (*p >= '0' && *p <= '9') { 46 | acc = acc *10 + *p++ - '0' ; 47 | num_dec++; 48 | } 49 | } 50 | int exp_acc = 0; 51 | if(*p == 'e' || *p == 'E'){ 52 | p++; 53 | if (*p == '+')p++; 54 | while (*p >= '0' && *p <= '9') 55 | exp_acc = exp_acc * 10 + *p++ - '0'; 56 | 57 | } 58 | if (is_space(p)== true) {//easy case succeeded. 59 | exp_acc -= num_dec; 60 | if (exp_acc < 0) 61 | return 0; 62 | else 63 | acc *= (int)(powf(10.f,(float)exp_acc)); 64 | 65 | *end = p; 66 | return s * acc; 67 | } 68 | else { 69 | return 0; 70 | } 71 | } 72 | 73 | //The following function is a home made strtoi 74 | inline unsigned int parseUint(char * p, char **end) { 75 | *end = p; 76 | p = strip_line(p); 77 | 78 | if (*p == '\0'){ 79 | return 0; 80 | } 81 | unsigned int acc = 0; 82 | while (*p >= '0' && *p <= '9') 83 | acc = acc * 10 + *p++ - '0'; 84 | 85 | int num_dec = 0; 86 | if (*p == '.') { 87 | p++; 88 | while (*p >= '0' && *p <= '9') { 89 | acc = acc *10 + *p++ - '0' ; 90 | num_dec++; 91 | } 92 | } 93 | int exp_acc = 0; 94 | if(*p == 'e' || *p == 'E'){ 95 | p++; 96 | if (*p == '+')p++; 97 | while (*p >= '0' && *p <= '9') 98 | exp_acc = exp_acc * 10 + *p++ - '0'; 99 | } 100 | if (*p == ':') {//easy case succeeded. 101 | if (exp_acc < num_dec) 102 | return 0; 103 | else 104 | acc *= (unsigned int)(powf(10.f,(float)(exp_acc - num_dec))); 105 | *end = ++p; 106 | return acc; 107 | } 108 | else { 109 | return 0; 110 | } 111 | } 112 | 113 | /* 114 | inline string parseString(char*p, char**end){ 115 | p = strip_line(p); 116 | char* start_pos = p; 117 | char* end_pos = p; 118 | if (*start_pos == '\"'){ 119 | start_pos++; 120 | end_pos = start_pos; 121 | while(*end_pos != '\"' && *end_pos != '\0')end_pos++; 122 | if (*end_pos != '\"'){ 123 | *end = p; 124 | return string(); 125 | } 126 | } 127 | *end = end_pos + 1; 128 | return string(start_pos,end_pos - start_pos - 1); 129 | } 130 | */ 131 | 132 | // The following function is a home made strtof. The 133 | // differences are : 134 | // - much faster (around 50% but depends on the string to parse) 135 | // - less error control, but utilised inside a very strict parser 136 | // in charge of error detection. 137 | inline float parseFloat(char * p, char **end) { 138 | *end = p; 139 | p = strip_line(p); 140 | 141 | if (*p == '\0'){ 142 | return 0; 143 | } 144 | int s = 1; 145 | if (*p == '+') p++; 146 | if (*p == '-') { 147 | s = -1; p++; 148 | } 149 | 150 | int acc = 0; 151 | while (*p >= '0' && *p <= '9') 152 | acc = acc * 10 + *p++ - '0'; 153 | 154 | int num_dec = 0; 155 | if (*p == '.') { 156 | p++; 157 | while (*p >= '0' && *p <= '9') { 158 | acc = acc *10 + *p++ - '0' ; 159 | num_dec++; 160 | } 161 | } 162 | 163 | int exp_acc = 0; 164 | if(*p == 'e' || *p == 'E'){ 165 | p++; 166 | int exp_s = 1; 167 | if (*p == '+') p++; 168 | if (*p == '-') { 169 | exp_s = -1; p++; 170 | } 171 | while (*p >= '0' && *p <= '9') 172 | exp_acc = exp_acc * 10 + *p++ - '0'; 173 | exp_acc *= exp_s; 174 | } 175 | if (is_space(p) == true || *p == '\0'){//easy case succeeded. 176 | exp_acc -= num_dec; 177 | *end = p; 178 | return s * acc * powf(10.f,(float)(exp_acc)); 179 | } 180 | else 181 | return 0; 182 | } 183 | 184 | } 185 | #endif 186 | -------------------------------------------------------------------------------- /src/data/s_array.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: s_array.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/9/19 15:14:53 5 | > Functions: customized array 6 | ************************************************************************/ 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | namespace SOL { 17 | //the difference of s_array with vector is that vector copies the data, while 18 | //s_array only copies the pointer and increase counter 19 | template class s_array { 20 | public: 21 | T* begin; //point to the first element 22 | T* end; //point to the next postion of the last element 23 | size_t capacity; //capacity of the array 24 | int *count; 25 | 26 | T first() const {return *begin;} 27 | T last() const {return *(end - 1);} 28 | T pop() {return *(--end);} 29 | bool empty() const {return begin == end;} 30 | size_t size() const {return end - begin;} 31 | T& operator[] (size_t i) {return begin[i];} 32 | const T& operator[] (size_t i) const { return begin[i];} 33 | 34 | void allocate(size_t new_size){ 35 | T* new_begin = NULL; 36 | try{ 37 | new_begin = new T[new_size]; 38 | }catch(std::bad_alloc &ex){ 39 | std::cerr< 0) { 46 | std::cerr<<"realloc of "<< new_size 47 | <<" failed in resize(). out of memory?\n" 48 | <<__FILE__<<"\n"<<__LINE__<size(); 53 | //copy data 54 | memcpy(new_begin,begin,sizeof(T) * old_len); 55 | if (begin != NULL) 56 | delete []begin; 57 | begin = new_begin; 58 | end = begin + old_len; 59 | capacity = new_size; 60 | } 61 | 62 | void resize(size_t newSize) { 63 | if (capacity < newSize){ //allocate more memory 64 | this->allocate(newSize); 65 | } 66 | end = begin + newSize; 67 | } 68 | void erase(void) { resize(0); } 69 | 70 | void push_back(const T& elem) { 71 | size_t old_len = size(); 72 | if (old_len == capacity) {//full array 73 | this->allocate(2 * old_len + 3); 74 | } 75 | *(end++) = elem; 76 | } 77 | 78 | void reserve(size_t new_size){ 79 | if(this->capacity < new_size){ 80 | size_t alloc_size = this->capacity; 81 | do{ 82 | alloc_size = 2 * alloc_size + 3; 83 | }while(alloc_size < new_size); 84 | this->allocate(alloc_size); 85 | } 86 | } 87 | 88 | s_array& operator= (const s_array &arr) { 89 | if (this->count == arr.count) 90 | return *this; 91 | this->release(); 92 | 93 | this->begin =arr.begin; 94 | this->end = arr.end; 95 | this->capacity = arr.capacity; 96 | this->count = arr.count; 97 | ++(*count); 98 | return *this; 99 | } 100 | 101 | //reset all the elements in the array to zero 102 | void zeros(){ 103 | memset(this->begin, 0, sizeof(T) * this->size()); 104 | } 105 | //reset all the elements in the array to zero 106 | void zeros(T* iter_begin, T* iter_end){ 107 | memset(iter_begin, 0, sizeof(T) * (iter_end - iter_begin)); 108 | } 109 | 110 | //set the elements in the array to val 111 | void set_value(const T& val){ 112 | T* p = this->begin; 113 | while(p < this->end){ 114 | *p = val; 115 | p++; 116 | } 117 | } 118 | //set the elements in the given range to the val 119 | void set_value(T* iter_begin, T* iter_end, const T& val){ 120 | while(iter_begin < iter_end){ 121 | *iter_begin = val; 122 | iter_begin++; 123 | } 124 | } 125 | 126 | void release() { 127 | --(*count); 128 | if (*count == 0) { 129 | if (this->begin != NULL) 130 | delete []this->begin; 131 | delete this->count; 132 | } 133 | this->begin = NULL; 134 | this->end = NULL; 135 | this->capacity = 0; 136 | this->count = NULL; 137 | } 138 | 139 | s_array() { 140 | begin = NULL; end = NULL; count = NULL; capacity = 0; 141 | count = new int; 142 | *count = 1; 143 | } 144 | s_array(const s_array &arr) { 145 | this->begin =arr.begin; 146 | this->end = arr.end; 147 | this->capacity = arr.capacity; 148 | this->count = arr.count; 149 | ++(*count); 150 | } 151 | 152 | ~s_array() { this->release(); } 153 | }; 154 | } 155 | -------------------------------------------------------------------------------- /src/data/thread_primitive.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: thread.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Sun 22 Sep 2013 03:22:34 PM SGT 5 | > Functions: Primitives for thread 6 | ************************************************************************/ 7 | #pragma once 8 | 9 | using namespace std; 10 | 11 | namespace SOL 12 | { 13 | #ifdef _WIN32 14 | #include 15 | typedef CRITICAL_SECTION MUTEX; 16 | typedef CONDITION_VARIABLE CV; 17 | #else 18 | typedef pthread_mutex_t MUTEX; 19 | typedef pthread_cond_t CV; 20 | #endif 21 | 22 | void initialize_mutex(MUTEX *pm) 23 | { 24 | #ifdef _WIN32 25 | ::InitializeCriticalSection(pm); 26 | #else 27 | pthread_mutex_init(pm,NULL); 28 | #endif 29 | } 30 | 31 | void delete_mutex(MUTEX *pm) 32 | { 33 | #ifdef _WIN32 34 | ::DeleteCriticalSection(pm); 35 | #else 36 | //no operation needed here 37 | #endif 38 | } 39 | 40 | void initialize_condition_variable(CV *pcv) 41 | { 42 | #ifdef _WIN32 43 | ::InitializeConditionVariable(pcv); 44 | #else 45 | pthread_cond_init(pcv,NULL); 46 | #endif 47 | } 48 | 49 | void mutex_lock(MUTEX *pm) 50 | { 51 | //cout<<"obtain lock"< File Name: zlib_io.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 04:15:51 PM 5 | > Descriptions: read and write file in default zlib format 6 | ************************************************************************/ 7 | 8 | #include "zlib_io.h" 9 | #include "../common/init_param.h" 10 | 11 | #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) 12 | # include 13 | # include 14 | # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) 15 | #else 16 | # define SET_BINARY_MODE(file) 17 | #endif 18 | 19 | 20 | #include 21 | 22 | using namespace std; 23 | 24 | namespace SOL{ 25 | bool zlib_io::open_file(const char* filename, const char* mode){ 26 | if (this->alloc_buf() == false){ 27 | this->free_buf(); 28 | return false; 29 | } 30 | 31 | this->close_file(); 32 | switch(mode[0]){ 33 | case 'w': 34 | /* allocate deflate state */ 35 | strm.zalloc = Z_NULL; 36 | strm.zfree = Z_NULL; 37 | strm.opaque = Z_NULL; 38 | if(deflateInit(&strm,zlib_deflate_level) != Z_OK) 39 | return false; 40 | rw_mode = mode_write; 41 | this->de_avail_count = zlib_buf_size; 42 | this->cur_de_pos = this->de_data; 43 | break; 44 | case 'r': 45 | /* allocate deflate state */ 46 | strm.zalloc = Z_NULL; 47 | strm.zfree = Z_NULL; 48 | strm.opaque = Z_NULL; 49 | if(inflateInit(&strm) != Z_OK) 50 | return false; 51 | strm.avail_in = 0; 52 | strm.next_in = NULL; 53 | 54 | rw_mode = mode_read; 55 | this->de_avail_count = 0; 56 | this->cur_de_pos = this->de_data + zlib_buf_size; 57 | 58 | break; 59 | default: 60 | cerr<<"unrecognized file open mode!"<good() != 0){ 70 | this->close_file(); 71 | return false; 72 | } 73 | return true; 74 | } 75 | 76 | // bind_stdin: bind the input to stdin 77 | bool zlib_io::open_stdin(){ 78 | file = stdin; 79 | return true; 80 | } 81 | 82 | // bind_stdin: bind the output to stdout 83 | bool zlib_io::open_stdout(){ 84 | file = stdout; 85 | return true; 86 | } 87 | void zlib_io::close_file(){ 88 | if (file != NULL){ 89 | if (rw_mode == mode_write){ 90 | if (this->finalize_write() != 0){ 91 | /* clean up and return */ 92 | (void)deflateEnd(&strm); 93 | } 94 | } 95 | else if (rw_mode == mode_read){ 96 | /* clean up and return */ 97 | (void)inflateEnd(&strm); 98 | } 99 | 100 | fclose(file); 101 | } 102 | file = NULL; 103 | } 104 | 105 | bool zlib_io::alloc_buf(){ 106 | if (this->en_data == NULL){ 107 | try{ 108 | this->en_data = new unsigned char[zlib_buf_size]; 109 | }catch(std::bad_alloc &ex){ 110 | cerr<<"allocate memory for encoded buffer failed\n"; 111 | cerr<de_data == NULL){ 116 | try{ 117 | this->de_data = new unsigned char[zlib_buf_size]; 118 | }catch(std::bad_alloc &ex){ 119 | cerr<<"allocate memory for decoded buffer failed\n"; 120 | cerr<en_data != NULL){ 129 | delete []this->en_data; 130 | this->en_data = NULL; 131 | } 132 | if (this->de_data != NULL){ 133 | delete []this->de_data; 134 | this->de_data = NULL; 135 | } 136 | } 137 | 138 | void zlib_io::rewind(){ 139 | if (file != NULL){ 140 | std::rewind(file); 141 | this->cur_de_pos = this->de_data; 142 | this->de_avail_count = 0; 143 | } 144 | } 145 | 146 | /** 147 | * good : test if the io is good 148 | * 149 | * @Return: zero if correct, else zero code 150 | */ 151 | int zlib_io::good(){ 152 | if (file == NULL) 153 | return -1; 154 | return ferror(file); 155 | } 156 | 157 | /** 158 | * read_data : read the data from file 159 | * 160 | * @Param dst: container to place the read data 161 | * @Param length: length of data of read in bytes 162 | * 163 | * @Return: true if succeed 164 | */ 165 | bool zlib_io::read_data(char* dst, size_t len){ 166 | while (this->de_avail_count < len){ 167 | memcpy(dst, this->cur_de_pos, this->de_avail_count); 168 | len -= this->de_avail_count; 169 | dst += this->de_avail_count; 170 | //this->buf_in_pos += this->buf_in_have; //can be ignored 171 | this->de_avail_count = 0; //can be ignored 172 | 173 | if (strm.avail_in == 0){ 174 | strm.avail_in = fread(this->en_data, 1,zlib_buf_size,this->file); 175 | if (ferror(this->file)) { 176 | (void)inflateEnd(&strm); 177 | cerr<<"unexpected error occured when loading cache!"<en_data; 184 | } 185 | 186 | /* run inflate() */ 187 | strm.avail_out = zlib_buf_size; 188 | strm.next_out = this->de_data; 189 | int ret = inflate(&strm, Z_NO_FLUSH); 190 | assert(ret != Z_STREAM_ERROR); /* state not clobbered */ 191 | switch (ret) { 192 | case Z_NEED_DICT: 193 | ret = Z_DATA_ERROR; /* and fall through */ 194 | case Z_DATA_ERROR: 195 | case Z_MEM_ERROR: 196 | (void)inflateEnd(&strm); 197 | cerr<<"error occured when parsing file!"<de_avail_count = zlib_buf_size - strm.avail_out; 201 | 202 | this->cur_de_pos = this->de_data; 203 | if (this->cur_de_pos == 0){ 204 | cerr<<"load compressed content failed!"<cur_de_pos, len); 209 | this->cur_de_pos += len; 210 | this->de_avail_count -= len; 211 | //len -= len; //can be ignored 212 | //dst += len; //can be ignored 213 | return true; 214 | } 215 | 216 | /** 217 | * read_line : read a line from disk 218 | * 219 | * @Param dst: container to place the read data 220 | * @Param dst_len: length of dst 221 | * 222 | * @Return: size of data read in bytes 223 | */ 224 | char* zlib_io::read_line(char* &dst, size_t &dst_len){ 225 | printf("error: no read line is supported in zlib io\n"); 226 | return NULL; 227 | } 228 | 229 | /** 230 | * write_data : write content to disk 231 | * 232 | * @Param src: source of the data 233 | * @Param length: length to write the data 234 | * 235 | * @Return: true if succeed 236 | */ 237 | bool zlib_io::write_data(char* src, size_t len){ 238 | while(this->de_avail_count < len){ 239 | memcpy(this->cur_de_pos,src, this->de_avail_count); 240 | src += this->de_avail_count; 241 | len -= this->de_avail_count; 242 | 243 | this->strm.avail_in = zlib_buf_size; 244 | this->strm.next_in = this->de_data; 245 | 246 | // run deflate() 247 | do { 248 | strm.avail_out = zlib_buf_size; 249 | strm.next_out = this->en_data; 250 | int ret = deflate(&(this->strm), Z_NO_FLUSH); //no bad return value 251 | assert(ret != Z_STREAM_ERROR); // state not clobbered 252 | unsigned int have = zlib_buf_size - this->strm.avail_out; 253 | if (fwrite(this->en_data, 1, have,this->file) != have 254 | || ferror(this->file)) { 255 | (void)deflateEnd(&(this->strm)); 256 | cerr<<"unexpected error occured when writing file!"<strm.avail_out == 0); 260 | assert(this->strm.avail_in == 0); // all input will be used 261 | 262 | this->de_avail_count = zlib_buf_size; 263 | this->cur_de_pos = this->de_data; 264 | } 265 | 266 | memcpy(this->cur_de_pos,src, len); 267 | this->cur_de_pos += len; 268 | this->de_avail_count -= len; 269 | this->strm.avail_in += len; 270 | 271 | return true; 272 | } 273 | /** 274 | * finalize_write : finalize write of deflate 275 | * 276 | * @Return: 0 if ok 277 | */ 278 | int zlib_io::finalize_write(){ 279 | this->strm.next_in = this->de_data; 280 | 281 | // run deflate() 282 | do { 283 | strm.avail_out = zlib_buf_size; 284 | strm.next_out = this->en_data; 285 | int ret = deflate(&(this->strm),Z_FINISH); //no bad return value 286 | assert(ret != Z_STREAM_ERROR); // state not clobbered 287 | unsigned int have = zlib_buf_size - this->strm.avail_out; 288 | if (fwrite(this->en_data, 1, have,this->file) != have 289 | || ferror(this->file)) { 290 | (void)deflateEnd(&(this->strm)); 291 | cerr<<"unexpected error occured when writing file!"<strm.avail_out == 0); 295 | assert(this->strm.avail_in == 0); // all input will be used 296 | (void)deflateEnd(&(this->strm)); 297 | return 0; 298 | } 299 | 300 | } 301 | 302 | 303 | 304 | -------------------------------------------------------------------------------- /src/data/zlib_io.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: /home/matthew/work/SOL/src/data/zlib_io.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: Wed 06 Nov 2013 05:01:04 PM 5 | > Descriptions: 6 | ************************************************************************/ 7 | #ifndef HEADER_ZLIB_IO 8 | #define HEADER_ZLIB_IO 9 | 10 | #include "io_interface.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include "zlib.h" 16 | 17 | 18 | namespace SOL{ 19 | #define ZLIB_BUF_SIZE 16348 20 | 21 | class zlib_io: public io_interface{ 22 | private: 23 | enum RW_MODE{ 24 | mode_null = 0, 25 | mode_read = 1, 26 | mode_write = 2, 27 | }; 28 | 29 | private: 30 | FILE* file; 31 | z_stream strm; 32 | 33 | unsigned char* en_data; //encoded data 34 | unsigned char* de_data; //decoded data 35 | unsigned char* cur_de_pos; //current read position of decoded data 36 | size_t de_avail_count; //available decoded data count 37 | 38 | int rw_mode; 39 | public: 40 | zlib_io(): 41 | file(NULL), en_data(NULL), de_data(NULL), 42 | cur_de_pos(NULL), de_avail_count(0), rw_mode(mode_null){} 43 | 44 | ~zlib_io(){ 45 | this->free_buf(); 46 | } 47 | 48 | private: 49 | bool alloc_buf(); 50 | void free_buf(); 51 | public: 52 | virtual bool open_file(const char* filename, const char* mode); 53 | // bind_stdin: bind the input to stdin 54 | virtual bool open_stdin(); 55 | // bind_stdin: bind the output to stdout 56 | virtual bool open_stdout(); 57 | 58 | virtual void close_file(); 59 | virtual void rewind(); 60 | 61 | /** 62 | * good : test if the io is good 63 | * 64 | * @Return: zero if correct, else zero code 65 | */ 66 | virtual int good(); 67 | 68 | public: 69 | /** 70 | * read_data : read the data from file 71 | * 72 | * @Param dst: container to place the read data 73 | * @Param length: length of data of read in bytes 74 | * 75 | * @Return: true if succeed 76 | */ 77 | virtual bool read_data(char* dst, size_t length); 78 | 79 | /** 80 | * read_line : read a line from disk 81 | * 82 | * @Param dst: container to place the read data 83 | * @Param dst_len: length of dst 84 | * 85 | * @Return: pointer to the read line, null if failed 86 | */ 87 | virtual char* read_line(char* &dst, size_t &dst_len); 88 | 89 | /** 90 | * write_data : write content to disk 91 | * 92 | * @Param src: source of the data 93 | * @Param length: length to write the data 94 | * 95 | * @Return: true if succeed 96 | */ 97 | virtual bool write_data(char* src, size_t length); 98 | 99 | private: 100 | /** 101 | * finalize_write : finalize write of deflate 102 | * 103 | * @Return: 0 if ok 104 | */ 105 | int finalize_write(); 106 | }; 107 | } 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /src/kernel/kernel_RBP.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "kernel_optim.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace SOL 10 | { 11 | template 12 | class kernel_RBP: public Kernel_optim 13 | { 14 | 15 | protected: 16 | int Budget; 17 | public: 18 | kernel_RBP(const Params ¶m,DataSet &dataset, 19 | LossFunction &lossFunc); 20 | virtual ~ kernel_RBP(); 21 | 22 | protected: 23 | //this is the core of different updating algorithms 24 | virtual float UpdateWeightVec(const DataPoint &x); 25 | virtual float Predict(const DataPoint &data); 26 | virtual void begin_test(void){} 27 | }; 28 | 29 | template 30 | kernel_RBP:: kernel_RBP(const Params ¶m, 31 | DataSet &dataset, 32 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 33 | { 34 | this->id_str = " kernel_RBP"; 35 | this->Budget=param.Budget_set; 36 | } 37 | 38 | template 39 | kernel_RBP::~ kernel_RBP() 40 | { 41 | } 42 | 43 | //update weight vector with stochastic gradient descent 44 | template 45 | float kernel_RBP::UpdateWeightVec(const DataPoint &x) 46 | { 47 | float y = this->Predict(x); 48 | if (y*x.label<=0) 49 | { 50 | 51 | SV* support = new SV(x.label,x); 52 | this->add_SV(support); 53 | 54 | } 55 | //delete SV 56 | if(this->size_SV==Budget+1) 57 | { 58 | srand((unsigned)time(NULL)); 59 | int SV_to_delete=rand() % (Budget);//from 0 to Budget-1 60 | this->delete_SV(SV_to_delete); 61 | } 62 | return y; 63 | } 64 | template 65 | float kernel_RBP::Predict(const DataPoint &data) 66 | { 67 | float predict = 0; 68 | 69 | SV* p_predict = this->SV_begin; 70 | while (p_predict!=NULL) 71 | { 72 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 73 | p_predict=p_predict->next; 74 | } 75 | return predict; 76 | } 77 | 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/kernel/kernel_bogd.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #pragma once 4 | 5 | #include "kernel_optim.h" 6 | 7 | namespace SOL 8 | { 9 | template 10 | class kernel_bogd: public Kernel_optim 11 | { 12 | 13 | protected: 14 | int Budget; 15 | float lambda; 16 | 17 | public: 18 | kernel_bogd(const Params ¶m,DataSet &dataset, 19 | LossFunction &lossFunc); 20 | virtual ~kernel_bogd(); 21 | 22 | protected: 23 | //this is the core of different updating algorithms 24 | virtual float UpdateWeightVec(const DataPoint &x); 25 | virtual float Predict(const DataPoint &data); 26 | virtual void begin_test(void){} 27 | }; 28 | 29 | template 30 | kernel_bogd::kernel_bogd(const Params ¶m, 31 | DataSet &dataset, 32 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 33 | { 34 | this->id_str = "kernel_bogd"; 35 | this->Budget=param.Budget_set; 36 | this->lambda=param.lambda; 37 | this->eta0=param.eta; 38 | } 39 | 40 | template 41 | kernel_bogd::~kernel_bogd() 42 | { 43 | } 44 | 45 | //update weight vector with stochastic gradient descent 46 | template 47 | float kernel_bogd::UpdateWeightVec(const 48 | DataPoint &x) 49 | { 50 | float y = this->Predict(x); 51 | 52 | float gt_i = this->lossFunc->GetGradient(x.label,y); 53 | 54 | SV* p_alpha=this->SV_begin; 55 | while(p_alpha!=NULL) 56 | { 57 | p_alpha->SV_alpha=p_alpha->SV_alpha*(1-this->eta0*lambda); 58 | p_alpha=p_alpha->next; 59 | } 60 | if(gt_i!=0) 61 | { 62 | SV* support = new SV(-this->eta0 * gt_i,x); 63 | this->add_SV(support); 64 | } 65 | //delete SV 66 | if(this->size_SV==Budget+1) 67 | this->delete_SV(); 68 | 69 | return y; 70 | } 71 | 72 | template 73 | float kernel_bogd::Predict(const DataPoint &data) 74 | { 75 | float predict = 0; 76 | 77 | SV* p_predict = this->SV_begin; 78 | while (p_predict!=NULL) 79 | { 80 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 81 | p_predict=p_predict->next; 82 | } 83 | return predict; 84 | } 85 | 86 | 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/kernel/kernel_bpas.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kernel_optim.h" 4 | #include 5 | 6 | namespace SOL 7 | { 8 | template 9 | class kernel_bpas: public Kernel_optim 10 | { 11 | 12 | protected: 13 | int Budget; 14 | float C_bpas; 15 | public: 16 | kernel_bpas(const Params ¶m,DataSet &dataset, 17 | LossFunction &lossFunc); 18 | virtual ~ kernel_bpas(); 19 | 20 | protected: 21 | //this is the core of different updating algorithms 22 | virtual float UpdateWeightVec(const DataPoint &x); 23 | virtual float Predict(const DataPoint &data); 24 | virtual void begin_test(void){} 25 | }; 26 | 27 | template 28 | kernel_bpas:: kernel_bpas(const Params ¶m, 29 | DataSet &dataset, 30 | LossFunction &lossFunc): Kernel_optim(param, 31 | dataset, lossFunc) 32 | { 33 | this->id_str = " kernel_BPAS"; 34 | this->Budget=param.Budget_set; 35 | this->C_bpas=param.C_bpas; 36 | } 37 | 38 | template 39 | kernel_bpas::~kernel_bpas() 40 | { 41 | } 42 | 43 | //update weight vector with stochastic gradient descent 44 | template 45 | float kernel_bpas::UpdateWeightVec( 46 | const DataPoint &x) 47 | { 48 | float y=0; 49 | float *k_t=NULL; 50 | //calculate k_t 51 | if(this->size_SV!=0) 52 | { 53 | SV* p_predict=this->SV_begin; 54 | k_t=new float [this->size_SV]; 55 | int i=0; 56 | while (p_predict!=NULL) 57 | { 58 | k_t[i]=this->kern(p_predict->SV_data,x); 59 | p_predict=p_predict->next; 60 | i++; 61 | } 62 | 63 | //k_t done 64 | 65 | //get prediction 66 | p_predict=this->SV_begin; 67 | i=0; 68 | while (p_predict!=NULL) 69 | { 70 | y+=p_predict->SV_alpha* k_t[i]; 71 | p_predict=p_predict->next; 72 | i++; 73 | } 74 | } 75 | //prediction is in y 76 | float l_t=1-x.label*y; 77 | if(l_t<0) 78 | { 79 | l_t=0; 80 | } 81 | 82 | //get the Hinge Loss 83 | 84 | if (l_t>0) 85 | { 86 | float tao= (std::min)(C_bpas,l_t); 87 | if(this->size_SV* support = new SV(x.label*tao,x); 90 | this->add_SV(support); 91 | } 92 | else //full Budget 93 | { 94 | double Q_star=1000000; 95 | int star=1; 96 | double star_alpha=1.0; 97 | 98 | SV *p_search=this->SV_begin; 99 | 100 | for(int i=0; isize_SV; i++) 101 | { 102 | double k_rt=k_t[i]; 103 | double alpha_r=p_search->SV_alpha; 104 | double beta_t=alpha_r*k_rt+tao*x.label; 105 | double distance=alpha_r*alpha_r+beta_t*beta_t-2*beta_t*alpha_r*k_rt; 106 | double f_rt=y-alpha_r*k_rt+beta_t; 107 | double l_rt=1-x.label*f_rt; 108 | if(l_rt<0) 109 | l_rt=0; 110 | double Q_r=0.5*distance+C_bpas*l_rt; 111 | if(Q_rnext; 118 | } 119 | this->delete_SV(star); 120 | SV* support = new SV(float(star_alpha),x); 121 | this->add_SV(support); 122 | } 123 | } 124 | delete [] k_t; 125 | return y; 126 | } 127 | 128 | template 129 | float kernel_bpas::Predict(const DataPoint &data) 130 | { 131 | float predict = 0; 132 | 133 | SV* p_predict = this->SV_begin; 134 | while (p_predict!=NULL) 135 | { 136 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 137 | p_predict=p_predict->next; 138 | } 139 | return predict; 140 | } 141 | 142 | } 143 | -------------------------------------------------------------------------------- /src/kernel/kernel_fogd.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "kernel_optim.h" 5 | #include 6 | #include 7 | 8 | #define B_cos 1.273239 9 | #define P_cos 0.225 10 | #define C_cos -0.40528 11 | #define pi_cos 3.1415926 12 | 13 | namespace SOL 14 | { 15 | template 16 | class kernel_fogd: public Kernel_optim 17 | { 18 | 19 | protected: 20 | int D; 21 | IndexType u_dimension; 22 | s_array w_fogd; 23 | s_array u; 24 | s_array w_fogd_sum; 25 | s_array ux; 26 | s_array ux_cos; 27 | int num_update; 28 | 29 | 30 | 31 | double a; 32 | std::default_random_engine generator; 33 | std::normal_distribution distribution; 34 | 35 | public: 36 | kernel_fogd(const Params ¶m,DataSet &dataset, 37 | LossFunction &lossFunc); 38 | 39 | virtual ~kernel_fogd(); 40 | 41 | protected: 42 | //this is the core of different updating algorithms 43 | virtual float UpdateWeightVec(const DataPoint &x); 44 | virtual float Predict(const DataPoint &data); 45 | virtual void begin_test(void); 46 | }; 47 | 48 | template 49 | kernel_fogd::kernel_fogd(const Params ¶m, 50 | DataSet &dataset, 51 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 52 | { 53 | this->id_str = "kernel_fogd"; 54 | this->D=param.D_set; 55 | 56 | w_fogd.resize(2 * D); 57 | w_fogd.zeros(); 58 | w_fogd_sum.resize(2*D); 59 | w_fogd_sum.zeros(); 60 | this->ux.resize(D); 61 | this->ux_cos.resize(2 * D); 62 | num_update=0; 63 | 64 | this->u_dimension=0; 65 | this->distribution=normal_distribution(0.0,sqrt(param.gamma*2)); 66 | this->generator=default_random_engine((unsigned)time(NULL)); 67 | this->eta0 = param.eta; 68 | } 69 | 70 | template 71 | kernel_fogd::~kernel_fogd() 72 | { 73 | } 74 | 75 | //update weight vector with stochastic gradient descent 76 | template 77 | float kernel_fogd::UpdateWeightVec(const DataPoint &x) 78 | { 79 | 80 | IndexType x_dimension=x.max_index; 81 | //generate u 82 | if(u_dimensionu.reserve(D * x_dimension); 86 | this->u.resize(D * x_dimension); 87 | for(IndexType i=(D*u_dimension); i<(D*x_dimension); i++) 88 | this->u[i]=distribution(generator); 89 | this->u_dimension=x_dimension; 90 | } 91 | 92 | this->ux.zeros(); 93 | 94 | size_t index_begin; 95 | float feature; 96 | for(size_t j=0; j 3.14159265) 114 | ux[i]-= 6.28318531; 115 | a = B_cos * ux[i] + C_cos * ux[i] * abs(ux[i]); 116 | *p1 = P_cos * (a * abs(a) - a) + a; 117 | 118 | *p2=sqrt(1-(*p1)*(*p1)); 119 | if(ux[i]<0) 120 | *p2=-(*p2); 121 | 122 | p1++; 123 | p2++; 124 | } 125 | 126 | double y=0; 127 | for(int i=0; i<2*D; i++) 128 | y=y+w_fogd[i]*ux_cos[i]; 129 | 130 | if(y*x.label<1) 131 | { 132 | num_update++; 133 | for(int i=0; i<2*D; i++) 134 | { 135 | w_fogd[i]=w_fogd[i]+this->eta0*x.label*ux_cos[i]; 136 | w_fogd_sum[i]=w_fogd_sum[i]+w_fogd[i]; 137 | } 138 | } 139 | return float(y); 140 | } 141 | 142 | template 143 | float kernel_fogd::Predict(const DataPoint &data) 144 | { 145 | 146 | this->ux.zeros(); 147 | 148 | size_t index_begin; 149 | float feature; 150 | for(size_t j=0; j 3.14159265) 168 | ux[i]-= 6.28318531; 169 | a = B_cos * ux[i] + C_cos * ux[i] * abs(ux[i]); 170 | *p1 = P_cos * (a * abs(a) - a) + a; 171 | *p2=sqrt(1-(*p1)*(*p1)); 172 | if(ux[i]<0) 173 | *p2=-(*p2); 174 | p1++; 175 | p2++; 176 | } 177 | 178 | double y=0; 179 | for(int i=0; i<2*D; i++) 180 | y=y+w_fogd[i]*ux_cos[i]; 181 | return float(y); 182 | } 183 | 184 | template 185 | void kernel_fogd::begin_test(void) 186 | { 187 | for(int i=0; i<2*D; i++) 188 | { 189 | w_fogd[i]=w_fogd_sum[i]/num_update; 190 | } 191 | } 192 | 193 | } 194 | -------------------------------------------------------------------------------- /src/kernel/kernel_forgetron.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "kernel_optim.h" 5 | #include 6 | #include 7 | #include 8 | #include "../data/DataPoint.h" 9 | #include 10 | #include 11 | 12 | namespace SOL 13 | { 14 | template 15 | class kernel_forgetron: public Kernel_optim 16 | { 17 | protected: 18 | int Budget; 19 | int err_until_now; 20 | double Q; 21 | 22 | public: 23 | kernel_forgetron(const Params ¶m,DataSet &dataset, 24 | LossFunction &lossFunc); 25 | 26 | virtual ~kernel_forgetron(); 27 | 28 | protected: 29 | //this is the core of different updating algorithms 30 | virtual float UpdateWeightVec(const DataPoint &x); 31 | virtual float Predict(const DataPoint &data); 32 | virtual void begin_test(void){} 33 | }; 34 | 35 | template 36 | kernel_forgetron:: kernel_forgetron(const Params ¶m, 37 | DataSet &dataset, 38 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 39 | { 40 | this->id_str = " kernel_forgetron"; 41 | this->Budget=param.Budget_set; 42 | this->err_until_now=0; 43 | this->Q=0; 44 | } 45 | 46 | template 47 | kernel_forgetron::~ kernel_forgetron() 48 | { 49 | } 50 | 51 | //update weight vector with stochastic gradient descent 52 | template 53 | float kernel_forgetron::UpdateWeightVec(const DataPoint &x) 54 | { 55 | float y = this->Predict(x); 56 | if (y*x.label<=0) 57 | { 58 | err_until_now++; 59 | 60 | SV* support = new SV(x.label,x); 61 | this->add_SV(support); 62 | } 63 | 64 | //delete SV 65 | if(this->size_SV==Budget+1) 66 | { 67 | float predict = this->Predict(this->SV_begin->SV_data); 68 | 69 | double mu=this->SV_begin->SV_data.label*predict; 70 | double delta=this->SV_begin->SV_alpha/this->SV_begin->SV_data.label; 71 | 72 | double coeA=delta*delta-2*delta*mu; 73 | double coeB=2*delta; 74 | double coeC=Q-(15.0/32.0)*err_until_now; 75 | 76 | double phi=0; 77 | if (coeA==0) 78 | phi=(std::max)(0.0,(std::min)(1.0,-coeC/coeB)); 79 | else if (coeA>0) 80 | { 81 | if (coeA+coeB+coeC<=0) 82 | phi=1; 83 | else 84 | phi=(-coeB+sqrt(coeB*coeB-4*coeA*coeC))/(2*coeA); 85 | } 86 | else if (coeA<0) 87 | { 88 | if (coeA+coeB+coeC<=0) 89 | phi=1; 90 | else 91 | phi=(-coeB-sqrt(coeB*coeB-4*coeA*coeC))/(2*coeA); 92 | } 93 | 94 | //alpha=phi*alpha_t; 95 | SV* p_change_alpha=this->SV_begin; 96 | while(p_change_alpha!=NULL) 97 | { 98 | p_change_alpha->SV_alpha= (float)(p_change_alpha->SV_alpha*phi); 99 | p_change_alpha=p_change_alpha->next; 100 | } 101 | 102 | Q=Q+(delta*phi)*(delta*phi)+2*delta*phi*(1-phi*mu); 103 | this->delete_SV(0); 104 | } 105 | return y; 106 | } 107 | template 108 | float kernel_forgetron::Predict(const DataPoint &data) 109 | { 110 | float predict = 0; 111 | 112 | SV* p_predict = this->SV_begin; 113 | while (p_predict!=NULL) 114 | { 115 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 116 | p_predict=p_predict->next; 117 | } 118 | return predict; 119 | } 120 | 121 | 122 | } 123 | -------------------------------------------------------------------------------- /src/kernel/kernel_nogd.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #pragma once 4 | 5 | #include "kernel_optim.h" 6 | #include 7 | #include "cmath" 8 | using namespace Eigen; 9 | 10 | namespace SOL 11 | { 12 | template 13 | class kernel_nogd: public Kernel_optim 14 | { 15 | 16 | protected: 17 | int k_nogd; 18 | MatrixXf * K_budget; 19 | virtual ~kernel_nogd(); 20 | int Budget; 21 | VectorXf *w_nogd; 22 | VectorXf *w_nogd_sum; 23 | MatrixXf * M_nogd; 24 | bool flag; 25 | int num_update; 26 | float eta1; 27 | 28 | public: 29 | kernel_nogd(const Params ¶m,DataSet &dataset, 30 | LossFunction &lossFunc); 31 | 32 | protected: 33 | //this is the core of different updating algorithms 34 | virtual float UpdateWeightVec(const DataPoint &x); 35 | virtual float Predict(const DataPoint &data); 36 | virtual void begin_test(void); 37 | }; 38 | 39 | template 40 | kernel_nogd::kernel_nogd(const Params ¶m, 41 | DataSet &dataset, 42 | LossFunction &lossFunc ): Kernel_optim(param,dataset, lossFunc) 43 | { 44 | eta1=param.eta1; 45 | this->eta0=param.eta; 46 | num_update=0; 47 | this->id_str = "kernel_nogd"; 48 | this->k_nogd=param.k_nogd; 49 | this->Budget=param.Budget_set; 50 | this->K_budget=new MatrixXf(Budget,Budget); 51 | for(int i=0; iw_nogd=new VectorXf(k_nogd); 56 | for(int i=0; iw_nogd_sum=new VectorXf(k_nogd); 62 | for(int i=0; iM_nogd= new MatrixXf(k_nogd,Budget); 68 | this->flag=0; 69 | } 70 | 71 | template 72 | kernel_nogd::~kernel_nogd() 73 | { 74 | delete w_nogd; 75 | delete M_nogd; 76 | delete K_budget; 77 | } 78 | 79 | //update weight vector with stochastic gradient descent 80 | template 81 | float kernel_nogd::UpdateWeightVec(const DataPoint &x) 82 | { 83 | float y=0; 84 | VectorXf kt(this->size_SV); 85 | VectorXf zt(k_nogd); 86 | //calculate k_t 87 | if((this->size_SV!=0)&&(flag==0)) 88 | { 89 | SV* p_predict=this->SV_begin; 90 | int i=0; 91 | while (p_predict!=NULL) 92 | { 93 | kt(i)=this->kern(p_predict->SV_data,x); 94 | p_predict=p_predict->next; 95 | i++; 96 | } 97 | //k_t done 98 | 99 | //get prediction 100 | p_predict=this->SV_begin; 101 | i=0; 102 | while (p_predict!=NULL) 103 | { 104 | y+=p_predict->SV_alpha* kt(i); 105 | p_predict=p_predict->next; 106 | i++; 107 | } 108 | } 109 | if(flag!=0) //linear predict 110 | { 111 | SV* p_predict=this->SV_begin; 112 | int i=0; 113 | while (p_predict!=NULL) 114 | { 115 | kt[i]=this->kern(p_predict->SV_data,x); 116 | p_predict=p_predict->next; 117 | i++; 118 | } 119 | zt=(*M_nogd)*kt; 120 | y=(*w_nogd).dot(zt); 121 | } 122 | //update 123 | if(y*x.label<1) 124 | { 125 | if(this->size_SV* support = new SV(x.label*this->eta0,x); 128 | this->add_SV(support); 129 | 130 | for(int i=0; isize_SV-1; i++) 131 | { 132 | (*K_budget)(i,this->size_SV-1)=kt(i); 133 | (*K_budget)(this->size_SV-1,i)=kt(i); 134 | } 135 | 136 | } 137 | else 138 | { 139 | if(flag==0) //SVD 140 | { 141 | this->curIterNum=1; 142 | flag=1; 143 | EigenSolver es(*K_budget); 144 | MatrixXcf V = es.eigenvectors(); 145 | //cout< 185 | void kernel_nogd::begin_test(void) 186 | { 187 | (*w_nogd)=(*w_nogd_sum)/float(num_update); 188 | } 189 | 190 | template 191 | float kernel_nogd::Predict(const DataPoint &data) 192 | { 193 | float y=0; 194 | SV* p_predict=this->SV_begin; 195 | int i=0; 196 | VectorXf kt(this->size_SV); 197 | VectorXf zt(k_nogd); 198 | while (p_predict!=NULL) 199 | { 200 | kt[i]=this->kern(p_predict->SV_data,data); 201 | p_predict=p_predict->next; 202 | i++; 203 | } 204 | zt=(*M_nogd)*kt; 205 | y=(*w_nogd).dot(zt); 206 | return y; 207 | } 208 | 209 | 210 | 211 | } 212 | -------------------------------------------------------------------------------- /src/kernel/kernel_optim.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #pragma once 4 | #include "../data/DataPoint.h" 5 | #include "../data/DataSet.h" 6 | #include "../loss/LossFunction.h" 7 | #include "../common/init_param.h" 8 | #include "../common/util.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace SOL 16 | { 17 | 18 | /** 19 | * namespace: Sparse Online Learning 20 | */ 21 | template 22 | struct SV 23 | { 24 | public: 25 | float SV_alpha_sum; 26 | float SV_alpha; 27 | DataPoint SV_data; 28 | SV * next; 29 | 30 | SV(float alpha, DataPoint x) 31 | { 32 | SV_alpha_sum=0; 33 | SV_alpha=alpha; 34 | SV_data= x.clone(); 35 | next=NULL; 36 | } 37 | }; 38 | 39 | 40 | template class Kernel_optim 41 | { 42 | //Iteration 43 | protected: 44 | //iteration number 45 | unsigned int curIterNum; 46 | float eta_now; 47 | //parameters 48 | float eta0; //learning rate 49 | float gamma; 50 | int weight; 51 | 52 | bool use_average_weight; 53 | DataSet &dataSet; 54 | 55 | //weight vector 56 | protected: 57 | SV * SV_begin; 58 | SV * SV_end; 59 | public: 60 | int size_SV; 61 | 62 | protected: 63 | LossFunction *lossFunc; 64 | 65 | protected: 66 | string id_str; 67 | 68 | public: 69 | void PrintOptInfo()const 70 | { 71 | printf("--------------------------------------------------\n"); 72 | printf("Algorithm: %s\n",this->Id_Str().c_str()); 73 | } 74 | 75 | public: 76 | Kernel_optim(const Params ¶m,DataSet &dataset, LossFunction &lossFunc); 77 | 78 | public: 79 | void SetParameter(float gamma_a=8, float eta_a = -1); 80 | 81 | virtual ~Kernel_optim() 82 | { 83 | SV * SV_free; 84 | for(int i=0;inext; 88 | delete SV_free; 89 | } 90 | } 91 | const string& Id_Str() const 92 | { 93 | return this->id_str; 94 | } 95 | 96 | protected: 97 | //train the data 98 | float Train(); 99 | //predict a new feature 100 | void sum_SV(); 101 | //this is the core of different updating algorithms 102 | //return the predict 103 | virtual float UpdateWeightVec(const DataPoint &x) = 0; 104 | virtual float Predict(const DataPoint &data) = 0; 105 | virtual void begin_test(void)=0; 106 | public: 107 | 108 | float kern( 109 | const DataPoint &SV_data, 110 | const DataPoint &x); 111 | void add_SV(SV *p_newSV); 112 | void delete_SV(int index_SV=0); 113 | public: 114 | //learn a model 115 | inline float Learn(int numOfTimes = 1); 116 | //learn a model and return the mistake rate and its variance 117 | float Learn(float &aveErrRate, float &varErrRate, float &sparseRate, int numOfTimes = 1); 118 | //test the performance on the given set 119 | float Test(const Params ¶m, DataSet &testSet); 120 | }; 121 | 122 | template 123 | void Kernel_optim::SetParameter(float gamma_a , float eta_a) { 124 | this->gamma = gamma_a; 125 | this->eta0 = eta_a; 126 | } 127 | 128 | 129 | template 130 | Kernel_optim::Kernel_optim(const Params ¶m,DataSet &dataset, 131 | LossFunction &lossFunc): dataSet(dataset) 132 | { 133 | this->lossFunc = &lossFunc; 134 | //this->eta0 = init_eta;///////////////////////////////////////// 135 | this->curIterNum = 0; 136 | 137 | this->size_SV=0; 138 | this->SV_begin=NULL; 139 | this->SV_end=NULL; 140 | this->weight=param.weight_sum; 141 | //this->sigma=sigma_kernel; 142 | } 143 | 144 | ////////////////////////////// 145 | 146 | template 147 | float Kernel_optim::Train() 148 | { 149 | float errorNum=0; 150 | if(dataSet.Rewind() == false) 151 | return 1.f; 152 | //reset 153 | while(1) 154 | { 155 | const DataChunk &chunk = dataSet.GetChunk(); 156 | //all the data has been processed! 157 | if(chunk.dataNum == 0) 158 | break; 159 | 160 | for (size_t i = 0; i < chunk.dataNum; i++) 161 | { 162 | if(curIterNum%10000==0) 163 | cout<curIterNum++; 166 | const DataPoint &data = chunk.data[i]; 167 | float y = this->UpdateWeightVec(data); 168 | //loss 169 | if (this->lossFunc->IsCorrect(data.label,y) == false) 170 | { 171 | errorNum++; 172 | } 173 | } 174 | dataSet.FinishRead(); 175 | } 176 | cout<<"\n#Training Instances:"< 182 | float Kernel_optim::Learn(float &aveErrRate, float &varErrRate, 183 | float &sparseRate, int numOfTimes) 184 | { 185 | float * errorRateVec = new float[numOfTimes]; 186 | 187 | for (int i = 0; i < numOfTimes; i++) 188 | { 189 | //random order 190 | 191 | errorRateVec[i] = this->Train(); 192 | } 193 | aveErrRate = Average(errorRateVec, numOfTimes); 194 | varErrRate = Variance(errorRateVec, numOfTimes); 195 | sparseRate=1; 196 | 197 | delete []errorRateVec; 198 | 199 | return aveErrRate; 200 | } 201 | 202 | //learn a model 203 | template 204 | float Kernel_optim::Learn(int numOfTimes) 205 | { 206 | float aveErrRate, varErrRate, sparseRate; 207 | return this->Learn(aveErrRate, varErrRate,sparseRate, numOfTimes); 208 | }//??? 209 | 210 | //test the performance on the given set 211 | template 212 | float Kernel_optim::Test(const Params ¶m,DataSet &testSet) 213 | { 214 | if(param.ave==0) 215 | { 216 | begin_test(); 217 | } 218 | if(testSet.Rewind() == false) 219 | exit(0); 220 | float errorRate(0); 221 | //test 222 | while(1) 223 | { 224 | const DataChunk &chunk = testSet.GetChunk(); 225 | if(chunk.dataNum == 0) //"all the data has been processed!" 226 | break; 227 | for (size_t i = 0; i < chunk.dataNum; i++) 228 | { 229 | const DataPoint &data = chunk.data[i]; 230 | //predict 231 | float predict = this->Predict(data); 232 | if (this->lossFunc->IsCorrect(data.label,predict) == false) 233 | errorRate++; 234 | } 235 | testSet.FinishRead(); 236 | } 237 | errorRate /= testSet.size(); 238 | return errorRate; 239 | } 240 | 241 | 242 | template 243 | float Kernel_optim::kern(const DataPoint &SV_data,const DataPoint &x) 244 | { 245 | float sum=0; 246 | int i=0; 247 | int j=0; 248 | int size_SV_dimension=SV_data.indexes.size(); 249 | int size_data_dimension=x.indexes.size(); 250 | 251 | 252 | while((i!=size_SV_dimension)&&(j!=size_data_dimension)) 253 | { 254 | if((SV_data.indexes[i])>(x.indexes[j])) 255 | { 256 | sum=sum+x.features[j]*x.features[j]; 257 | j++; 258 | } 259 | else if((SV_data.indexes[i])<(x.indexes[j])) 260 | { 261 | sum=sum+SV_data.features[i]*SV_data.features[i]; 262 | i++; 263 | } 264 | else 265 | { 266 | sum=sum+(SV_data.features[i]-x.features[j])*(SV_data.features[i]-x.features[j]); 267 | i++; 268 | j++; 269 | } 270 | } 271 | if(i==size_SV_dimension)//i first reach the end 272 | { 273 | for(int a=j; agamma); 288 | float a=exp(sum); 289 | return a; 290 | 291 | } 292 | 293 | 294 | template 295 | void Kernel_optim::add_SV(SV *p_newSV) 296 | { 297 | if(SV_end!=NULL) 298 | { 299 | SV_end->next=p_newSV; 300 | SV_end=p_newSV; 301 | } 302 | else 303 | { 304 | SV_begin=p_newSV; 305 | SV_end=p_newSV; 306 | } 307 | size_SV++; 308 | } 309 | 310 | template 311 | void Kernel_optim::delete_SV(int index_SV) 312 | { 313 | //index_SV is the index of SV to be deleted from 0 to B-1 314 | SV* p_delete=SV_begin; 315 | SV* q_delete=NULL; 316 | if((index_SV!=0)&&(index_SV!=size_SV-1)) 317 | { 318 | int i=0; 319 | while(inext; 322 | i++; 323 | } 324 | q_delete=p_delete->next; 325 | p_delete->next=q_delete->next; 326 | delete q_delete; 327 | } 328 | else if(index_SV==0) 329 | { 330 | SV_begin=p_delete->next; 331 | delete p_delete; 332 | } 333 | else 334 | { 335 | int i=0; 336 | while(inext; 339 | i++; 340 | } 341 | q_delete=p_delete->next; 342 | p_delete->next=NULL; 343 | delete q_delete; 344 | SV_end=p_delete; 345 | } 346 | size_SV--; 347 | } 348 | template 349 | void Kernel_optim::sum_SV() 350 | { 351 | //float weight_now = (float(weight+1))/(float(curIterNum+weight)); 352 | SV* p_sum=SV_begin; 353 | while(p_sum!=NULL) 354 | { 355 | //p_sum->SV_alpha_sum=p_sum->SV_alpha_sum*(1-weight_now)+p_sum->SV_alpha*weight_now; 356 | p_sum->SV_alpha_sum=p_sum->SV_alpha_sum+p_sum->SV_alpha; 357 | p_sum=p_sum->next; 358 | } 359 | } 360 | 361 | 362 | 363 | } 364 | -------------------------------------------------------------------------------- /src/kernel/kernel_pa.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kernel_optim.h" 4 | 5 | 6 | namespace SOL 7 | { 8 | template 9 | class kernel_pa: public Kernel_optim 10 | { 11 | public: 12 | kernel_pa(const Params ¶m,DataSet &dataset, 13 | LossFunction &lossFunc); 14 | virtual ~kernel_pa(); 15 | float C; 16 | protected: 17 | //this is the core of different updating algorithms 18 | virtual float UpdateWeightVec(const DataPoint &x); 19 | virtual float Predict(const DataPoint &data); 20 | virtual void begin_test(void){} 21 | }; 22 | 23 | template 24 | kernel_pa::kernel_pa(const Params ¶m, 25 | DataSet &dataset, 26 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 27 | { 28 | this->id_str = "kernel_pa"; 29 | this->C=param.C; 30 | } 31 | 32 | template 33 | kernel_pa::~kernel_pa() 34 | { 35 | } 36 | 37 | //update weight vector with stochastic gradient descent 38 | template 39 | float kernel_pa::UpdateWeightVec(const DataPoint &x) 40 | { 41 | float y = this->Predict(x); 42 | 43 | float lt=1-x.label*y; 44 | //cout<0) 49 | { 50 | SV* support = new SV(x.label*lt,x); 51 | add_SV(support); 52 | } 53 | return y; 54 | } 55 | 56 | 57 | template 58 | float kernel_pa::Predict(const DataPoint &data) 59 | { 60 | float predict = 0; 61 | 62 | SV* p_predict = this->SV_begin; 63 | while (p_predict!=NULL) 64 | { 65 | predict+=p_predict->SV_alpha* kern(p_predict->SV_data,data); 66 | p_predict=p_predict->next; 67 | } 68 | return predict; 69 | } 70 | 71 | 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/kernel/kernel_perceptron.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #pragma once 4 | 5 | #include "kernel_optim.h" 6 | 7 | namespace SOL 8 | { 9 | template 10 | class kernel_perceptron: public Kernel_optim 11 | { 12 | public: 13 | kernel_perceptron(const Params ¶m,DataSet &dataset, 14 | LossFunction &lossFunc); 15 | virtual ~ kernel_perceptron(); 16 | 17 | protected: 18 | //this is the core of different updating algorithms 19 | virtual float UpdateWeightVec(const DataPoint &x); 20 | virtual float Predict(const DataPoint &data); 21 | virtual void begin_test(void){} 22 | }; 23 | 24 | template 25 | kernel_perceptron:: kernel_perceptron(const Params ¶m, 26 | DataSet &dataset, 27 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 28 | { 29 | this->id_str = " kernel_perceptron"; 30 | } 31 | 32 | template 33 | kernel_perceptron::~ kernel_perceptron() 34 | { 35 | } 36 | 37 | //update weight vector with stochastic gradient descent 38 | template 39 | float kernel_perceptron::UpdateWeightVec(const DataPoint &x) 40 | { 41 | float y = this->Predict(x); 42 | 43 | if (y*x.label<=0) 44 | { 45 | SV* support = new SV(x.label,x); 46 | 47 | this->add_SV(support); 48 | } 49 | return y; 50 | } 51 | template 52 | float kernel_perceptron::Predict(const DataPoint &data) 53 | { 54 | float predict = 0; 55 | 56 | SV* p_predict = this->SV_begin; 57 | while (p_predict!=NULL) 58 | { 59 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 60 | p_predict=p_predict->next; 61 | } 62 | return predict; 63 | } 64 | 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/kernel/kernel_projectron.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kernel_optim.h" 4 | 5 | namespace SOL 6 | { 7 | template 8 | class kernel_projectron: public Kernel_optim 9 | { 10 | 11 | protected: 12 | s_array K_inverse; 13 | int Budget; 14 | public: 15 | kernel_projectron(const Params ¶m,DataSet &dataset, 16 | LossFunction &lossFunc); 17 | virtual ~ kernel_projectron(); 18 | 19 | protected: 20 | //this is the core of different updating algorithms 21 | virtual float UpdateWeightVec(const DataPoint &x); 22 | virtual float Predict(const DataPoint &data); 23 | virtual void begin_test(void){}; 24 | }; 25 | 26 | template 27 | kernel_projectron:: kernel_projectron(const Params ¶m, 28 | DataSet &dataset, 29 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 30 | { 31 | this->Budget=param.Budget_set; 32 | this->id_str = " kernel_projectron"; 33 | this->K_inverse.resize(Budget*Budget); 34 | this->K_inverse.zeros(); 35 | } 36 | 37 | template 38 | kernel_projectron::~kernel_projectron() 39 | { 40 | } 41 | 42 | //add by yuewu: 2013/12/11 43 | //Memory optimization 44 | 45 | //update weight vector with stochastic gradient descent 46 | template 47 | float kernel_projectron::UpdateWeightVec(const DataPoint &x) 48 | { 49 | float y=0; 50 | float *k_t=NULL; 51 | //calculate k_t 52 | if(this->size_SV!=0) 53 | { 54 | SV* p_predict=this->SV_begin; 55 | k_t=new float [this->size_SV]; 56 | int i=0; 57 | while (p_predict!=NULL) 58 | { 59 | k_t[i]=this->kern(p_predict->SV_data,x); 60 | p_predict=p_predict->next; 61 | i++; 62 | } 63 | 64 | //k_t done 65 | 66 | //get prediction 67 | p_predict=this->SV_begin; 68 | i=0; 69 | while (p_predict!=NULL) 70 | { 71 | y+=p_predict->SV_alpha* k_t[i]; 72 | p_predict=p_predict->next; 73 | i++; 74 | } 75 | } 76 | //prediction is in y 77 | float l_t=1-x.label*y; 78 | // if there is mistake, make update 79 | if (y*x.label<=0) 80 | { 81 | if(this->size_SV==0) 82 | { 83 | 84 | SV* support = new SV(x.label,x); 85 | 86 | this->add_SV(support); 87 | 88 | //ini K_inverse 89 | K_inverse[0]=1; 90 | } 91 | else //have SV 92 | { 93 | // calculate d_star=K_t_inver*k_t; 94 | float * d_star=new float [this->size_SV]; 95 | for(int i=0; isize_SV; i++) 96 | { 97 | d_star[i]=0; 98 | for(int j=0; jsize_SV; j++) 99 | { 100 | d_star[i]=d_star[i]+K_inverse[i*Budget+j]*k_t[j]; 101 | } 102 | } 103 | 104 | //caculate delta 105 | double k_t_d_star=0; 106 | for(int i=0; isize_SV; i++) 107 | { 108 | k_t_d_star=k_t_d_star+k_t[i]*d_star[i]; 109 | } 110 | double delta_project=1-k_t_d_star; 111 | 112 | 113 | //full budget projectron 114 | if(this->size_SV==Budget) 115 | { 116 | SV *p_predict=this->SV_begin; 117 | for(int i=0; isize_SV; i++) 118 | { 119 | p_predict->SV_alpha=p_predict->SV_alpha+x.label*d_star[i]; 120 | p_predict=p_predict->next; 121 | } 122 | } 123 | else // not full 124 | { 125 | //add SV 126 | 127 | SV* support = new SV(x.label,x); 128 | this->add_SV(support); 129 | 130 | 131 | //updata K_inverse 132 | for(int i=0; isize_SV-1; i++) 133 | { 134 | for(int j=0; jsize_SV-1; j++) 135 | { 136 | K_inverse[i*Budget+j]=K_inverse[i*Budget+j]+d_star[i]*d_star[j]/delta_project; 137 | } 138 | } 139 | for(int i=0; isize_SV-1; i++) 140 | { 141 | K_inverse[i*Budget+this->size_SV-1]=(-1)*d_star[i]/delta_project; 142 | K_inverse[(this->size_SV-1)*Budget+i]=(-1)*d_star[i]/delta_project; 143 | } 144 | K_inverse[(this->size_SV-1)*Budget+(this->size_SV-1)]=1/delta_project; 145 | } 146 | delete[] d_star; 147 | } 148 | } 149 | delete[] k_t; 150 | return y; 151 | } 152 | 153 | 154 | template 155 | float kernel_projectron::Predict(const DataPoint &data) 156 | { 157 | float predict = 0; 158 | 159 | SV* p_predict = this->SV_begin; 160 | while (p_predict!=NULL) 161 | { 162 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 163 | p_predict=p_predict->next; 164 | } 165 | return predict; 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/kernel/kernel_projectronpp.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kernel_optim.h" 4 | 5 | namespace SOL 6 | { 7 | template 8 | class kernel_projectronpp: public Kernel_optim 9 | { 10 | 11 | protected: 12 | int Budget; 13 | float U; 14 | s_array K_inverse; 15 | s_array K_t; 16 | 17 | public: 18 | kernel_projectronpp(const Params ¶m,DataSet &dataset, 19 | LossFunction &lossFunc); 20 | virtual ~kernel_projectronpp(); 21 | 22 | protected: 23 | //this is the core of different updating algorithms 24 | virtual float UpdateWeightVec(const DataPoint &x); 25 | virtual float Predict(const DataPoint &data); 26 | virtual void begin_test(void){} 27 | }; 28 | 29 | template 30 | kernel_projectronpp:: kernel_projectronpp(const Params ¶m, 31 | DataSet &dataset, 32 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 33 | { 34 | this->id_str = " kernel_projectronpp"; 35 | this->Budget=param.Budget_set; 36 | this->U=(1.f/4.f)*sqrtf((Budget+1.f)/logf(Budget+1.f)); 37 | 38 | this->K_inverse.resize(Budget*Budget); 39 | this->K_inverse.zeros(); 40 | 41 | this->K_t.resize(Budget * Budget); 42 | this->K_t.zeros(); 43 | } 44 | 45 | template 46 | kernel_projectronpp::~ kernel_projectronpp() 47 | { 48 | } 49 | 50 | 51 | //update weight vector with stochastic gradient descent 52 | template 53 | float kernel_projectronpp::UpdateWeightVec(const DataPoint &x) 54 | { 55 | float y=0; 56 | float *k_t=NULL; 57 | //calculate k_t 58 | if(this->size_SV!=0) 59 | { 60 | SV* p_predict=this->SV_begin; 61 | k_t=new float [this->size_SV]; 62 | int i=0; 63 | 64 | while (p_predict!=NULL) 65 | { 66 | k_t[i]=this->kern(p_predict->SV_data,x); 67 | p_predict=p_predict->next; 68 | i++; 69 | } 70 | //k_t done 71 | 72 | //get prediction 73 | p_predict=this->SV_begin; 74 | i=0; 75 | while (p_predict!=NULL) 76 | { 77 | y+=p_predict->SV_alpha* k_t[i]; 78 | p_predict=p_predict->next; 79 | i++; 80 | } 81 | } 82 | //prediction is in y 83 | 84 | // if there is mistake, make update 85 | if(this->size_SV==0) 86 | { 87 | 88 | SV* support = new SV(x.label,x); 89 | 90 | this->add_SV(support); 91 | 92 | //ini K_inverse 93 | K_inverse[0]=1; 94 | K_t[0]=1; 95 | } 96 | else //have SV 97 | { 98 | float l_t=1-x.label*y; 99 | if(y*x.label<=0) 100 | { 101 | // calculate d_star=K_t_inver*k_t; 102 | float * d_star=new float [this->size_SV]; 103 | for(int i=0; isize_SV; i++) 104 | { 105 | d_star[i]=0; 106 | for(int j=0; jsize_SV; j++) 107 | { 108 | d_star[i]=d_star[i]+K_inverse[i*Budget+j]*k_t[j]; 109 | } 110 | } 111 | 112 | //caculate delta 113 | float k_t_d_star=0; 114 | for(int i=0; isize_SV; i++) 115 | { 116 | k_t_d_star=k_t_d_star+k_t[i]*d_star[i]; 117 | } 118 | float delta_project=1-k_t_d_star; 119 | 120 | 121 | //full budget projectron 122 | if(this->size_SV==Budget) 123 | { 124 | SV *p_predict=this->SV_begin; 125 | for(int i=0; iSV_alpha=p_predict->SV_alpha+x.label*d_star[i]; 128 | p_predict=p_predict->next; 129 | } 130 | } 131 | else // not full 132 | { 133 | 134 | SV* support = new SV(x.label,x); 135 | this->add_SV(support); 136 | //updata K_inverse 137 | for(int i=0; isize_SV-1; i++) 138 | { 139 | for(int j=0; jsize_SV-1; j++) 140 | { 141 | K_inverse[i*Budget+j]=K_inverse[i*Budget+j]+d_star[i]*d_star[j]/delta_project; 142 | } 143 | } 144 | for(int i=0; isize_SV-1; i++) 145 | { 146 | K_inverse[i*Budget+this->size_SV-1]=(-1)*d_star[i]/delta_project; 147 | K_inverse[(this->size_SV-1)*Budget+i]=(-1)*d_star[i]/delta_project; 148 | } 149 | K_inverse[(this->size_SV-1)*Budget+(this->size_SV-1)]=1/delta_project; 150 | 151 | //updata K_t 152 | for(int i=0; isize_SV-1; i++) 153 | { 154 | K_t[i*Budget+this->size_SV-1]=k_t[i]; 155 | K_t[(this->size_SV-1)*Budget+i]=k_t[i]; 156 | } 157 | K_t[(this->size_SV-1)*Budget+(this->size_SV-1)]=1;/////////////////////// 158 | } 159 | delete[] d_star; 160 | }//mistake 161 | else if((l_t<1)&&(l_t>0)) 162 | { 163 | 164 | float * d_star=new float [this->size_SV]; 165 | for(int i=0; isize_SV; i++) 166 | { 167 | d_star[i]=0; 168 | for(int j=0; jsize_SV; j++) 169 | { 170 | d_star[i]=d_star[i]+K_inverse[i*Budget+j]*k_t[j]; 171 | } 172 | } 173 | 174 | //caculate delta 175 | float k_t_d_star=0; 176 | for(int i=0; isize_SV; i++) 177 | { 178 | k_t_d_star=k_t_d_star+k_t[i]*d_star[i]; 179 | } 180 | float delta_project=1-k_t_d_star; 181 | 182 | float power_p_k_t=0; 183 | 184 | for(int i=0; isize_SV; i++) 185 | { 186 | for(int j=0; jsize_SV; j++) 187 | { 188 | power_p_k_t=power_p_k_t+K_t[i*Budget+j]*d_star[i]*d_star[j]; 189 | } 190 | } 191 | 192 | float tau_t= (std::min)(l_t/power_p_k_t,1.f); 193 | float beta_t=tau_t*(2*l_t-tau_t*power_p_k_t-2*U*sqrt(delta_project)); 194 | if(beta_t>=0) 195 | { 196 | SV *p_predict=this->SV_begin; 197 | for(int i=0; isize_SV; i++) 198 | { 199 | p_predict->SV_alpha=p_predict->SV_alpha+tau_t*d_star[i]*x.label; 200 | p_predict=p_predict->next; 201 | } 202 | } 203 | delete[] d_star; 204 | }//margin loss 205 | }//have SV 206 | delete[] k_t; 207 | 208 | return y; 209 | } 210 | template 211 | float kernel_projectronpp::Predict(const DataPoint &data) 212 | { 213 | float predict = 0; 214 | 215 | SV* p_predict = this->SV_begin; 216 | while (p_predict!=NULL) 217 | { 218 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 219 | p_predict=p_predict->next; 220 | } 221 | return predict; 222 | } 223 | 224 | 225 | } 226 | -------------------------------------------------------------------------------- /src/kernel/kernel_sgd.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kernel_optim.h" 4 | 5 | 6 | namespace SOL 7 | { 8 | template 9 | class kernel_sgd: public Kernel_optim 10 | { 11 | public: 12 | kernel_sgd(const Params ¶m,DataSet &dataset, 13 | LossFunction &lossFunc); 14 | virtual ~kernel_sgd(); 15 | 16 | protected: 17 | //this is the core of different updating algorithms 18 | virtual float UpdateWeightVec(const DataPoint &x); 19 | virtual float Predict(const DataPoint &data); 20 | virtual void begin_test(void){} 21 | }; 22 | 23 | template 24 | kernel_sgd::kernel_sgd(const Params ¶m, 25 | DataSet &dataset, 26 | LossFunction &lossFunc): Kernel_optim(param,dataset, lossFunc) 27 | { 28 | this->id_str = "kernel_ogd"; 29 | this->eta0=param.eta; 30 | } 31 | 32 | template 33 | kernel_sgd::~kernel_sgd() 34 | { 35 | } 36 | 37 | //update weight vector with stochastic gradient descent 38 | template 39 | float kernel_sgd::UpdateWeightVec(const DataPoint &x) 40 | { 41 | float y = this->Predict(x); 42 | 43 | float gt_i = this->lossFunc->GetGradient(x.label,y); 44 | 45 | if(gt_i!=0) 46 | { 47 | SV* support = new SV(-this->eta0 * gt_i,x); 48 | this->add_SV(support); 49 | } 50 | return y; 51 | } 52 | 53 | 54 | template 55 | float kernel_sgd::Predict(const DataPoint &data) 56 | { 57 | float predict = 0; 58 | 59 | SV* p_predict = this->SV_begin; 60 | while (p_predict!=NULL) 61 | { 62 | predict+=p_predict->SV_alpha* this->kern(p_predict->SV_data,data); 63 | p_predict=p_predict->next; 64 | } 65 | return predict; 66 | } 67 | 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/loss/HingeLoss.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: HingeLoss.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/8/18 星期日 16:58:22 5 | > Functions: Hinge Loss function, for SVM 6 | ************************************************************************/ 7 | 8 | #ifndef HEADER_HINGE_LOSS 9 | #define HEADER_HINGE_LOSS 10 | 11 | #include "LossFunction.h" 12 | 13 | namespace SOL { 14 | template 15 | class HingeLoss: public LossFunction { 16 | public: 17 | virtual float GetLoss(LabelType label, float predict) { 18 | return (std::max)(0.0f, 1.f - predict * label); 19 | } 20 | 21 | virtual float GetGradient(LabelType label, float predict) { 22 | if (this->GetLoss(label,predict) > 0) 23 | return (float)(-label); 24 | else 25 | return 0; 26 | } 27 | }; 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/loss/LogisticLoss.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: LogisticLoss.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/8/18 星期日 17:11:42 5 | > Functions: Logistic loss for binary classification 6 | ************************************************************************/ 7 | 8 | #pragma once 9 | #include "LossFunction.h" 10 | 11 | namespace SOL { 12 | template 13 | class LogisticLoss: public LossFunction { 14 | public: 15 | virtual float GetLoss(LabelType label, float predict) { 16 | float tmp = -predict * label; 17 | if (tmp > 100.f) return tmp; 18 | else if (tmp < -100.f) return 0.f; 19 | else 20 | return std::log(1.f + std::exp(tmp)); 21 | } 22 | 23 | //aggressive learning 24 | virtual float GetGradient(LabelType label, float predict) { 25 | float tmp = predict * label; 26 | if (tmp > 100.f) //to reject numeric problems 27 | return 0.f; 28 | else if (tmp < -100.f) 29 | return (float)(-label); 30 | else 31 | return -label / (1.f + std::exp(tmp)); 32 | } 33 | }; 34 | } 35 | -------------------------------------------------------------------------------- /src/loss/LossFunction.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: LossFunction.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/8/18 星期日 16:48:55 5 | > Functions: base class for loss function 6 | ************************************************************************/ 7 | 8 | #pragma once 9 | #include 10 | #include "../common/util.h" 11 | 12 | namespace SOL { 13 | template 14 | class LossFunction { 15 | inline char Sign(float x) { 16 | if (x > 0.f) 17 | return 1; 18 | else 19 | return -1; 20 | } 21 | 22 | public: 23 | virtual inline bool IsCorrect(LabelType label, float predict) { 24 | return Sign(predict) == label ? true : false; 25 | } 26 | 27 | virtual float GetLoss(LabelType label, float predict) = 0; 28 | virtual float GetGradient(LabelType label, float predict) = 0; 29 | 30 | public: 31 | virtual ~LossFunction(){} 32 | }; 33 | } 34 | -------------------------------------------------------------------------------- /src/loss/SquareLoss.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: SquareLoss.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/8/18 星期日 17:19:33 5 | > Functions: Square Loss 6 | ************************************************************************/ 7 | 8 | #pragma once 9 | #include "LossFunction.h" 10 | 11 | namespace SOL { 12 | template 13 | class SquareLoss: public LossFunction { 14 | public: 15 | virtual float GetLoss(LabelType label, float predict) { 16 | return (predict - label) * (predict - label); 17 | } 18 | 19 | virtual float GetGradient(LabelType label, float predict) { 20 | return 2 * (predict - label); 21 | } 22 | }; 23 | } 24 | -------------------------------------------------------------------------------- /src/loss/SquaredHingeLoss.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: SquaredHingeLoss.h 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/11/27 11:30:44 5 | > Functions: Squared Hinge loss 6 | ************************************************************************/ 7 | #ifndef HEADER_SQUARE_HINGE_LOSS 8 | #define HEADER_SQUARE_HINGE_LOSS 9 | 10 | #include "LossFunction.h" 11 | 12 | namespace SOL { 13 | template 14 | class SquaredHingeLoss: public LossFunction { 15 | public: 16 | virtual float GetLoss(LabelType label, float predict) { 17 | float loss = (std::max)(0.0f, 1.f - predict * label); 18 | return loss * loss; 19 | } 20 | 21 | virtual float GetGradient(LabelType label, float predict) { 22 | float loss = (std::max)(0.0f, 1.f - predict * label); 23 | if (loss > 0) 24 | return -label * loss * 2.f; 25 | else 26 | return 0; 27 | } 28 | }; 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | > File Name: main.cpp 3 | > Copyright (C) 2013 Yue Wu 4 | > Created Time: 2013/9/20 13:18:02 5 | > Functions: 6 | ************************************************************************/ 7 | #include "Params.h" 8 | #include "common/util.h" 9 | 10 | #include "data/DataSet.h" 11 | #include "data/libsvmread.h" 12 | 13 | #include "loss/LogisticLoss.h" 14 | #include "loss/HingeLoss.h" 15 | #include "loss/SquareLoss.h" 16 | #include "loss/SquaredHingeLoss.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "kernel/kernel_optim.h" 24 | #include "kernel/kernel_perceptron.h" 25 | #include "kernel/kernel_sgd.h" 26 | #include "kernel/kernel_RBP.h" 27 | #include "kernel/kernel_forgetron.h" 28 | #include "kernel/kernel_projectron.h" 29 | #include "kernel/kernel_projectronpp.h" 30 | #include "kernel/kernel_bogd.h" 31 | #include "kernel/kernel_bpas.h" 32 | #include "kernel/kernel_nogd.h" 33 | #include "kernel/kernel_fogd.h" 34 | #include "kernel/kernel_pa.h" 35 | 36 | using namespace std; 37 | using namespace SOL; 38 | 39 | #define FeatType float 40 | #define LabelType char 41 | 42 | ///////////////////////////function declarications///////////////////// 43 | void FakeInput(int &argc, char **args, char** &argv); 44 | template LossFunction* GetLossFunc(const Params ¶m); 45 | template 46 | Kernel_optim* GetOptimizer(const Params ¶m, DataSet &dataset, LossFunction &lossFun); 47 | /////////////////// 48 | int main(int argc, const char** args) { 49 | 50 | //check memory leak in VC++ 51 | #if defined(_MSC_VER) && defined(_DEBUG) 52 | int tmpFlag = _CrtSetDbgFlag( _CRTDBG_REPORT_FLAG ); 53 | tmpFlag |= _CRTDBG_LEAK_CHECK_DF; 54 | _CrtSetDbgFlag( tmpFlag ); 55 | #endif 56 | Params param; 57 | if (param.Parse(argc, args) == false){ 58 | return -1; 59 | } 60 | 61 | LossFunction *lossFunc = GetLossFunc(param); 62 | if(lossFunc == NULL) 63 | return -1; 64 | 65 | DataSet dataset(param.passNum,param.buf_size); 66 | if (dataset.Load(param.fileName, param.cache_fileName) == false){ 67 | cerr<<"ERROR: Load dataset "< *opti = GetOptimizer(param,dataset,*lossFunc); 73 | if (opti == NULL) 74 | return -1; 75 | 76 | opti->SetParameter(param.gamma,param.eta); 77 | 78 | float l_errRate(0), l_varErr(0); //learning error rate 79 | float sparseRate(0); 80 | opti->PrintOptInfo(); 81 | //learning the model 82 | double time1 = get_current_time(); 83 | 84 | opti->Learn(l_errRate,l_varErr,sparseRate); 85 | 86 | double time2 = get_current_time(); 87 | 88 | printf("\nLearn acuracy: %.6f%%\n",(1-l_errRate)* 100); 89 | cout<<"#SV:"<size_SV< 0 || param.test_fileName.length() > 0; 96 | if ( is_test) { 97 | DataSet testset(1,param.buf_size); 98 | if (testset.Load(param.test_fileName, param.test_cache_fileName) == true) { 99 | float t_errRate(0); //test error rate 100 | t_errRate = opti->Test(param,testset); 101 | time3 = get_current_time(); 102 | 103 | printf("Test acuracy: %.6f %%\n",(1-t_errRate) * 100); 104 | } 105 | else 106 | cout<<"load test set failed!"< 120 | LossFunction* GetLossFunc(const Params ¶m) { 121 | if (param.str_loss == "Hinge") 122 | return new HingeLoss(); 123 | else if (param.str_loss == "Logit") 124 | return new LogisticLoss(); 125 | else if (param.str_loss == "Square") 126 | return new SquareLoss(); 127 | else if (param.str_loss == "SquareHinge") 128 | return new SquaredHingeLoss(); 129 | else{ 130 | cerr<<"ERROR: unrecognized Loss function "< 137 | Kernel_optim* GetOptimizer(const Params ¶m, DataSet &dataset, LossFunction &lossFunc) { 138 | string method = param.str_opt; 139 | ToUpperCase(method); 140 | const char* c_str = method.c_str(); 141 | if (strcmp(c_str, "KERNEL-PERCEPTRON") == 0) 142 | return new kernel_perceptron(param,dataset,lossFunc); 143 | else if (strcmp(c_str, "KERNEL-OGD") == 0) 144 | return new kernel_sgd(param,dataset,lossFunc); 145 | else if (strcmp(c_str, "KERNEL-RBP") == 0) 146 | return new kernel_RBP(param,dataset,lossFunc); 147 | else if (strcmp(c_str, "KERNEL-FORGETRON") == 0) 148 | return new kernel_forgetron(param,dataset,lossFunc); 149 | 150 | else if (strcmp(c_str, "KERNEL-PROJECTRON") == 0) 151 | return new kernel_projectron(param,dataset,lossFunc); 152 | else if (strcmp(c_str, "KERNEL-PROJECTRONPP") == 0) 153 | return new kernel_projectronpp(param,dataset,lossFunc); 154 | else if (strcmp(c_str, "KERNEL-BOGD") == 0) 155 | return new kernel_bogd(param,dataset,lossFunc); 156 | else if (strcmp(c_str, "KERNEL-BPAS") == 0) 157 | return new kernel_bpas(param,dataset,lossFunc); 158 | else if (strcmp(c_str, "KERNEL-FOGD") == 0) 159 | return new kernel_fogd(param,dataset,lossFunc); 160 | else if (strcmp(c_str, "KERNEL-NOGD") == 0) 161 | return new kernel_nogd(param,dataset,lossFunc); 162 | else{ 163 | cerr<<"ERROR: unrecgonized optimization method "<