├── ann ├── ReadMe.md └── neural_network.go ├── core ├── ReadMe.md ├── feature.go ├── vector_test.go ├── matrix_test.go ├── mock_dataset.go ├── label_preprocessing.go ├── matrix.go ├── array_vector.go ├── sample.go ├── vector.go ├── feature_analyze.go └── dataset.go ├── bin ├── hector-preprocessor ├── install ├── test.go ├── hector-feature-combination.go ├── hector-mc-run.go ├── hector-preprocessor.go ├── hector-mc-cv.go └── hector-stack.go ├── .travis.yml ├── algo ├── clustering.go ├── regressor.go └── classifier.go ├── lr ├── diff_function.go ├── terminal_criterion.go ├── minimizer_test.go ├── linear_regression.go ├── lbfgs_minimizer.go ├── logistic_regression.go ├── logistic_regression_streaming.go ├── ftrl_logistic_regression.go ├── quasinewton_helper.go ├── lr_owlqn.go ├── ep_logistic_regression.go └── owlqn_minimizer.go ├── util ├── hash_util.go ├── string_util.go └── math_util.go ├── .gitignore ├── regressor_test.go ├── hectorun └── hector-run.go ├── hectorstream └── hectorstream.go ├── fanaly └── fanaly.go ├── LICENSE ├── eval ├── evaluation_test.go └── evaluation.go ├── gp ├── covariance_function.go └── gaussian_process.go ├── hectorcv └── hector-cv.go ├── svm ├── knn.go ├── l1vm.go ├── linear_svm.go └── svm.go ├── hectorserver └── hectorserver.go ├── sa └── sa_auc.go ├── classifier_test.go ├── dt ├── gbdt.go ├── random_forest.go ├── regression_tree.go ├── random_decision_tree.go └── cart.go ├── fm └── factorize_machine.go ├── README.md ├── mc_runner.go ├── combine └── category_feature_combination.go ├── algo_runner.go └── params.go /ann/ReadMe.md: -------------------------------------------------------------------------------- 1 | Neural Network -------------------------------------------------------------------------------- /core/ReadMe.md: -------------------------------------------------------------------------------- 1 | Define base structure such as Feature, Sample, DataSet -------------------------------------------------------------------------------- /bin/hector-preprocessor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlvector/hector/HEAD/bin/hector-preprocessor -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.4.1 5 | 6 | install: 7 | - go test github.com/xlvector/hector 8 | -------------------------------------------------------------------------------- /algo/clustering.go: -------------------------------------------------------------------------------- 1 | package algo 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | ) 6 | 7 | type Clustering interface { 8 | Init(params map[string]string) 9 | Cluster(dataset core.DataSet) 10 | } 11 | -------------------------------------------------------------------------------- /bin/install: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | go build hector-cv.go 4 | go build hector-run.go 5 | go build hector-mc-cv.go 6 | 7 | sudo cp hector-cv /usr/local/bin 8 | sudo cp hector-run /usr/local/bin 9 | sudo cp hector-mc-cv /usr/local/bin 10 | -------------------------------------------------------------------------------- /lr/diff_function.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | ) 6 | 7 | // Description: function for minimizer such as LBFGS and OWLQN 8 | type DiffFunction interface { 9 | Value(pos *core.Vector) float64 10 | Gradient(pos *core.Vector) *core.Vector 11 | } 12 | -------------------------------------------------------------------------------- /bin/test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import( 4 | "reflect" 5 | "fmt" 6 | ) 7 | 8 | type A interface { 9 | f(a int) int 10 | } 11 | 12 | type B struct { 13 | c int 14 | } 15 | 16 | func (self *B) f(a int) int { 17 | return a + self.c 18 | } 19 | 20 | func main(){ 21 | var x A 22 | x = &(B{3}) 23 | fmt.Println(reflect.TypeOf(x)) 24 | } -------------------------------------------------------------------------------- /core/feature.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | type FeatureType int 4 | 5 | var FeatureTypeEnum = struct { 6 | DISCRETE_FEATURE FeatureType 7 | CONTINUOUS_FEATURE FeatureType 8 | }{0, 1} 9 | 10 | func GetFeatureType(key string) FeatureType { 11 | if key[0] == '#' { 12 | return FeatureTypeEnum.DISCRETE_FEATURE 13 | } else { 14 | return FeatureTypeEnum.CONTINUOUS_FEATURE 15 | } 16 | } 17 | 18 | type Feature struct { 19 | Id int64 20 | Value float64 21 | } -------------------------------------------------------------------------------- /util/hash_util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | 4 | func CombineFeatures(fids []int64) int64{ 5 | ret := int64(0) 6 | 7 | for _, fid := range fids{ 8 | ret *= 601840361 9 | ret += fid 10 | } 11 | if ret < 0 { 12 | ret *= -1 13 | } 14 | return ret 15 | } 16 | 17 | func Hash(str string) int64 { 18 | h := int64(0) 19 | 20 | for _, ch := range str { 21 | h *= 601840361 22 | h += int64(ch) 23 | } 24 | if h < 0 { 25 | return -1 * h; 26 | } 27 | return h 28 | } -------------------------------------------------------------------------------- /algo/regressor.go: -------------------------------------------------------------------------------- 1 | package algo 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | ) 6 | 7 | type Regressor interface { 8 | 9 | //Set training parameters from parameter map 10 | Init(params map[string]string) 11 | 12 | //Train model on a given dataset 13 | Train(dataset *core.RealDataSet) 14 | 15 | //Predict the output of an input sample 16 | Predict(sample *core.RealSample) float64 17 | 18 | SaveModel(path string) 19 | LoadModel(path string) 20 | } 21 | -------------------------------------------------------------------------------- /core/vector_test.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "testing" 5 | "math" 6 | ) 7 | 8 | func TestArrayVector(t *testing.T){ 9 | a := NewArrayVector() 10 | precision := 1e-9 11 | 12 | a.AddValue(3, 1.78) 13 | 14 | if math.Abs(a.GetValue(3) - 1.78) > precision { 15 | t.Error("Get wrong value after set value") 16 | } 17 | 18 | a.AddValue(3, -1.1) 19 | 20 | if math.Abs(a.GetValue(3) - 0.68) > precision { 21 | t.Error("Add value wrong") 22 | } 23 | 24 | a.Scale(0.5) 25 | 26 | if math.Abs(a.GetValue(3) - 0.34) > precision { 27 | t.Error("Scale wrong") 28 | } 29 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | hector-mc-cv 3 | hector-mc-run 4 | hector-stack 5 | hector-server 6 | code.google.com 7 | hector-cv 8 | hector-run 9 | hector-feature-combination 10 | data/ 11 | *.train 12 | *.test 13 | dwfa 14 | *.csv 15 | *.txt 16 | *.zip 17 | *.py 18 | *.data 19 | dw 20 | dwcv 21 | test 22 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 23 | *.o 24 | *.a 25 | *.so 26 | 27 | # Folders 28 | _obj 29 | _test 30 | 31 | # Architecture specific extensions/prefixes 32 | *.[568vq] 33 | [568vq].out 34 | 35 | *.cgo1.go 36 | *.cgo2.c 37 | _cgo_defun.c 38 | _cgo_gotypes.go 39 | _cgo_export.* 40 | 41 | _testmain.go 42 | 43 | *.exe 44 | 45 | *.tsv -------------------------------------------------------------------------------- /regressor_test.go: -------------------------------------------------------------------------------- 1 | package hector 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "testing" 6 | ) 7 | 8 | func TestRegressorOnSin(t *testing.T) { 9 | algos := []string{"gp"} 10 | 11 | params := make(map[string]string) 12 | params["dim"] = "1" 13 | 14 | for _, algo := range algos { 15 | train_dataset := core.SinusoidalDataSet(100) 16 | test_dataset := core.SinusoidalDataSet(50) 17 | regressor := GetRegressor(algo) 18 | regressor.Init(params) 19 | rmse, _ := RegAlgorithmRunOnDataSet(regressor, train_dataset, test_dataset, "", params) 20 | 21 | t.Logf("rmse of %s in sinusoidal dataset is %f", algo, rmse) 22 | if rmse > 0.1 { 23 | t.Error("rmse less than 0.1 in sinusoidal dataset") 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /core/matrix_test.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "testing" 5 | "math" 6 | ) 7 | 8 | func TestMatrix(t *testing.T){ 9 | a := NewMatrix() 10 | precision := 1e-9 11 | 12 | a.AddValue(3, 4, 1.78) 13 | 14 | if math.Abs(a.GetValue(3, 4) - 1.78) > precision { 15 | t.Error("Get wrong value after set value") 16 | } 17 | 18 | a.AddValue(3, 4, -1.1) 19 | 20 | if math.Abs(a.GetValue(3, 4) - 0.68) > precision { 21 | t.Error("Add value wrong") 22 | } 23 | 24 | b := NewMatrix() 25 | 26 | for i := 0; i < 10; i++ { 27 | for j := 0; j < 10; j++ { 28 | b.SetValue(int64(i), int64(j), 1.0) 29 | } 30 | } 31 | 32 | c := b.Scale(2.0) 33 | 34 | if math.Abs(c.GetValue(7,8) - 2.0) > precision { 35 | t.Error("scale function error") 36 | } 37 | } -------------------------------------------------------------------------------- /bin/hector-feature-combination.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/xlvector/hector" 5 | "github.com/xlvector/hector/combine" 6 | "github.com/xlvector/hector/core" 7 | "os" 8 | "strings" 9 | ) 10 | 11 | func main() { 12 | train, _, _, _, params := hector.PrepareParams() 13 | 14 | feature_combination := combine.CategoryFeatureCombination{} 15 | feature_combination.Init(params) 16 | 17 | dataset := core.NewRawDataSet() 18 | dataset.Load(train) 19 | 20 | combinations := feature_combination.FindCombination(dataset) 21 | 22 | output := params["output"] 23 | 24 | file, _ := os.Create(output) 25 | defer file.Close() 26 | 27 | for _, combination := range combinations { 28 | file.WriteString(strings.Join(combination, "\t") + "\n") 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /hectorun/hector-run.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "runtime" 6 | 7 | "github.com/xlvector/hector" 8 | ) 9 | 10 | func main() { 11 | train, test, pred, method, params := hector.PrepareParams() 12 | 13 | action, _ := params["action"] 14 | 15 | classifier := hector.GetClassifier(method) 16 | runtime.GOMAXPROCS(runtime.NumCPU()) 17 | if action == "" { 18 | auc, _, _ := hector.AlgorithmRun(classifier, train, test, pred, params) 19 | fmt.Println("AUC:") 20 | fmt.Println(auc) 21 | } else if action == "train" { 22 | hector.AlgorithmTrain(classifier, train, params) 23 | 24 | } else if action == "test" { 25 | auc, _, _ := hector.AlgorithmTest(classifier, test, pred, params) 26 | fmt.Println("AUC:") 27 | fmt.Println(auc) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /algo/classifier.go: -------------------------------------------------------------------------------- 1 | package algo 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | ) 6 | 7 | type Classifier interface { 8 | 9 | //Set training parameters from parameter map 10 | Init(params map[string]string) 11 | 12 | //Train model on a given dataset 13 | Train(dataset *core.DataSet) 14 | 15 | //Predict the probability of a sample to be positive sample 16 | Predict(sample *core.Sample) float64 17 | 18 | SaveModel(path string) 19 | LoadModel(path string) 20 | } 21 | 22 | type MultiClassClassifier interface { 23 | //Set training parameters from parameter map 24 | Init(params map[string]string) 25 | 26 | //Train model on a given dataset 27 | Train(dataset *core.DataSet) 28 | 29 | //Predict the probability of a sample to be positive sample 30 | PredictMultiClass(sample *core.Sample) *core.ArrayVector 31 | 32 | SaveModel(path string) 33 | LoadModel(path string) 34 | } 35 | -------------------------------------------------------------------------------- /hectorstream/hectorstream.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "runtime" 7 | 8 | "github.com/xlvector/hector" 9 | "github.com/xlvector/hector/core" 10 | "github.com/xlvector/hector/lr" 11 | ) 12 | 13 | func main() { 14 | train, test, pred, _, params := hector.PrepareParams() 15 | log.SetFlags(log.LstdFlags | log.Lshortfile) 16 | 17 | action, _ := params["action"] 18 | runtime.GOMAXPROCS(runtime.NumCPU()) 19 | if action == "train" { 20 | classifier := &lr.LogisticRegressionStream{} 21 | classifier.Init(params) 22 | data := core.NewStreamingDataSet() 23 | go data.Load(train, 1) 24 | classifier.Train(data) 25 | classifier.SaveModel(params["model"]) 26 | } else if action == "test" { 27 | classifier := &lr.LogisticRegression{} 28 | classifier.Init(params) 29 | auc, _, _ := hector.AlgorithmTest(classifier, test, pred, params) 30 | fmt.Println("AUC:") 31 | fmt.Println(auc) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /fanaly/fanaly.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/xlvector/hector/core" 7 | "sort" 8 | ) 9 | 10 | type FeatureValue struct { 11 | Name string 12 | Value float64 13 | } 14 | 15 | type FeatureValueList []FeatureValue 16 | 17 | func (ms FeatureValueList) Len() int { 18 | return len(ms) 19 | } 20 | 21 | func (ms FeatureValueList) Less(i, j int) bool { 22 | return ms[i].Value > ms[j].Value 23 | } 24 | 25 | func (ms FeatureValueList) Swap(i, j int) { 26 | ms[i], ms[j] = ms[j], ms[i] 27 | } 28 | 29 | func main() { 30 | path := flag.String("input", "", "path of dataset") 31 | flag.Parse() 32 | 33 | ds := core.NewDataSet() 34 | ds.Load(*path, -1) 35 | iv := core.InformationValue(ds) 36 | fs := make(FeatureValueList, 0, len(iv)) 37 | for f, v := range iv { 38 | fs = append(fs, FeatureValue{Name: ds.FeatureNameIdMap[f], Value: v}) 39 | } 40 | sort.Sort(fs) 41 | for _, f := range fs { 42 | fmt.Printf("%s\t%v\n", f.Name, f.Value) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /bin/hector-mc-run.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector" 6 | "log" 7 | "os" 8 | "runtime/pprof" 9 | ) 10 | 11 | func main() { 12 | train, test, pred, method, params := hector.PrepareParams() 13 | 14 | action, _ := params["action"] 15 | 16 | classifier := hector.GetMutliClassClassifier(method) 17 | 18 | profile, _ := params["profile"] 19 | if profile != "" { 20 | fmt.Printf("Profile data => %s\n", profile) 21 | f, err := os.Create(profile) 22 | if err != nil { 23 | log.Fatal(err) 24 | } 25 | pprof.StartCPUProfile(f) 26 | defer pprof.StopCPUProfile() 27 | } 28 | 29 | if action == "" { 30 | accuracy, _ := hector.MultiClassRun(classifier, train, test, pred, params) 31 | fmt.Println("accuracy : ", accuracy) 32 | } else if action == "train" { 33 | hector.MultiClassTrain(classifier, train, params) 34 | 35 | } else if action == "test" { 36 | accuracy, _ := hector.MultiClassTest(classifier, test, pred, params) 37 | fmt.Println("accuracy", accuracy) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Xiang Liang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /bin/hector-preprocessor.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector" 6 | "github.com/xlvector/hector/core" 7 | "log" 8 | "os" 9 | ) 10 | 11 | func main() { 12 | train, test, _, _, params := hector.PrepareParams() 13 | 14 | action, _ := params["action"] 15 | 16 | if action == "encodelabel" { 17 | 18 | fmt.Println("encoded dataset label ..." + train) 19 | e := core.NewLabelEncoder() 20 | EncodeLabelAction(e, train) 21 | fmt.Println("encoded dataset label ..." + test) 22 | EncodeLabelAction(e, test) 23 | } 24 | 25 | } 26 | 27 | func EncodeLabelAction(e *core.LabelEncoder, data_path string) { 28 | 29 | dataset := core.NewDataSet() 30 | err := dataset.Load(data_path, -1) 31 | 32 | if err != nil { 33 | log.Fatal(err) 34 | return 35 | } 36 | 37 | encoded_label_dataset := e.TransformDataset(dataset) 38 | var output_file *os.File 39 | 40 | output_file, _ = os.Create(data_path + ".hector") 41 | for _, sample := range encoded_label_dataset.Samples { 42 | output_file.WriteString(string(sample.ToString(false)) + "\n") 43 | } 44 | 45 | if output_file != nil { 46 | defer output_file.Close() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /eval/evaluation_test.go: -------------------------------------------------------------------------------- 1 | package eval 2 | 3 | import ( 4 | "testing" 5 | "math/rand" 6 | "math" 7 | ) 8 | 9 | func TestAUC(t *testing.T){ 10 | predictions := []*LabelPrediction{} 11 | for i := 0; i < 1000; i++ { 12 | predictions = append(predictions, &(LabelPrediction{Label: rand.Int() % 2, Prediction: rand.Float64()})) 13 | } 14 | auc := AUC(predictions) 15 | if math.Abs(auc - 0.5) > 0.05{ 16 | t.Error("Random predictions should have auc arround 0.5") 17 | } 18 | 19 | predictions = nil 20 | for i := 0; i < 1000; i++ { 21 | label := rand.Int() % 2 22 | prediction := rand.Float64() 23 | if label == 1 { 24 | prediction += 1.0 25 | } 26 | predictions = append(predictions, &(LabelPrediction{Label: label, Prediction: prediction})) 27 | } 28 | auc = AUC(predictions) 29 | if auc < 0.6 { 30 | t.Error("Asending predictions should have auc > 0.5") 31 | } 32 | 33 | } 34 | 35 | func TestRMSE(t *testing.T){ 36 | predictions := []*LabelPrediction{} 37 | for i := 0; i < 1000; i++ { 38 | predictions = append(predictions, &(LabelPrediction{Label: 1, Prediction: 1.0})) 39 | rmse := RMSE(predictions) 40 | 41 | if math.Abs(rmse) > 1e-9 { 42 | t.Error("RMSE Error") 43 | } 44 | } 45 | } 46 | 47 | func TestErrorRate(t *testing.T) { 48 | predictions := []*LabelPrediction{} 49 | for i := 0; i < 1000; i++ { 50 | p := rand.Intn(2) 51 | predictions = append(predictions, &(LabelPrediction{Label: p, Prediction: float64(p)})) 52 | } 53 | error_rate := ErrorRate(predictions) 54 | if math.Abs(error_rate) > 1e-9{ 55 | t.Error("Error Rate Error") 56 | } 57 | } -------------------------------------------------------------------------------- /lr/terminal_criterion.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ("math") 4 | 5 | /** 6 | * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models" 7 | * by Galen Andrew and Jianfeng Gao 8 | * user: weixuan 9 | */ 10 | type relativeMeanImprCriterion struct { 11 | minHist int 12 | maxHist int 13 | tolerance float64 14 | improvement float64 15 | costList []float64 16 | } 17 | 18 | func NewRelativeMeanImprCriterion(tolerance float64) *relativeMeanImprCriterion { 19 | tc := new(relativeMeanImprCriterion) 20 | tc.minHist = 5 21 | tc.maxHist = 10 22 | tc.costList = make([]float64, 0, tc.maxHist) 23 | tc.tolerance = tolerance 24 | return tc 25 | } 26 | 27 | func (tc *relativeMeanImprCriterion) calImprovement() float64{ 28 | sz := len(tc.costList) 29 | if sz <= tc.minHist { 30 | return math.MaxFloat32 31 | } 32 | first := tc.costList[0] 33 | last := tc.costList[sz-1] 34 | impr := (first - last) /float64(sz-1) 35 | if last != 0 { 36 | impr = math.Abs(impr / last) 37 | } else if first != 0 { 38 | impr = math.Abs(impr / first) 39 | } else { 40 | impr = 0 41 | } 42 | if sz > tc.maxHist { 43 | tc.costList = tc.costList[1:] 44 | } 45 | return impr 46 | } 47 | 48 | func (tc *relativeMeanImprCriterion) addCost(latestCost float64) { 49 | tc.costList = append(tc.costList, latestCost) 50 | tc.improvement = tc.calImprovement() 51 | } 52 | 53 | func (tc *relativeMeanImprCriterion) isTerminable() bool { 54 | return tc.improvement <= tc.tolerance 55 | } 56 | -------------------------------------------------------------------------------- /gp/covariance_function.go: -------------------------------------------------------------------------------- 1 | package gp 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "math" 6 | ) 7 | 8 | type CovFunc func(*core.Vector, *core.Vector) float64 9 | 10 | func CovMatrix(X []*core.RealSample, cov_func CovFunc) *core.Matrix { 11 | l := int64(len(X)) 12 | ret := core.NewMatrix() 13 | for i := int64(0); i < l; i++ { 14 | for j := i; j < l; j++ { 15 | c := cov_func(X[i].GetFeatureVector(), X[j].GetFeatureVector()) 16 | ret.SetValue(i, j, c) 17 | ret.SetValue(j, i, c) 18 | } 19 | } 20 | return ret 21 | } 22 | 23 | func CovVector(X []*core.RealSample, y *core.RealSample, cov_func CovFunc) *core.Vector { 24 | l := int64(len(X)) 25 | ret := core.NewVector() 26 | for i := int64(0); i < l; i++ { 27 | ret.SetValue(i, cov_func(X[i].GetFeatureVector(), y.GetFeatureVector())) 28 | } 29 | return ret 30 | } 31 | 32 | /* 33 | Squared error covariance function 34 | ARD = auto relevance detection, and here indicates there is a scaling/radius factor per dimension 35 | */ 36 | type CovSEARD struct { 37 | Radiuses *core.Vector // dim -> radius 38 | Amp float64 39 | } 40 | 41 | func (cov_func *CovSEARD) Init(radiuses *core.Vector, amp float64) { 42 | cov_func.Radiuses = radiuses 43 | cov_func.Amp = amp 44 | } 45 | 46 | func (cov_func *CovSEARD) Cov(x1 *core.Vector, x2 *core.Vector) float64 { 47 | ret := 0.0 48 | tmp := 0.0 49 | for key, r := range cov_func.Radiuses.Data { 50 | v1 := x1.GetValue(key) 51 | v2 := x2.GetValue(key) 52 | tmp = (v1 - v2) / r 53 | ret += tmp * tmp 54 | } 55 | ret = cov_func.Amp * math.Exp(-ret) 56 | return ret 57 | } 58 | -------------------------------------------------------------------------------- /bin/hector-mc-cv.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector" 6 | "github.com/xlvector/hector/core" 7 | "log" 8 | "os" 9 | "runtime" 10 | "runtime/pprof" 11 | "strconv" 12 | ) 13 | 14 | func SplitFile(dataset *core.DataSet, total, part int) (*core.DataSet, *core.DataSet) { 15 | 16 | train := core.NewDataSet() 17 | test := core.NewDataSet() 18 | 19 | for i, sample := range dataset.Samples { 20 | if i%total == part { 21 | test.AddSample(sample) 22 | } else { 23 | train.AddSample(sample) 24 | } 25 | } 26 | return train, test 27 | } 28 | 29 | func main() { 30 | train_path, _, _, method, params := hector.PrepareParams() 31 | global, _ := strconv.ParseInt(params["global"], 10, 64) 32 | profile, _ := params["profile"] 33 | dataset := core.NewDataSet() 34 | dataset.Load(train_path, global) 35 | 36 | cv, _ := strconv.ParseInt(params["cv"], 10, 32) 37 | total := int(cv) 38 | 39 | if profile != "" { 40 | f, err := os.Create(profile) 41 | if err != nil { 42 | log.Fatal(err) 43 | } 44 | pprof.StartCPUProfile(f) 45 | defer pprof.StopCPUProfile() 46 | } 47 | 48 | average_accuracy := 0.0 49 | for part := 0; part < total; part++ { 50 | train, test := SplitFile(dataset, total, part) 51 | classifier := hector.GetMutliClassClassifier(method) 52 | classifier.Init(params) 53 | accuracy := hector.MultiClassRunOnDataSet(classifier, train, test, "", params) 54 | fmt.Println("accuracy : ", accuracy) 55 | average_accuracy += accuracy 56 | classifier = nil 57 | train = nil 58 | test = nil 59 | runtime.GC() 60 | } 61 | fmt.Println(average_accuracy / float64(total)) 62 | } 63 | -------------------------------------------------------------------------------- /hectorcv/hector-cv.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "runtime" 8 | "runtime/pprof" 9 | "strconv" 10 | 11 | "github.com/xlvector/hector" 12 | "github.com/xlvector/hector/core" 13 | ) 14 | 15 | func SplitFile(dataset *core.DataSet, total, part int) (*core.DataSet, *core.DataSet) { 16 | 17 | train := core.NewDataSet() 18 | test := core.NewDataSet() 19 | 20 | for i, sample := range dataset.Samples { 21 | if i%total == part { 22 | test.AddSample(sample) 23 | } else { 24 | train.AddSample(sample) 25 | } 26 | } 27 | return train, test 28 | } 29 | 30 | func main() { 31 | train_path, _, _, method, params := hector.PrepareParams() 32 | global, _ := strconv.ParseInt(params["global"], 10, 64) 33 | profile, _ := params["profile"] 34 | dataset := core.NewDataSet() 35 | dataset.Load(train_path, global) 36 | runtime.GOMAXPROCS(runtime.NumCPU()) 37 | cv, _ := strconv.ParseInt(params["cv"], 10, 32) 38 | total := int(cv) 39 | 40 | if profile != "" { 41 | fmt.Println(profile) 42 | f, err := os.Create(profile) 43 | if err != nil { 44 | fmt.Println("%v", err) 45 | log.Fatal(err) 46 | } 47 | pprof.StartCPUProfile(f) 48 | defer pprof.StopCPUProfile() 49 | } 50 | 51 | average_auc := 0.0 52 | for part := 0; part < total; part++ { 53 | train, test := SplitFile(dataset, total, part) 54 | classifier := hector.GetClassifier(method) 55 | classifier.Init(params) 56 | auc, _ := hector.AlgorithmRunOnDataSet(classifier, train, test, "", params) 57 | fmt.Println("AUC:") 58 | fmt.Println(auc) 59 | average_auc += auc 60 | classifier = nil 61 | } 62 | fmt.Println(average_auc / float64(total)) 63 | } 64 | -------------------------------------------------------------------------------- /core/mock_dataset.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | ) 7 | 8 | func XORDataSet(n int) *DataSet{ 9 | ret := NewDataSet() 10 | for i := 0; i < n; i++ { 11 | x := 2 * (float64(rand.Intn(2)) - 0.5) 12 | y := 2 * (float64(rand.Intn(2)) - 0.5) 13 | 14 | label := 1 15 | 16 | if x * y < 0.0 { 17 | label = 0 18 | } 19 | 20 | sample := NewSample() 21 | sample.Label = label 22 | sample.AddFeature(Feature{Id: 1, Value: x}) 23 | sample.AddFeature(Feature{Id: 2, Value: y}) 24 | sample.AddFeature(Feature{Id: 3, Value: 1.0}) 25 | ret.AddSample(sample) 26 | } 27 | return ret 28 | } 29 | 30 | func LinearDataSet(n int) *DataSet { 31 | ret := NewDataSet() 32 | for i := 0; i < n; i++{ 33 | sample := NewSample() 34 | sample.Label = 0 35 | for f := 0; f < 100; f++{ 36 | if rand.Intn(10) != 1 { 37 | continue 38 | } 39 | if f < 20 { 40 | sample.Label += 1 41 | } else if f > 80 { 42 | sample.Label -= 1 43 | } 44 | sample.AddFeature(Feature{Id: int64(f), Value: 1.0}) 45 | } 46 | if sample.Label > 0 { 47 | sample.Label = 1 48 | } else { 49 | sample.Label = 0 50 | } 51 | ret.AddSample(sample) 52 | } 53 | return ret 54 | } 55 | 56 | func SinusoidalDataSet(n int) *RealDataSet { 57 | ret := NewRealDataSet() 58 | 59 | min := -5.0 60 | max := 5.0 61 | amp := 1.0 62 | noise := 0.05 63 | period := 4.0 64 | interval := (max - min) / float64(n) 65 | for i := 0; i < n; i++ { 66 | x := min + interval * float64(i) + 0.5*interval 67 | y := math.Sin((x-min)*2*math.Pi/period) * amp + rand.NormFloat64()*noise 68 | sample := NewRealSample() 69 | sample.AddFeature(Feature{Id: int64(1), Value: x}) 70 | sample.Value = y 71 | ret.AddSample(sample) 72 | } 73 | 74 | return ret 75 | } 76 | -------------------------------------------------------------------------------- /svm/knn.go: -------------------------------------------------------------------------------- 1 | package svm 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "github.com/xlvector/hector/eval" 6 | "math" 7 | "math/rand" 8 | "strconv" 9 | ) 10 | 11 | type KNN struct { 12 | sv []*core.Vector 13 | labels []int 14 | k int 15 | } 16 | 17 | func (self *KNN) SaveModel(path string) { 18 | 19 | } 20 | 21 | func (self *KNN) LoadModel(path string) { 22 | 23 | } 24 | 25 | func (c *KNN) Init(params map[string]string) { 26 | K, _ := strconv.ParseInt(params["k"], 10, 64) 27 | c.k = int(K) 28 | } 29 | 30 | func (c *KNN) Kernel(x, y *core.Vector) float64 { 31 | z := x.Copy() 32 | z.AddVector(y, -1.0) 33 | ret := math.Exp(-1.0 * z.NormL2() / 20.0) 34 | return ret 35 | } 36 | 37 | func (c *KNN) Predict(sample *core.Sample) float64 { 38 | ret := c.PredictMultiClass(sample) 39 | return ret.GetValue(1) 40 | } 41 | 42 | func (c *KNN) PredictMultiClass(sample *core.Sample) *core.ArrayVector { 43 | x := sample.GetFeatureVector() 44 | predictions := []*eval.LabelPrediction{} 45 | for i, s := range c.sv { 46 | predictions = append(predictions, &(eval.LabelPrediction{Label: c.labels[i], Prediction: c.Kernel(s, x)})) 47 | } 48 | 49 | compare := func(p1, p2 *eval.LabelPrediction) bool { 50 | return p1.Prediction > p2.Prediction 51 | } 52 | 53 | eval.By(compare).Sort(predictions) 54 | 55 | ret := core.NewArrayVector() 56 | for i, pred := range predictions { 57 | if i > c.k { 58 | break 59 | } 60 | ret.AddValue(pred.Label, 1.0) 61 | } 62 | return ret 63 | } 64 | 65 | func (c *KNN) Train(dataset *core.DataSet) { 66 | c.sv = []*core.Vector{} 67 | c.labels = []int{} 68 | for i := 0; i < 1000; i++ { 69 | k := rand.Intn(len(dataset.Samples)) 70 | c.sv = append(c.sv, dataset.Samples[k].GetFeatureVector()) 71 | c.labels = append(c.labels, dataset.Samples[k].Label) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /hectorserver/hectorserver.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/xlvector/hector" 7 | "github.com/xlvector/hector/algo" 8 | "github.com/xlvector/hector/core" 9 | "github.com/xlvector/hector/util" 10 | "log" 11 | "net/http" 12 | ) 13 | 14 | type ClassifierHandler struct { 15 | classifier algo.Classifier 16 | } 17 | 18 | func (c *ClassifierHandler) ServeHTTP(w http.ResponseWriter, 19 | req *http.Request) { 20 | sample := core.NewSample() 21 | if req.Method != "POST" { 22 | http.Error(w, "method not allowed", http.StatusMethodNotAllowed) 23 | return 24 | } 25 | features := req.FormValue("features") 26 | if len(features) == 0 { 27 | http.Error(w, "need input features", http.StatusInternalServerError) 28 | return 29 | } 30 | fs := make(map[string]float64) 31 | err := json.Unmarshal([]byte(features), &fs) 32 | if err != nil { 33 | http.Error(w, err.Error(), http.StatusInternalServerError) 34 | return 35 | } 36 | for k, v := range fs { 37 | f := core.Feature{ 38 | Id: util.Hash(k), 39 | Value: v, 40 | } 41 | sample.AddFeature(f) 42 | } 43 | p := c.classifier.Predict(sample) 44 | output, err := json.Marshal(map[string]interface{}{ 45 | "prediction": p, 46 | }) 47 | if err != nil { 48 | http.Error(w, err.Error(), http.StatusInternalServerError) 49 | return 50 | } 51 | fmt.Fprint(w, output) 52 | } 53 | 54 | func main() { 55 | _, _, _, method, params := hector.PrepareParams() 56 | ch := &ClassifierHandler{ 57 | classifier: hector.GetClassifier(method), 58 | } 59 | model, ok := params["model"] 60 | if !ok { 61 | log.Fatalln("please input model file") 62 | } 63 | ch.classifier.LoadModel(model) 64 | http.Handle("/predict", ch) 65 | err := http.ListenAndServe(":"+params["port"], nil) 66 | if err != nil { 67 | log.Fatal(err) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /lr/minimizer_test.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "math" 6 | "testing" 7 | ) 8 | 9 | type mseDiffFunction struct { 10 | center core.Vector 11 | weights core.Vector 12 | grad core.Vector 13 | init core.Vector 14 | } 15 | 16 | func getMSECostFunction() *mseDiffFunction { 17 | f := new(mseDiffFunction) 18 | f.center.Data = map[int64]float64{} 19 | f.weights.Data = map[int64]float64{0: 1, 1: 0.01} 20 | f.init.Data = map[int64]float64{0: 1, 1: 1} 21 | f.grad.Data = map[int64]float64{0: 0, 1: 0} 22 | return f 23 | } 24 | 25 | func (f *mseDiffFunction) Value(x *core.Vector) float64 { 26 | var cost float64 = 0 27 | for n, val := range x.Data { 28 | diff := val - f.center.GetValue(n) 29 | cost += f.weights.GetValue(n) * diff * diff 30 | } 31 | return 0.5 * cost 32 | } 33 | 34 | // Gradients for different points could use the same memory 35 | func (f *mseDiffFunction) Gradient(x *core.Vector) *core.Vector { 36 | for n, val := range x.Data { 37 | f.grad.SetValue(n, f.weights.GetValue(n)*(val-f.center.GetValue(n))) 38 | } 39 | return &f.grad 40 | } 41 | 42 | func (f *mseDiffFunction) testResult(result *core.Vector, tolerance float64, t *testing.T) { 43 | for n, val := range result.Data { 44 | if math.Abs(val-f.center.GetValue(n)) > tolerance { 45 | t.Errorf("Mismatch\nIndex\tTrue\tResult\n%d\t%e\t%e\n", n, f.center.GetValue(n), val) 46 | } 47 | } 48 | } 49 | 50 | func TestLBFGS(t *testing.T) { 51 | diffFunc := getMSECostFunction() 52 | minimizer := NewLBFGSMinimizer() 53 | result := minimizer.Minimize(diffFunc, &(diffFunc.init)) 54 | diffFunc.testResult(result, 1e-6, t) 55 | } 56 | 57 | func TestOWLQN(t *testing.T) { 58 | diffFunc := getMSECostFunction() 59 | minimizer := NewOWLQNMinimizer(0.001) 60 | result := minimizer.Minimize(diffFunc, &(diffFunc.init)) 61 | diffFunc.testResult(result, 0, t) 62 | } 63 | -------------------------------------------------------------------------------- /core/label_preprocessing.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import( 4 | "fmt" 5 | ) 6 | 7 | type IntEncoder struct { 8 | Mapping map[int]int 9 | InverseMapping map[int]int 10 | } 11 | 12 | func NewIntEncoder() *IntEncoder { 13 | e := IntEncoder{} 14 | e.Mapping = make(map[int]int) 15 | e.InverseMapping = make(map[int]int) 16 | return &e 17 | } 18 | 19 | func (e *IntEncoder) Encoded(original int) int{ 20 | if encoded, ok := e.Mapping[original]; ok { 21 | return encoded 22 | } 23 | 24 | e.Mapping[original] = len(e.Mapping) 25 | encoded := e.Mapping[original] 26 | e.InverseMapping[encoded] = original 27 | return encoded 28 | } 29 | 30 | func (e *IntEncoder) Decoded(encoded int) (int, error){ 31 | if decoded, ok := e.InverseMapping[encoded]; ok { 32 | return decoded, nil 33 | } 34 | 35 | return -1, fmt.Errorf("Can't find %d in dictionary...", encoded) 36 | } 37 | 38 | type LabelEncoder struct { 39 | labelMapper *IntEncoder 40 | } 41 | 42 | func NewLabelEncoder() *LabelEncoder { 43 | e := LabelEncoder{} 44 | e.labelMapper = NewIntEncoder() 45 | return &e 46 | } 47 | 48 | func (e *LabelEncoder) TransformSample(s *Sample) *Sample{ 49 | ret := s.Clone() 50 | ret.Label = e.labelMapper.Encoded(ret.Label) 51 | return ret 52 | } 53 | 54 | func (e *LabelEncoder) TransformDataset(dataset *DataSet) *DataSet{ 55 | ret := NewDataSet() 56 | for _, sample := range dataset.Samples { 57 | ret.AddSample(e.TransformSample(sample)) 58 | } 59 | 60 | return ret 61 | } 62 | 63 | func (e *LabelEncoder) InverseTransformSample(s *Sample) *Sample{ 64 | ret := s.Clone() 65 | ret.Label, _ = e.labelMapper.Decoded(ret.Label) 66 | return ret 67 | } 68 | 69 | func (e *LabelEncoder) InverseTransformDataset(dataset *DataSet) *DataSet{ 70 | ret := NewDataSet() 71 | for _, sample := range dataset.Samples { 72 | ret.AddSample(e.InverseTransformSample(sample)) 73 | } 74 | 75 | return ret 76 | } -------------------------------------------------------------------------------- /core/matrix.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | type Matrix struct { 4 | Data map[int64]*Vector 5 | } 6 | 7 | func NewMatrix() *Matrix { 8 | m := Matrix{} 9 | m.Data = make(map[int64]*Vector) 10 | return &m 11 | } 12 | 13 | func (m *Matrix) AddValue(k1, k2 int64, v float64){ 14 | _, ok := m.Data[k1] 15 | if !ok { 16 | m.Data[k1] = NewVector() 17 | } 18 | m.Data[k1].AddValue(k2, v) 19 | } 20 | 21 | func (m *Matrix) SetValue(k1, k2 int64, v float64){ 22 | row, ok := m.Data[k1] 23 | if !ok { 24 | row = NewVector() 25 | m.Data[k1] = row 26 | } 27 | row.SetValue(k2, v) 28 | } 29 | 30 | func (m *Matrix) GetValue(k1, k2 int64) float64 { 31 | row := m.GetRow(k1) 32 | if row == nil { 33 | return 0.0 34 | } else { 35 | return row.GetValue(k2) 36 | } 37 | } 38 | 39 | func (m *Matrix) GetRow(k1 int64) *Vector { 40 | row, ok := m.Data[k1] 41 | if !ok { 42 | return nil 43 | } else { 44 | return row 45 | } 46 | } 47 | 48 | func (m *Matrix) Scale(scale float64) *Matrix { 49 | ret := NewMatrix() 50 | for id, vi := range m.Data { 51 | ret.Data[id] = vi.Scale(scale) 52 | } 53 | return ret 54 | } 55 | 56 | func (m *Matrix) MultiplyVector(v *Vector) *Vector { 57 | // This is intended for l-by-m * m-by-1 58 | // For m-by-1 * 1-by-n, use OuterProduct in vector.go 59 | // Probably should just have a MatrixMultiply for everything 60 | ret := NewVector() 61 | for id, vi := range m.Data { 62 | ret.SetValue(id, v.Dot(vi)) 63 | } 64 | return ret 65 | } 66 | 67 | func (m *Matrix) Trans() *Matrix { 68 | ret := NewMatrix() 69 | for rid, vi := range m.Data { 70 | for cid, w := range vi.Data { 71 | ret.SetValue(cid, rid, w) 72 | } 73 | } 74 | return ret 75 | } 76 | 77 | func (m *Matrix) ElemWiseAddMatrix(n *Matrix) *Matrix { 78 | ret := NewMatrix() 79 | for key, mi := range m.Data{ 80 | ret.Data[key] = mi 81 | } 82 | for key, ni := range n.Data{ 83 | if ret.GetRow(key) == nil{ 84 | ret.Data[key] = ni 85 | } else { 86 | ret.Data[key] = ni.ElemWiseAddVector(ret.GetRow(key)) 87 | } 88 | } 89 | return ret 90 | } 91 | -------------------------------------------------------------------------------- /sa/sa_auc.go: -------------------------------------------------------------------------------- 1 | package sa 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector/core" 6 | "github.com/xlvector/hector/eval" 7 | "math/rand" 8 | ) 9 | 10 | type SAOptAUC struct { 11 | Model map[int64]float64 12 | } 13 | 14 | func (self *SAOptAUC) SaveModel(path string) { 15 | 16 | } 17 | 18 | func (self *SAOptAUC) LoadModel(path string) { 19 | 20 | } 21 | 22 | func (algo *SAOptAUC) Init(params map[string]string) { 23 | algo.Model = make(map[int64]float64) 24 | } 25 | 26 | func (algo *SAOptAUC) TrainAUC(samples []*core.Sample) float64 { 27 | predictions := []*eval.LabelPrediction{} 28 | for _, sample := range samples { 29 | pred := algo.Predict(sample) 30 | predictions = append(predictions, &(eval.LabelPrediction{Label: sample.Label, Prediction: pred})) 31 | } 32 | return eval.AUC(predictions) 33 | } 34 | 35 | func (algo *SAOptAUC) Train(dataset *core.DataSet) { 36 | algo.Model = make(map[int64]float64) 37 | samples := []*core.Sample{} 38 | for _, sample := range dataset.Samples { 39 | for _, feature := range sample.Features { 40 | algo.Model[feature.Id] = 1.0 / float64(len(sample.Features)) 41 | } 42 | samples = append(samples, sample) 43 | } 44 | 45 | features := []int64{} 46 | for fid, _ := range algo.Model { 47 | features = append(features, fid) 48 | } 49 | 50 | prev_auc := 0.5 51 | for i := 0; i < 5000; i++ { 52 | add := rand.Float64() 53 | fid := features[rand.Intn(len(features))] 54 | fweight := algo.Model[fid] 55 | algo.Model[fid] = add 56 | auc := algo.TrainAUC(samples) 57 | 58 | if i%500 == 0 { 59 | fmt.Println(prev_auc) 60 | } 61 | 62 | if prev_auc < auc { 63 | prev_auc = auc 64 | } else { 65 | algo.Model[fid] = fweight 66 | } 67 | } 68 | fmt.Println(algo.Model) 69 | } 70 | 71 | func (algo *SAOptAUC) Predict(sample *core.Sample) float64 { 72 | ret := 0.0 73 | for _, feature := range sample.Features { 74 | model_feature_value, ok := algo.Model[feature.Id] 75 | if ok { 76 | ret += model_feature_value * feature.Value 77 | } 78 | } 79 | return ret 80 | } 81 | -------------------------------------------------------------------------------- /classifier_test.go: -------------------------------------------------------------------------------- 1 | package hector 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "testing" 6 | ) 7 | 8 | func TestClassifiers(t *testing.T) { 9 | train_dataset := core.LinearDataSet(1000) 10 | test_dataset := core.LinearDataSet(500) 11 | 12 | algos := []string{"ep", "fm", "ftrl", "lr", "linear_svm", "lr_owlqn"} 13 | 14 | params := make(map[string]string) 15 | params["beta"] = "1.0" 16 | params["steps"] = "10" 17 | params["lambda1"] = "0.1" 18 | params["lambda2"] = "1.0" 19 | params["alpha"] = "0.1" 20 | params["max-depth"] = "20" 21 | params["min-leaf-size"] = "5" 22 | params["tree-count"] = "10" 23 | params["learning-rate"] = "0.05" 24 | params["regularization"] = "0.0001" 25 | params["e"] = "0.1" 26 | params["c"] = "0.1" 27 | params["gini"] = "1.0" 28 | params["factors"] = "10" 29 | 30 | for _, algo := range algos { 31 | classifier := GetClassifier(algo) 32 | classifier.Init(params) 33 | auc, _ := AlgorithmRunOnDataSet(classifier, train_dataset, test_dataset, "", params) 34 | 35 | t.Logf("auc of %s in linear dataset is %f", algo, auc) 36 | if auc < 0.9 { 37 | t.Error("auc less than 0.9 in linear dataset") 38 | } 39 | } 40 | } 41 | 42 | func TestClassifiersOnXOR(t *testing.T) { 43 | algos := []string{"ann", "rf", "rdt", "knn"} 44 | 45 | params := make(map[string]string) 46 | params["steps"] = "30" 47 | params["max-depth"] = "10" 48 | params["min-leaf-size"] = "10" 49 | params["tree-count"] = "100" 50 | params["learning-rate"] = "0.1" 51 | params["learning-rate-discount"] = "1.0" 52 | params["regularization"] = "0.0001" 53 | params["gini"] = "1.0" 54 | params["hidden"] = "15" 55 | params["k"] = "10" 56 | params["feature-count"] = "1.0" 57 | params["dt-sample-ratio"] = "1.0" 58 | 59 | for _, algo := range algos { 60 | train_dataset := core.XORDataSet(1000) 61 | test_dataset := core.XORDataSet(500) 62 | classifier := GetClassifier(algo) 63 | classifier.Init(params) 64 | auc, _ := AlgorithmRunOnDataSet(classifier, train_dataset, test_dataset, "", params) 65 | 66 | t.Logf("auc of %s in xor dataset is %f", algo, auc) 67 | if auc < 0.9 { 68 | t.Error("auc less than 0.9 in xor dataset") 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /util/string_util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "strconv" 8 | "io/ioutil" 9 | ) 10 | 11 | type StringBuilder struct { 12 | buffer bytes.Buffer 13 | } 14 | 15 | func (self *StringBuilder) Write(strings ...string) *StringBuilder { 16 | for _, str := range strings { 17 | self.buffer.WriteString(str) 18 | } 19 | return self 20 | } 21 | 22 | func (self *StringBuilder) Printf(format string, args ...interface{}) *StringBuilder { 23 | fmt.Fprintf(&self.buffer, format, args...) 24 | return self 25 | } 26 | 27 | func (self *StringBuilder) Byte(value byte) *StringBuilder { 28 | self.buffer.WriteByte(value) 29 | return self 30 | } 31 | 32 | func (self *StringBuilder) WriteBytes(bytes []byte) *StringBuilder { 33 | self.buffer.Write(bytes) 34 | return self 35 | } 36 | 37 | func (self *StringBuilder) Int(value int) *StringBuilder { 38 | self.buffer.WriteString(strconv.Itoa(value)) 39 | return self 40 | } 41 | 42 | func (self *StringBuilder) Int64(value int64) *StringBuilder { 43 | self.buffer.WriteString(strconv.FormatInt(value, 10)) 44 | return self 45 | } 46 | 47 | func (self *StringBuilder) UInt64(value uint64) *StringBuilder { 48 | self.buffer.WriteString(strconv.FormatUint(value, 10)) 49 | return self 50 | } 51 | 52 | func (self *StringBuilder) Uint(value uint) *StringBuilder { 53 | self.buffer.WriteString(strconv.FormatUint(uint64(value), 10)) 54 | return self 55 | } 56 | 57 | func (self *StringBuilder) Float(value float64) *StringBuilder { 58 | self.buffer.WriteString(strconv.FormatFloat(value, 'f', -1, 64)) 59 | return self 60 | } 61 | 62 | func (self *StringBuilder) Bool(value bool) *StringBuilder { 63 | self.buffer.WriteString(strconv.FormatBool(value)) 64 | return self 65 | } 66 | 67 | func (self *StringBuilder) WriteTo(writer io.Writer) (n int64, err error) { 68 | return self.buffer.WriteTo(writer) 69 | } 70 | 71 | func (self *StringBuilder) WriteToFile(filename string) error { 72 | return ioutil.WriteFile(filename, self.buffer.Bytes(), 0600) 73 | } 74 | 75 | func (self *StringBuilder) Bytes() []byte { 76 | return self.buffer.Bytes() 77 | } 78 | 79 | func (self *StringBuilder) String() string { 80 | return self.buffer.String() 81 | } 82 | -------------------------------------------------------------------------------- /dt/gbdt.go: -------------------------------------------------------------------------------- 1 | package dt 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "github.com/xlvector/hector/core" 7 | "math" 8 | "os" 9 | "strconv" 10 | ) 11 | 12 | type GBDT struct { 13 | dts []*RegressionTree 14 | tree_count int 15 | shrink float64 16 | } 17 | 18 | func (self *GBDT) SaveModel(path string) { 19 | file, _ := os.Create(path) 20 | defer file.Close() 21 | for _, dt := range self.dts { 22 | buf := dt.tree.ToString() 23 | file.Write(buf) 24 | file.WriteString("\n#\n") 25 | } 26 | } 27 | 28 | func (self *GBDT) LoadModel(path string) { 29 | file, _ := os.Open(path) 30 | defer file.Close() 31 | 32 | self.dts = []*RegressionTree{} 33 | scanner := bufio.NewScanner(file) 34 | text := "" 35 | for scanner.Scan() { 36 | line := scanner.Text() 37 | if line == "#" { 38 | tree := Tree{} 39 | tree.FromString(text) 40 | dt := RegressionTree{tree: tree} 41 | self.dts = append(self.dts, &dt) 42 | text = "" 43 | } else { 44 | text += line + "\n" 45 | } 46 | } 47 | } 48 | 49 | func (c *GBDT) Init(params map[string]string) { 50 | tree_count, _ := strconv.ParseInt(params["tree-count"], 10, 64) 51 | c.tree_count = int(tree_count) 52 | for i := 0; i < c.tree_count; i++ { 53 | dt := RegressionTree{} 54 | dt.Init(params) 55 | c.dts = append(c.dts, &dt) 56 | } 57 | c.shrink, _ = strconv.ParseFloat(params["learning-rate"], 64) 58 | } 59 | 60 | func (c *GBDT) RMSE(dataset *core.DataSet) float64 { 61 | rmse := 0.0 62 | n := 0.0 63 | for _, sample := range dataset.Samples { 64 | rmse += (sample.Prediction) * (sample.Prediction) 65 | n += 1.0 66 | } 67 | return math.Sqrt(rmse / n) 68 | } 69 | 70 | func (c *GBDT) Train(dataset *core.DataSet) { 71 | for _, sample := range dataset.Samples { 72 | sample.Prediction = sample.LabelDoubleValue() 73 | } 74 | for k, dt := range c.dts { 75 | dt.Train(dataset) 76 | for _, sample := range dataset.Samples { 77 | sample.Prediction -= c.shrink * dt.Predict(sample) 78 | } 79 | if k%10 == 0 { 80 | fmt.Println(c.RMSE(dataset)) 81 | } 82 | } 83 | } 84 | 85 | func (c *GBDT) Predict(sample *core.Sample) float64 { 86 | ret := 0.0 87 | for _, dt := range c.dts { 88 | ret += c.shrink * dt.Predict(sample) 89 | } 90 | return ret 91 | } 92 | -------------------------------------------------------------------------------- /lr/linear_regression.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "bufio" 5 | "github.com/xlvector/hector/core" 6 | "github.com/xlvector/hector/util" 7 | "os" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | type LinearRegression struct { 13 | Model map[int64]float64 14 | Params LogisticRegressionParams 15 | } 16 | 17 | func (algo *LinearRegression) SaveModel(path string) { 18 | sb := util.StringBuilder{} 19 | for f, g := range algo.Model { 20 | sb.Int64(f) 21 | sb.Write("\t") 22 | sb.Float(g) 23 | sb.Write("\n") 24 | } 25 | sb.WriteToFile(path) 26 | } 27 | 28 | func (algo *LinearRegression) LoadModel(path string) { 29 | file, _ := os.Open(path) 30 | defer file.Close() 31 | 32 | scaner := bufio.NewScanner(file) 33 | for scaner.Scan() { 34 | line := scaner.Text() 35 | tks := strings.Split(line, "\t") 36 | fid, _ := strconv.ParseInt(tks[0], 10, 64) 37 | fw, _ := strconv.ParseFloat(tks[1], 64) 38 | algo.Model[fid] = fw 39 | } 40 | } 41 | 42 | func (algo *LinearRegression) Init(params map[string]string) { 43 | algo.Model = make(map[int64]float64) 44 | 45 | algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64) 46 | algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64) 47 | } 48 | 49 | func (algo *LinearRegression) Train(dataset *core.DataSet) { 50 | algo.Model = make(map[int64]float64) 51 | for step := 0; step < algo.Params.Steps; step++ { 52 | for _, sample := range dataset.Samples { 53 | prediction := algo.Predict(sample) 54 | err := sample.LabelDoubleValue() - prediction 55 | for _, feature := range sample.Features { 56 | model_feature_value, ok := algo.Model[feature.Id] 57 | if !ok { 58 | model_feature_value = 0.0 59 | } 60 | model_feature_value += algo.Params.LearningRate * (err*feature.Value - algo.Params.Regularization*model_feature_value) 61 | algo.Model[feature.Id] = model_feature_value 62 | } 63 | } 64 | algo.Params.LearningRate *= 0.9 65 | } 66 | } 67 | 68 | func (algo *LinearRegression) Predict(sample *core.Sample) float64 { 69 | ret := 0.0 70 | for _, feature := range sample.Features { 71 | model_feature_value, ok := algo.Model[feature.Id] 72 | if ok { 73 | ret += model_feature_value * feature.Value 74 | } 75 | } 76 | return ret 77 | } 78 | -------------------------------------------------------------------------------- /lr/lbfgs_minimizer.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector/core" 6 | ) 7 | 8 | /** 9 | * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models" 10 | * by Galen Andrew and Jianfeng Gao 11 | * user: weixuan 12 | * To change this template use File | Settings | File Templates. 13 | */ 14 | type LBFGSMinimizer struct { 15 | costFun DiffFunction 16 | numHist int 17 | maxIteration int 18 | tolerance float64 19 | } 20 | 21 | var lbfgs_output_switch bool = false 22 | 23 | func NewLBFGSMinimizer() *LBFGSMinimizer { 24 | m := new(LBFGSMinimizer) 25 | m.numHist = 10 26 | m.maxIteration = 200 27 | m.tolerance = 1e-4 28 | return m 29 | } 30 | 31 | func (m *LBFGSMinimizer) Minimize(costfun DiffFunction, init *core.Vector) *core.Vector { 32 | m.costFun = costfun 33 | var cost float64 = costfun.Value(init) 34 | var grad *core.Vector = costfun.Gradient(init).Copy() 35 | var pos *core.Vector = init.Copy() 36 | var terminalCriterion *relativeMeanImprCriterion = NewRelativeMeanImprCriterion(m.tolerance) 37 | terminalCriterion.addCost(cost) 38 | 39 | var helper *QuasiNewtonHelper = NewQuasiNewtonHelper(m.numHist, m, pos, grad) 40 | if lbfgs_output_switch { 41 | fmt.Println("Iter\tcost\timprovement") 42 | fmt.Printf("%d\t%e\tUndefined", 0, cost) 43 | } 44 | for iter := 1; iter <= m.maxIteration; iter++ { 45 | dir := grad.Copy() 46 | dir.ApplyScale(-1.0) 47 | helper.ApplyQuasiInverseHession(dir) 48 | newCost, newPos := helper.BackTrackingLineSearch(cost, pos, grad, dir, iter == 1) 49 | if lbfgs_output_switch { 50 | fmt.Println("") 51 | } 52 | if cost == newCost { 53 | break 54 | } 55 | cost = newCost 56 | pos = newPos 57 | grad = costfun.Gradient(pos).Copy() 58 | terminalCriterion.addCost(cost) 59 | if lbfgs_output_switch { 60 | fmt.Printf("%d\t%e\t%e", iter, newCost, terminalCriterion.improvement) 61 | } 62 | if terminalCriterion.isTerminable() || helper.UpdateState(pos, grad) { 63 | if lbfgs_output_switch { 64 | fmt.Println("") 65 | } 66 | break 67 | } 68 | } 69 | return pos 70 | } 71 | 72 | func (m *LBFGSMinimizer) Evaluate(pos *core.Vector) float64 { 73 | return m.costFun.Value(pos) 74 | } 75 | 76 | func (m *LBFGSMinimizer) NextPoint(curPos *core.Vector, dir *core.Vector, alpha float64) *core.Vector { 77 | if lbfgs_output_switch { 78 | fmt.Printf(".") 79 | } 80 | return curPos.ElemWiseMultiplyAdd(dir, alpha) 81 | } 82 | -------------------------------------------------------------------------------- /fm/factorize_machine.go: -------------------------------------------------------------------------------- 1 | package fm 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "github.com/xlvector/hector/util" 6 | "strconv" 7 | ) 8 | 9 | type FactorizeMachine struct { 10 | w *core.Vector 11 | v []*core.Vector 12 | params FactorizeMachineParams 13 | } 14 | 15 | type FactorizeMachineParams struct { 16 | LearningRate float64 17 | Regularization float64 18 | FactorNumber int 19 | } 20 | 21 | func (self *FactorizeMachine) SaveModel(path string) { 22 | 23 | } 24 | 25 | func (self *FactorizeMachine) LoadModel(path string) { 26 | 27 | } 28 | 29 | func (c *FactorizeMachine) Predict(sample *core.Sample) float64 { 30 | for _, f := range sample.Features { 31 | c.w.RandomInit(f.Id, 0.1) 32 | for k, _ := range c.v { 33 | c.v[k].RandomInit(f.Id, 0.1) 34 | } 35 | } 36 | ret := c.w.DotFeatures(sample.Features) 37 | for k, _ := range c.v { 38 | a := c.v[k].DotFeatures(sample.Features) 39 | b := 0.0 40 | for _, f := range sample.Features { 41 | vkf := c.v[k].GetValue(f.Id) 42 | b += f.Value * f.Value * vkf * vkf 43 | } 44 | ret += 0.5 * (a*a - b) 45 | } 46 | return util.Sigmoid(ret) 47 | } 48 | 49 | func (c *FactorizeMachine) Init(params map[string]string) { 50 | c.w = core.NewVector() 51 | factor_number, _ := strconv.ParseInt(params["factors"], 10, 64) 52 | c.params.FactorNumber = int(factor_number) 53 | c.params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64) 54 | c.params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64) 55 | 56 | for i := 0; i < c.params.FactorNumber; i++ { 57 | c.v = append(c.v, core.NewVector()) 58 | } 59 | } 60 | 61 | func (c *FactorizeMachine) Train(dataset *core.DataSet) { 62 | n := 0 63 | for _, sample := range dataset.Samples { 64 | n += 1 65 | if n%10000 == 0 { 66 | c.params.LearningRate *= 0.9 67 | } 68 | pred := c.Predict(sample) 69 | err := sample.LabelDoubleValue() - pred 70 | 71 | vx := []float64{} 72 | for _, vf := range c.v { 73 | vx = append(vx, vf.DotFeatures(sample.Features)) 74 | } 75 | for _, f := range sample.Features { 76 | fweight := c.w.GetValue(f.Id) 77 | fweight += c.params.LearningRate * (err*f.Value - c.params.Regularization*fweight) 78 | c.w.SetValue(f.Id, fweight) 79 | 80 | for k, _ := range c.v { 81 | vkx := c.v[k].GetValue(f.Id) 82 | vkx += c.params.LearningRate * (err*(f.Value*vx[k]-f.Value*f.Value*vkx) - c.params.Regularization*vkx) 83 | c.v[k].SetValue(f.Id, vkx) 84 | } 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /lr/logistic_regression.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "bufio" 5 | "os" 6 | "strconv" 7 | "strings" 8 | 9 | "github.com/xlvector/hector/core" 10 | "github.com/xlvector/hector/util" 11 | ) 12 | 13 | type LogisticRegressionParams struct { 14 | LearningRate float64 15 | Regularization float64 16 | Steps int 17 | } 18 | 19 | type LogisticRegression struct { 20 | Model map[int64]float64 21 | Params LogisticRegressionParams 22 | } 23 | 24 | func (algo *LogisticRegression) SaveModel(path string) { 25 | sb := util.StringBuilder{} 26 | for f, g := range algo.Model { 27 | sb.Int64(f) 28 | sb.Write("\t") 29 | sb.Float(g) 30 | sb.Write("\n") 31 | } 32 | sb.WriteToFile(path) 33 | } 34 | 35 | func (algo *LogisticRegression) LoadModel(path string) { 36 | file, _ := os.Open(path) 37 | defer file.Close() 38 | algo.Model = make(map[int64]float64) 39 | scaner := bufio.NewScanner(file) 40 | for scaner.Scan() { 41 | line := scaner.Text() 42 | tks := strings.Split(line, "\t") 43 | fid, _ := strconv.ParseInt(tks[0], 10, 64) 44 | fw, _ := strconv.ParseFloat(tks[1], 64) 45 | algo.Model[fid] = fw 46 | } 47 | } 48 | 49 | func (algo *LogisticRegression) Init(params map[string]string) { 50 | algo.Model = make(map[int64]float64) 51 | 52 | algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64) 53 | algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64) 54 | steps, _ := strconv.ParseInt(params["steps"], 10, 32) 55 | algo.Params.Steps = int(steps) 56 | } 57 | 58 | func (algo *LogisticRegression) Train(dataset *core.DataSet) { 59 | algo.Model = make(map[int64]float64) 60 | for step := 0; step < algo.Params.Steps; step++ { 61 | for _, sample := range dataset.Samples { 62 | prediction := algo.Predict(sample) 63 | err := sample.LabelDoubleValue() - prediction 64 | for _, feature := range sample.Features { 65 | model_feature_value, ok := algo.Model[feature.Id] 66 | if !ok { 67 | model_feature_value = 0.0 68 | } 69 | model_feature_value += algo.Params.LearningRate * (err*feature.Value - algo.Params.Regularization*model_feature_value) 70 | algo.Model[feature.Id] = model_feature_value 71 | } 72 | } 73 | algo.Params.LearningRate *= 0.9 74 | } 75 | } 76 | 77 | func (algo *LogisticRegression) Predict(sample *core.Sample) float64 { 78 | ret := 0.0 79 | for _, feature := range sample.Features { 80 | model_feature_value, ok := algo.Model[feature.Id] 81 | if ok { 82 | ret += model_feature_value * feature.Value 83 | } 84 | } 85 | return util.Sigmoid(ret) 86 | } 87 | -------------------------------------------------------------------------------- /lr/logistic_regression_streaming.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "bufio" 5 | "log" 6 | "math" 7 | "os" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/xlvector/hector/core" 12 | "github.com/xlvector/hector/util" 13 | ) 14 | 15 | type LogisticRegressionStream struct { 16 | Model map[int64]float64 17 | Params LogisticRegressionParams 18 | } 19 | 20 | func (algo *LogisticRegressionStream) SaveModel(path string) { 21 | sb := util.StringBuilder{} 22 | for f, g := range algo.Model { 23 | sb.Int64(f) 24 | sb.Write("\t") 25 | sb.Float(g) 26 | sb.Write("\n") 27 | } 28 | sb.WriteToFile(path) 29 | } 30 | 31 | func (algo *LogisticRegressionStream) LoadModel(path string) { 32 | file, _ := os.Open(path) 33 | defer file.Close() 34 | 35 | scaner := bufio.NewScanner(file) 36 | for scaner.Scan() { 37 | line := scaner.Text() 38 | tks := strings.Split(line, "\t") 39 | fid, _ := strconv.ParseInt(tks[0], 10, 64) 40 | fw, _ := strconv.ParseFloat(tks[1], 64) 41 | algo.Model[fid] = fw 42 | } 43 | } 44 | 45 | func (algo *LogisticRegressionStream) Init(params map[string]string) { 46 | algo.Model = make(map[int64]float64) 47 | 48 | algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64) 49 | algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64) 50 | steps, _ := strconv.ParseInt(params["steps"], 10, 32) 51 | algo.Params.Steps = int(steps) 52 | } 53 | 54 | func (algo *LogisticRegressionStream) Train(dataset *core.StreamingDataSet) { 55 | algo.Model = make(map[int64]float64) 56 | totalErr := 0.0 57 | n := 0 58 | for sample := range dataset.Samples { 59 | prediction := algo.Predict(sample) 60 | err := sample.LabelDoubleValue() - prediction 61 | totalErr += math.Abs(err) 62 | n += 1 63 | if n%100000 == 0 { 64 | log.Println("proc ", n, totalErr/100000.0, sample.LabelDoubleValue(), prediction) 65 | totalErr = 0.0 66 | } 67 | for _, feature := range sample.Features { 68 | model_feature_value, ok := algo.Model[feature.Id] 69 | if !ok { 70 | model_feature_value = 0.0 71 | } 72 | model_feature_value += algo.Params.LearningRate * (err*feature.Value - algo.Params.Regularization*model_feature_value) 73 | algo.Model[feature.Id] = model_feature_value 74 | } 75 | } 76 | } 77 | 78 | func (algo *LogisticRegressionStream) Predict(sample *core.Sample) float64 { 79 | ret := 0.0 80 | for _, feature := range sample.Features { 81 | model_feature_value, ok := algo.Model[feature.Id] 82 | if ok { 83 | ret += model_feature_value * feature.Value 84 | } 85 | } 86 | return util.Sigmoid(ret) 87 | } 88 | -------------------------------------------------------------------------------- /util/math_util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "math" 5 | "strconv" 6 | ) 7 | 8 | func Sigmoid(x float64)(y float64) { 9 | y = 1 / (1 + math.Exp(-1 * x)) 10 | return y 11 | } 12 | 13 | func UnSigmoid(x float64) float64 { 14 | x = x * 0.99 + 0.01 15 | y := math.Log(x / (1 - x)) 16 | return y 17 | } 18 | 19 | func Signum(x float64) float64 { 20 | ret := 0.0 21 | if x > 0{ 22 | ret = 1.0 23 | } else if(x < 0) { 24 | ret = -1.0 25 | } else { 26 | ret = 0.0 27 | } 28 | return ret 29 | } 30 | 31 | func ParseInt64(str string) int64 { 32 | ret, _ := strconv.ParseInt(str, 10, 64) 33 | return ret 34 | } 35 | 36 | func ParseFloat64(str string) float64 { 37 | ret, _ := strconv.ParseFloat(str, 64) 38 | return ret 39 | } 40 | 41 | type Gaussian struct { 42 | Mean, Vari float64 43 | } 44 | 45 | func (g *Gaussian) Integral(x float64) float64{ 46 | a1 := 0.254829592 47 | a2 := -0.284496736 48 | a3 := 1.421413741 49 | a4 := -1.453152027 50 | a5 := 1.061405429 51 | p := 0.3275911 52 | 53 | sign := 1.0 54 | if x < 0{ 55 | sign = -1.0 56 | } 57 | x = math.Abs(x) / math.Sqrt(2.0) 58 | 59 | t := 1.0 / (1.0 + p * x) 60 | y := 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.Exp(-x * x) 61 | return 0.5 * (1.0 + sign * y) 62 | } 63 | 64 | func (g *Gaussian) AddGaussian(g1 *Gaussian){ 65 | g.Mean += g1.Mean 66 | g.Vari += g1.Vari 67 | } 68 | 69 | func (g *Gaussian) MultGaussian(g1 *Gaussian){ 70 | Mean := (g.Mean * g1.Vari + g1.Mean * g.Vari) / (g.Vari + g1.Vari) 71 | Vari := g.Vari * g1.Vari / (g.Vari + g1.Vari) 72 | g.Mean = Mean 73 | g.Vari = Vari 74 | } 75 | 76 | func (g *Gaussian) Func(x float64) float64{ 77 | return math.Exp(-0.5 * x * x) * 0.3989423; 78 | } 79 | 80 | func (g *Gaussian) UpperTruncateGaussian(Mean, Vari, s float64){ 81 | sqrtVari := math.Sqrt(Vari) 82 | a := (s - Mean) / sqrtVari 83 | lambda := a 84 | if a < 4.0 { 85 | lambda = g.Func(a) / g.Integral(-1.0 * a) 86 | } 87 | Mean = Mean + sqrtVari * lambda 88 | if lambda * (lambda - a) > 1{ 89 | Vari = 0.0 90 | } else { 91 | Vari *= 1 - lambda * (lambda - a) 92 | } 93 | g.Mean = Mean 94 | g.Vari = Vari 95 | } 96 | 97 | func (g *Gaussian) LowerTruncateGaussian(Mean, Vari, s float64){ 98 | sqrtVari := math.Sqrt(Vari) 99 | a := (s - Mean) / sqrtVari 100 | delta := -1.0 * a 101 | if a > -4.0 { 102 | delta = g.Func(a) / g.Integral(a) 103 | } 104 | Mean = Mean - sqrtVari * delta 105 | if a * delta + delta * delta > 1.0 { 106 | Vari = 0.0 107 | } else { 108 | Vari *= 1 - a * delta - delta * delta 109 | } 110 | g.Mean = Mean 111 | g.Vari = Vari 112 | } -------------------------------------------------------------------------------- /eval/evaluation.go: -------------------------------------------------------------------------------- 1 | package eval 2 | 3 | import ( 4 | "sort" 5 | "math" 6 | ) 7 | 8 | type LabelPrediction struct { 9 | Prediction float64 10 | Label int 11 | } 12 | 13 | type RealPrediction struct { // Real valued 14 | Prediction float64 15 | Value float64 16 | } 17 | 18 | type By func(p1, p2 *LabelPrediction) bool 19 | 20 | type labelPredictionSorter struct { 21 | predictions []*LabelPrediction 22 | by By 23 | } 24 | 25 | func (s *labelPredictionSorter) Len() int { 26 | return len(s.predictions) 27 | } 28 | 29 | func (s *labelPredictionSorter) Swap(i, j int) { 30 | s.predictions[i], s.predictions[j] = s.predictions[j], s.predictions[i] 31 | } 32 | 33 | func (s *labelPredictionSorter) Less(i, j int) bool { 34 | return s.by(s.predictions[i], s.predictions[j]) 35 | } 36 | 37 | func (by By) Sort(predictions []*LabelPrediction) { 38 | sorter := &labelPredictionSorter{ 39 | predictions: predictions, 40 | by: by, 41 | } 42 | sort.Sort(sorter) 43 | } 44 | 45 | func AUC(predictions0 []*LabelPrediction) float64 { 46 | predictions := []*LabelPrediction{} 47 | for _, pred := range predictions0{ 48 | predictions = append(predictions, pred) 49 | } 50 | prediction := func(p1, p2 *LabelPrediction) bool { 51 | return p1.Prediction > p2.Prediction 52 | } 53 | 54 | By(prediction).Sort(predictions) 55 | 56 | pn := 0.0 57 | nn := float64(len(predictions)) 58 | ret := 0.0 59 | count := nn 60 | for i, lp := range predictions{ 61 | if lp.Label > 0 { 62 | pn += 1.0 63 | nn -= 1.0 64 | ret += float64(count) - float64(i) 65 | } 66 | } 67 | ret2 := pn * (pn + 1) / 2.0; 68 | if pn * nn == 0.0{ 69 | return 0.5 70 | } 71 | return (ret - ret2) / (pn * nn) 72 | } 73 | 74 | func RMSE(predictions []*LabelPrediction) float64 { 75 | ret := 0.0 76 | n := 0.0 77 | 78 | for _, pred := range predictions { 79 | ret += (float64(pred.Label) - pred.Prediction) * (float64(pred.Label) - pred.Prediction) 80 | n += 1.0 81 | } 82 | 83 | return math.Sqrt(ret / n) 84 | } 85 | 86 | func ErrorRate(predictions []*LabelPrediction) float64 { 87 | ret := 0.0 88 | n := 0.0 89 | 90 | for _, pred := range predictions { 91 | if (float64(pred.Label) - 0.5) * (pred.Prediction - 0.5) < 0 { 92 | ret += 1.0 93 | } 94 | n += 1.0 95 | } 96 | return ret / n 97 | } 98 | 99 | func RegRMSE(predictions []*RealPrediction) float64 { 100 | ret := 0.0 101 | n := 0.0 102 | 103 | for _, pred := range predictions { 104 | ret += (pred.Value - pred.Prediction) * (pred.Value - pred.Prediction) 105 | n += 1.0 106 | } 107 | 108 | return math.Sqrt(ret / n) 109 | } 110 | 111 | -------------------------------------------------------------------------------- /svm/l1vm.go: -------------------------------------------------------------------------------- 1 | package svm 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "github.com/xlvector/hector/lr" 6 | "math" 7 | "math/rand" 8 | "strconv" 9 | ) 10 | 11 | func Distance(x, y *core.Vector) float64 { 12 | z := x.Copy() 13 | z.AddVector(y, -1) 14 | d := z.NormL2() 15 | return d 16 | } 17 | 18 | func RBFKernel(x, y *core.Vector, radius float64) float64 { 19 | d := Distance(x, y) 20 | ret := math.Exp(-1.0 * d / radius) 21 | return ret 22 | } 23 | 24 | type L1VM struct { 25 | sv []*core.Vector 26 | ftrl *lr.FTRLLogisticRegression 27 | radius float64 28 | count int 29 | } 30 | 31 | func (self *L1VM) SaveModel(path string) { 32 | 33 | } 34 | 35 | func (self *L1VM) LoadModel(path string) { 36 | 37 | } 38 | 39 | func (c *L1VM) Init(params map[string]string) { 40 | c.ftrl = &(lr.FTRLLogisticRegression{}) 41 | c.ftrl.Init(params) 42 | c.radius, _ = strconv.ParseFloat(params["radius"], 64) 43 | count, _ := strconv.ParseInt(params["sv"], 10, 64) 44 | c.count = int(count) 45 | } 46 | 47 | func (c *L1VM) Predict(sample *core.Sample) float64 { 48 | x := sample.GetFeatureVector() 49 | return c.PredictVector(x) 50 | } 51 | 52 | func (c *L1VM) PredictVector(x *core.Vector) float64 { 53 | s := core.NewSample() 54 | for k, xs := range c.sv { 55 | 56 | s.AddFeature(core.Feature{Id: int64(k), Value: RBFKernel(xs, x, c.radius)}) 57 | } 58 | return c.ftrl.Predict(s) 59 | } 60 | 61 | func (c *L1VM) Train(dataset *core.DataSet) { 62 | c.sv = []*core.Vector{} 63 | kernel_dataset := core.NewDataSet() 64 | 65 | positive := []int{} 66 | negative := []int{} 67 | for i, si := range dataset.Samples { 68 | if si.Label > 0.0 { 69 | positive = append(positive, i) 70 | } else { 71 | negative = append(negative, i) 72 | } 73 | } 74 | 75 | perm_positive := rand.Perm(len(positive)) 76 | 77 | for i, k := range perm_positive { 78 | if i > c.count { 79 | break 80 | } 81 | c.sv = append(c.sv, dataset.Samples[positive[k]].GetFeatureVector()) 82 | } 83 | 84 | perm_negative := rand.Perm(len(negative)) 85 | 86 | for i, k := range perm_negative { 87 | if i > c.count { 88 | break 89 | } 90 | c.sv = append(c.sv, dataset.Samples[negative[k]].GetFeatureVector()) 91 | } 92 | 93 | for _, si := range dataset.Samples { 94 | xi := si.GetFeatureVector() 95 | tsample := core.NewSample() 96 | tsample.Label = si.Label 97 | for j, xj := range c.sv { 98 | tsample.AddFeature(core.Feature{Id: int64(j), Value: RBFKernel(xi, xj, c.radius)}) 99 | } 100 | kernel_dataset.AddSample(tsample) 101 | } 102 | 103 | c.ftrl.Train(kernel_dataset) 104 | } 105 | -------------------------------------------------------------------------------- /svm/linear_svm.go: -------------------------------------------------------------------------------- 1 | package svm 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "github.com/xlvector/hector/core" 7 | "github.com/xlvector/hector/util" 8 | "math" 9 | "math/rand" 10 | "os" 11 | "runtime" 12 | "strconv" 13 | "strings" 14 | ) 15 | 16 | /* 17 | This algorithm implement L1 Linear SVM described in "A Dual Coordinate Descent Method for Large-scale Linear SVM" 18 | You can download the paper from http://ntu.csie.org/~cjlin/papers/cddual.pdf 19 | */ 20 | type LinearSVM struct { 21 | sv []*core.Vector 22 | y []float64 23 | a []float64 24 | b float64 25 | C float64 26 | e float64 27 | w *core.Vector 28 | 29 | xx []float64 30 | } 31 | 32 | func (self *LinearSVM) SaveModel(path string) { 33 | sb := util.StringBuilder{} 34 | for f, g := range self.w.Data { 35 | sb.Int64(f) 36 | sb.Write("\t") 37 | sb.Float(g) 38 | sb.Write("\n") 39 | } 40 | sb.WriteToFile(path) 41 | } 42 | 43 | func (self *LinearSVM) LoadModel(path string) { 44 | file, _ := os.Open(path) 45 | defer file.Close() 46 | 47 | scaner := bufio.NewScanner(file) 48 | for scaner.Scan() { 49 | line := scaner.Text() 50 | tks := strings.Split(line, "\t") 51 | fid, _ := strconv.ParseInt(tks[0], 10, 64) 52 | fw, _ := strconv.ParseFloat(tks[1], 64) 53 | self.w.SetValue(fid, fw) 54 | } 55 | } 56 | 57 | func (c *LinearSVM) Init(params map[string]string) { 58 | c.C, _ = strconv.ParseFloat(params["c"], 64) 59 | c.e, _ = strconv.ParseFloat(params["e"], 64) 60 | 61 | c.w = core.NewVector() 62 | } 63 | 64 | func (c *LinearSVM) Predict(sample *core.Sample) float64 { 65 | x := sample.GetFeatureVector() 66 | return c.PredictVector(x) 67 | } 68 | 69 | func (c *LinearSVM) PredictVector(x *core.Vector) float64 { 70 | ret := c.w.Dot(x) 71 | return ret 72 | } 73 | 74 | func (c *LinearSVM) Train(dataset *core.DataSet) { 75 | c.sv = []*core.Vector{} 76 | c.y = []float64{} 77 | c.a = []float64{} 78 | for k, sample := range dataset.Samples { 79 | x := sample.GetFeatureVector() 80 | c.sv = append(c.sv, x) 81 | c.xx = append(c.xx, x.Dot(x)) 82 | if sample.Label > 0.0 { 83 | c.y = append(c.y, 1.0) 84 | } else { 85 | c.y = append(c.y, -1.0) 86 | } 87 | c.a = append(c.a, c.C*rand.Float64()*0.0) 88 | c.w.AddVector(x, c.y[k]*c.a[k]) 89 | } 90 | 91 | da0 := 0.0 92 | for { 93 | da := 0.0 94 | for i, ai := range c.a { 95 | g := c.y[i]*c.w.Dot(c.sv[i]) - 1.0 96 | pg := g 97 | if ai < 1e-9 { 98 | pg = math.Min(0.0, g) 99 | } else if ai > c.C-1e-9 { 100 | pg = math.Max(0.0, g) 101 | } 102 | 103 | if math.Abs(pg) > 1e-9 { 104 | ai0 := ai 105 | ai = math.Min(math.Max(0, ai-g/c.xx[i]), c.C) 106 | c.w.AddVector(c.sv[i], (ai-ai0)*c.y[i]) 107 | da += math.Abs(ai - ai0) 108 | } 109 | } 110 | da /= float64(len(c.a)) 111 | fmt.Println(da) 112 | if da < c.e || math.Abs(da-da0) < 1e-3 { 113 | break 114 | } 115 | da0 = da 116 | } 117 | 118 | c.sv = nil 119 | runtime.GC() 120 | } 121 | -------------------------------------------------------------------------------- /core/array_vector.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "github.com/xlvector/hector/util" 5 | "math" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | type ArrayVector struct { 11 | data []float64 12 | } 13 | 14 | func NewArrayVector() *ArrayVector { 15 | v := ArrayVector{} 16 | v.data = []float64{} 17 | return &v 18 | } 19 | 20 | func (v *ArrayVector) ToString() []byte { 21 | sb := util.StringBuilder{} 22 | for _, value := range v.data { 23 | sb.Float(value) 24 | sb.Write("|") 25 | } 26 | return sb.Bytes() 27 | } 28 | 29 | func (v *ArrayVector) FromString(buf string) { 30 | tks := strings.Split(buf, "|") 31 | for _, tk := range tks { 32 | if len(tk) == 0 { 33 | continue 34 | } 35 | value, _ := strconv.ParseFloat(tk, 64) 36 | v.data = append(v.data, value) 37 | } 38 | } 39 | 40 | func (v *ArrayVector) Expand(size int) { 41 | for len(v.data) < size { 42 | v.data = append(v.data, 0.0) 43 | } 44 | } 45 | 46 | func (v *ArrayVector) AddValue(key int, value float64) { 47 | v.Expand(key + 1) 48 | v.data[key] += value 49 | } 50 | 51 | func (v *ArrayVector) GetValue(key int) float64 { 52 | if key >= len(v.data) { 53 | return 0.0 54 | } else { 55 | return v.data[key] 56 | } 57 | } 58 | 59 | func (v *ArrayVector) SetValue(key int, value float64) { 60 | v.Expand(key + 1) 61 | v.data[key] = value 62 | } 63 | 64 | func (v *ArrayVector) AddVector(v2 *ArrayVector, alpha float64) { 65 | for key, value := range v2.data { 66 | v.AddValue(key, value*alpha) 67 | } 68 | } 69 | 70 | func (v *ArrayVector) NormL2() float64 { 71 | ret := 0.0 72 | for _, val := range v.data { 73 | ret += val * val 74 | } 75 | return ret 76 | } 77 | 78 | func (v *ArrayVector) Copy() *ArrayVector { 79 | ret := NewArrayVector() 80 | for key, val := range v.data { 81 | ret.SetValue(key, val) 82 | } 83 | return ret 84 | } 85 | 86 | func (v *ArrayVector) KeyWithMaxValue() (int, float64) { 87 | ret := 0 88 | max_val := 0.0 89 | for key, val := range v.data { 90 | max_val = val 91 | ret = key 92 | break 93 | } 94 | for key, val := range v.data { 95 | if max_val < val { 96 | max_val = val 97 | ret = key 98 | } 99 | } 100 | return ret, max_val 101 | } 102 | 103 | func (v *ArrayVector) Sum() float64 { 104 | ret := 0.0 105 | for _, val := range v.data { 106 | ret += val 107 | } 108 | return ret 109 | } 110 | 111 | func (v *ArrayVector) Dot(v2 *ArrayVector) float64 { 112 | va := v 113 | vb := v2 114 | 115 | if len(v2.data) < len(v.data) { 116 | va = v2 117 | vb = v 118 | } 119 | ret := 0.0 120 | for key, a := range va.data { 121 | b := vb.data[key] 122 | ret += a * b 123 | } 124 | return ret 125 | } 126 | 127 | func (v *ArrayVector) Scale(s float64) { 128 | for i, _ := range v.data { 129 | v.data[i] *= s 130 | } 131 | } 132 | 133 | func (v *ArrayVector) SoftMaxNorm() *ArrayVector { 134 | sum := 0.0 135 | for _, val := range v.data { 136 | sum += math.Exp(val) 137 | } 138 | ret := NewArrayVector() 139 | for key, val := range v.data { 140 | ret.SetValue(key, math.Exp(val)/sum) 141 | } 142 | return ret 143 | } 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | hector 2 | ====== 3 | 4 | Golang machine learning lib. Currently, it can be used to solve binary classification problems. 5 | 6 | # Supported Algorithms 7 | 8 | 1. Logistic Regression 9 | 2. Factorized Machine 10 | 3. CART, Random Forest, Random Decision Tree, Gradient Boosting Decision Tree 11 | 4. Neural Network 12 | 13 | # Dataset Format 14 | 15 | Hector support libsvm-like data format. Following is an sample dataset 16 | 17 | 1 1:0.7 3:0.1 9:0.4 18 | 0 2:0.3 4:0.9 7:0.5 19 | 0 2:0.7 5:0.3 20 | ... 21 | 22 | # How to Run 23 | 24 | ## Run as tools 25 | 26 | hector-cv.go will help you test one algorithm by cross validation in some dataset, you can run it by following steps: 27 | 28 | go get github.com/xlvector/hector 29 | go install github.com/xlvector/hector/hectorcv 30 | hectorcv --method [Method] --train [Data Path] --cv 10 31 | 32 | Here, Method include 33 | 34 | 1. lr : logistic regression with SGD and L2 regularization. 35 | 2. ftrl : FTRL-proximal logistic regreesion with L1 regularization. Please review this paper for more details "Ad Click Prediction: a View from the Trenches". 36 | 3. ep : bayesian logistic regression with expectation propagation. Please review this paper for more details "Web-Scale Bayesian Click-Through Rate Prediction for Sponsored Search Advertising in Microsoft’s Bing Search Engine" 37 | 4. fm : factorization machine 38 | 5. cart : classifiaction tree 39 | 6. cart-regression : regression tree 40 | 7. rf : random forest 41 | 8. rdt : random decision trees 42 | 9. gbdt : gradient boosting decisio tree 43 | 10. linear-svm : linear svm with L1 regularization 44 | 11. svm : svm optimizaed by SMO (current, its linear svm) 45 | 12. l1vm : vector machine with L1 regularization by RBF kernel 46 | 13. knn : k-nearest neighbor classification 47 | 48 | hector-run.go will help you train one algorithm on train dataset and test it on test dataset, you can run it by following steps: 49 | 50 | cd src 51 | go build hector-run.go 52 | ./hector-run --method [Method] --train [Data Path] --test [Data Path] 53 | 54 | Above methods will direct train algorithm on train dataset and then test on test dataset. If you want to train algorithm and get the model file, you can run it by following steps: 55 | 56 | ./hector-run --method [Method] --action train --train [Data Path] --model [Model Path] 57 | 58 | Then, you can use model file to test any test dataset: 59 | 60 | ./hector-run --method [Method] --action test --test [Data Path] --model [Model Path] 61 | 62 | # Benchmark 63 | 64 | ## Binary Classification 65 | 66 | Following are datasets used in benchmarks, You can find them from [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/) 67 | 68 | 1. heart 69 | 2. fourclass 70 | 71 | I will do 5-fold cross validation on the dataset, and use AUC as evaluation metric. Following are the results: 72 | 73 | DataSet | Method | AUC 74 | ------- | ------ | --- 75 | heart | FTRL-LR |0.9109 76 | heart | EP-LR | 0.8982 77 | heart | CART | 0.8231 78 | heart | RDT | 0.9155 79 | heart | RF | 0.9019 80 | heart | GBDT | 0.9061 81 | fourclass | FTRL-LR | 0.8281 82 | fourclass | EP-LR | 0.7986 83 | fourclass | CART | 0.9832 84 | fourclass | RDT | 0.9925 85 | fourclass | RF | 0.9947 86 | fourclass | GBDT | 0.9958 87 | 88 | -------------------------------------------------------------------------------- /mc_runner.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package hector is a golang based machine learning lib. It intend to implement all famous machine learning algoirhtms by golang. 3 | Currently, it only support algorithms which can solve binary classification problems. Supported algorithms include: 4 | 1. Decision Tree (CART, Random Forest, GBDT) 5 | 2. Logistic Regression 6 | 3. SVM 7 | 4. Neural Network 8 | */ 9 | package hector 10 | 11 | import ( 12 | "github.com/xlvector/hector/algo" 13 | "github.com/xlvector/hector/core" 14 | "os" 15 | "strconv" 16 | ) 17 | 18 | func MultiClassRun(classifier algo.MultiClassClassifier, train_path string, test_path string, pred_path string, params map[string]string) (float64, error) { 19 | global, _ := strconv.ParseInt(params["global"], 10, 64) 20 | train_dataset := core.NewDataSet() 21 | 22 | err := train_dataset.Load(train_path, global) 23 | 24 | if err != nil { 25 | return 0.5, err 26 | } 27 | 28 | test_dataset := core.NewDataSet() 29 | err = test_dataset.Load(test_path, global) 30 | if err != nil { 31 | return 0.5, err 32 | } 33 | classifier.Init(params) 34 | accuracy := MultiClassRunOnDataSet(classifier, train_dataset, test_dataset, pred_path, params) 35 | 36 | return accuracy, nil 37 | } 38 | 39 | func MultiClassTrain(classifier algo.MultiClassClassifier, train_path string, params map[string]string) error { 40 | global, _ := strconv.ParseInt(params["global"], 10, 64) 41 | train_dataset := core.NewDataSet() 42 | 43 | err := train_dataset.Load(train_path, global) 44 | 45 | if err != nil { 46 | return err 47 | } 48 | 49 | classifier.Init(params) 50 | classifier.Train(train_dataset) 51 | 52 | model_path, _ := params["model"] 53 | 54 | if model_path != "" { 55 | classifier.SaveModel(model_path) 56 | } 57 | 58 | return nil 59 | } 60 | 61 | func MultiClassTest(classifier algo.MultiClassClassifier, test_path string, pred_path string, params map[string]string) (float64, error) { 62 | global, _ := strconv.ParseInt(params["global"], 10, 64) 63 | 64 | model_path, _ := params["model"] 65 | classifier.Init(params) 66 | if model_path != "" { 67 | classifier.LoadModel(model_path) 68 | } else { 69 | return 0.0, nil 70 | } 71 | 72 | test_dataset := core.NewDataSet() 73 | err := test_dataset.Load(test_path, global) 74 | if err != nil { 75 | return 0.0, err 76 | } 77 | 78 | accuracy := MultiClassRunOnDataSet(classifier, nil, test_dataset, pred_path, params) 79 | 80 | return accuracy, nil 81 | } 82 | 83 | func MultiClassRunOnDataSet(classifier algo.MultiClassClassifier, train_dataset, test_dataset *core.DataSet, pred_path string, params map[string]string) float64 { 84 | 85 | if train_dataset != nil { 86 | classifier.Train(train_dataset) 87 | } 88 | 89 | var pred_file *os.File 90 | if pred_path != "" { 91 | pred_file, _ = os.Create(pred_path) 92 | } 93 | accuracy := 0.0 94 | total := 0.0 95 | for _, sample := range test_dataset.Samples { 96 | prediction := classifier.PredictMultiClass(sample) 97 | label, _ := prediction.KeyWithMaxValue() 98 | if int(label) == sample.Label { 99 | accuracy += 1.0 100 | } 101 | total += 1.0 102 | if pred_file != nil { 103 | pred_file.WriteString(strconv.Itoa(label) + "\n") 104 | } 105 | } 106 | if pred_path != "" { 107 | defer pred_file.Close() 108 | } 109 | 110 | return accuracy / total 111 | } 112 | -------------------------------------------------------------------------------- /core/sample.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "github.com/xlvector/hector/util" 5 | ) 6 | 7 | /* 8 | Sample - for classification 9 | Here, label should be int value started from 0 10 | */ 11 | type Sample struct { 12 | Features []Feature 13 | Label int 14 | Prediction float64 15 | } 16 | 17 | func NewSample() *Sample { 18 | ret := Sample{} 19 | ret.Features = []Feature{} 20 | ret.Label = 0 21 | ret.Prediction = 0.0 22 | return &ret 23 | } 24 | 25 | func (s *Sample) Clone() *Sample { 26 | ret := NewSample() 27 | ret.Label = s.Label 28 | ret.Prediction = s.Prediction 29 | for _, feature := range s.Features { 30 | clone_feature := Feature{feature.Id, feature.Value} 31 | ret.Features = append(ret.Features, clone_feature) 32 | } 33 | 34 | return ret 35 | } 36 | 37 | func (s *Sample) ToString(includePrediction bool) []byte { 38 | sb := util.StringBuilder{} 39 | sb.Int(s.Label) 40 | sb.Write(" ") 41 | if includePrediction { 42 | sb.Float(s.Prediction) 43 | sb.Write(" ") 44 | } 45 | for _, feature := range s.Features { 46 | sb.Int64(feature.Id) 47 | sb.Write(":") 48 | sb.Float(feature.Value) 49 | sb.Write(" ") 50 | } 51 | return sb.Bytes() 52 | } 53 | 54 | func (s *Sample) LabelDoubleValue() float64 { 55 | if s.Label > 0 { 56 | return 1.0 57 | } else { 58 | return 0.0 59 | } 60 | } 61 | 62 | func (s *Sample) AddFeature(f Feature) { 63 | s.Features = append(s.Features, f) 64 | } 65 | 66 | /* RawSample */ 67 | type RawSample struct { 68 | Label int 69 | Prediction float64 70 | Features map[string]string 71 | } 72 | 73 | func NewRawSample() *RawSample { 74 | ret := RawSample{} 75 | ret.Features = make(map[string]string) 76 | ret.Label = 0 77 | ret.Prediction = 0.0 78 | return &ret 79 | } 80 | 81 | func (s *RawSample) GetFeatureValue(key string) string { 82 | value, ok := s.Features[key] 83 | if ok { 84 | return value 85 | } else { 86 | return "nil" 87 | } 88 | } 89 | 90 | /* MapBasedSample */ 91 | type MapBasedSample struct { 92 | Label int 93 | Prediction float64 94 | Features map[int64]float64 95 | } 96 | 97 | func (s *MapBasedSample) LabelDoubleValue() float64 { 98 | return float64(s.Label) 99 | } 100 | 101 | func (s *Sample) ToMapBasedSample() *MapBasedSample { 102 | ret := MapBasedSample{} 103 | ret.Features = make(map[int64]float64) 104 | ret.Label = s.Label 105 | ret.Prediction = s.Prediction 106 | for _, feature := range s.Features { 107 | ret.Features[feature.Id] = feature.Value 108 | } 109 | return &ret 110 | } 111 | 112 | func (s *Sample) GetFeatureVector() *Vector { 113 | ret := NewVector() 114 | for _, f := range s.Features { 115 | ret.SetValue(f.Id, f.Value) 116 | } 117 | return ret 118 | } 119 | 120 | /* 121 | RealSample 122 | Real valued samples for regression 123 | */ 124 | type RealSample struct { 125 | Features []Feature 126 | Prediction float64 127 | Value float64 128 | } 129 | 130 | func NewRealSample() *RealSample { 131 | ret := RealSample{} 132 | ret.Features = []Feature{} 133 | ret.Value = 0.0 134 | ret.Prediction = 0.0 135 | return &ret 136 | } 137 | 138 | func (rs *RealSample) GetFeatureVector() *Vector { 139 | ret := NewVector() 140 | for _, f := range rs.Features { 141 | ret.SetValue(f.Id, f.Value) 142 | } 143 | return ret 144 | } 145 | 146 | func (s *RealSample) AddFeature(f Feature) { 147 | s.Features = append(s.Features, f) 148 | } 149 | -------------------------------------------------------------------------------- /lr/ftrl_logistic_regression.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "bufio" 5 | "github.com/xlvector/hector/core" 6 | "github.com/xlvector/hector/util" 7 | "math" 8 | "os" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | type FTRLLogisticRegressionParams struct { 14 | Alpha, Beta, Lambda1, Lambda2 float64 15 | Steps int 16 | } 17 | 18 | type FTRLFeatureWeight struct { 19 | ni, zi float64 20 | } 21 | 22 | func (w *FTRLFeatureWeight) Wi(p FTRLLogisticRegressionParams) float64 { 23 | wi := 0.0 24 | if math.Abs(w.zi) > p.Lambda1 { 25 | wi = (util.Signum(w.zi)*p.Lambda1 - w.zi) / (p.Lambda2 + (p.Beta+math.Sqrt(w.ni))/p.Alpha) 26 | } 27 | return wi 28 | } 29 | 30 | type FTRLLogisticRegression struct { 31 | Model map[int64]FTRLFeatureWeight 32 | Params FTRLLogisticRegressionParams 33 | } 34 | 35 | func (algo *FTRLLogisticRegression) SaveModel(path string) { 36 | sb := util.StringBuilder{} 37 | for f, g := range algo.Model { 38 | sb.Int64(f) 39 | sb.Write("\t") 40 | sb.Float(g.ni) 41 | sb.Write("\t") 42 | sb.Float(g.zi) 43 | sb.Write("\n") 44 | } 45 | sb.WriteToFile(path) 46 | } 47 | 48 | func (algo *FTRLLogisticRegression) LoadModel(path string) { 49 | file, _ := os.Open(path) 50 | defer file.Close() 51 | 52 | scaner := bufio.NewScanner(file) 53 | for scaner.Scan() { 54 | line := scaner.Text() 55 | tks := strings.Split(line, "\t") 56 | fid, _ := strconv.ParseInt(tks[0], 10, 64) 57 | ni, _ := strconv.ParseFloat(tks[1], 64) 58 | zi, _ := strconv.ParseFloat(tks[2], 64) 59 | g := FTRLFeatureWeight{ni: ni, zi: zi} 60 | algo.Model[fid] = g 61 | } 62 | } 63 | 64 | func (algo *FTRLLogisticRegression) Predict(sample *core.Sample) float64 { 65 | ret := 0.0 66 | for _, feature := range sample.Features { 67 | model_feature_value, ok := algo.Model[feature.Id] 68 | if ok { 69 | ret += model_feature_value.Wi(algo.Params) * feature.Value 70 | } 71 | } 72 | return util.Sigmoid(ret) 73 | } 74 | 75 | func (algo *FTRLLogisticRegression) Init(params map[string]string) { 76 | algo.Model = make(map[int64]FTRLFeatureWeight) 77 | algo.Params.Alpha, _ = strconv.ParseFloat(params["alpha"], 64) 78 | algo.Params.Lambda1, _ = strconv.ParseFloat(params["lambda1"], 64) 79 | algo.Params.Lambda2, _ = strconv.ParseFloat(params["lambda2"], 64) 80 | algo.Params.Beta, _ = strconv.ParseFloat(params["beta"], 64) 81 | steps, _ := strconv.ParseInt(params["steps"], 10, 32) 82 | algo.Params.Steps = int(steps) 83 | } 84 | 85 | func (algo *FTRLLogisticRegression) Clear() { 86 | algo.Model = nil 87 | algo.Model = make(map[int64]FTRLFeatureWeight) 88 | } 89 | 90 | func (algo *FTRLLogisticRegression) Train(dataset *core.DataSet) { 91 | for step := 0; step < algo.Params.Steps; step++ { 92 | for _, sample := range dataset.Samples { 93 | prediction := algo.Predict(sample) 94 | err := sample.LabelDoubleValue() - prediction 95 | for _, feature := range sample.Features { 96 | model_feature_value, ok := algo.Model[feature.Id] 97 | if !ok { 98 | model_feature_value = FTRLFeatureWeight{0.0, 0.0} 99 | } 100 | zi := model_feature_value.zi 101 | ni := model_feature_value.ni 102 | gi := -1 * err * feature.Value 103 | sigma := (math.Sqrt(ni+gi*gi) - math.Sqrt(ni)) / algo.Params.Alpha 104 | wi := model_feature_value.Wi(algo.Params) 105 | zi += gi - sigma*wi 106 | ni += gi * gi 107 | algo.Model[feature.Id] = FTRLFeatureWeight{zi: zi, ni: ni} 108 | } 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /combine/category_feature_combination.go: -------------------------------------------------------------------------------- 1 | package combine 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector/core" 6 | "github.com/xlvector/hector/eval" 7 | "github.com/xlvector/hector/lr" 8 | "math/rand" 9 | ) 10 | 11 | type CategoryFeatureCombination struct { 12 | algo *lr.EPLogisticRegression 13 | feature_combinations []core.CombinedFeature 14 | output string 15 | } 16 | 17 | func (c *CategoryFeatureCombination) Init(params map[string]string) { 18 | c.algo = &(lr.EPLogisticRegression{}) 19 | c.algo.Init(params) 20 | c.output = params["output"] 21 | } 22 | 23 | func (c *CategoryFeatureCombination) OneCVAUC(dataset0 *core.RawDataSet, combines []core.CombinedFeature, total_cv, cv int) float64 { 24 | dataset := dataset0.ToDataSet(nil, combines) 25 | 26 | train := dataset.Split(func(i int) bool { return i%total_cv != cv }) 27 | 28 | c.algo.Train(train) 29 | 30 | test := dataset.Split(func(i int) bool { return i%total_cv == cv }) 31 | 32 | predictions := []*eval.LabelPrediction{} 33 | for _, sample := range test.Samples { 34 | pred := c.algo.Predict(sample) 35 | lp := eval.LabelPrediction{Label: sample.Label, Prediction: pred} 36 | predictions = append(predictions, &lp) 37 | } 38 | auc := eval.AUC(predictions) 39 | c.algo.Clear() 40 | return auc 41 | } 42 | 43 | func (c *CategoryFeatureCombination) FindCombination(dataset *core.RawDataSet) []core.CombinedFeature { 44 | features := []string{} 45 | for fkey, _ := range dataset.FeatureKeys { 46 | features = append(features, fkey) 47 | } 48 | candidate_column_combines := []core.CombinedFeature{} 49 | c.feature_combinations = []core.CombinedFeature{} 50 | 51 | for i, fi := range features { 52 | c.feature_combinations = append(c.feature_combinations, core.CombinedFeature{fi}) 53 | for j, fj := range features[i+1:] { 54 | candidate_column_combines = append(candidate_column_combines, core.CombinedFeature{fi, fj}) 55 | for k, fk := range features[i+j+1:] { 56 | candidate_column_combines = append(candidate_column_combines, core.CombinedFeature{fi, fj, fk}) 57 | for _, ft := range features[i+j+k+1:] { 58 | candidate_column_combines = append(candidate_column_combines, core.CombinedFeature{fi, fj, fk, ft}) 59 | } 60 | } 61 | } 62 | } 63 | fmt.Printf("candidates %d\n", len(candidate_column_combines)) 64 | used_combines := make(map[int]bool) 65 | 66 | total_cv := 3 67 | 68 | best_auc := 0.0 69 | best_combines := -1 70 | for { 71 | if len(used_combines) == len(candidate_column_combines) { 72 | break 73 | } 74 | ok := false 75 | for i, column_combines := range candidate_column_combines { 76 | _, used := used_combines[i] 77 | if used { 78 | continue 79 | } 80 | temp_combines := c.feature_combinations 81 | temp_combines = append(temp_combines, column_combines) 82 | 83 | ave_auc := 0.0 84 | for cv := 0; cv < total_cv; cv++ { 85 | ave_auc += c.OneCVAUC(dataset, temp_combines, total_cv, cv) 86 | } 87 | ave_auc /= float64(total_cv) 88 | if best_auc < ave_auc { 89 | best_auc = ave_auc 90 | best_combines = i 91 | ok = true 92 | if rand.Intn(10) == 1 { 93 | break 94 | } 95 | } 96 | } 97 | if !ok { 98 | break 99 | } 100 | used_combines[best_combines] = true 101 | c.feature_combinations = append(c.feature_combinations, candidate_column_combines[best_combines]) 102 | fmt.Println(best_auc) 103 | fmt.Println(c.feature_combinations) 104 | } 105 | 106 | return c.feature_combinations 107 | } 108 | -------------------------------------------------------------------------------- /lr/quasinewton_helper.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "math" 6 | ) 7 | 8 | /** 9 | * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models" 10 | * by Galen Andrew and Jianfeng Gao 11 | * user: weixuan 12 | * To change this template use File | Settings | File Templates. 13 | */ 14 | type QuasiNewtonHelper struct { 15 | // config 16 | numHist int64 17 | minimizer Minimizer 18 | // historical data 19 | sList, yList []*core.Vector 20 | roList []float64 21 | curPos, curGrad *core.Vector 22 | } 23 | 24 | type Minimizer interface { 25 | NextPoint(curPos *core.Vector, dir *core.Vector, alpha float64) *core.Vector 26 | Evaluate(curPos *core.Vector) float64 27 | } 28 | 29 | const MAX_BACKTRACKING_ITER = 50 30 | 31 | // Description: the pos and gradient arguments should NOT be modified outside 32 | func NewQuasiNewtonHelper(numHist int, minimizer Minimizer, curPos *core.Vector, curGrad *core.Vector) *QuasiNewtonHelper { 33 | h := new(QuasiNewtonHelper) 34 | h.numHist = int64(numHist) 35 | h.minimizer = minimizer 36 | h.curPos = curPos 37 | h.curGrad = curGrad 38 | h.sList = make([]*core.Vector, 0) 39 | h.yList = make([]*core.Vector, 0) 40 | h.roList = make([]float64, 0) 41 | return h 42 | } 43 | 44 | // Description: Update the dir from -grad to optimal direction 45 | // Dir will be modified directly 46 | func (h *QuasiNewtonHelper) ApplyQuasiInverseHession(dir *core.Vector) { 47 | count := len(h.sList) 48 | if count == 0 { 49 | return 50 | } 51 | alphas := make([]float64, count, count) 52 | for n := count - 1; n >= 0; n-- { 53 | alphas[n] = -dir.Dot(h.sList[n]) / h.roList[n] 54 | dir.ApplyElemWiseMultiplyAccumulation(h.yList[n], alphas[n]) 55 | } 56 | lastY := h.yList[count-1] 57 | yDotY := lastY.Dot(lastY) 58 | scalar := h.roList[count-1] / yDotY 59 | dir.ApplyScale(scalar) 60 | 61 | for n := 0; n < count; n++ { 62 | beta := dir.Dot(h.yList[n]) / h.roList[n] 63 | dir.ApplyElemWiseMultiplyAccumulation(h.sList[n], -alphas[n]-beta) 64 | } 65 | return 66 | } 67 | 68 | func (h *QuasiNewtonHelper) BackTrackingLineSearch(cost float64, pos *core.Vector, grad *core.Vector, dir *core.Vector, isInit bool) (nextCost float64, nextPos *core.Vector) { 69 | dotGradDir := grad.Dot(dir) 70 | if dotGradDir == 0 { 71 | return cost, pos 72 | } 73 | if dotGradDir > 0 { 74 | panic("BackTracking: to the opposite direction of grad") 75 | } 76 | 77 | alpha := 1.0 78 | backoff := 0.5 79 | if isInit { 80 | normDir := math.Sqrt(dir.Dot(dir)) 81 | alpha = (1 / normDir) 82 | backoff = 0.1 83 | } 84 | 85 | var c1 float64 = 1e-4 86 | for cntItr := 0; cntItr <= MAX_BACKTRACKING_ITER; cntItr++ { 87 | nextPos = h.minimizer.NextPoint(pos, dir, alpha) 88 | nextCost = h.minimizer.Evaluate(nextPos) 89 | if nextCost <= cost+c1*dotGradDir*alpha { 90 | break 91 | } 92 | alpha *= backoff 93 | } 94 | return nextCost, nextPos 95 | } 96 | 97 | // Description: the pos and gradient arguments should NOT be modified outside 98 | func (h *QuasiNewtonHelper) UpdateState(nextPos *core.Vector, nextGrad *core.Vector) (isOptimal bool) { 99 | if int64(len(h.sList)) >= h.numHist { 100 | h.sList = h.sList[1:] 101 | h.yList = h.yList[1:] 102 | h.roList = h.roList[1:] 103 | } 104 | newS := nextPos.ElemWiseMultiplyAdd(h.curPos, -1) 105 | newY := nextGrad.ElemWiseMultiplyAdd(h.curGrad, -1) 106 | ro := newS.Dot(newY) 107 | h.sList = append(h.sList, newS) 108 | h.yList = append(h.yList, newY) 109 | h.roList = append(h.roList, ro) 110 | h.curPos = nextPos 111 | h.curGrad = nextGrad 112 | return ro == 0 113 | } 114 | -------------------------------------------------------------------------------- /dt/random_forest.go: -------------------------------------------------------------------------------- 1 | package dt 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "log" 7 | "os" 8 | "strconv" 9 | "strings" 10 | "sync" 11 | 12 | "github.com/xlvector/hector/core" 13 | ) 14 | 15 | type RandomForestParams struct { 16 | TreeCount int 17 | FeatureCount float64 18 | } 19 | 20 | type RandomForest struct { 21 | trees []*Tree 22 | params RandomForestParams 23 | cart CART 24 | continuous_features bool 25 | } 26 | 27 | func (self *RandomForest) SaveModel(path string) { 28 | file, _ := os.Create(path) 29 | defer file.Close() 30 | for _, tree := range self.trees { 31 | buf := tree.ToString() 32 | file.Write(buf) 33 | file.WriteString("\n#\n") 34 | } 35 | } 36 | 37 | func (self *RandomForest) LoadModel(path string) { 38 | file, _ := os.Open(path) 39 | defer file.Close() 40 | 41 | self.trees = []*Tree{} 42 | reader := bufio.NewReader(file) 43 | text := []string{} 44 | for { 45 | line, err := reader.ReadString('\n') 46 | if err != nil { 47 | break 48 | } 49 | line = strings.TrimSpace(line) 50 | if line == "#" { 51 | tree := Tree{} 52 | tree.fromString(text) 53 | self.trees = append(self.trees, &tree) 54 | text = []string{} 55 | } else { 56 | text = append(text, line) 57 | } 58 | } 59 | log.Println("rf tree count :", len(self.trees)) 60 | } 61 | 62 | func (dt *RandomForest) Init(params map[string]string) { 63 | dt.trees = []*Tree{} 64 | dt.cart.Init(params) 65 | tree_count, _ := strconv.ParseInt(params["tree-count"], 10, 64) 66 | feature_count, _ := strconv.ParseFloat(params["feature-count"], 64) 67 | dt.params.TreeCount = int(tree_count) 68 | dt.params.FeatureCount = feature_count 69 | } 70 | 71 | func (dt *RandomForest) Train(dataset *core.DataSet) { 72 | samples := []*core.MapBasedSample{} 73 | feature_weights := make(map[int64]float64) 74 | for _, sample := range dataset.Samples { 75 | if !dt.continuous_features { 76 | for _, f := range sample.Features { 77 | _, ok := feature_weights[f.Id] 78 | if !ok { 79 | feature_weights[f.Id] = f.Value 80 | } 81 | if feature_weights[f.Id] != f.Value { 82 | dt.continuous_features = true 83 | } 84 | } 85 | } 86 | msample := sample.ToMapBasedSample() 87 | samples = append(samples, msample) 88 | } 89 | dt.cart.continuous_features = dt.continuous_features 90 | 91 | trees := make(chan *Tree, dt.params.TreeCount) 92 | var wait sync.WaitGroup 93 | wait.Add(dt.params.TreeCount) 94 | 95 | for i := 0; i < dt.params.TreeCount; i++ { 96 | 97 | go func() { 98 | tree := dt.cart.SingleTreeBuild(samples, dt.params.FeatureCount, true) 99 | trees <- &tree 100 | fmt.Printf(".") 101 | wait.Done() 102 | }() 103 | } 104 | wait.Wait() 105 | fmt.Println() 106 | close(trees) 107 | for tree := range trees { 108 | dt.trees = append(dt.trees, tree) 109 | } 110 | } 111 | 112 | func (dt *RandomForest) Predict(sample *core.Sample) float64 { 113 | msample := sample.ToMapBasedSample() 114 | predictions := 0.0 115 | total := 0.0 116 | for _, tree := range dt.trees { 117 | node, _ := PredictBySingleTree(tree, msample) 118 | predictions += node.prediction.GetValue(1) 119 | total += 1.0 120 | } 121 | return predictions / total 122 | } 123 | 124 | func (dt *RandomForest) PredictMultiClass(sample *core.Sample) *core.ArrayVector { 125 | msample := sample.ToMapBasedSample() 126 | predictions := core.NewArrayVector() 127 | total := 0.0 128 | for _, tree := range dt.trees { 129 | node, _ := PredictBySingleTree(tree, msample) 130 | predictions.AddVector(node.prediction, 1.0) 131 | total += 1.0 132 | } 133 | predictions.Scale(1.0 / total) 134 | return predictions 135 | } 136 | -------------------------------------------------------------------------------- /lr/lr_owlqn.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "bufio" 5 | "github.com/xlvector/hector/core" 6 | "github.com/xlvector/hector/util" 7 | "math" 8 | "os" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | type LROWLQNParams struct { 14 | Regularization float64 15 | } 16 | 17 | type LROWLQN struct { 18 | Model *core.Vector 19 | Params LROWLQNParams 20 | // for training 21 | dataSet *core.DataSet 22 | lastPos *core.Vector 23 | lastCost float64 24 | lastGrad *core.Vector 25 | } 26 | 27 | func (lr *LROWLQN) SaveModel(path string) { 28 | sb := util.StringBuilder{} 29 | for key, val := range lr.Model.Data { 30 | sb.Int64(key) 31 | sb.Write("\t") 32 | sb.Float(val) 33 | sb.Write("\n") 34 | } 35 | sb.WriteToFile(path) 36 | } 37 | 38 | func (lr *LROWLQN) LoadModel(path string) { 39 | file, _ := os.Open(path) 40 | defer file.Close() 41 | 42 | scaner := bufio.NewScanner(file) 43 | for scaner.Scan() { 44 | line := scaner.Text() 45 | tks := strings.Split(line, "\t") 46 | key, _ := strconv.ParseInt(tks[0], 10, 64) 47 | val, _ := strconv.ParseFloat(tks[1], 64) 48 | lr.Model.SetValue(key, val) 49 | } 50 | } 51 | 52 | func (lr *LROWLQN) Init(params map[string]string) { 53 | lr.Model = core.NewVector() 54 | lr.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64) 55 | } 56 | 57 | func (lr *LROWLQN) updateValueGrad(pos *core.Vector, dataset *core.DataSet) { 58 | var totalLoss float64 = 0.0 59 | var grad *core.Vector = core.NewVector() 60 | for _, sample := range dataset.Samples { 61 | var score float64 = lr.getScore(pos, sample) 62 | var signScore float64 = score 63 | if sample.Label == 0 { 64 | signScore = -score 65 | } 66 | var prob float64 67 | var lnProb float64 68 | if signScore < -30 { 69 | prob = 0 70 | lnProb = signScore 71 | } else if signScore > 30 { 72 | prob = 1 73 | lnProb = 0 74 | } else { 75 | prob = 1.0 / (1.0 + math.Exp(-signScore)) 76 | lnProb = math.Log(prob) 77 | } 78 | var scale float64 79 | if sample.Label == 0 { 80 | scale = (1 - prob) 81 | } else { 82 | scale = -(1 - prob) 83 | } 84 | totalLoss += -lnProb 85 | for _, fea := range sample.Features { 86 | grad.AddValue(fea.Id, scale*fea.Value) 87 | } 88 | } 89 | lr.lastPos = pos.Copy() 90 | lr.lastCost = totalLoss 91 | lr.lastGrad = grad 92 | } 93 | 94 | func (lr *LROWLQN) Equals(x *core.Vector, y *core.Vector) bool { 95 | if y == nil && x == nil { 96 | return true 97 | } 98 | if y == nil || x == nil { 99 | return false 100 | } 101 | for key, val := range x.Data { 102 | if y.GetValue(key) != val { 103 | return false 104 | } 105 | } 106 | for key, val := range y.Data { 107 | if x.GetValue(key) != val { 108 | return false 109 | } 110 | } 111 | return true 112 | } 113 | 114 | func (lr *LROWLQN) Value(pos *core.Vector) float64 { 115 | if lr.Equals(pos, lr.lastPos) { 116 | return lr.lastCost 117 | } 118 | lr.updateValueGrad(pos, lr.dataSet) 119 | return lr.lastCost 120 | } 121 | 122 | func (lr *LROWLQN) Gradient(pos *core.Vector) *core.Vector { 123 | if lr.Equals(pos, lr.lastPos) { 124 | return lr.lastGrad 125 | } 126 | lr.updateValueGrad(pos, lr.dataSet) 127 | return lr.lastGrad 128 | } 129 | 130 | func (lr *LROWLQN) Train(dataset *core.DataSet) { 131 | lr.dataSet = dataset 132 | minimizer := NewOWLQNMinimizer(lr.Params.Regularization) 133 | lr.Model = minimizer.Minimize(lr, core.NewVector()) 134 | } 135 | 136 | func (lr *LROWLQN) getScore(model *core.Vector, sample *core.Sample) float64 { 137 | var score float64 = 0 138 | for _, fea := range sample.Features { 139 | score += model.GetValue(fea.Id) * fea.Value 140 | } 141 | return score 142 | } 143 | 144 | func (lr *LROWLQN) Predict(sample *core.Sample) float64 { 145 | score := lr.getScore(lr.Model, sample) 146 | score = 1.0 / (1.0 + math.Exp(-score)) 147 | return score 148 | } 149 | -------------------------------------------------------------------------------- /lr/ep_logistic_regression.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "bufio" 5 | "github.com/xlvector/hector/core" 6 | "github.com/xlvector/hector/util" 7 | "math" 8 | "os" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | type EPLogisticRegressionParams struct { 14 | init_var, beta float64 15 | } 16 | 17 | type EPLogisticRegression struct { 18 | Model map[int64]*util.Gaussian 19 | params EPLogisticRegressionParams 20 | } 21 | 22 | func (algo *EPLogisticRegression) SaveModel(path string) { 23 | sb := util.StringBuilder{} 24 | for f, g := range algo.Model { 25 | sb.Int64(f) 26 | sb.Write("\t") 27 | sb.Float(g.Mean) 28 | sb.Write("\t") 29 | sb.Float(g.Vari) 30 | sb.Write("\n") 31 | } 32 | sb.WriteToFile(path) 33 | } 34 | 35 | func (algo *EPLogisticRegression) LoadModel(path string) { 36 | file, _ := os.Open(path) 37 | defer file.Close() 38 | 39 | scaner := bufio.NewScanner(file) 40 | for scaner.Scan() { 41 | line := scaner.Text() 42 | tks := strings.Split(line, "\t") 43 | fid, _ := strconv.ParseInt(tks[0], 10, 64) 44 | mean, _ := strconv.ParseFloat(tks[1], 64) 45 | vari, _ := strconv.ParseFloat(tks[2], 64) 46 | g := util.Gaussian{Mean: mean, Vari: vari} 47 | algo.Model[fid] = &g 48 | } 49 | } 50 | 51 | func (algo *EPLogisticRegression) Predict(sample *core.Sample) float64 { 52 | s := util.Gaussian{Mean: 0.0, Vari: 0.0} 53 | for _, feature := range sample.Features { 54 | if feature.Value == 0.0 { 55 | continue 56 | } 57 | wi, ok := algo.Model[feature.Id] 58 | if !ok { 59 | wi = &(util.Gaussian{Mean: 0.0, Vari: algo.params.init_var}) 60 | } 61 | s.Mean += feature.Value * wi.Mean 62 | s.Vari += feature.Value * feature.Value * wi.Vari 63 | } 64 | 65 | t := s 66 | t.Vari += algo.params.beta 67 | return t.Integral(t.Mean / math.Sqrt(t.Vari)) 68 | } 69 | 70 | func (algo *EPLogisticRegression) Init(params map[string]string) { 71 | algo.Model = make(map[int64]*util.Gaussian) 72 | algo.params.beta, _ = strconv.ParseFloat(params["beta"], 64) 73 | algo.params.init_var = 1.0 74 | } 75 | 76 | func (algo *EPLogisticRegression) Clear() { 77 | algo.Model = nil 78 | algo.Model = make(map[int64]*util.Gaussian) 79 | } 80 | 81 | func (algo *EPLogisticRegression) Train(dataset *core.DataSet) { 82 | 83 | for _, sample := range dataset.Samples { 84 | s := util.Gaussian{Mean: 0.0, Vari: 0.0} 85 | for _, feature := range sample.Features { 86 | if feature.Value == 0.0 { 87 | continue 88 | } 89 | wi, ok := algo.Model[feature.Id] 90 | if !ok { 91 | wi = &(util.Gaussian{Mean: 0.0, Vari: algo.params.init_var}) 92 | algo.Model[feature.Id] = wi 93 | } 94 | s.Mean += feature.Value * wi.Mean 95 | s.Vari += feature.Value * feature.Value * wi.Vari 96 | } 97 | 98 | t := s 99 | t.Vari += algo.params.beta 100 | 101 | t2 := util.Gaussian{Mean: 0.0, Vari: 0.0} 102 | if sample.Label > 0.0 { 103 | t2.UpperTruncateGaussian(t.Mean, t.Vari, 0.0) 104 | } else { 105 | t2.LowerTruncateGaussian(t.Mean, t.Vari, 0.0) 106 | } 107 | t.MultGaussian(&t2) 108 | s2 := t 109 | s2.Vari += algo.params.beta 110 | s0 := s 111 | s.MultGaussian(&s2) 112 | 113 | for _, feature := range sample.Features { 114 | if feature.Value == 0.0 { 115 | continue 116 | } 117 | wi0 := util.Gaussian{Mean: 0.0, Vari: algo.params.init_var} 118 | w2 := util.Gaussian{Mean: 0.0, Vari: 0.0} 119 | wi, _ := algo.Model[feature.Id] 120 | w2.Mean = (s.Mean - (s0.Mean - wi.Mean*feature.Value)) / feature.Value 121 | w2.Vari = (s.Vari + (s0.Vari - wi.Vari*feature.Value*feature.Value)) / (feature.Value * feature.Value) 122 | wi.MultGaussian(&w2) 123 | wi_vari := wi.Vari 124 | wi_new_vari := wi_vari * wi0.Vari / (0.99*wi0.Vari + 0.01*wi.Vari) 125 | wi.Vari = wi_new_vari 126 | wi.Mean = wi.Vari * (0.99*wi.Mean/wi_vari + 0.01*wi0.Mean/wi.Vari) 127 | if wi.Vari < algo.params.init_var*0.01 { 128 | wi.Vari = algo.params.init_var * 0.01 129 | } 130 | algo.Model[feature.Id] = wi 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /bin/hector-stack.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "github.com/xlvector/hector" 7 | "github.com/xlvector/hector/core" 8 | "github.com/xlvector/hector/eval" 9 | "github.com/xlvector/hector/lr" 10 | "os" 11 | "strconv" 12 | "sync" 13 | ) 14 | 15 | func SplitFile(input string, total int, part int) (string, string, error) { 16 | file, err := os.Open(input) 17 | if err != nil { 18 | return "", "", err 19 | } 20 | defer file.Close() 21 | 22 | train_path := input + ".train." + strconv.Itoa(part) 23 | train_file, err := os.Create(train_path) 24 | if err != nil { 25 | return "", "", err 26 | } 27 | defer train_file.Close() 28 | 29 | test_path := input + ".test." + strconv.Itoa(part) 30 | test_file, err := os.Create(test_path) 31 | if err != nil { 32 | return "", "", err 33 | } 34 | defer test_file.Close() 35 | 36 | scanner := bufio.NewScanner(file) 37 | k := 0 38 | for scanner.Scan() { 39 | if k%total == part { 40 | test_file.WriteString(scanner.Text() + "\n") 41 | } else { 42 | train_file.WriteString(scanner.Text() + "\n") 43 | } 44 | k += 1 45 | } 46 | return train_path, test_path, nil 47 | } 48 | 49 | func main() { 50 | train_path, test_path, pred_path, _, params := hector.PrepareParams() 51 | total := 5 52 | methods := []string{"ftrl", "fm"} 53 | all_methods_predictions := [][]*eval.LabelPrediction{} 54 | all_methods_test_predictions := [][]*eval.LabelPrediction{} 55 | for _, method := range methods { 56 | fmt.Println(method) 57 | average_auc := 0.0 58 | all_predictions := []*eval.LabelPrediction{} 59 | for part := 0; part < total; part++ { 60 | train, test, _ := SplitFile(train_path, total, part) 61 | classifier := hector.GetClassifier(method) 62 | 63 | auc, predictions, _ := hector.AlgorithmRun(classifier, train, test, "", params) 64 | fmt.Println("AUC:") 65 | fmt.Println(auc) 66 | average_auc += auc 67 | os.Remove(train) 68 | os.Remove(test) 69 | classifier = nil 70 | for _, pred := range predictions { 71 | all_predictions = append(all_predictions, pred) 72 | } 73 | } 74 | all_methods_predictions = append(all_methods_predictions, all_predictions) 75 | fmt.Println(average_auc / float64(total)) 76 | 77 | classifier := hector.GetClassifier(method) 78 | fmt.Println(test_path) 79 | _, test_predictions, _ := hector.AlgorithmRun(classifier, train_path, test_path, "", params) 80 | all_methods_test_predictions = append(all_methods_test_predictions, test_predictions) 81 | } 82 | 83 | var wait sync.WaitGroup 84 | wait.Add(2) 85 | dataset := core.NewDataSet() 86 | go func() { 87 | for i, _ := range all_methods_predictions[0] { 88 | sample := core.NewSample() 89 | sample.Label = all_methods_predictions[0][i].Label 90 | for j, _ := range all_methods_predictions { 91 | feature := core.Feature{Id: int64(j), Value: all_methods_predictions[j][i].Prediction} 92 | sample.AddFeature(feature) 93 | } 94 | dataset.Samples <- sample 95 | } 96 | close(dataset.Samples) 97 | wait.Done() 98 | }() 99 | 100 | ensembler := lr.LinearRegression{} 101 | go func() { 102 | ensembler.Init(params) 103 | ensembler.Train(dataset) 104 | wait.Done() 105 | }() 106 | wait.Wait() 107 | 108 | fmt.Println(ensembler.Model) 109 | 110 | wait.Add(2) 111 | test_dataset := hector.NewDataSet() 112 | go func() { 113 | for i, _ := range all_methods_test_predictions[0] { 114 | sample := hector.NewSample() 115 | sample.Label = all_methods_test_predictions[0][i].Prediction 116 | for j, _ := range all_methods_test_predictions { 117 | feature := hector.Feature{Id: int64(j), Value: all_methods_test_predictions[j][i].Prediction} 118 | sample.AddFeature(feature) 119 | } 120 | test_dataset.Samples <- sample 121 | } 122 | close(test_dataset.Samples) 123 | wait.Done() 124 | }() 125 | 126 | go func() { 127 | pred_file, _ := os.Create(test_path + ".out") 128 | for sample := range test_dataset.Samples { 129 | prediction := sample.Label //ensembler.Predict(sample) 130 | pred_file.WriteString(strconv.FormatFloat(prediction, 'g', 5, 64) + "\n") 131 | } 132 | defer pred_file.Close() 133 | wait.Done() 134 | }() 135 | wait.Wait() 136 | } 137 | -------------------------------------------------------------------------------- /svm/svm.go: -------------------------------------------------------------------------------- 1 | package svm 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector/core" 6 | "math" 7 | "math/rand" 8 | "strconv" 9 | ) 10 | 11 | type SVM struct { 12 | sv []*core.Vector 13 | y []float64 14 | a []float64 15 | b float64 16 | C float64 17 | e float64 18 | w *core.Vector 19 | 20 | xx []float64 21 | } 22 | 23 | func (self *SVM) SaveModel(path string) { 24 | 25 | } 26 | 27 | func (self *SVM) LoadModel(path string) { 28 | 29 | } 30 | 31 | type SVMValues struct { 32 | a1, a2, e1, e2, k11, k12, k22 float64 33 | i1, i2 int 34 | } 35 | 36 | func (c *SVM) Init(params map[string]string) { 37 | c.C, _ = strconv.ParseFloat(params["c"], 64) 38 | c.e, _ = strconv.ParseFloat(params["e"], 64) 39 | 40 | c.w = core.NewVector() 41 | } 42 | 43 | func (c *SVM) Predict(sample *core.Sample) float64 { 44 | x := sample.GetFeatureVector() 45 | return c.PredictVector(x) 46 | } 47 | 48 | func (c *SVM) PredictVector(x *core.Vector) float64 { 49 | ret := c.w.Dot(x) - c.b 50 | return ret 51 | } 52 | 53 | func (c *SVM) MatchKKT(y, f, a float64) bool { 54 | ep := c.C * 0.01 55 | if a < ep && y*f > 1.0 { 56 | return true 57 | } 58 | 59 | if a > c.C-ep && y*f < 1.0 { 60 | return true 61 | } 62 | 63 | if a > ep && a < c.C-ep && y*f == 1.0 { 64 | return true 65 | } 66 | 67 | return false 68 | } 69 | 70 | func (c *SVM) Train(dataset *core.DataSet) { 71 | c.sv = []*core.Vector{} 72 | c.y = []float64{} 73 | c.a = []float64{} 74 | for k, sample := range dataset.Samples { 75 | x := sample.GetFeatureVector() 76 | c.sv = append(c.sv, x) 77 | c.xx = append(c.xx, x.Dot(x)) 78 | if sample.Label > 0.0 { 79 | c.y = append(c.y, 1.0) 80 | } else { 81 | c.y = append(c.y, -1.0) 82 | } 83 | c.a = append(c.a, c.C*rand.Float64()) 84 | c.w.AddVector(x, c.y[k]*c.a[k]) 85 | } 86 | 87 | c.b = 0.0 88 | for k, x := range c.sv { 89 | c.b += c.PredictVector(x) - c.y[k] 90 | } 91 | c.b /= float64(len(c.sv)) 92 | fmt.Println(c.b) 93 | 94 | for step := 0; step < 100; step++ { 95 | da := 0.0 96 | for i1 := 0; i1 < len(c.sv); i1++ { 97 | a1 := c.a[i1] 98 | x1 := c.sv[i1] 99 | y1 := c.y[i1] 100 | p1 := c.PredictVector(x1) 101 | if c.MatchKKT(y1, p1, a1) { 102 | continue 103 | } 104 | maxde := 0.0 105 | best_values := SVMValues{} 106 | for k2 := 0; k2 < 10; k2++ { 107 | i2 := rand.Intn(len(c.sv)) 108 | if i1 == i2 { 109 | continue 110 | } 111 | 112 | x2 := c.sv[i2] 113 | y2 := c.y[i2] 114 | p2 := c.PredictVector(x2) 115 | k11 := c.xx[i1] 116 | k12 := x1.Dot(x2) 117 | k22 := c.xx[i2] 118 | 119 | a2 := c.a[i2] 120 | 121 | u := math.Max(0, a2-a1) 122 | v := math.Min(c.C, c.C+a2-a1) 123 | if y1*y2 > 0.0 { 124 | u = math.Max(0, a2+a1-c.C) 125 | v = math.Min(c.C, a1+a2) 126 | } 127 | 128 | e1 := p1 - y1 129 | e2 := p2 - y2 130 | 131 | a2old := a2 132 | a2 += y2 * (e1 - e2) / (k11 + k22 - 2*k12) 133 | 134 | a2 = math.Max(u, math.Min(a2, v)) 135 | 136 | a1 += y1 * y2 * (a2old - a2) 137 | 138 | if math.Abs(e1-e2) > maxde { 139 | maxde = math.Abs(e1 - e2) 140 | best_values.a1 = a1 141 | best_values.a2 = a2 142 | best_values.i1 = i1 143 | best_values.i2 = i2 144 | best_values.e1 = e1 145 | best_values.e2 = e2 146 | } 147 | if maxde >= 4.0 { 148 | break 149 | } 150 | } 151 | da += math.Abs(c.a[best_values.i1] - best_values.a1) 152 | c.w.AddVector(c.sv[best_values.i1], c.y[best_values.i1]*(best_values.a1-c.a[best_values.i1])) 153 | c.w.AddVector(c.sv[best_values.i2], c.y[best_values.i2]*(best_values.a2-c.a[best_values.i2])) 154 | /* 155 | b1 := c.b - best_values.e1 - c.y[best_values.i1] * (best_values.a1 - c.a[best_values.i1]) * best_values.k11 - c.y[best_values.i2] * (best_values.a2 - c.a[best_values.i2]) * best_values.k12 156 | b2 := c.b - best_values.e2 - c.y[best_values.i1] * (best_values.a1 - c.a[best_values.i1]) * best_values.k12 - c.y[best_values.i2] * (best_values.a2 - c.a[best_values.i2]) * best_values.k22 157 | if best_values.a1 > 0.0 && best_values.a1 < c.C{ 158 | c.b = b1 159 | } else { 160 | if best_values.a2 > 0.0 && best_values.a2 < c.C { 161 | c.b = b2 162 | } else { 163 | c.b = 0.5 * (b1 + b2) 164 | } 165 | }*/ 166 | c.a[best_values.i1] = best_values.a1 167 | c.a[best_values.i2] = best_values.a2 168 | } 169 | da /= float64(len(c.sv)) 170 | fmt.Printf(".. %f %f\n", da, c.b) 171 | if da < c.e { 172 | break 173 | } 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /lr/owlqn_minimizer.go: -------------------------------------------------------------------------------- 1 | package lr 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector/core" 6 | "math" 7 | ) 8 | 9 | /** 10 | * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models" 11 | * by Galen Andrew and Jianfeng Gao 12 | * user: weixuan 13 | * To change this template use File | Settings | File Templates. 14 | */ 15 | type OWLQNMinimizer struct { 16 | l1reg float64 17 | costFun DiffFunction 18 | numHist int 19 | maxIteration int 20 | tolerance float64 21 | } 22 | 23 | var owlqn_output_switch bool = false 24 | 25 | func NewOWLQNMinimizer(l1reg float64) *OWLQNMinimizer { 26 | m := new(OWLQNMinimizer) 27 | m.l1reg = l1reg 28 | m.numHist = 10 29 | m.maxIteration = 20 30 | m.tolerance = 1e-4 31 | return m 32 | } 33 | 34 | func (m *OWLQNMinimizer) Minimize(costfun DiffFunction, init *core.Vector) *core.Vector { 35 | m.costFun = costfun 36 | var cost float64 = m.Evaluate(init) 37 | var grad *core.Vector = costfun.Gradient(init).Copy() 38 | var pos *core.Vector = init.Copy() 39 | var terminalCriterion *relativeMeanImprCriterion = NewRelativeMeanImprCriterion(m.tolerance) 40 | terminalCriterion.addCost(cost) 41 | 42 | var helper *QuasiNewtonHelper = NewQuasiNewtonHelper(m.numHist, m, pos, grad) 43 | if owlqn_output_switch { 44 | fmt.Println("Iter\tcost\timprovement") 45 | fmt.Printf("%d\t%e\tUndefined", 0, cost) 46 | } 47 | for iter := 1; iter <= m.maxIteration; iter++ { 48 | // customed steepest descending dir 49 | steepestDescDir := grad.Copy() 50 | m.updateGrad(pos, steepestDescDir) 51 | steepestDescDir.ApplyScale(-1.0) 52 | dir := steepestDescDir.Copy() 53 | // quasi-newton dir 54 | helper.ApplyQuasiInverseHession(dir) 55 | m.fixDirSign(dir, steepestDescDir) 56 | // customed grad for the new position 57 | potentialGrad := grad.Copy() 58 | m.updateGradForNewPos(pos, potentialGrad, dir) 59 | newCost, newPos := helper.BackTrackingLineSearch(cost, pos, potentialGrad, dir, iter == 1) 60 | if owlqn_output_switch { 61 | fmt.Println("") 62 | } 63 | if cost == newCost { 64 | break 65 | } 66 | cost = newCost 67 | pos = newPos 68 | grad = costfun.Gradient(pos).Copy() 69 | terminalCriterion.addCost(cost) 70 | if owlqn_output_switch { 71 | fmt.Printf("%d\t%e\t%e", iter, newCost, terminalCriterion.improvement) 72 | } 73 | if terminalCriterion.isTerminable() || helper.UpdateState(pos, grad) { 74 | if owlqn_output_switch { 75 | fmt.Println("") 76 | } 77 | break 78 | } 79 | } 80 | return pos 81 | } 82 | 83 | // Description: assume all the features in x also appears in grad 84 | // all the features in dir must be in grad 85 | func (m *OWLQNMinimizer) updateGradForNewPos(x *core.Vector, grad *core.Vector, dir *core.Vector) { 86 | if m.l1reg == 0 { 87 | return 88 | } 89 | for key, val := range grad.Data { 90 | xval := x.GetValue(key) 91 | if xval < 0 { 92 | grad.SetValue(key, val-m.l1reg) 93 | } else if xval > 0 { 94 | grad.SetValue(key, val+m.l1reg) 95 | } else { 96 | dirval := dir.GetValue(key) 97 | if dirval < 0 { 98 | grad.SetValue(key, val-m.l1reg) 99 | } else if dirval > 0 { 100 | grad.SetValue(key, val+m.l1reg) 101 | } 102 | } 103 | } 104 | return 105 | } 106 | 107 | // Description: assume all the features in x also appears in grad 108 | func (m *OWLQNMinimizer) updateGrad(x *core.Vector, grad *core.Vector) { 109 | if m.l1reg == 0 { 110 | return 111 | } 112 | for key, val := range grad.Data { 113 | xval := x.GetValue(key) 114 | if xval < 0 { 115 | grad.SetValue(key, val-m.l1reg) 116 | } else if xval > 0 { 117 | grad.SetValue(key, val+m.l1reg) 118 | } else { 119 | if val < -m.l1reg { 120 | grad.SetValue(key, val+m.l1reg) 121 | } else if val > m.l1reg { 122 | grad.SetValue(key, val-m.l1reg) 123 | } 124 | } 125 | } 126 | return 127 | } 128 | 129 | func (m *OWLQNMinimizer) fixDirSign(dir *core.Vector, steepestDescDir *core.Vector) { 130 | if m.l1reg == 0 { 131 | return 132 | } 133 | for key, val := range dir.Data { 134 | if val*steepestDescDir.GetValue(key) <= 0 { 135 | dir.SetValue(key, 0) 136 | } 137 | } 138 | } 139 | 140 | func (m *OWLQNMinimizer) Evaluate(pos *core.Vector) float64 { 141 | cost := m.costFun.Value(pos) 142 | for _, val := range pos.Data { 143 | cost += math.Abs(val) * m.l1reg 144 | } 145 | return cost 146 | } 147 | 148 | func (m *OWLQNMinimizer) NextPoint(curPos *core.Vector, dir *core.Vector, alpha float64) *core.Vector { 149 | if owlqn_output_switch { 150 | fmt.Printf(".") 151 | } 152 | newPos := curPos.ElemWiseMultiplyAdd(dir, alpha) 153 | if m.l1reg > 0 { 154 | for key, val := range curPos.Data { 155 | if val*newPos.GetValue(key) < 0 { 156 | newPos.SetValue(key, 0) 157 | } 158 | } 159 | } 160 | return newPos 161 | } 162 | -------------------------------------------------------------------------------- /core/vector.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "github.com/xlvector/hector/util" 5 | "math" 6 | "math/rand" 7 | "strconv" 8 | "strings" 9 | ) 10 | 11 | type Vector struct { 12 | Data map[int64]float64 13 | } 14 | 15 | func NewVector() *Vector { 16 | v := Vector{} 17 | v.Data = make(map[int64]float64) 18 | return &v 19 | } 20 | 21 | func (v *Vector) ToString() []byte { 22 | sb := util.StringBuilder{} 23 | for key, value := range v.Data { 24 | sb.Int64(key) 25 | sb.Write(":") 26 | sb.Float(value) 27 | sb.Write("|") 28 | } 29 | return sb.Bytes() 30 | } 31 | 32 | func (v *Vector) FromString(buf string) { 33 | tks := strings.Split(buf, "|") 34 | for _, tk := range tks { 35 | if len(tk) == 0 { 36 | continue 37 | } 38 | kv := strings.Split(tk, ":") 39 | key, _ := strconv.ParseInt(kv[0], 10, 64) 40 | value, _ := strconv.ParseFloat(kv[1], 64) 41 | v.Data[key] = value 42 | } 43 | } 44 | 45 | func (v *Vector) AddValue(key int64, value float64) { 46 | _, ok := v.Data[key] 47 | if ok { 48 | v.Data[key] += value 49 | } else { 50 | v.Data[key] = value 51 | } 52 | } 53 | 54 | func (v *Vector) GetValue(key int64) float64 { 55 | value, ok := v.Data[key] 56 | if !ok { 57 | return 0.0 58 | } else { 59 | return value 60 | } 61 | } 62 | 63 | func (v *Vector) RandomInit(key int64, c float64) { 64 | value, ok := v.Data[key] 65 | if !ok { 66 | value = rand.NormFloat64() * c 67 | v.Data[key] = value 68 | } 69 | } 70 | 71 | func (v *Vector) SetValue(key int64, value float64) { 72 | v.Data[key] = value 73 | } 74 | 75 | func (v *Vector) AddVector(v2 *Vector, alpha float64) { 76 | for key, value := range v2.Data { 77 | v.AddValue(key, value*alpha) 78 | } 79 | } 80 | 81 | func (v *Vector) NormL2() float64 { 82 | ret := 0.0 83 | for _, val := range v.Data { 84 | ret += val * val 85 | } 86 | return ret 87 | } 88 | 89 | func (v *Vector) Copy() *Vector { 90 | ret := NewVector() 91 | for key, val := range v.Data { 92 | ret.SetValue(key, val) 93 | } 94 | return ret 95 | } 96 | 97 | func (v *Vector) KeyWithMaxValue() (int64, float64) { 98 | ret := int64(0) 99 | max_val := 0.0 100 | for key, val := range v.Data { 101 | max_val = val 102 | ret = key 103 | break 104 | } 105 | for key, val := range v.Data { 106 | if max_val < val { 107 | max_val = val 108 | ret = key 109 | } 110 | } 111 | return ret, max_val 112 | } 113 | 114 | func (v *Vector) Sum() float64 { 115 | ret := 0.0 116 | for _, val := range v.Data { 117 | ret += val 118 | } 119 | return ret 120 | } 121 | 122 | func (v *Vector) Dot(v2 *Vector) float64 { 123 | va := v 124 | vb := v2 125 | 126 | if len(v2.Data) < len(v.Data) { 127 | va = v2 128 | vb = v 129 | } 130 | ret := 0.0 131 | for key, a := range va.Data { 132 | b, ok := vb.Data[key] 133 | if ok { 134 | ret += a * b 135 | } 136 | } 137 | return ret 138 | } 139 | 140 | func (v *Vector) DotFeatures(fs []Feature) float64 { 141 | ret := 0.0 142 | for _, f := range fs { 143 | ret += f.Value * v.GetValue(f.Id) 144 | } 145 | return ret 146 | } 147 | 148 | type ElemOperation func(float64) float64 149 | 150 | func (v *Vector) ApplyOnElem(fn ElemOperation) *Vector { 151 | ret := NewVector() 152 | for key, val := range v.Data { 153 | ret.SetValue(key, fn(val)) 154 | } 155 | return ret 156 | } 157 | 158 | func (v *Vector) Scale(scale float64) *Vector { 159 | ret := NewVector() 160 | for key, val := range v.Data { 161 | ret.SetValue(key, val*scale) 162 | } 163 | return ret 164 | } 165 | 166 | func (v *Vector) ApplyScale(scale float64) { 167 | for key, val := range v.Data { 168 | v.Data[key] = val * scale 169 | } 170 | } 171 | 172 | func (v *Vector) SoftMaxNorm() *Vector { 173 | sum := 0.0 174 | for _, val := range v.Data { 175 | sum += math.Exp(val) 176 | } 177 | ret := NewVector() 178 | for key, val := range v.Data { 179 | ret.SetValue(key, math.Exp(val)/sum) 180 | } 181 | return ret 182 | } 183 | 184 | func (v *Vector) ElemWiseAddVector(u *Vector) *Vector { 185 | ret := NewVector() 186 | for key, vi := range v.Data { 187 | ret.SetValue(key, vi) 188 | } 189 | for key, ui := range u.Data { 190 | ret.AddValue(key, ui) 191 | } 192 | return ret 193 | } 194 | 195 | func (v *Vector) ElemWiseMultiply(u *Vector) *Vector { 196 | ret := NewVector() 197 | for key, val := range v.Data { 198 | ual := u.GetValue(key) 199 | if ual != 0 && val != 0 { 200 | ret.SetValue(key, val*ual) 201 | } 202 | } 203 | return ret 204 | } 205 | 206 | func (v *Vector) ElemWiseMultiplyAdd(u *Vector, s float64) *Vector { 207 | ret := NewVector() 208 | for key, val := range v.Data { 209 | ret.SetValue(key, val) 210 | } 211 | for key, val := range u.Data { 212 | ret.AddValue(key, val*s) 213 | } 214 | return ret 215 | } 216 | 217 | func (v *Vector) ApplyElemWiseMultiplyAccumulation(u *Vector, s float64) { 218 | for key, val := range u.Data { 219 | v.AddValue(key, val*s) 220 | } 221 | } 222 | 223 | func (v *Vector) OuterProduct(u *Vector) *Matrix { 224 | ret := NewMatrix() 225 | for key, vi := range v.Data { 226 | ret.Data[key] = u.Scale(vi) 227 | } 228 | return ret 229 | } 230 | 231 | func (v *Vector) MultiplyMatrix(m *Matrix) *Vector { 232 | ret := NewVector() 233 | for k, v := range v.Data { 234 | u, ok := m.Data[k] 235 | if ok { 236 | for ki, ui := range u.Data { 237 | ret.Data[ki] += v * ui 238 | } 239 | } 240 | } 241 | return ret 242 | } 243 | -------------------------------------------------------------------------------- /gp/gaussian_process.go: -------------------------------------------------------------------------------- 1 | package gp 2 | 3 | import ( 4 | "github.com/xlvector/hector/core" 5 | "math" 6 | "strconv" 7 | ) 8 | 9 | type GaussianProcessParameters struct { 10 | Dim int64 11 | Theta float64 12 | } 13 | 14 | type GaussianProcess struct { 15 | Params GaussianProcessParameters 16 | CovarianceFunc CovFunc 17 | CovMatrix *core.Matrix 18 | TargetValues *core.Vector 19 | InvCovTarget *core.Vector // inv(CovMatrix)*TargetValues 20 | DataSet *core.RealDataSet 21 | TrainingDataCount int64 22 | } 23 | 24 | func (self *GaussianProcess) SaveModel(path string) { 25 | 26 | } 27 | 28 | func (self *GaussianProcess) LoadModel(path string) { 29 | 30 | } 31 | 32 | /* 33 | Given matrix m and vector v, compute inv(m)*v. 34 | Based on Gibbs and MacKay 1997, and Mark N. Gibbs's PhD dissertation 35 | 36 | Details: 37 | A - positive seminidefinite matrix 38 | u - a vector 39 | theta - positive number 40 | C = A + I*theta 41 | Returns inv(C)*u - So you need the diagonal noise term for covariance matrix in a sense. 42 | However, this algorithm is numerically stable, the noise term can be very small and the inversion can still be calculated... 43 | */ 44 | func (algo *GaussianProcess) ApproximateInversion(A *core.Matrix, u *core.Vector, theta float64, dim int64) *core.Vector { 45 | max_itr := 500 46 | tol := 0.01 47 | 48 | C := core.NewMatrix() 49 | for key, val := range A.Data { 50 | C.Data[key] = val.Copy() 51 | } 52 | 53 | // Add theta to diagonal elements 54 | for i := int64(0); i < dim; i++ { 55 | _, ok := C.Data[i] 56 | if !ok { 57 | C.Data[i] = core.NewVector() 58 | } 59 | C.Data[i].Data[i] = C.Data[i].Data[i] + theta 60 | } 61 | 62 | var Q_l float64 63 | var Q_u float64 64 | var dQ float64 65 | u_norm := u.Dot(u) / 2 66 | 67 | // Lower bound 68 | y_l := core.NewVector() 69 | g_l := u.Copy() 70 | h_l := u.Copy() 71 | lambda_l := float64(0) 72 | gamma_l := float64(0) 73 | var tmp_f1 float64 74 | var tmp_f2 float64 75 | var tmp_v1 *core.Vector 76 | tmp_f1 = g_l.Dot(g_l) 77 | tmp_v1 = C.MultiplyVector(h_l) 78 | 79 | // Upper bound 80 | y_u := core.NewVector() 81 | g_u := u.Copy() 82 | h_u := u.Copy() 83 | lambda_u := float64(0) 84 | gamma_u := float64(0) 85 | var tmp_f3 float64 86 | var tmp_f4 float64 87 | var tmp_v3 *core.Vector 88 | var tmp_v4 *core.Vector 89 | tmp_v3 = g_u.MultiplyMatrix(A) 90 | tmp_v4 = C.MultiplyVector(h_u) 91 | tmp_f3 = tmp_v1.Dot(g_u) 92 | 93 | for i := 0; i < max_itr; i++ { 94 | // Lower bound 95 | lambda_l = tmp_f1 / h_l.Dot(tmp_v1) 96 | y_l.AddVector(h_l, lambda_l) //y_l next 97 | Q_l = y_l.Dot(u) - 0.5*(y_l.MultiplyMatrix(C)).Dot(y_l) 98 | 99 | // Upper bound 100 | lambda_u = tmp_f3 / tmp_v3.Dot(tmp_v4) 101 | y_u.AddVector(h_u, lambda_u) //y_u next 102 | Q_u = (y_u.MultiplyMatrix(A)).Dot(u) - 0.5*((y_u.MultiplyMatrix(C)).MultiplyMatrix(A)).Dot(y_u) 103 | 104 | dQ = (u_norm-Q_u)/theta - Q_l 105 | if dQ < tol { 106 | break 107 | } 108 | 109 | // Lower bound var updates 110 | g_l.AddVector(tmp_v1, -lambda_l) //g_l next 111 | tmp_f2 = g_l.Dot(g_l) 112 | gamma_l = tmp_f2 / tmp_f1 113 | for key, val := range h_l.Data { 114 | h_l.SetValue(key, val*gamma_l) 115 | } 116 | h_l.AddVector(g_l, 1) //h_l next 117 | tmp_f1 = tmp_f2 //tmp_f1 next 118 | tmp_v1 = C.MultiplyVector(h_l) //tmp_v1 next 119 | 120 | // Upper bound var updates 121 | g_u.AddVector(tmp_v4, -lambda_u) //g_u next 122 | tmp_v3 = g_u.MultiplyMatrix(A) //tmp_v3 next 123 | tmp_f4 = tmp_v3.Dot(g_u) 124 | gamma_u = tmp_f4 / tmp_f3 125 | for key, val := range h_u.Data { 126 | h_u.SetValue(key, val*gamma_u) 127 | } 128 | h_u.AddVector(g_u, 1) //h_u next 129 | tmp_v4 = C.MultiplyVector(h_u) //tmp_v4 next 130 | tmp_f3 = tmp_f4 // tmp_f3 next 131 | } 132 | 133 | return y_l 134 | } 135 | 136 | func (algo *GaussianProcess) ExtractTargetValuesAsVector(samples []*core.RealSample) *core.Vector { 137 | targets := core.NewVector() 138 | for i := 0; i < len(samples); i++ { 139 | targets.SetValue(int64(i), samples[i].Value) 140 | } 141 | return targets 142 | } 143 | 144 | func (algo *GaussianProcess) Init(params map[string]string) { 145 | 146 | dim, _ := strconv.ParseInt(params["dim"], 10, 64) 147 | 148 | algo.Params = GaussianProcessParameters{} 149 | algo.Params.Dim = dim // Pass in dim as a param.. and require feature space to be continous. 150 | algo.Params.Theta = 1e-7 // Used by approximate inversion as the diagonal noise 151 | 152 | radius := 0.1 153 | camp := 40.0 154 | cf := CovSEARD{} 155 | radiuses := core.NewVector() 156 | for i := int64(1); i <= dim; i++ { 157 | radiuses.SetValue(i, radius) 158 | } 159 | cf.Init(radiuses, camp) 160 | 161 | algo.CovarianceFunc = cf.Cov 162 | } 163 | 164 | func (algo *GaussianProcess) Train(dataset *core.RealDataSet) { 165 | algo.DataSet = dataset 166 | algo.TrainingDataCount = int64(len(dataset.Samples)) 167 | algo.CovMatrix = CovMatrix(algo.DataSet.Samples, algo.CovarianceFunc) 168 | algo.TargetValues = algo.ExtractTargetValuesAsVector(algo.DataSet.Samples) 169 | algo.InvCovTarget = algo.ApproximateInversion(algo.CovMatrix, algo.TargetValues, algo.Params.Theta, algo.TrainingDataCount) 170 | } 171 | 172 | func (algo *GaussianProcess) Predict(sample *core.RealSample) float64 { 173 | k := CovVector(algo.DataSet.Samples, sample, algo.CovarianceFunc) 174 | pred := k.Dot(algo.InvCovTarget) 175 | 176 | return pred 177 | } 178 | 179 | func (algo *GaussianProcess) PredictStd(sample *core.RealSample) float64 { 180 | k := CovVector(algo.DataSet.Samples, sample, algo.CovarianceFunc) 181 | C_inv_k := algo.ApproximateInversion(algo.CovMatrix, k, algo.Params.Theta, algo.TrainingDataCount) 182 | std := math.Sqrt(algo.CovarianceFunc(sample.GetFeatureVector(), sample.GetFeatureVector()) - k.Dot(C_inv_k)) 183 | return std 184 | } 185 | -------------------------------------------------------------------------------- /ann/neural_network.go: -------------------------------------------------------------------------------- 1 | package ann 2 | 3 | import ( 4 | "fmt" 5 | "github.com/xlvector/hector/core" 6 | "github.com/xlvector/hector/util" 7 | "math" 8 | "math/rand" 9 | "strconv" 10 | ) 11 | 12 | type NeuralNetworkParams struct { 13 | LearningRate float64 14 | LearningRateDiscount float64 15 | Regularization float64 16 | Hidden int64 17 | Steps int 18 | Verbose int 19 | } 20 | 21 | type TwoLayerWeights struct { 22 | L1 *core.Matrix 23 | L2 *core.Matrix 24 | } 25 | 26 | /* 27 | Please refer to this chapter to know algorithm details : 28 | http://www4.rgu.ac.uk/files/chapter3%20-%20bp.pdf 29 | */ 30 | type NeuralNetwork struct { 31 | Model TwoLayerWeights 32 | MaxLabel int64 33 | Params NeuralNetworkParams 34 | } 35 | 36 | func RandomInitVector(dim int64) *core.Vector { 37 | v := core.NewVector() 38 | var i int64 39 | for i = 0; i < dim; i++ { 40 | v.Data[i] = (rand.Float64() - 0.5) / math.Sqrt(float64(dim)) 41 | } 42 | return v 43 | } 44 | 45 | func (self *NeuralNetwork) SaveModel(path string) { 46 | 47 | } 48 | 49 | func (self *NeuralNetwork) LoadModel(path string) { 50 | 51 | } 52 | 53 | func (algo *NeuralNetwork) Init(params map[string]string) { 54 | algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64) 55 | algo.Params.LearningRateDiscount, _ = strconv.ParseFloat(params["learning-rate-discount"], 64) 56 | algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64) 57 | steps, _ := strconv.ParseInt(params["steps"], 10, 32) 58 | hidden, _ := strconv.ParseInt(params["hidden"], 10, 64) 59 | verbose, _ := strconv.ParseInt(params["verbose"], 10, 32) 60 | 61 | algo.Params.Steps = int(steps) 62 | algo.Params.Hidden = int64(hidden) 63 | algo.Params.Verbose = int(verbose) 64 | } 65 | 66 | func (algo *NeuralNetwork) Train(dataset *core.DataSet) { 67 | algo.Model = TwoLayerWeights{} 68 | algo.Model.L1 = core.NewMatrix() 69 | algo.Model.L2 = core.NewMatrix() 70 | 71 | for i := int64(0); i < algo.Params.Hidden; i++ { 72 | algo.Model.L1.Data[i] = core.NewVector() 73 | } 74 | 75 | initalized := make(map[int64]int) 76 | max_label := 0 77 | for _, sample := range dataset.Samples { 78 | if max_label < sample.Label { 79 | max_label = sample.Label 80 | } 81 | for _, f := range sample.Features { 82 | _, ok := initalized[f.Id] 83 | if !ok { 84 | for i := int64(0); i < algo.Params.Hidden; i++ { 85 | algo.Model.L1.SetValue(i, f.Id, (rand.Float64()-0.5)/math.Sqrt(float64(algo.Params.Hidden))) 86 | } 87 | initalized[f.Id] = 1 88 | } 89 | } 90 | } 91 | algo.MaxLabel = int64(max_label) 92 | 93 | for i := int64(0); i <= algo.Params.Hidden; i++ { 94 | for j := int64(0); j <= algo.MaxLabel; j++ { 95 | algo.Model.L2.SetValue(i, j, (rand.NormFloat64() / math.Sqrt(float64(algo.MaxLabel)+1.0))) 96 | } 97 | } 98 | 99 | for step := 0; step < algo.Params.Steps; step++ { 100 | if algo.Params.Verbose <= 0 { 101 | fmt.Printf(".") 102 | } 103 | total := len(dataset.Samples) 104 | counter := 0 105 | for _, sample := range dataset.Samples { 106 | y := core.NewVector() 107 | z := core.NewVector() 108 | e := core.NewVector() 109 | delta_hidden := core.NewVector() 110 | 111 | for i := int64(0); i < algo.Params.Hidden; i++ { 112 | sum := float64(0) 113 | wi := algo.Model.L1.Data[i] 114 | for _, f := range sample.Features { 115 | sum += f.Value * wi.GetValue(f.Id) 116 | } 117 | y.Data[i] = util.Sigmoid(sum) 118 | } 119 | y.Data[algo.Params.Hidden] = 1.0 120 | for i := int64(0); i <= algo.MaxLabel; i++ { 121 | sum := float64(0) 122 | for j := int64(0); j <= algo.Params.Hidden; j++ { 123 | sum += y.GetValue(j) * algo.Model.L2.GetValue(j, i) 124 | } 125 | z.SetValue(i, sum) 126 | } 127 | z = z.SoftMaxNorm() 128 | e.SetValue(int64(sample.Label), 1.0) 129 | e.AddVector(z, -1.0) 130 | 131 | for i := int64(0); i <= algo.Params.Hidden; i++ { 132 | delta := float64(0) 133 | for j := int64(0); j <= algo.MaxLabel; j++ { 134 | wij := algo.Model.L2.GetValue(i, j) 135 | sig_ij := e.GetValue(j) * (1 - z.GetValue(j)) * z.GetValue(j) 136 | delta += sig_ij * wij 137 | wij += algo.Params.LearningRate * (y.GetValue(i)*sig_ij - algo.Params.Regularization*wij) 138 | algo.Model.L2.SetValue(i, j, wij) 139 | } 140 | delta_hidden.SetValue(i, delta) 141 | } 142 | 143 | for i := int64(0); i < algo.Params.Hidden; i++ { 144 | wi := algo.Model.L1.Data[i] 145 | for _, f := range sample.Features { 146 | wji := wi.GetValue(f.Id) 147 | wji += algo.Params.LearningRate * (delta_hidden.GetValue(i)*f.Value*y.GetValue(i)*(1-y.GetValue(i)) - algo.Params.Regularization*wji) 148 | wi.SetValue(f.Id, wji) 149 | } 150 | } 151 | counter++ 152 | if algo.Params.Verbose > 0 && counter%2000 == 0 { 153 | fmt.Printf("Epoch %d %f%%\n", step+1, float64(counter)/float64(total)*100) 154 | } 155 | } 156 | 157 | if algo.Params.Verbose > 0 { 158 | algo.Evaluate(dataset) 159 | } 160 | algo.Params.LearningRate *= algo.Params.LearningRateDiscount 161 | } 162 | fmt.Println() 163 | } 164 | 165 | func (algo *NeuralNetwork) PredictMultiClass(sample *core.Sample) *core.ArrayVector { 166 | y := core.NewVector() 167 | z := core.NewArrayVector() 168 | for i := int64(0); i < algo.Params.Hidden; i++ { 169 | sum := float64(0) 170 | for _, f := range sample.Features { 171 | sum += f.Value * algo.Model.L1.Data[i].GetValue(f.Id) 172 | } 173 | y.Data[i] = util.Sigmoid(sum) 174 | } 175 | y.Data[algo.Params.Hidden] = 1 176 | for i := 0; i <= int(algo.MaxLabel); i++ { 177 | sum := float64(0) 178 | for j := int64(0); j <= algo.Params.Hidden; j++ { 179 | sum += y.GetValue(j) * algo.Model.L2.GetValue(j, int64(i)) 180 | } 181 | z.SetValue(i, sum) 182 | } 183 | z = z.SoftMaxNorm() 184 | return z 185 | } 186 | 187 | func (algo *NeuralNetwork) Predict(sample *core.Sample) float64 { 188 | z := algo.PredictMultiClass(sample) 189 | return z.GetValue(1) 190 | } 191 | 192 | func (algo *NeuralNetwork) Evaluate(dataset *core.DataSet) { 193 | accuracy := 0.0 194 | total := 0.0 195 | for _, sample := range dataset.Samples { 196 | prediction := algo.PredictMultiClass(sample) 197 | label, _ := prediction.KeyWithMaxValue() 198 | if int(label) == sample.Label { 199 | accuracy += 1.0 200 | } 201 | total += 1.0 202 | } 203 | fmt.Printf("accuracy %f%%\n", accuracy/total*100) 204 | } 205 | -------------------------------------------------------------------------------- /algo_runner.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package hector is a golang based machine learning lib. It intend to implement all famous machine learning algoirhtms by golang. 3 | Currently, it only support algorithms which can solve binary classification problems. Supported algorithms include: 4 | 1. Decision Tree (CART, Random Forest, GBDT) 5 | 2. Logistic Regression 6 | 3. SVM 7 | 4. Neural Network 8 | */ 9 | package hector 10 | 11 | import ( 12 | "github.com/xlvector/hector/algo" 13 | "github.com/xlvector/hector/core" 14 | "github.com/xlvector/hector/eval" 15 | "os" 16 | "strconv" 17 | ) 18 | 19 | func AlgorithmRun(classifier algo.Classifier, 20 | train_path string, test_path string, pred_path string, 21 | params map[string]string) (float64, []*eval.LabelPrediction, error) { 22 | global, _ := strconv.ParseInt(params["global"], 10, 64) 23 | train_dataset := core.NewDataSet() 24 | 25 | err := train_dataset.Load(train_path, global) 26 | 27 | if err != nil { 28 | return 0.5, nil, err 29 | } 30 | 31 | test_dataset := core.NewDataSet() 32 | err = test_dataset.Load(test_path, global) 33 | if err != nil { 34 | return 0.5, nil, err 35 | } 36 | classifier.Init(params) 37 | auc, predictions := AlgorithmRunOnDataSet(classifier, train_dataset, 38 | test_dataset, pred_path, params) 39 | 40 | return auc, predictions, nil 41 | } 42 | 43 | func AlgorithmTrain(classifier algo.Classifier, train_path string, 44 | params map[string]string) error { 45 | global, _ := strconv.ParseInt(params["global"], 10, 64) 46 | train_dataset := core.NewDataSet() 47 | 48 | err := train_dataset.Load(train_path, global) 49 | 50 | if err != nil { 51 | return err 52 | } 53 | 54 | classifier.Init(params) 55 | classifier.Train(train_dataset) 56 | 57 | model_path, _ := params["model"] 58 | 59 | if model_path != "" { 60 | classifier.SaveModel(model_path) 61 | } 62 | 63 | return nil 64 | } 65 | 66 | func AlgorithmTest(classifier algo.Classifier, test_path string, pred_path string, params map[string]string) (float64, []*eval.LabelPrediction, error) { 67 | global, _ := strconv.ParseInt(params["global"], 10, 64) 68 | 69 | model_path, _ := params["model"] 70 | classifier.Init(params) 71 | if model_path != "" { 72 | classifier.LoadModel(model_path) 73 | } else { 74 | return 0.0, nil, nil 75 | } 76 | 77 | test_dataset := core.NewDataSet() 78 | err := test_dataset.Load(test_path, global) 79 | if err != nil { 80 | return 0.0, nil, err 81 | } 82 | 83 | auc, predictions := AlgorithmRunOnDataSet(classifier, nil, test_dataset, pred_path, params) 84 | 85 | return auc, predictions, nil 86 | } 87 | 88 | func AlgorithmRunOnDataSet(classifier algo.Classifier, train_dataset, test_dataset *core.DataSet, pred_path string, params map[string]string) (float64, []*eval.LabelPrediction) { 89 | 90 | if train_dataset != nil { 91 | classifier.Train(train_dataset) 92 | } 93 | 94 | predictions := []*eval.LabelPrediction{} 95 | var pred_file *os.File 96 | if pred_path != "" { 97 | pred_file, _ = os.Create(pred_path) 98 | } 99 | for _, sample := range test_dataset.Samples { 100 | prediction := classifier.Predict(sample) 101 | if pred_file != nil { 102 | pred_file.WriteString(strconv.FormatFloat(prediction, 'g', 5, 64) + "\n") 103 | } 104 | predictions = append(predictions, &(eval.LabelPrediction{Label: sample.Label, Prediction: prediction})) 105 | } 106 | if pred_path != "" { 107 | defer pred_file.Close() 108 | } 109 | 110 | auc := eval.AUC(predictions) 111 | return auc, predictions 112 | } 113 | 114 | /* Regression */ 115 | func RegAlgorithmRun(regressor algo.Regressor, train_path string, test_path string, pred_path string, params map[string]string) (float64, []*eval.RealPrediction, error) { 116 | global, _ := strconv.ParseInt(params["global"], 10, 64) 117 | train_dataset := core.NewRealDataSet() 118 | 119 | err := train_dataset.Load(train_path, global) 120 | 121 | if err != nil { 122 | return 0.5, nil, err 123 | } 124 | 125 | test_dataset := core.NewRealDataSet() 126 | err = test_dataset.Load(test_path, global) 127 | if err != nil { 128 | return 0.5, nil, err 129 | } 130 | regressor.Init(params) 131 | rmse, predictions := RegAlgorithmRunOnDataSet(regressor, train_dataset, test_dataset, pred_path, params) 132 | 133 | return rmse, predictions, nil 134 | } 135 | 136 | func RegAlgorithmTrain(regressor algo.Regressor, train_path string, params map[string]string) error { 137 | global, _ := strconv.ParseInt(params["global"], 10, 64) 138 | train_dataset := core.NewRealDataSet() 139 | 140 | err := train_dataset.Load(train_path, global) 141 | 142 | if err != nil { 143 | return err 144 | } 145 | 146 | regressor.Init(params) 147 | regressor.Train(train_dataset) 148 | 149 | model_path, _ := params["model"] 150 | 151 | if model_path != "" { 152 | regressor.SaveModel(model_path) 153 | } 154 | 155 | return nil 156 | } 157 | 158 | func RegAlgorithmTest(regressor algo.Regressor, test_path string, pred_path string, params map[string]string) (float64, []*eval.RealPrediction, error) { 159 | global, _ := strconv.ParseInt(params["global"], 10, 64) 160 | 161 | model_path, _ := params["model"] 162 | regressor.Init(params) 163 | if model_path != "" { 164 | regressor.LoadModel(model_path) 165 | } else { 166 | return 0.0, nil, nil 167 | } 168 | 169 | test_dataset := core.NewRealDataSet() 170 | err := test_dataset.Load(test_path, global) 171 | if err != nil { 172 | return 0.0, nil, err 173 | } 174 | 175 | rmse, predictions := RegAlgorithmRunOnDataSet(regressor, nil, test_dataset, pred_path, params) 176 | 177 | return rmse, predictions, nil 178 | } 179 | 180 | func RegAlgorithmRunOnDataSet(regressor algo.Regressor, train_dataset, test_dataset *core.RealDataSet, pred_path string, params map[string]string) (float64, []*eval.RealPrediction) { 181 | 182 | if train_dataset != nil { 183 | regressor.Train(train_dataset) 184 | } 185 | 186 | predictions := []*eval.RealPrediction{} 187 | var pred_file *os.File 188 | if pred_path != "" { 189 | pred_file, _ = os.Create(pred_path) 190 | } 191 | for _, sample := range test_dataset.Samples { 192 | prediction := regressor.Predict(sample) 193 | if pred_file != nil { 194 | pred_file.WriteString(strconv.FormatFloat(prediction, 'g', 5, 64) + "\n") 195 | } 196 | predictions = append(predictions, &eval.RealPrediction{Value: sample.Value, Prediction: prediction}) 197 | } 198 | if pred_path != "" { 199 | defer pred_file.Close() 200 | } 201 | 202 | rmse := eval.RegRMSE(predictions) 203 | return rmse, predictions 204 | } 205 | -------------------------------------------------------------------------------- /params.go: -------------------------------------------------------------------------------- 1 | package hector 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "math/rand" 7 | "runtime" 8 | "strconv" 9 | "time" 10 | 11 | "github.com/xlvector/hector/algo" 12 | "github.com/xlvector/hector/ann" 13 | "github.com/xlvector/hector/dt" 14 | "github.com/xlvector/hector/fm" 15 | "github.com/xlvector/hector/gp" 16 | "github.com/xlvector/hector/lr" 17 | "github.com/xlvector/hector/sa" 18 | "github.com/xlvector/hector/svm" 19 | ) 20 | 21 | func GetMutliClassClassifier(method string) algo.MultiClassClassifier { 22 | rand.Seed(time.Now().UTC().UnixNano()) 23 | var classifier algo.MultiClassClassifier 24 | 25 | if method == "rf" { 26 | classifier = &(dt.RandomForest{}) 27 | } else if method == "cart" { 28 | classifier = &(dt.CART{}) 29 | } else if method == "rdt" { 30 | classifier = &(dt.RandomDecisionTree{}) 31 | } else if method == "knn" { 32 | classifier = &(svm.KNN{}) 33 | } else if method == "ann" { 34 | classifier = &(ann.NeuralNetwork{}) 35 | } 36 | return classifier 37 | } 38 | 39 | func GetClassifier(method string) algo.Classifier { 40 | rand.Seed(time.Now().UTC().UnixNano()) 41 | var classifier algo.Classifier 42 | 43 | if method == "lr" { 44 | classifier = &(lr.LogisticRegression{}) 45 | } else if method == "ftrl" { 46 | classifier = &(lr.FTRLLogisticRegression{}) 47 | } else if method == "ep" { 48 | classifier = &(lr.EPLogisticRegression{}) 49 | } else if method == "rdt" { 50 | classifier = &(dt.RandomDecisionTree{}) 51 | } else if method == "cart" { 52 | classifier = &(dt.CART{}) 53 | } else if method == "cart-regression" { 54 | classifier = &(dt.RegressionTree{}) 55 | } else if method == "rf" { 56 | classifier = &(dt.RandomForest{}) 57 | } else if method == "fm" { 58 | classifier = &(fm.FactorizeMachine{}) 59 | } else if method == "sa" { 60 | classifier = &(sa.SAOptAUC{}) 61 | } else if method == "gbdt" { 62 | classifier = &(dt.GBDT{}) 63 | } else if method == "svm" { 64 | classifier = &(svm.SVM{}) 65 | } else if method == "linear_svm" { 66 | classifier = &(svm.LinearSVM{}) 67 | } else if method == "l1vm" { 68 | classifier = &(svm.L1VM{}) 69 | } else if method == "knn" { 70 | classifier = &(svm.KNN{}) 71 | } else if method == "ann" { 72 | classifier = &(ann.NeuralNetwork{}) 73 | } else if method == "lr_owlqn" { 74 | classifier = &(lr.LROWLQN{}) 75 | } else { 76 | classifier = &(lr.LogisticRegression{}) 77 | } 78 | return classifier 79 | } 80 | 81 | func GetRegressor(method string) algo.Regressor { 82 | rand.Seed(time.Now().UTC().UnixNano()) 83 | 84 | var regressor algo.Regressor 85 | 86 | if method == "gp" { 87 | regressor = &(gp.GaussianProcess{}) 88 | } 89 | return regressor 90 | } 91 | 92 | func PrepareParams() (string, string, string, string, map[string]string) { 93 | params := make(map[string]string) 94 | train_path := flag.String("train", "train.tsv", "path of training file") 95 | test_path := flag.String("test", "test.tsv", "path of testing file") 96 | pred_path := flag.String("pred", "", "path of pred file") 97 | output := flag.String("output", "", "output file path") 98 | verbose := flag.Int("v", 0, "verbose output if 1") 99 | learning_rate := flag.String("learning-rate", "0.01", "learning rate") 100 | learning_rate_discount := flag.String("learning-rate-discount", "1.0", "discount rate of learning rate per training step") 101 | regularization := flag.String("regularization", "0.01", "regularization") 102 | alpha := flag.String("alpha", "0.1", "alpha of ftrl") 103 | beta := flag.String("beta", "1", "beta of ftrl") 104 | c := flag.String("c", "1", "C in svm") 105 | e := flag.String("e", "0.01", "stop threshold") 106 | lambda1 := flag.String("lambda1", "0.1", "lambda1 of ftrl") 107 | lambda2 := flag.String("lambda2", "0.1", "lambda2 of ftrl") 108 | tree_count := flag.String("tree-count", "10", "tree count in rdt/rf") 109 | feature_count := flag.String("feature-count", "1.0", "feature count in rdt/rf") 110 | gini := flag.String("gini", "1.0", "gini threshold, between (0, 0.5]") 111 | min_leaf_size := flag.String("min-leaf-size", "10", "min leaf size in dt") 112 | max_depth := flag.String("max-depth", "10", "max depth of dt") 113 | factors := flag.String("factors", "10", "factor number in factorized machine") 114 | steps := flag.Int("steps", 1, "steps before convergent") 115 | global := flag.Int64("global", -1, "feature id of global bias") 116 | method := flag.String("method", "lr", "algorithm name") 117 | cv := flag.Int("cv", 7, "cross validation folder count") 118 | k := flag.String("k", "3", "neighborhood size of knn") 119 | radius := flag.String("radius", "1.0", "radius of RBF kernel") 120 | sv := flag.String("sv", "8", "support vector count for l1vm") 121 | hidden := flag.Int64("hidden", 1, "hidden neuron number") 122 | profile := flag.String("profile", "", "profile file name") 123 | model := flag.String("model", "", "model file name") 124 | action := flag.String("action", "", "train or test, do both if action is empty string") 125 | core := flag.Int("core", 1, "core number when run program") 126 | dt_sample_ratio := flag.String("dt-sample-ratio", "1.0", "sampling ratio when split feature in decision tree") 127 | dim := flag.String("dim", "1", "input space dimension") 128 | port := flag.String("port", "8080", "port") 129 | 130 | flag.Parse() 131 | runtime.GOMAXPROCS(*core) 132 | fmt.Println(*train_path) 133 | fmt.Println(*test_path) 134 | fmt.Println(*method) 135 | params["port"] = *port 136 | params["verbose"] = strconv.FormatInt(int64(*verbose), 10) 137 | params["learning-rate"] = *learning_rate 138 | params["learning-rate-discount"] = *learning_rate_discount 139 | params["regularization"] = *regularization 140 | params["alpha"] = *alpha 141 | params["beta"] = *beta 142 | params["lambda1"] = *lambda1 143 | params["lambda2"] = *lambda2 144 | params["tree-count"] = *tree_count 145 | params["feature-count"] = *feature_count 146 | params["max-depth"] = *max_depth 147 | params["min-leaf-size"] = *min_leaf_size 148 | params["steps"] = strconv.FormatInt(int64(*steps), 10) 149 | params["global"] = strconv.FormatInt(*global, 10) 150 | params["gini"] = *gini 151 | params["factors"] = *factors 152 | params["output"] = *output 153 | params["c"] = *c 154 | params["e"] = *e 155 | params["k"] = *k 156 | params["cv"] = strconv.FormatInt(int64(*cv), 10) 157 | params["radius"] = *radius 158 | params["sv"] = *sv 159 | params["hidden"] = strconv.FormatInt(int64(*hidden), 10) 160 | params["profile"] = *profile 161 | params["action"] = *action 162 | params["model"] = *model 163 | params["method"] = *method 164 | params["dt-sample-ratio"] = *dt_sample_ratio 165 | params["dim"] = *dim 166 | 167 | fmt.Println(params) 168 | return *train_path, *test_path, *pred_path, *method, params 169 | } 170 | -------------------------------------------------------------------------------- /dt/regression_tree.go: -------------------------------------------------------------------------------- 1 | package dt 2 | 3 | import ( 4 | "bufio" 5 | "container/list" 6 | "github.com/xlvector/hector/core" 7 | "io/ioutil" 8 | "os" 9 | "sort" 10 | "strconv" 11 | ) 12 | 13 | type RegressionTree struct { 14 | tree Tree 15 | params CARTParams 16 | } 17 | 18 | func (self *RegressionTree) SaveModel(path string) { 19 | ioutil.WriteFile(path, self.tree.ToString(), 0600) 20 | } 21 | 22 | func (self *RegressionTree) LoadModel(path string) { 23 | file, _ := os.Open(path) 24 | defer file.Close() 25 | text := "" 26 | scanner := bufio.NewScanner(file) 27 | for scanner.Scan() { 28 | text += scanner.Text() + "\n" 29 | } 30 | self.tree.FromString(string(text)) 31 | } 32 | 33 | func (dt *RegressionTree) GoLeft(sample *core.MapBasedSample, feature_split core.Feature) bool { 34 | value, ok := sample.Features[feature_split.Id] 35 | if ok && value >= feature_split.Value { 36 | return true 37 | } else { 38 | return false 39 | } 40 | } 41 | 42 | func (dt *RegressionTree) GetElementFromQueue(queue *list.List, n int) []*TreeNode { 43 | ret := []*TreeNode{} 44 | for i := 0; i < n; i++ { 45 | node := queue.Front() 46 | if node == nil { 47 | break 48 | } 49 | ret = append(ret, (node.Value.(*TreeNode))) 50 | queue.Remove(node) 51 | } 52 | return ret 53 | } 54 | 55 | func (dt *RegressionTree) FindBestSplit(samples []*core.MapBasedSample, node *TreeNode, select_features map[int64]bool) { 56 | feature_weight_labels := make(map[int64]*core.FeatureGoalDistribution) 57 | sum_total := 0.0 58 | sum_total2 := 0.0 59 | count_total := 0.0 60 | for _, k := range node.samples { 61 | sum_total += samples[k].Prediction 62 | sum_total2 += samples[k].Prediction * samples[k].Prediction 63 | count_total += 1.0 64 | } 65 | 66 | feature_sum_right := core.NewVector() 67 | feature_sum_right2 := core.NewVector() 68 | feature_count_right := core.NewVector() 69 | 70 | for _, k := range node.samples { 71 | for fid, fvalue := range samples[k].Features { 72 | feature_count_right.AddValue(fid, 1.0) 73 | feature_sum_right.AddValue(fid, samples[k].Prediction) 74 | feature_sum_right2.AddValue(fid, samples[k].Prediction*samples[k].Prediction) 75 | _, ok := feature_weight_labels[fid] 76 | if !ok { 77 | feature_weight_labels[fid] = core.NewFeatureGoalDistribution() 78 | } 79 | feature_weight_labels[fid].AddWeightGoal(fvalue, samples[k].Prediction) 80 | } 81 | } 82 | 83 | min_vari := 1e20 84 | node.feature_split = core.Feature{Id: -1, Value: 0} 85 | for fid, distribution := range feature_weight_labels { 86 | sort.Sort(distribution) 87 | split, vari := distribution.BestSplitByVariance(sum_total-feature_sum_right.GetValue(fid), 88 | sum_total2-feature_sum_right2.GetValue(fid), 89 | count_total-feature_count_right.GetValue(fid), 90 | feature_sum_right.GetValue(fid), 91 | feature_sum_right2.GetValue(fid), 92 | feature_count_right.GetValue(fid)) 93 | if min_vari > vari { 94 | min_vari = vari 95 | node.feature_split.Id = fid 96 | node.feature_split.Value = split 97 | } 98 | } 99 | } 100 | 101 | func (dt *RegressionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, select_features map[int64]bool) { 102 | if node.depth >= dt.params.MaxDepth { 103 | return 104 | } 105 | 106 | dt.FindBestSplit(samples, node, select_features) 107 | 108 | if node.feature_split.Id < 0 { 109 | return 110 | } 111 | left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}} 112 | right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}} 113 | 114 | left_positive := 0.0 115 | left_total := 0.0 116 | right_positive := 0.0 117 | right_total := 0.0 118 | for _, k := range node.samples { 119 | if dt.GoLeft(samples[k], node.feature_split) { 120 | left_node.samples = append(left_node.samples, k) 121 | left_positive += samples[k].Prediction 122 | left_total += 1.0 123 | } else { 124 | right_node.samples = append(right_node.samples, k) 125 | right_positive += samples[k].Prediction 126 | right_total += 1.0 127 | } 128 | } 129 | node.samples = nil 130 | 131 | if len(left_node.samples) > dt.params.MinLeafSize { 132 | left_node.sample_count = len(left_node.samples) 133 | left_node.prediction.SetValue(0, left_positive/left_total) 134 | queue.PushBack(&left_node) 135 | node.left = len(tree.nodes) 136 | tree.AddTreeNode(&left_node) 137 | } 138 | 139 | if len(right_node.samples) > dt.params.MinLeafSize { 140 | right_node.sample_count = len(right_node.samples) 141 | right_node.prediction.SetValue(0, right_positive/right_total) 142 | queue.PushBack(&right_node) 143 | node.right = len(tree.nodes) 144 | tree.AddTreeNode(&right_node) 145 | } 146 | } 147 | 148 | func (dt *RegressionTree) SingleTreeBuild(samples []*core.MapBasedSample, select_features map[int64]bool) Tree { 149 | tree := Tree{} 150 | queue := list.New() 151 | root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}} 152 | total := 0.0 153 | positive := 0.0 154 | for i, sample := range samples { 155 | root.AddSample(i) 156 | total += 1.0 157 | positive += sample.Prediction 158 | } 159 | root.sample_count = len(root.samples) 160 | root.prediction.SetValue(0, positive/total) 161 | 162 | queue.PushBack(&root) 163 | tree.AddTreeNode(&root) 164 | for { 165 | nodes := dt.GetElementFromQueue(queue, 10) 166 | if len(nodes) == 0 { 167 | break 168 | } 169 | 170 | for _, node := range nodes { 171 | dt.AppendNodeToTree(samples, node, queue, &tree, select_features) 172 | } 173 | } 174 | return tree 175 | } 176 | 177 | func (dt *RegressionTree) PredictBySingleTree(tree *Tree, sample *core.MapBasedSample) (*TreeNode, string) { 178 | path := "" 179 | node := tree.GetNode(0) 180 | path += node.ToString() 181 | for { 182 | if dt.GoLeft(sample, node.feature_split) { 183 | if node.left >= 0 && node.left < tree.Size() { 184 | node = tree.GetNode(node.left) 185 | path += "-" + node.ToString() 186 | } else { 187 | break 188 | } 189 | } else { 190 | if node.right >= 0 && node.right < tree.Size() { 191 | node = tree.GetNode(node.right) 192 | path += "+" + node.ToString() 193 | } else { 194 | break 195 | } 196 | } 197 | } 198 | return node, path 199 | } 200 | 201 | func (dt *RegressionTree) Train(dataset *core.DataSet) { 202 | samples := []*core.MapBasedSample{} 203 | for _, sample := range dataset.Samples { 204 | msample := sample.ToMapBasedSample() 205 | samples = append(samples, msample) 206 | } 207 | dt.tree = dt.SingleTreeBuild(samples, nil) 208 | } 209 | 210 | func (dt *RegressionTree) Predict(sample *core.Sample) float64 { 211 | msample := sample.ToMapBasedSample() 212 | node, _ := dt.PredictBySingleTree(&dt.tree, msample) 213 | return node.prediction.GetValue(0) 214 | } 215 | 216 | func (dt *RegressionTree) Init(params map[string]string) { 217 | dt.tree = Tree{} 218 | min_leaf_size, _ := strconv.ParseInt(params["min-leaf-size"], 10, 32) 219 | max_depth, _ := strconv.ParseInt(params["max-depth"], 10, 32) 220 | 221 | dt.params.MinLeafSize = int(min_leaf_size) 222 | dt.params.MaxDepth = int(max_depth) 223 | dt.params.GiniThreshold, _ = strconv.ParseFloat(params["gini"], 64) 224 | } 225 | -------------------------------------------------------------------------------- /dt/random_decision_tree.go: -------------------------------------------------------------------------------- 1 | package dt 2 | 3 | import ( 4 | "container/list" 5 | "fmt" 6 | "math/rand" 7 | "strconv" 8 | "strings" 9 | "sync" 10 | 11 | "github.com/xlvector/hector/core" 12 | "github.com/xlvector/hector/util" 13 | ) 14 | 15 | type TreeNode struct { 16 | left, right, depth int 17 | prediction *core.ArrayVector 18 | sample_count int 19 | samples []int 20 | feature_split core.Feature 21 | } 22 | 23 | func (t *TreeNode) ToString() string { 24 | return strconv.FormatInt(t.feature_split.Id, 10) + ":" + strconv.FormatFloat(t.feature_split.Value, 'g', 3, 64) 25 | } 26 | 27 | func (t *TreeNode) AddSample(k int) { 28 | t.samples = append(t.samples, k) 29 | } 30 | 31 | type Tree struct { 32 | nodes []*TreeNode 33 | } 34 | 35 | func (t *Tree) AddTreeNode(n *TreeNode) { 36 | t.nodes = append(t.nodes, n) 37 | } 38 | 39 | func (t *Tree) Size() int { 40 | return len(t.nodes) 41 | } 42 | 43 | func (t *Tree) GetNode(i int) *TreeNode { 44 | return t.nodes[i] 45 | } 46 | 47 | func (t *Tree) ToString() []byte { 48 | sb := util.StringBuilder{} 49 | sb.Int(len(t.nodes)) 50 | sb.Write("\n") 51 | for i, node := range t.nodes { 52 | sb.Int(i) 53 | sb.Write("\t") 54 | sb.Int(node.left) 55 | sb.Write("\t") 56 | sb.Int(node.right) 57 | sb.Write("\t") 58 | sb.Int(node.depth) 59 | sb.Write("\t") 60 | sb.WriteBytes(node.prediction.ToString()) 61 | sb.Write("\t") 62 | sb.Int(node.sample_count) 63 | sb.Write("\t") 64 | sb.Int64(node.feature_split.Id) 65 | sb.Write("\t") 66 | sb.Float(node.feature_split.Value) 67 | sb.Write("\n") 68 | } 69 | return sb.Bytes() 70 | } 71 | 72 | func (t *Tree) fromString(lines []string) { 73 | size, _ := strconv.Atoi(lines[0]) 74 | t.nodes = make([]*TreeNode, size+1, size+1) 75 | for _, line := range lines[1:] { 76 | if len(line) == 0 { 77 | break 78 | } 79 | tks := strings.Split(line, "\t") 80 | node := TreeNode{} 81 | i, _ := strconv.Atoi(tks[0]) 82 | node.left, _ = strconv.Atoi(tks[1]) 83 | node.right, _ = strconv.Atoi(tks[2]) 84 | node.depth, _ = strconv.Atoi(tks[3]) 85 | node.prediction = core.NewArrayVector() 86 | node.prediction.FromString(tks[4]) 87 | node.sample_count, _ = strconv.Atoi(tks[5]) 88 | node.feature_split = core.Feature{} 89 | node.feature_split.Id, _ = strconv.ParseInt(tks[6], 10, 64) 90 | node.feature_split.Value, _ = strconv.ParseFloat(tks[7], 64) 91 | t.nodes[i] = &node 92 | } 93 | } 94 | 95 | func (t *Tree) FromString(buf string) { 96 | lines := strings.Split(buf, "\n") 97 | t.fromString(lines) 98 | } 99 | 100 | type RDTParams struct { 101 | TreeCount int 102 | MinLeafSize int 103 | MaxDepth int 104 | } 105 | 106 | type RandomDecisionTree struct { 107 | trees []*Tree 108 | params RDTParams 109 | } 110 | 111 | func (self *RandomDecisionTree) SaveModel(path string) { 112 | 113 | } 114 | 115 | func (self *RandomDecisionTree) LoadModel(path string) { 116 | 117 | } 118 | 119 | func (rdt *RandomDecisionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree) { 120 | node.prediction = core.NewArrayVector() 121 | for _, k := range node.samples { 122 | node.prediction.AddValue(samples[k].Label, 1.0) 123 | } 124 | node.prediction.Scale(1.0 / node.prediction.Sum()) 125 | 126 | random_sample := samples[node.samples[rand.Intn(len(node.samples))]] 127 | 128 | split := core.Feature{Id: -1, Value: -1.0} 129 | for fid, fvalue := range random_sample.Features { 130 | if split.Id < 0 || rand.Intn(len(random_sample.Features)) == 0 { 131 | split.Id = fid 132 | split.Value = fvalue 133 | } 134 | } 135 | 136 | if split.Id < 0 || node.depth > rdt.params.MaxDepth { 137 | return 138 | } 139 | 140 | node.feature_split = split 141 | left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} 142 | right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} 143 | 144 | for _, k := range node.samples { 145 | if DTGoLeft(samples[k], node.feature_split) { 146 | left_node.samples = append(left_node.samples, k) 147 | } else { 148 | right_node.samples = append(right_node.samples, k) 149 | } 150 | } 151 | node.samples = nil 152 | 153 | if len(left_node.samples) == 0 || len(right_node.samples) == 0 { 154 | return 155 | } 156 | 157 | if len(left_node.samples) > rdt.params.MinLeafSize { 158 | queue.PushBack(&left_node) 159 | node.left = len(tree.nodes) 160 | tree.AddTreeNode(&left_node) 161 | } 162 | 163 | if len(right_node.samples) > rdt.params.MinLeafSize { 164 | queue.PushBack(&right_node) 165 | node.right = len(tree.nodes) 166 | tree.AddTreeNode(&right_node) 167 | } 168 | } 169 | 170 | func (rdt *RandomDecisionTree) SingleTreeBuild(samples []*core.MapBasedSample) Tree { 171 | tree := Tree{} 172 | queue := list.New() 173 | root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}} 174 | 175 | for i := 0; i < len(samples); i++ { 176 | k := rand.Intn(len(samples)) 177 | root.AddSample(k) 178 | root.prediction.AddValue(samples[k].Label, 1.0) 179 | } 180 | root.sample_count = len(root.samples) 181 | root.prediction.Scale(1.0 / root.prediction.Sum()) 182 | 183 | queue.PushBack(&root) 184 | tree.AddTreeNode(&root) 185 | for { 186 | nodes := DTGetElementFromQueue(queue, 10) 187 | if len(nodes) == 0 { 188 | break 189 | } 190 | 191 | for _, node := range nodes { 192 | rdt.AppendNodeToTree(samples, node, queue, &tree) 193 | } 194 | } 195 | return tree 196 | } 197 | 198 | func (rdt *RandomDecisionTree) RandomShuffle(features []core.Feature) { 199 | for i := range features { 200 | j := rand.Intn(i + 1) 201 | features[i], features[j] = features[j], features[i] 202 | } 203 | } 204 | 205 | func (rdt *RandomDecisionTree) Train(dataset *core.DataSet) { 206 | samples := []*core.MapBasedSample{} 207 | for _, sample := range dataset.Samples { 208 | samples = append(samples, sample.ToMapBasedSample()) 209 | } 210 | dataset.Samples = nil 211 | 212 | forest := make(chan *Tree, rdt.params.TreeCount) 213 | var wait sync.WaitGroup 214 | wait.Add(rdt.params.TreeCount) 215 | for k := 0; k < rdt.params.TreeCount; k++ { 216 | go func() { 217 | tree := rdt.SingleTreeBuild(samples) 218 | forest <- &tree 219 | fmt.Printf(".") 220 | wait.Done() 221 | }() 222 | } 223 | wait.Wait() 224 | fmt.Println() 225 | close(forest) 226 | for tree := range forest { 227 | rdt.trees = append(rdt.trees, tree) 228 | } 229 | } 230 | 231 | func (rdt *RandomDecisionTree) Predict(sample *core.Sample) float64 { 232 | ret := 0.0 233 | total := 0.0 234 | msample := sample.ToMapBasedSample() 235 | for _, tree := range rdt.trees { 236 | node, _ := PredictBySingleTree(tree, msample) 237 | ret += node.prediction.GetValue(1) 238 | total += 1.0 239 | } 240 | return ret / total 241 | } 242 | 243 | func (rdt *RandomDecisionTree) PredictMultiClass(sample *core.Sample) *core.ArrayVector { 244 | msample := sample.ToMapBasedSample() 245 | predictions := core.NewArrayVector() 246 | total := 0.0 247 | for _, tree := range rdt.trees { 248 | node, _ := PredictBySingleTree(tree, msample) 249 | predictions.AddVector(node.prediction, 1.0) 250 | total += 1.0 251 | } 252 | predictions.Scale(1.0 / total) 253 | return predictions 254 | } 255 | 256 | func (rdt *RandomDecisionTree) Init(params map[string]string) { 257 | rdt.trees = []*Tree{} 258 | rdt.params.MinLeafSize, _ = strconv.Atoi(params["min-leaf-size"]) 259 | rdt.params.TreeCount, _ = strconv.Atoi(params["tree-count"]) 260 | rdt.params.MaxDepth, _ = strconv.Atoi(params["max-depth"]) 261 | } 262 | -------------------------------------------------------------------------------- /core/feature_analyze.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import( 4 | "sort" 5 | "math" 6 | ) 7 | 8 | type WeightLabel struct { 9 | weight float64 10 | label int 11 | } 12 | 13 | func (self *WeightLabel) LabelDoubleValue() float64{ 14 | return float64(self.label) 15 | } 16 | 17 | type FeatureLabelDistribution struct { 18 | weight_label []WeightLabel 19 | } 20 | 21 | type WeightGoal struct { 22 | weight float64 23 | goal float64 24 | } 25 | 26 | type FeatureGoalDistribution struct { 27 | weight_goal []WeightGoal 28 | } 29 | 30 | func NewFeatureLabelDistribution() *FeatureLabelDistribution{ 31 | ret := FeatureLabelDistribution{} 32 | ret.weight_label = []WeightLabel{} 33 | return &ret 34 | } 35 | 36 | func NewFeatureGoalDistribution() *FeatureGoalDistribution{ 37 | ret := FeatureGoalDistribution{} 38 | ret.weight_goal = []WeightGoal{} 39 | return &ret 40 | } 41 | 42 | func (f *FeatureLabelDistribution) AddWeightLabel(weight float64, label int){ 43 | wl := WeightLabel{weight:weight, label:label} 44 | f.weight_label = append(f.weight_label, wl) 45 | } 46 | 47 | func (f *FeatureGoalDistribution) AddWeightGoal(weight float64, goal float64){ 48 | wl := WeightGoal{weight:weight, goal:goal} 49 | f.weight_goal = append(f.weight_goal, wl) 50 | } 51 | 52 | func (f *FeatureLabelDistribution) Len() int { 53 | return len(f.weight_label) 54 | } 55 | 56 | func (f *FeatureLabelDistribution) Swap(i, j int) { 57 | f.weight_label[i], f.weight_label[j] = f.weight_label[j], f.weight_label[i] 58 | } 59 | 60 | func (f *FeatureLabelDistribution) Less(i, j int) bool { 61 | return (f.weight_label[i].weight < f.weight_label[j].weight) 62 | } 63 | 64 | func (f *FeatureGoalDistribution) Len() int { 65 | return len(f.weight_goal) 66 | } 67 | 68 | func (f *FeatureGoalDistribution) Swap(i, j int) { 69 | f.weight_goal[i], f.weight_goal[j] = f.weight_goal[j], f.weight_goal[i] 70 | } 71 | 72 | func (f *FeatureGoalDistribution) Less(i, j int) bool { 73 | return (f.weight_goal[i].weight < f.weight_goal[j].weight) 74 | } 75 | 76 | func (f *FeatureLabelDistribution) PositiveCount() int { 77 | ret := 0 78 | for _, e := range f.weight_label{ 79 | ret += int(e.label) 80 | } 81 | return ret 82 | } 83 | 84 | func (f *FeatureLabelDistribution) LabelDistribution() *ArrayVector { 85 | ret := NewArrayVector() 86 | for _, e := range f.weight_label { 87 | ret.AddValue(e.label, 1.0) 88 | } 89 | return ret 90 | } 91 | 92 | func (f *FeatureGoalDistribution) Variance(sum_left, sum_left2, count_left, sum_right, sum_right2, count_right float64) float64 { 93 | mean_left := sum_left / count_left 94 | mean_right := sum_right / count_right 95 | 96 | return sum_left2 + sum_right2 - mean_left * mean_left * count_left - mean_right * mean_right * count_right 97 | } 98 | 99 | func (f *FeatureGoalDistribution) BestSplitByVariance(sum_left, sum_left2, count_left, sum_right, sum_right2, count_right float64) (float64, float64) { 100 | 101 | min_vari := 100000.0 102 | split := f.weight_goal[0].weight - 1.0 103 | prev_weight := f.weight_goal[0].weight - 1.0 104 | for _, wl := range f.weight_goal{ 105 | if prev_weight != wl.weight{ 106 | vari := f.Variance(sum_left, sum_left2, count_left, sum_right, sum_right2, count_right) 107 | if vari < min_vari{ 108 | min_vari = vari 109 | split = wl.weight 110 | } 111 | } 112 | prev_weight = wl.weight 113 | sum_left += wl.goal 114 | sum_left2 += wl.goal * wl.goal 115 | count_left += 1.0 116 | 117 | sum_right -= wl.goal 118 | sum_right2 -= wl.goal * wl.goal 119 | count_right -= 1.0 120 | } 121 | return split, min_vari 122 | } 123 | 124 | /* 125 | func Gini(pleft, tleft, pright, tright float64) float64 { 126 | if tleft == 0.0 || tright == 0.0{ 127 | return 1.0 128 | } 129 | p11 := pleft / tleft 130 | g1 := 1 - p11 * p11 - (1 - p11) * (1 - p11) 131 | p21 := pright / tright 132 | g2 := 1 - p21 * p21 - (1 - p21) * (1 - p21) 133 | ret := tleft * g1 / (tleft + tright) + tright * g2 / (tleft + tright) 134 | return ret 135 | } 136 | */ 137 | 138 | func Gini(left_dis, right_dis *ArrayVector) float64 { 139 | left_sum := left_dis.Sum() 140 | right_sum := right_dis.Sum() 141 | 142 | if left_sum == 0.0 || right_sum == 0.0 { 143 | return 1.0 144 | } 145 | 146 | left_gini := 1.0 147 | for _, p := range left_dis.data { 148 | left_gini -= (p / left_sum) * (p / left_sum) 149 | } 150 | 151 | right_gini := 1.0 152 | for _, p := range right_dis.data { 153 | right_gini -= (p / right_sum) * (p / right_sum) 154 | } 155 | return (left_sum * left_gini + right_sum * right_gini) / (left_sum + right_sum) 156 | } 157 | 158 | /* 159 | func (f *FeatureLabelDistribution) BestSplitByGini(total, positive int) (float64, float64) { 160 | pright := float64(f.PositiveCount()) 161 | tright := float64(len(f.weight_label)) 162 | pleft := float64(positive) - pright 163 | tleft := float64(total) - tright 164 | min_gini := Gini(pleft, tleft, pright, tright) 165 | split := f.weight_label[0].weight 166 | prev_weight := f.weight_label[0].weight 167 | for _, wl := range f.weight_label{ 168 | if prev_weight != wl.weight{ 169 | gini := Gini(pleft, tleft, pright, tright) 170 | if gini < min_gini{ 171 | min_gini = gini 172 | split = wl.weight 173 | } 174 | } 175 | prev_weight = wl.weight 176 | tleft += 1.0 177 | tright -= 1.0 178 | pleft += float64(wl.label) 179 | pright -= float64(wl.label) 180 | } 181 | return split, min_gini 182 | } 183 | */ 184 | 185 | func (self *FeatureLabelDistribution) BestSplitByGini(total_dis *ArrayVector) (float64, float64) { 186 | left_dis := total_dis.Copy() 187 | right_dis := self.LabelDistribution() 188 | left_dis.AddVector(right_dis, -1.0) 189 | 190 | min_gini := Gini(left_dis, right_dis) 191 | split := self. weight_label[0].weight 192 | prev_weight := self.weight_label[0].weight 193 | for _, wl := range self.weight_label { 194 | if prev_weight != wl.weight { 195 | gini := Gini(left_dis, right_dis) 196 | if gini < min_gini{ 197 | min_gini = gini 198 | split = wl.weight 199 | } 200 | } 201 | prev_weight = wl.weight 202 | left_dis.AddValue(wl.label, 1.0) 203 | right_dis.AddValue(wl.label, -1.0) 204 | } 205 | return split, min_gini 206 | } 207 | 208 | func (f *FeatureLabelDistribution) InformationValue(global_total, global_positive int) float64 { 209 | with_total := len(f.weight_label) 210 | with_positive := f.PositiveCount() 211 | 212 | positives := []int{} 213 | negatives := []int{} 214 | 215 | positives = append(positives, global_positive - with_positive) 216 | negatives = append(negatives, (global_total - global_positive) - (with_total - with_positive)) 217 | 218 | sort.Sort(f) 219 | 220 | prev_c := -1 221 | pos := 0 222 | total := 0 223 | for i, e := range f.weight_label { 224 | c := int(200.0 * float64(i) / float64(with_total)) 225 | if c != prev_c { 226 | if total > 0{ 227 | positives = append(positives, pos) 228 | negatives = append(negatives, total - pos) 229 | pos = 0 230 | total = 0 231 | } 232 | } 233 | prev_c = c 234 | pos += int(e.label) 235 | total += 1 236 | } 237 | if total > 0{ 238 | positives = append(positives, pos) 239 | negatives = append(negatives, total - pos) 240 | } 241 | 242 | sum_positive := 0 243 | sum_negative := 0 244 | for _, v := range positives{ 245 | sum_positive += v 246 | } 247 | for _, v := range negatives{ 248 | sum_negative += v 249 | } 250 | iv := 0.0 251 | for i := range positives{ 252 | positive_ratio := float64(positives[i]) / float64(sum_positive) 253 | negative_ratio := float64(negatives[i]) / float64(sum_negative) 254 | iv += (positive_ratio - negative_ratio) * math.Log((0.00001 + positive_ratio) / (0.00001 + negative_ratio)) 255 | } 256 | return iv 257 | } 258 | 259 | func InformationValue(dataset *DataSet) map[int64]float64 { 260 | feature_weight_labels := make(map[int64]*FeatureLabelDistribution) 261 | total := 0 262 | positive := 0 263 | for _,sample := range dataset.Samples { 264 | total += 1 265 | positive += int(sample.Label) 266 | for _, feature := range sample.Features { 267 | _, ok := feature_weight_labels[feature.Id] 268 | if !ok { 269 | feature_weight_labels[feature.Id] = NewFeatureLabelDistribution() 270 | } 271 | feature_weight_labels[feature.Id].AddWeightLabel(feature.Value, sample.Label) 272 | } 273 | } 274 | 275 | ret := make(map[int64]float64) 276 | 277 | for fid, distribution := range feature_weight_labels{ 278 | ret[fid] = distribution.InformationValue(total, positive) 279 | } 280 | return ret 281 | } 282 | -------------------------------------------------------------------------------- /dt/cart.go: -------------------------------------------------------------------------------- 1 | package dt 2 | 3 | import ( 4 | "bufio" 5 | "container/list" 6 | "fmt" 7 | "github.com/xlvector/hector/core" 8 | "io/ioutil" 9 | "math" 10 | "math/rand" 11 | "os" 12 | "sort" 13 | "strconv" 14 | ) 15 | 16 | /* 17 | CART is classification and regression tree, this class implement classification tree and use gini 18 | to split features 19 | */ 20 | type CART struct { 21 | tree Tree 22 | params CARTParams 23 | continuous_features bool 24 | salt int64 25 | } 26 | 27 | func DTGoLeft(sample *core.MapBasedSample, feature_split core.Feature) bool { 28 | value, ok := sample.Features[feature_split.Id] 29 | if ok && value >= feature_split.Value { 30 | return true 31 | } else { 32 | return false 33 | } 34 | } 35 | 36 | func DTGetElementFromQueue(queue *list.List, n int) []*TreeNode { 37 | ret := []*TreeNode{} 38 | for i := 0; i < n; i++ { 39 | node := queue.Front() 40 | if node == nil { 41 | break 42 | } 43 | ret = append(ret, (node.Value.(*TreeNode))) 44 | queue.Remove(node) 45 | } 46 | return ret 47 | } 48 | 49 | func (dt *CART) RandByFeatureId(fid int64) float64 { 50 | ret := fid*19857 + dt.salt 51 | r := math.Abs(float64(ret%1000) / 1000.0) 52 | return r 53 | } 54 | 55 | func (dt *CART) FindBestSplitOfContinusousFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) { 56 | feature_weight_labels := make(map[int64]*core.FeatureLabelDistribution) 57 | total_dis := core.NewArrayVector() 58 | for i, k := range node.samples { 59 | if i > 10 && rand.Float64() > dt.params.SamplingRatio { 60 | continue 61 | } 62 | total_dis.AddValue(samples[k].Label, 1.0) 63 | for fid, fvalue := range samples[k].Features { 64 | if dt.RandByFeatureId(fid) > feature_select_prob { 65 | continue 66 | } 67 | _, ok := feature_weight_labels[fid] 68 | if !ok { 69 | feature_weight_labels[fid] = core.NewFeatureLabelDistribution() 70 | } 71 | feature_weight_labels[fid].AddWeightLabel(fvalue, samples[k].Label) 72 | } 73 | } 74 | 75 | min_gini := 1.0 76 | node.feature_split = core.Feature{Id: -1, Value: 0} 77 | for fid, distribution := range feature_weight_labels { 78 | sort.Sort(distribution) 79 | split, gini := distribution.BestSplitByGini(total_dis) 80 | if min_gini > gini { 81 | min_gini = gini 82 | node.feature_split.Id = fid 83 | node.feature_split.Value = split 84 | } 85 | } 86 | if min_gini > dt.params.GiniThreshold { 87 | node.feature_split.Id = -1 88 | node.feature_split.Value = 0.0 89 | } 90 | } 91 | 92 | func (dt *CART) FindBestSplitOfBinaryFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) { 93 | feature_right_dis := make(map[int64]*core.ArrayVector) 94 | total_dis := core.NewArrayVector() 95 | for i, k := range node.samples { 96 | if i > 10 && rand.Float64() > dt.params.SamplingRatio { 97 | continue 98 | } 99 | total_dis.AddValue(samples[k].Label, 1.0) 100 | for fid, _ := range samples[k].Features { 101 | if dt.RandByFeatureId(fid) > feature_select_prob { 102 | continue 103 | } 104 | _, ok := feature_right_dis[fid] 105 | if !ok { 106 | feature_right_dis[fid] = core.NewArrayVector() 107 | } 108 | feature_right_dis[fid].AddValue(samples[k].Label, 1.0) 109 | } 110 | } 111 | 112 | min_gini := 1.0 113 | node.feature_split = core.Feature{Id: -1, Value: 0} 114 | for fid, right_dis := range feature_right_dis { 115 | left_dis := total_dis.Copy() 116 | left_dis.AddVector(right_dis, -1.0) 117 | gini := core.Gini(left_dis, right_dis) 118 | if min_gini > gini { 119 | min_gini = gini 120 | node.feature_split.Id = fid 121 | node.feature_split.Value = 1.0 122 | } 123 | } 124 | if min_gini > dt.params.GiniThreshold { 125 | node.feature_split.Id = -1 126 | node.feature_split.Value = 0.0 127 | } 128 | } 129 | 130 | func (dt *CART) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, feature_select_prob float64) { 131 | if node.depth >= dt.params.MaxDepth { 132 | return 133 | } 134 | 135 | if dt.continuous_features { 136 | dt.FindBestSplitOfContinusousFeature(samples, node, feature_select_prob) 137 | } else { 138 | dt.FindBestSplitOfBinaryFeature(samples, node, feature_select_prob) 139 | } 140 | if node.feature_split.Id < 0 { 141 | return 142 | } 143 | left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} 144 | right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} 145 | 146 | left_node.prediction = core.NewArrayVector() 147 | right_node.prediction = core.NewArrayVector() 148 | for _, k := range node.samples { 149 | if DTGoLeft(samples[k], node.feature_split) { 150 | left_node.samples = append(left_node.samples, k) 151 | left_node.prediction.AddValue(samples[k].Label, 1.0) 152 | } else { 153 | right_node.samples = append(right_node.samples, k) 154 | right_node.prediction.AddValue(samples[k].Label, 1.0) 155 | } 156 | } 157 | node.samples = nil 158 | 159 | if len(left_node.samples) > dt.params.MinLeafSize { 160 | left_node.sample_count = len(left_node.samples) 161 | left_node.prediction.Scale(1.0 / left_node.prediction.Sum()) 162 | queue.PushBack(&left_node) 163 | node.left = len(tree.nodes) 164 | tree.AddTreeNode(&left_node) 165 | } 166 | 167 | if len(right_node.samples) > dt.params.MinLeafSize { 168 | right_node.sample_count = len(right_node.samples) 169 | right_node.prediction.Scale(1.0 / right_node.prediction.Sum()) 170 | queue.PushBack(&right_node) 171 | node.right = len(tree.nodes) 172 | tree.AddTreeNode(&right_node) 173 | } 174 | } 175 | 176 | func (dt *CART) SingleTreeBuild(samples []*core.MapBasedSample, feature_select_prob float64, bootstrap bool) Tree { 177 | tree := Tree{} 178 | queue := list.New() 179 | root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}} 180 | 181 | if !bootstrap { 182 | for i, sample := range samples { 183 | root.AddSample(i) 184 | root.prediction.AddValue(sample.Label, 1.0) 185 | } 186 | } else { 187 | for i := 0; i < len(samples); i++ { 188 | k := rand.Intn(len(samples)) 189 | root.AddSample(k) 190 | root.prediction.AddValue(samples[k].Label, 1.0) 191 | } 192 | } 193 | root.sample_count = len(root.samples) 194 | root.prediction.Scale(1.0 / root.prediction.Sum()) 195 | 196 | queue.PushBack(&root) 197 | tree.AddTreeNode(&root) 198 | for { 199 | nodes := DTGetElementFromQueue(queue, 10) 200 | if len(nodes) == 0 { 201 | break 202 | } 203 | 204 | for _, node := range nodes { 205 | dt.AppendNodeToTree(samples, node, queue, &tree, feature_select_prob) 206 | } 207 | } 208 | return tree 209 | } 210 | 211 | func PredictBySingleTree(tree *Tree, sample *core.MapBasedSample) (*TreeNode, string) { 212 | path := "" 213 | node := tree.GetNode(0) 214 | path += node.ToString() 215 | for { 216 | if DTGoLeft(sample, node.feature_split) { 217 | if node.left >= 0 && node.left < tree.Size() { 218 | node = tree.GetNode(node.left) 219 | path += "-" + node.ToString() 220 | } else { 221 | break 222 | } 223 | } else { 224 | if node.right >= 0 && node.right < tree.Size() { 225 | node = tree.GetNode(node.right) 226 | path += "+" + node.ToString() 227 | } else { 228 | break 229 | } 230 | } 231 | } 232 | return node, path 233 | } 234 | 235 | func (dt *CART) Train(dataset *core.DataSet) { 236 | samples := []*core.MapBasedSample{} 237 | feature_weights := make(map[int64]float64) 238 | for _, sample := range dataset.Samples { 239 | if !dt.continuous_features { 240 | for _, f := range sample.Features { 241 | _, ok := feature_weights[f.Id] 242 | if !ok { 243 | feature_weights[f.Id] = f.Value 244 | } 245 | if feature_weights[f.Id] != f.Value { 246 | dt.continuous_features = true 247 | } 248 | } 249 | } 250 | msample := sample.ToMapBasedSample() 251 | samples = append(samples, msample) 252 | } 253 | if dt.continuous_features { 254 | fmt.Println("Continuous DataSet") 255 | } else { 256 | fmt.Println("Binary DataSet") 257 | } 258 | dt.tree = dt.SingleTreeBuild(samples, 1.0, false) 259 | } 260 | 261 | func (dt *CART) Predict(sample *core.Sample) float64 { 262 | msample := sample.ToMapBasedSample() 263 | node, _ := PredictBySingleTree(&dt.tree, msample) 264 | return node.prediction.GetValue(1) 265 | } 266 | 267 | func (dt *CART) PredictMultiClass(sample *core.Sample) *core.ArrayVector { 268 | msample := sample.ToMapBasedSample() 269 | node, _ := PredictBySingleTree(&dt.tree, msample) 270 | return node.prediction 271 | } 272 | 273 | func (self *CART) SaveModel(path string) { 274 | ioutil.WriteFile(path, self.tree.ToString(), 0600) 275 | } 276 | 277 | func (self *CART) LoadModel(path string) { 278 | file, _ := os.Open(path) 279 | defer file.Close() 280 | text := "" 281 | scanner := bufio.NewScanner(file) 282 | for scanner.Scan() { 283 | text += scanner.Text() + "\n" 284 | } 285 | self.tree.FromString(string(text)) 286 | } 287 | 288 | type CARTParams struct { 289 | MaxDepth int 290 | MinLeafSize int 291 | GiniThreshold float64 292 | SamplingRatio float64 293 | } 294 | 295 | func (dt *CART) Init(params map[string]string) { 296 | dt.tree = Tree{} 297 | dt.continuous_features = false 298 | min_leaf_size, _ := strconv.ParseInt(params["min-leaf-size"], 10, 32) 299 | max_depth, _ := strconv.ParseInt(params["max-depth"], 10, 32) 300 | 301 | dt.params.MinLeafSize = int(min_leaf_size) 302 | dt.params.MaxDepth = int(max_depth) 303 | dt.params.GiniThreshold, _ = strconv.ParseFloat(params["gini"], 64) 304 | dt.salt = rand.Int63n(10000000000) 305 | dt.params.SamplingRatio, _ = strconv.ParseFloat(params["dt-sample-ratio"], 64) 306 | } 307 | -------------------------------------------------------------------------------- /core/dataset.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "log" 7 | "os" 8 | "sort" 9 | "strconv" 10 | "strings" 11 | 12 | "github.com/xlvector/hector/util" 13 | ) 14 | 15 | type CombinedFeature []string 16 | 17 | type FeatureSplit []float64 18 | 19 | func FindCategory(split []float64, value float64) int { 20 | return sort.Search(len(split), func(i int) bool { return split[i] >= value }) 21 | } 22 | 23 | /* RawDataSet */ 24 | type RawDataSet struct { 25 | Samples []*RawSample 26 | FeatureKeys map[string]bool 27 | } 28 | 29 | func NewRawDataSet() *RawDataSet { 30 | ret := RawDataSet{} 31 | ret.Samples = []*RawSample{} 32 | ret.FeatureKeys = make(map[string]bool) 33 | return &ret 34 | } 35 | 36 | func (d *RawDataSet) AddSample(sample *RawSample) { 37 | d.Samples = append(d.Samples, sample) 38 | } 39 | 40 | func (d *RawDataSet) ToDataSet(splits map[string][]float64, combinations []CombinedFeature) *DataSet { 41 | out_data := NewDataSet() 42 | fm := make(map[string]int64) 43 | for _, sample := range d.Samples { 44 | out_sample := NewSample() 45 | out_sample.Label = sample.Label 46 | if splits != nil { 47 | for fkey_str, fvalue_str := range sample.Features { 48 | fkey := "" 49 | fvalue := 0.0 50 | if GetFeatureType(fkey_str) == FeatureTypeEnum.CONTINUOUS_FEATURE { 51 | split, ok := splits[fkey_str] 52 | if ok { 53 | cat := FindCategory(split, util.ParseFloat64(fvalue_str)) 54 | fkey = fkey_str + "_" + strconv.FormatInt(int64(cat), 10) 55 | fvalue = 1.0 56 | } else { 57 | fvalue = util.ParseFloat64(fvalue_str) 58 | } 59 | fm[fkey] = util.Hash(fkey) 60 | out_sample.AddFeature(Feature{Id: util.Hash(fkey), Value: fvalue}) 61 | } 62 | } 63 | } 64 | for _, combination := range combinations { 65 | fkey := "" 66 | for _, ckey := range combination { 67 | fkey += ckey 68 | fkey += ":" 69 | fkey += sample.GetFeatureValue(ckey) 70 | fkey += "_" 71 | } 72 | fm[fkey] = util.Hash(fkey) 73 | out_sample.AddFeature(Feature{Id: util.Hash(fkey), Value: 1.0}) 74 | } 75 | out_data.AddSample(out_sample) 76 | } 77 | f, _ := os.Create("features.tsv") 78 | defer f.Close() 79 | w := bufio.NewWriter(f) 80 | for k, v := range fm { 81 | w.WriteString(k + "\t" + strconv.FormatInt(v, 10) + "\n") 82 | } 83 | 84 | return out_data 85 | } 86 | 87 | func (d *RawDataSet) Load(path string) error { 88 | file, err := os.Open(path) 89 | if err != nil { 90 | return err 91 | } 92 | defer file.Close() 93 | ch := make(chan string, 1000) 94 | go func() { 95 | reader := bufio.NewReader(file) 96 | for { 97 | line, err := reader.ReadString('\n') 98 | if err != nil { 99 | break 100 | } 101 | ch <- line 102 | } 103 | close(ch) 104 | }() 105 | 106 | n := 0 107 | for line := range ch { 108 | n += 1 109 | if n%10000 == 0 { 110 | fmt.Println(n, len(ch)) 111 | } 112 | line = strings.Replace(line, " ", "\t", -1) 113 | tks := strings.Split(line, "\t") 114 | sample := NewRawSample() 115 | for i, tk := range tks { 116 | if i == 0 { 117 | label, err := strconv.ParseInt(tk, 10, 16) 118 | if err != nil { 119 | break 120 | } 121 | if label > 0 { 122 | sample.Label = 1.0 123 | } else { 124 | sample.Label = 0.0 125 | } 126 | } else { 127 | kv := strings.Split(tk, ":") 128 | sample.Features[kv[0]] = kv[1] 129 | d.FeatureKeys[kv[0]] = true 130 | } 131 | } 132 | d.AddSample(sample) 133 | } 134 | return nil 135 | } 136 | 137 | /*Streaming*/ 138 | type StreamingDataSet struct { 139 | Samples chan *Sample 140 | } 141 | 142 | func NewStreamingDataSet() *StreamingDataSet { 143 | return &StreamingDataSet{ 144 | Samples: make(chan *Sample, 10000), 145 | } 146 | } 147 | 148 | func (d *StreamingDataSet) AddSample(sample *Sample) { 149 | d.Samples <- sample 150 | } 151 | 152 | func (d *StreamingDataSet) Load(path string, global_bias_feature_id int64) error { 153 | for step := 0; step < 2; step++ { 154 | file, err := os.Open(path) 155 | defer file.Close() 156 | if err != nil { 157 | log.Fatalln("load file fail: ", err) 158 | } 159 | reader := bufio.NewReader(file) 160 | for { 161 | line, err := reader.ReadString('\n') 162 | if err != nil { 163 | break 164 | } 165 | tks := strings.Split(strings.TrimSpace(line), "\t") 166 | sample := Sample{Features: make([]Feature, 0, 20), Label: 0} 167 | for i, tk := range tks { 168 | if i == 0 { 169 | label, _ := strconv.Atoi(tk) 170 | sample.Label = label 171 | } else { 172 | kv := strings.Split(tk, ":") 173 | feature_id, err := strconv.ParseInt(kv[0], 10, 64) 174 | if err != nil { 175 | log.Fatalln("wrong feature: ", kv[0]) 176 | } 177 | feature_value := 1.0 178 | if len(kv) > 1 { 179 | feature_value, err = strconv.ParseFloat(kv[1], 64) 180 | if err != nil { 181 | log.Fatalln("wrong value: ", kv[1]) 182 | } 183 | } 184 | feature := Feature{feature_id, feature_value} 185 | sample.Features = append(sample.Features, feature) 186 | } 187 | } 188 | if global_bias_feature_id >= 0 { 189 | sample.Features = append(sample.Features, Feature{global_bias_feature_id, 1.0}) 190 | } 191 | d.AddSample(&sample) 192 | } 193 | } 194 | close(d.Samples) 195 | return nil 196 | } 197 | 198 | /* DataSet */ 199 | type DataSet struct { 200 | Samples []*Sample 201 | FeatureNameIdMap map[int64]string 202 | max_label int 203 | } 204 | 205 | func NewDataSet() *DataSet { 206 | ret := DataSet{} 207 | ret.Samples = []*Sample{} 208 | ret.FeatureNameIdMap = make(map[int64]string) 209 | return &ret 210 | } 211 | 212 | func (d *DataSet) AddSample(sample *Sample) { 213 | d.Samples = append(d.Samples, sample) 214 | if d.max_label < sample.Label { 215 | d.max_label = sample.Label 216 | } 217 | } 218 | 219 | func (d *DataSet) Load(path string, global_bias_feature_id int64) error { 220 | fm := make(map[string]int64) 221 | 222 | ch := make(chan string, 1000) 223 | go func() { 224 | file, err := os.Open(path) 225 | defer file.Close() 226 | defer close(ch) 227 | if err != nil { 228 | log.Println("load file fail: ", err) 229 | return 230 | } 231 | 232 | scanner := bufio.NewScanner(file) 233 | 234 | for scanner.Scan() { 235 | line := strings.Replace(scanner.Text(), " ", "\t", -1) 236 | ch <- line 237 | } 238 | }() 239 | 240 | for line := range ch { 241 | tks := strings.Split(line, "\t") 242 | sample := Sample{Features: make([]Feature, 0, 20), Label: 0} 243 | for i, tk := range tks { 244 | if i == 0 { 245 | label, _ := strconv.Atoi(tk) 246 | sample.Label = label 247 | if d.max_label < label { 248 | d.max_label = label 249 | } 250 | } else { 251 | kv := strings.Split(tk, ":") 252 | feature_id, err := strconv.ParseInt(kv[0], 10, 64) 253 | if err != nil { 254 | feature_id = util.Hash(kv[0]) 255 | fm[kv[0]] = feature_id 256 | } 257 | d.FeatureNameIdMap[feature_id] = kv[0] 258 | feature_value := 1.0 259 | if len(kv) > 1 { 260 | feature_value, err = strconv.ParseFloat(kv[1], 64) 261 | if err != nil { 262 | break 263 | } 264 | } 265 | feature := Feature{feature_id, feature_value} 266 | sample.Features = append(sample.Features, feature) 267 | } 268 | } 269 | if global_bias_feature_id >= 0 { 270 | sample.Features = append(sample.Features, Feature{global_bias_feature_id, 1.0}) 271 | } 272 | d.AddSample(&sample) 273 | } 274 | f, _ := os.Create("features.tsv") 275 | defer f.Close() 276 | w := bufio.NewWriter(f) 277 | for k, v := range fm { 278 | w.WriteString(k + "\t" + strconv.FormatInt(v, 10) + "\n") 279 | } 280 | 281 | log.Println("dataset size : ", len(d.Samples)) 282 | return nil 283 | } 284 | 285 | func RemoveLowFreqFeatures(dataset *DataSet, threshold float64) { 286 | freq := NewVector() 287 | 288 | for _, sample := range dataset.Samples { 289 | for _, feature := range sample.Features { 290 | freq.AddValue(feature.Id, 1.0) 291 | } 292 | } 293 | 294 | for _, sample := range dataset.Samples { 295 | features := []Feature{} 296 | for _, feature := range sample.Features { 297 | if freq.GetValue(feature.Id) > threshold { 298 | features = append(features, feature) 299 | } 300 | } 301 | sample.Features = features 302 | } 303 | } 304 | 305 | func (d *DataSet) Split(f func(int) bool) *DataSet { 306 | out_data := NewDataSet() 307 | for i, sample := range d.Samples { 308 | if f(i) { 309 | out_data.AddSample(sample) 310 | } 311 | } 312 | return out_data 313 | } 314 | 315 | /* Real valued DataSet */ 316 | type RealDataSet struct { 317 | Samples []*RealSample 318 | } 319 | 320 | func NewRealDataSet() *RealDataSet { 321 | ret := RealDataSet{} 322 | ret.Samples = []*RealSample{} 323 | return &ret 324 | } 325 | 326 | func (d *RealDataSet) AddSample(sample *RealSample) { 327 | d.Samples = append(d.Samples, sample) 328 | } 329 | 330 | func (d *RealDataSet) Load(path string, global_bias_feature_id int64) error { 331 | file, err := os.Open(path) 332 | if err != nil { 333 | return err 334 | } 335 | defer file.Close() 336 | 337 | scanner := bufio.NewScanner(file) 338 | 339 | for scanner.Scan() { 340 | line := strings.Replace(scanner.Text(), " ", "\t", -1) 341 | tks := strings.Split(line, "\t") 342 | sample := RealSample{Features: []Feature{}, Value: 0.0} 343 | for i, tk := range tks { 344 | if i == 0 { 345 | value := util.ParseFloat64(tk) 346 | sample.Value = value 347 | } else { 348 | kv := strings.Split(tk, ":") 349 | feature_id, err := strconv.ParseInt(kv[0], 10, 64) 350 | if err != nil { 351 | break 352 | } 353 | feature_value := 1.0 354 | if len(kv) > 1 { 355 | feature_value, err = strconv.ParseFloat(kv[1], 64) 356 | if err != nil { 357 | break 358 | } 359 | } 360 | feature := Feature{feature_id, feature_value} 361 | sample.Features = append(sample.Features, feature) 362 | } 363 | } 364 | if global_bias_feature_id >= 0 { 365 | sample.Features = append(sample.Features, Feature{global_bias_feature_id, 1.0}) 366 | } 367 | d.AddSample(&sample) 368 | } 369 | if scanner.Err() != nil { 370 | return scanner.Err() 371 | } 372 | return nil 373 | } 374 | --------------------------------------------------------------------------------