├── ann
    ├── ReadMe.md
    └── neural_network.go
├── core
    ├── ReadMe.md
    ├── feature.go
    ├── vector_test.go
    ├── matrix_test.go
    ├── mock_dataset.go
    ├── label_preprocessing.go
    ├── matrix.go
    ├── array_vector.go
    ├── sample.go
    ├── vector.go
    ├── feature_analyze.go
    └── dataset.go
├── bin
    ├── hector-preprocessor
    ├── install
    ├── test.go
    ├── hector-feature-combination.go
    ├── hector-mc-run.go
    ├── hector-preprocessor.go
    ├── hector-mc-cv.go
    └── hector-stack.go
├── .travis.yml
├── algo
    ├── clustering.go
    ├── regressor.go
    └── classifier.go
├── lr
    ├── diff_function.go
    ├── terminal_criterion.go
    ├── minimizer_test.go
    ├── linear_regression.go
    ├── lbfgs_minimizer.go
    ├── logistic_regression.go
    ├── logistic_regression_streaming.go
    ├── ftrl_logistic_regression.go
    ├── quasinewton_helper.go
    ├── lr_owlqn.go
    ├── ep_logistic_regression.go
    └── owlqn_minimizer.go
├── util
    ├── hash_util.go
    ├── string_util.go
    └── math_util.go
├── .gitignore
├── regressor_test.go
├── hectorun
    └── hector-run.go
├── hectorstream
    └── hectorstream.go
├── fanaly
    └── fanaly.go
├── LICENSE
├── eval
    ├── evaluation_test.go
    └── evaluation.go
├── gp
    ├── covariance_function.go
    └── gaussian_process.go
├── hectorcv
    └── hector-cv.go
├── svm
    ├── knn.go
    ├── l1vm.go
    ├── linear_svm.go
    └── svm.go
├── hectorserver
    └── hectorserver.go
├── sa
    └── sa_auc.go
├── classifier_test.go
├── dt
    ├── gbdt.go
    ├── random_forest.go
    ├── regression_tree.go
    ├── random_decision_tree.go
    └── cart.go
├── fm
    └── factorize_machine.go
├── README.md
├── mc_runner.go
├── combine
    └── category_feature_combination.go
├── algo_runner.go
└── params.go


/ann/ReadMe.md:
--------------------------------------------------------------------------------
1 | Neural Network


--------------------------------------------------------------------------------
/core/ReadMe.md:
--------------------------------------------------------------------------------
1 | Define base structure such as Feature, Sample, DataSet


--------------------------------------------------------------------------------
/bin/hector-preprocessor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xlvector/hector/HEAD/bin/hector-preprocessor


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | 
3 | go:
4 |     - 1.4.1
5 | 
6 | install:
7 |     - go test github.com/xlvector/hector
8 | 


--------------------------------------------------------------------------------
/algo/clustering.go:
--------------------------------------------------------------------------------
 1 | package algo
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | )
 6 | 
 7 | type Clustering interface {
 8 | 	Init(params map[string]string)
 9 | 	Cluster(dataset core.DataSet)
10 | }
11 | 


--------------------------------------------------------------------------------
/bin/install:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | go build hector-cv.go
 4 | go build hector-run.go
 5 | go build hector-mc-cv.go
 6 | 
 7 | sudo cp hector-cv /usr/local/bin
 8 | sudo cp hector-run /usr/local/bin
 9 | sudo cp hector-mc-cv /usr/local/bin
10 | 


--------------------------------------------------------------------------------
/lr/diff_function.go:
--------------------------------------------------------------------------------
 1 | package lr
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | )
 6 | 
 7 | // Description: function for minimizer such as LBFGS and OWLQN
 8 | type DiffFunction interface {
 9 | 	Value(pos *core.Vector) float64
10 | 	Gradient(pos *core.Vector) *core.Vector
11 | }
12 | 


--------------------------------------------------------------------------------
/bin/test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import(
 4 | 	"reflect"
 5 | 	"fmt"
 6 | )
 7 | 
 8 | type A interface {
 9 | 	f(a int) int
10 | }
11 | 
12 | type B struct {
13 | 	c int
14 | }
15 | 
16 | func (self *B) f(a int) int {
17 | 	return a + self.c
18 | }
19 | 
20 | func main(){
21 | 	var x A
22 | 	x = &(B{3})
23 | 	fmt.Println(reflect.TypeOf(x))
24 | }


--------------------------------------------------------------------------------
/core/feature.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | type FeatureType int
 4 | 
 5 | var FeatureTypeEnum = struct {
 6 | 	DISCRETE_FEATURE FeatureType
 7 | 	CONTINUOUS_FEATURE FeatureType
 8 | }{0, 1}
 9 | 
10 | func GetFeatureType(key string) FeatureType {
11 | 	if key[0] == '#' {
12 | 		return FeatureTypeEnum.DISCRETE_FEATURE
13 | 	} else {
14 | 		return FeatureTypeEnum.CONTINUOUS_FEATURE
15 | 	}
16 | }
17 | 
18 | type Feature struct {
19 | 	Id int64
20 | 	Value float64
21 | }


--------------------------------------------------------------------------------
/util/hash_util.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | 
 4 | func CombineFeatures(fids []int64) int64{
 5 | 	ret := int64(0)
 6 | 	
 7 | 	for _, fid := range fids{
 8 | 		ret *= 601840361
 9 | 		ret += fid
10 | 	}	
11 | 	if ret < 0 {
12 | 		ret *= -1
13 | 	}
14 | 	return ret
15 | }
16 | 
17 | func Hash(str string) int64 {
18 | 	h := int64(0)
19 | 
20 | 	for _, ch := range str {
21 | 		h *= 601840361
22 | 		h += int64(ch)
23 | 	}
24 | 	if h < 0 {
25 | 		return -1 * h;
26 | 	}
27 | 	return h
28 | }


--------------------------------------------------------------------------------
/algo/regressor.go:
--------------------------------------------------------------------------------
 1 | package algo
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | )
 6 | 
 7 | type Regressor interface {
 8 | 
 9 | 	//Set training parameters from parameter map
10 | 	Init(params map[string]string)
11 | 
12 | 	//Train model on a given dataset
13 | 	Train(dataset *core.RealDataSet)
14 | 
15 | 	//Predict the output of an input sample
16 | 	Predict(sample *core.RealSample) float64
17 | 
18 | 	SaveModel(path string)
19 | 	LoadModel(path string)
20 | }
21 | 


--------------------------------------------------------------------------------
/core/vector_test.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"math"
 6 | )
 7 | 
 8 | func TestArrayVector(t *testing.T){
 9 | 	a := NewArrayVector()
10 | 	precision := 1e-9
11 | 
12 | 	a.AddValue(3, 1.78)
13 | 
14 | 	if math.Abs(a.GetValue(3) - 1.78) > precision {
15 | 		t.Error("Get wrong value after set value")
16 | 	}
17 | 
18 | 	a.AddValue(3, -1.1)
19 | 
20 | 	if math.Abs(a.GetValue(3) - 0.68) > precision {
21 | 		t.Error("Add value wrong")
22 | 	}
23 | 
24 | 	a.Scale(0.5)
25 | 
26 | 	if math.Abs(a.GetValue(3) - 0.34) > precision {
27 | 		t.Error("Scale wrong")
28 | 	}
29 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | hector-mc-cv
 3 | hector-mc-run
 4 | hector-stack
 5 | hector-server
 6 | code.google.com
 7 | hector-cv
 8 | hector-run
 9 | hector-feature-combination
10 | data/
11 | *.train
12 | *.test
13 | dwfa
14 | *.csv
15 | *.txt
16 | *.zip
17 | *.py
18 | *.data
19 | dw
20 | dwcv
21 | test
22 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
23 | *.o
24 | *.a
25 | *.so
26 | 
27 | # Folders
28 | _obj
29 | _test
30 | 
31 | # Architecture specific extensions/prefixes
32 | *.[568vq]
33 | [568vq].out
34 | 
35 | *.cgo1.go
36 | *.cgo2.c
37 | _cgo_defun.c
38 | _cgo_gotypes.go
39 | _cgo_export.*
40 | 
41 | _testmain.go
42 | 
43 | *.exe
44 | 
45 | *.tsv


--------------------------------------------------------------------------------
/regressor_test.go:
--------------------------------------------------------------------------------
 1 | package hector
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestRegressorOnSin(t *testing.T) {
 9 | 	algos := []string{"gp"}
10 | 
11 | 	params := make(map[string]string)
12 | 	params["dim"] = "1"
13 | 
14 | 	for _, algo := range algos {
15 | 		train_dataset := core.SinusoidalDataSet(100)
16 | 		test_dataset := core.SinusoidalDataSet(50)
17 | 		regressor := GetRegressor(algo)
18 | 		regressor.Init(params)
19 | 		rmse, _ := RegAlgorithmRunOnDataSet(regressor, train_dataset, test_dataset, "", params)
20 | 
21 | 		t.Logf("rmse of %s in sinusoidal dataset is %f", algo, rmse)
22 | 		if rmse > 0.1 {
23 | 			t.Error("rmse less than 0.1 in sinusoidal dataset")
24 | 		}
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/core/matrix_test.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"math"
 6 | )
 7 | 
 8 | func TestMatrix(t *testing.T){
 9 | 	a := NewMatrix()
10 | 	precision := 1e-9
11 | 
12 | 	a.AddValue(3, 4, 1.78)
13 | 
14 | 	if math.Abs(a.GetValue(3, 4) - 1.78) > precision {
15 | 		t.Error("Get wrong value after set value")
16 | 	}
17 | 
18 | 	a.AddValue(3, 4, -1.1)
19 | 
20 | 	if math.Abs(a.GetValue(3, 4) - 0.68) > precision {
21 | 		t.Error("Add value wrong")
22 | 	}
23 | 
24 | 	b := NewMatrix()
25 | 
26 | 	for i := 0; i < 10; i++ {
27 | 		for j := 0; j < 10; j++ {
28 | 			b.SetValue(int64(i), int64(j), 1.0)
29 | 		}
30 | 	}
31 | 
32 | 	c := b.Scale(2.0)
33 | 
34 | 	if math.Abs(c.GetValue(7,8) - 2.0) > precision {
35 | 		t.Error("scale function error")
36 | 	}
37 | }


--------------------------------------------------------------------------------
/bin/hector-feature-combination.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector"
 5 | 	"github.com/xlvector/hector/combine"
 6 | 	"github.com/xlvector/hector/core"
 7 | 	"os"
 8 | 	"strings"
 9 | )
10 | 
11 | func main() {
12 | 	train, _, _, _, params := hector.PrepareParams()
13 | 
14 | 	feature_combination := combine.CategoryFeatureCombination{}
15 | 	feature_combination.Init(params)
16 | 
17 | 	dataset := core.NewRawDataSet()
18 | 	dataset.Load(train)
19 | 
20 | 	combinations := feature_combination.FindCombination(dataset)
21 | 
22 | 	output := params["output"]
23 | 
24 | 	file, _ := os.Create(output)
25 | 	defer file.Close()
26 | 
27 | 	for _, combination := range combinations {
28 | 		file.WriteString(strings.Join(combination, "\t") + "\n")
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/hectorun/hector-run.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"runtime"
 6 | 
 7 | 	"github.com/xlvector/hector"
 8 | )
 9 | 
10 | func main() {
11 | 	train, test, pred, method, params := hector.PrepareParams()
12 | 
13 | 	action, _ := params["action"]
14 | 
15 | 	classifier := hector.GetClassifier(method)
16 | 	runtime.GOMAXPROCS(runtime.NumCPU())
17 | 	if action == "" {
18 | 		auc, _, _ := hector.AlgorithmRun(classifier, train, test, pred, params)
19 | 		fmt.Println("AUC:")
20 | 		fmt.Println(auc)
21 | 	} else if action == "train" {
22 | 		hector.AlgorithmTrain(classifier, train, params)
23 | 
24 | 	} else if action == "test" {
25 | 		auc, _, _ := hector.AlgorithmTest(classifier, test, pred, params)
26 | 		fmt.Println("AUC:")
27 | 		fmt.Println(auc)
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/algo/classifier.go:
--------------------------------------------------------------------------------
 1 | package algo
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | )
 6 | 
 7 | type Classifier interface {
 8 | 
 9 | 	//Set training parameters from parameter map
10 | 	Init(params map[string]string)
11 | 
12 | 	//Train model on a given dataset
13 | 	Train(dataset *core.DataSet)
14 | 
15 | 	//Predict the probability of a sample to be positive sample
16 | 	Predict(sample *core.Sample) float64
17 | 
18 | 	SaveModel(path string)
19 | 	LoadModel(path string)
20 | }
21 | 
22 | type MultiClassClassifier interface {
23 | 	//Set training parameters from parameter map
24 | 	Init(params map[string]string)
25 | 
26 | 	//Train model on a given dataset
27 | 	Train(dataset *core.DataSet)
28 | 
29 | 	//Predict the probability of a sample to be positive sample
30 | 	PredictMultiClass(sample *core.Sample) *core.ArrayVector
31 | 
32 | 	SaveModel(path string)
33 | 	LoadModel(path string)
34 | }
35 | 


--------------------------------------------------------------------------------
/hectorstream/hectorstream.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"runtime"
 7 | 
 8 | 	"github.com/xlvector/hector"
 9 | 	"github.com/xlvector/hector/core"
10 | 	"github.com/xlvector/hector/lr"
11 | )
12 | 
13 | func main() {
14 | 	train, test, pred, _, params := hector.PrepareParams()
15 | 	log.SetFlags(log.LstdFlags | log.Lshortfile)
16 | 
17 | 	action, _ := params["action"]
18 | 	runtime.GOMAXPROCS(runtime.NumCPU())
19 | 	if action == "train" {
20 | 		classifier := &lr.LogisticRegressionStream{}
21 | 		classifier.Init(params)
22 | 		data := core.NewStreamingDataSet()
23 | 		go data.Load(train, 1)
24 | 		classifier.Train(data)
25 | 		classifier.SaveModel(params["model"])
26 | 	} else if action == "test" {
27 | 		classifier := &lr.LogisticRegression{}
28 | 		classifier.Init(params)
29 | 		auc, _, _ := hector.AlgorithmTest(classifier, test, pred, params)
30 | 		fmt.Println("AUC:")
31 | 		fmt.Println(auc)
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/fanaly/fanaly.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"fmt"
 6 | 	"github.com/xlvector/hector/core"
 7 | 	"sort"
 8 | )
 9 | 
10 | type FeatureValue struct {
11 | 	Name  string
12 | 	Value float64
13 | }
14 | 
15 | type FeatureValueList []FeatureValue
16 | 
17 | func (ms FeatureValueList) Len() int {
18 | 	return len(ms)
19 | }
20 | 
21 | func (ms FeatureValueList) Less(i, j int) bool {
22 | 	return ms[i].Value > ms[j].Value
23 | }
24 | 
25 | func (ms FeatureValueList) Swap(i, j int) {
26 | 	ms[i], ms[j] = ms[j], ms[i]
27 | }
28 | 
29 | func main() {
30 | 	path := flag.String("input", "", "path of dataset")
31 | 	flag.Parse()
32 | 
33 | 	ds := core.NewDataSet()
34 | 	ds.Load(*path, -1)
35 | 	iv := core.InformationValue(ds)
36 | 	fs := make(FeatureValueList, 0, len(iv))
37 | 	for f, v := range iv {
38 | 		fs = append(fs, FeatureValue{Name: ds.FeatureNameIdMap[f], Value: v})
39 | 	}
40 | 	sort.Sort(fs)
41 | 	for _, f := range fs {
42 | 		fmt.Printf("%s\t%v\n", f.Name, f.Value)
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/bin/hector-mc-run.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/xlvector/hector"
 6 | 	"log"
 7 | 	"os"
 8 | 	"runtime/pprof"
 9 | )
10 | 
11 | func main() {
12 | 	train, test, pred, method, params := hector.PrepareParams()
13 | 
14 | 	action, _ := params["action"]
15 | 
16 | 	classifier := hector.GetMutliClassClassifier(method)
17 | 
18 | 	profile, _ := params["profile"]
19 | 	if profile != "" {
20 | 		fmt.Printf("Profile data => %s\n", profile)
21 | 		f, err := os.Create(profile)
22 | 		if err != nil {
23 | 			log.Fatal(err)
24 | 		}
25 | 		pprof.StartCPUProfile(f)
26 | 		defer pprof.StopCPUProfile()
27 | 	}
28 | 
29 | 	if action == "" {
30 | 		accuracy, _ := hector.MultiClassRun(classifier, train, test, pred, params)
31 | 		fmt.Println("accuracy : ", accuracy)
32 | 	} else if action == "train" {
33 | 		hector.MultiClassTrain(classifier, train, params)
34 | 
35 | 	} else if action == "test" {
36 | 		accuracy, _ := hector.MultiClassTest(classifier, test, pred, params)
37 | 		fmt.Println("accuracy", accuracy)
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Xiang Liang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/bin/hector-preprocessor.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/xlvector/hector"
 6 | 	"github.com/xlvector/hector/core"
 7 | 	"log"
 8 | 	"os"
 9 | )
10 | 
11 | func main() {
12 | 	train, test, _, _, params := hector.PrepareParams()
13 | 
14 | 	action, _ := params["action"]
15 | 
16 | 	if action == "encodelabel" {
17 | 
18 | 		fmt.Println("encoded dataset label ..." + train)
19 | 		e := core.NewLabelEncoder()
20 | 		EncodeLabelAction(e, train)
21 | 		fmt.Println("encoded dataset label ..." + test)
22 | 		EncodeLabelAction(e, test)
23 | 	}
24 | 
25 | }
26 | 
27 | func EncodeLabelAction(e *core.LabelEncoder, data_path string) {
28 | 
29 | 	dataset := core.NewDataSet()
30 | 	err := dataset.Load(data_path, -1)
31 | 
32 | 	if err != nil {
33 | 		log.Fatal(err)
34 | 		return
35 | 	}
36 | 
37 | 	encoded_label_dataset := e.TransformDataset(dataset)
38 | 	var output_file *os.File
39 | 
40 | 	output_file, _ = os.Create(data_path + ".hector")
41 | 	for _, sample := range encoded_label_dataset.Samples {
42 | 		output_file.WriteString(string(sample.ToString(false)) + "\n")
43 | 	}
44 | 
45 | 	if output_file != nil {
46 | 		defer output_file.Close()
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/eval/evaluation_test.go:
--------------------------------------------------------------------------------
 1 | package eval
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"math/rand"
 6 | 	"math"
 7 | )
 8 | 
 9 | func TestAUC(t *testing.T){
10 | 	predictions := []*LabelPrediction{}
11 | 	for i := 0; i < 1000; i++ {
12 | 		predictions = append(predictions, &(LabelPrediction{Label: rand.Int() % 2, Prediction: rand.Float64()}))
13 | 	}
14 | 	auc := AUC(predictions)
15 | 	if math.Abs(auc - 0.5) > 0.05{
16 | 		t.Error("Random predictions should have auc arround 0.5")
17 | 	}
18 | 	
19 | 	predictions = nil
20 | 	for i := 0; i < 1000; i++ {
21 | 		label := rand.Int() % 2
22 | 		prediction := rand.Float64()
23 | 		if label == 1 {
24 | 			prediction += 1.0
25 | 		}
26 | 		predictions = append(predictions, &(LabelPrediction{Label: label, Prediction: prediction}))
27 | 	}
28 | 	auc = AUC(predictions)
29 | 	if auc < 0.6 {
30 | 		t.Error("Asending predictions should have auc > 0.5")
31 | 	}
32 | 
33 | }
34 | 
35 | func TestRMSE(t *testing.T){
36 | 	predictions := []*LabelPrediction{}
37 | 	for i := 0; i < 1000; i++ {
38 | 		predictions = append(predictions, &(LabelPrediction{Label: 1, Prediction: 1.0}))
39 | 		rmse := RMSE(predictions)
40 | 
41 | 		if math.Abs(rmse) > 1e-9 {
42 | 			t.Error("RMSE Error")
43 | 		}
44 | 	}
45 | }
46 | 
47 | func TestErrorRate(t *testing.T) {
48 | 	predictions := []*LabelPrediction{}
49 | 	for i := 0; i < 1000; i++ {
50 | 		p := rand.Intn(2)
51 | 		predictions = append(predictions, &(LabelPrediction{Label: p, Prediction: float64(p)}))
52 | 	}
53 | 	error_rate := ErrorRate(predictions)
54 | 	if math.Abs(error_rate) > 1e-9{
55 | 		t.Error("Error Rate Error")
56 | 	}
57 | }


--------------------------------------------------------------------------------
/lr/terminal_criterion.go:
--------------------------------------------------------------------------------
 1 | package lr 
 2 | 
 3 | import ("math")
 4 | 
 5 | /**
 6 |  * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models"
 7 |  * by Galen Andrew and Jianfeng Gao
 8 |  * user: weixuan
 9 |  */
10 | type relativeMeanImprCriterion struct {
11 |     minHist int
12 |     maxHist int
13 |     tolerance float64
14 |     improvement float64
15 |     costList []float64
16 | }
17 | 
18 | func NewRelativeMeanImprCriterion(tolerance float64) *relativeMeanImprCriterion {
19 |     tc := new(relativeMeanImprCriterion)
20 |     tc.minHist = 5
21 |     tc.maxHist = 10
22 |     tc.costList = make([]float64, 0, tc.maxHist)
23 |     tc.tolerance = tolerance
24 |     return tc
25 | }
26 | 
27 | func (tc *relativeMeanImprCriterion) calImprovement() float64{
28 |     sz := len(tc.costList)
29 |     if sz <= tc.minHist {
30 |         return math.MaxFloat32
31 |     }
32 |     first := tc.costList[0]
33 |     last := tc.costList[sz-1]
34 |     impr := (first - last) /float64(sz-1)
35 |     if last != 0 {
36 |         impr = math.Abs(impr / last)
37 |     } else if first != 0 {
38 |         impr = math.Abs(impr / first)
39 |     } else {
40 |         impr = 0
41 |     }
42 |     if sz > tc.maxHist {
43 |         tc.costList = tc.costList[1:]
44 |     }
45 |     return impr
46 | }
47 | 
48 | func (tc *relativeMeanImprCriterion) addCost(latestCost float64) {
49 |     tc.costList = append(tc.costList, latestCost)
50 |     tc.improvement = tc.calImprovement()
51 | }
52 | 
53 | func (tc *relativeMeanImprCriterion) isTerminable() bool {
54 |     return tc.improvement  <= tc.tolerance
55 | }
56 | 


--------------------------------------------------------------------------------
/gp/covariance_function.go:
--------------------------------------------------------------------------------
 1 | package gp
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | 	"math"
 6 | )
 7 | 
 8 | type CovFunc func(*core.Vector, *core.Vector) float64
 9 | 
10 | func CovMatrix(X []*core.RealSample, cov_func CovFunc) *core.Matrix {
11 | 	l := int64(len(X))
12 | 	ret := core.NewMatrix()
13 | 	for i := int64(0); i < l; i++ {
14 | 		for j := i; j < l; j++ {
15 | 			c := cov_func(X[i].GetFeatureVector(), X[j].GetFeatureVector())
16 | 			ret.SetValue(i, j, c)
17 | 			ret.SetValue(j, i, c)
18 | 		}
19 | 	}
20 | 	return ret
21 | }
22 | 
23 | func CovVector(X []*core.RealSample, y *core.RealSample, cov_func CovFunc) *core.Vector {
24 | 	l := int64(len(X))
25 | 	ret := core.NewVector()
26 | 	for i := int64(0); i < l; i++ {
27 | 		ret.SetValue(i, cov_func(X[i].GetFeatureVector(), y.GetFeatureVector()))
28 | 	}
29 | 	return ret
30 | }
31 | 
32 | /*
33 |  Squared error covariance function
34 |  ARD = auto relevance detection, and here indicates there is a scaling/radius factor per dimension
35 | */
36 | type CovSEARD struct {
37 | 	Radiuses *core.Vector // dim -> radius
38 | 	Amp      float64
39 | }
40 | 
41 | func (cov_func *CovSEARD) Init(radiuses *core.Vector, amp float64) {
42 | 	cov_func.Radiuses = radiuses
43 | 	cov_func.Amp = amp
44 | }
45 | 
46 | func (cov_func *CovSEARD) Cov(x1 *core.Vector, x2 *core.Vector) float64 {
47 | 	ret := 0.0
48 | 	tmp := 0.0
49 | 	for key, r := range cov_func.Radiuses.Data {
50 | 		v1 := x1.GetValue(key)
51 | 		v2 := x2.GetValue(key)
52 | 		tmp = (v1 - v2) / r
53 | 		ret += tmp * tmp
54 | 	}
55 | 	ret = cov_func.Amp * math.Exp(-ret)
56 | 	return ret
57 | }
58 | 


--------------------------------------------------------------------------------
/bin/hector-mc-cv.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/xlvector/hector"
 6 | 	"github.com/xlvector/hector/core"
 7 | 	"log"
 8 | 	"os"
 9 | 	"runtime"
10 | 	"runtime/pprof"
11 | 	"strconv"
12 | )
13 | 
14 | func SplitFile(dataset *core.DataSet, total, part int) (*core.DataSet, *core.DataSet) {
15 | 
16 | 	train := core.NewDataSet()
17 | 	test := core.NewDataSet()
18 | 
19 | 	for i, sample := range dataset.Samples {
20 | 		if i%total == part {
21 | 			test.AddSample(sample)
22 | 		} else {
23 | 			train.AddSample(sample)
24 | 		}
25 | 	}
26 | 	return train, test
27 | }
28 | 
29 | func main() {
30 | 	train_path, _, _, method, params := hector.PrepareParams()
31 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
32 | 	profile, _ := params["profile"]
33 | 	dataset := core.NewDataSet()
34 | 	dataset.Load(train_path, global)
35 | 
36 | 	cv, _ := strconv.ParseInt(params["cv"], 10, 32)
37 | 	total := int(cv)
38 | 
39 | 	if profile != "" {
40 | 		f, err := os.Create(profile)
41 | 		if err != nil {
42 | 			log.Fatal(err)
43 | 		}
44 | 		pprof.StartCPUProfile(f)
45 | 		defer pprof.StopCPUProfile()
46 | 	}
47 | 
48 | 	average_accuracy := 0.0
49 | 	for part := 0; part < total; part++ {
50 | 		train, test := SplitFile(dataset, total, part)
51 | 		classifier := hector.GetMutliClassClassifier(method)
52 | 		classifier.Init(params)
53 | 		accuracy := hector.MultiClassRunOnDataSet(classifier, train, test, "", params)
54 | 		fmt.Println("accuracy : ", accuracy)
55 | 		average_accuracy += accuracy
56 | 		classifier = nil
57 | 		train = nil
58 | 		test = nil
59 | 		runtime.GC()
60 | 	}
61 | 	fmt.Println(average_accuracy / float64(total))
62 | }
63 | 


--------------------------------------------------------------------------------
/hectorcv/hector-cv.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"runtime"
 8 | 	"runtime/pprof"
 9 | 	"strconv"
10 | 
11 | 	"github.com/xlvector/hector"
12 | 	"github.com/xlvector/hector/core"
13 | )
14 | 
15 | func SplitFile(dataset *core.DataSet, total, part int) (*core.DataSet, *core.DataSet) {
16 | 
17 | 	train := core.NewDataSet()
18 | 	test := core.NewDataSet()
19 | 
20 | 	for i, sample := range dataset.Samples {
21 | 		if i%total == part {
22 | 			test.AddSample(sample)
23 | 		} else {
24 | 			train.AddSample(sample)
25 | 		}
26 | 	}
27 | 	return train, test
28 | }
29 | 
30 | func main() {
31 | 	train_path, _, _, method, params := hector.PrepareParams()
32 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
33 | 	profile, _ := params["profile"]
34 | 	dataset := core.NewDataSet()
35 | 	dataset.Load(train_path, global)
36 | 	runtime.GOMAXPROCS(runtime.NumCPU())
37 | 	cv, _ := strconv.ParseInt(params["cv"], 10, 32)
38 | 	total := int(cv)
39 | 
40 | 	if profile != "" {
41 | 		fmt.Println(profile)
42 | 		f, err := os.Create(profile)
43 | 		if err != nil {
44 | 			fmt.Println("%v", err)
45 | 			log.Fatal(err)
46 | 		}
47 | 		pprof.StartCPUProfile(f)
48 | 		defer pprof.StopCPUProfile()
49 | 	}
50 | 
51 | 	average_auc := 0.0
52 | 	for part := 0; part < total; part++ {
53 | 		train, test := SplitFile(dataset, total, part)
54 | 		classifier := hector.GetClassifier(method)
55 | 		classifier.Init(params)
56 | 		auc, _ := hector.AlgorithmRunOnDataSet(classifier, train, test, "", params)
57 | 		fmt.Println("AUC:")
58 | 		fmt.Println(auc)
59 | 		average_auc += auc
60 | 		classifier = nil
61 | 	}
62 | 	fmt.Println(average_auc / float64(total))
63 | }
64 | 


--------------------------------------------------------------------------------
/core/mock_dataset.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"math/rand"
 6 | )
 7 | 
 8 | func XORDataSet(n int) *DataSet{
 9 | 	ret := NewDataSet()
10 | 	for i := 0; i < n; i++ {
11 | 		x := 2 * (float64(rand.Intn(2)) - 0.5)
12 | 		y := 2 * (float64(rand.Intn(2)) - 0.5)
13 | 
14 | 		label := 1
15 | 
16 | 		if x * y < 0.0 {
17 | 			label = 0
18 | 		}
19 | 
20 | 		sample := NewSample()
21 | 		sample.Label = label
22 | 		sample.AddFeature(Feature{Id: 1, Value: x})
23 | 		sample.AddFeature(Feature{Id: 2, Value: y})
24 | 		sample.AddFeature(Feature{Id: 3, Value: 1.0})
25 | 		ret.AddSample(sample)
26 | 	}
27 | 	return ret
28 | }
29 | 
30 | func LinearDataSet(n int) *DataSet {
31 | 	ret := NewDataSet()
32 | 	for i := 0; i < n; i++{
33 | 		sample := NewSample()
34 | 		sample.Label = 0
35 | 		for f := 0; f < 100; f++{
36 | 			if rand.Intn(10) != 1 {
37 | 				continue
38 | 			}
39 | 			if f < 20 {
40 | 				sample.Label += 1
41 | 			} else if f > 80 {
42 | 				sample.Label -= 1
43 | 			}
44 | 			sample.AddFeature(Feature{Id: int64(f), Value: 1.0})
45 | 		}
46 | 		if sample.Label > 0 {
47 | 			sample.Label = 1
48 | 		} else {
49 | 			sample.Label = 0
50 | 		}
51 | 		ret.AddSample(sample)
52 | 	}
53 | 	return ret
54 | }
55 | 
56 | func SinusoidalDataSet(n int) *RealDataSet {
57 | 	ret := NewRealDataSet()
58 | 
59 | 	min := -5.0
60 | 	max := 5.0
61 | 	amp := 1.0
62 | 	noise := 0.05
63 | 	period := 4.0
64 | 	interval := (max - min) / float64(n)
65 | 	for i := 0; i < n; i++ {
66 | 		x := min + interval * float64(i) + 0.5*interval
67 | 		y := math.Sin((x-min)*2*math.Pi/period) * amp + rand.NormFloat64()*noise
68 | 		sample := NewRealSample()
69 | 		sample.AddFeature(Feature{Id: int64(1), Value: x})
70 | 		sample.Value = y
71 | 		ret.AddSample(sample)
72 | 	}
73 | 
74 | 	return ret
75 | }
76 | 


--------------------------------------------------------------------------------
/svm/knn.go:
--------------------------------------------------------------------------------
 1 | package svm
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | 	"github.com/xlvector/hector/eval"
 6 | 	"math"
 7 | 	"math/rand"
 8 | 	"strconv"
 9 | )
10 | 
11 | type KNN struct {
12 | 	sv     []*core.Vector
13 | 	labels []int
14 | 	k      int
15 | }
16 | 
17 | func (self *KNN) SaveModel(path string) {
18 | 
19 | }
20 | 
21 | func (self *KNN) LoadModel(path string) {
22 | 
23 | }
24 | 
25 | func (c *KNN) Init(params map[string]string) {
26 | 	K, _ := strconv.ParseInt(params["k"], 10, 64)
27 | 	c.k = int(K)
28 | }
29 | 
30 | func (c *KNN) Kernel(x, y *core.Vector) float64 {
31 | 	z := x.Copy()
32 | 	z.AddVector(y, -1.0)
33 | 	ret := math.Exp(-1.0 * z.NormL2() / 20.0)
34 | 	return ret
35 | }
36 | 
37 | func (c *KNN) Predict(sample *core.Sample) float64 {
38 | 	ret := c.PredictMultiClass(sample)
39 | 	return ret.GetValue(1)
40 | }
41 | 
42 | func (c *KNN) PredictMultiClass(sample *core.Sample) *core.ArrayVector {
43 | 	x := sample.GetFeatureVector()
44 | 	predictions := []*eval.LabelPrediction{}
45 | 	for i, s := range c.sv {
46 | 		predictions = append(predictions, &(eval.LabelPrediction{Label: c.labels[i], Prediction: c.Kernel(s, x)}))
47 | 	}
48 | 
49 | 	compare := func(p1, p2 *eval.LabelPrediction) bool {
50 | 		return p1.Prediction > p2.Prediction
51 | 	}
52 | 
53 | 	eval.By(compare).Sort(predictions)
54 | 
55 | 	ret := core.NewArrayVector()
56 | 	for i, pred := range predictions {
57 | 		if i > c.k {
58 | 			break
59 | 		}
60 | 		ret.AddValue(pred.Label, 1.0)
61 | 	}
62 | 	return ret
63 | }
64 | 
65 | func (c *KNN) Train(dataset *core.DataSet) {
66 | 	c.sv = []*core.Vector{}
67 | 	c.labels = []int{}
68 | 	for i := 0; i < 1000; i++ {
69 | 		k := rand.Intn(len(dataset.Samples))
70 | 		c.sv = append(c.sv, dataset.Samples[k].GetFeatureVector())
71 | 		c.labels = append(c.labels, dataset.Samples[k].Label)
72 | 	}
73 | }
74 | 


--------------------------------------------------------------------------------
/hectorserver/hectorserver.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 	"github.com/xlvector/hector"
 7 | 	"github.com/xlvector/hector/algo"
 8 | 	"github.com/xlvector/hector/core"
 9 | 	"github.com/xlvector/hector/util"
10 | 	"log"
11 | 	"net/http"
12 | )
13 | 
14 | type ClassifierHandler struct {
15 | 	classifier algo.Classifier
16 | }
17 | 
18 | func (c *ClassifierHandler) ServeHTTP(w http.ResponseWriter,
19 | 	req *http.Request) {
20 | 	sample := core.NewSample()
21 | 	if req.Method != "POST" {
22 | 		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
23 | 		return
24 | 	}
25 | 	features := req.FormValue("features")
26 | 	if len(features) == 0 {
27 | 		http.Error(w, "need input features", http.StatusInternalServerError)
28 | 		return
29 | 	}
30 | 	fs := make(map[string]float64)
31 | 	err := json.Unmarshal([]byte(features), &fs)
32 | 	if err != nil {
33 | 		http.Error(w, err.Error(), http.StatusInternalServerError)
34 | 		return
35 | 	}
36 | 	for k, v := range fs {
37 | 		f := core.Feature{
38 | 			Id:    util.Hash(k),
39 | 			Value: v,
40 | 		}
41 | 		sample.AddFeature(f)
42 | 	}
43 | 	p := c.classifier.Predict(sample)
44 | 	output, err := json.Marshal(map[string]interface{}{
45 | 		"prediction": p,
46 | 	})
47 | 	if err != nil {
48 | 		http.Error(w, err.Error(), http.StatusInternalServerError)
49 | 		return
50 | 	}
51 | 	fmt.Fprint(w, output)
52 | }
53 | 
54 | func main() {
55 | 	_, _, _, method, params := hector.PrepareParams()
56 | 	ch := &ClassifierHandler{
57 | 		classifier: hector.GetClassifier(method),
58 | 	}
59 | 	model, ok := params["model"]
60 | 	if !ok {
61 | 		log.Fatalln("please input model file")
62 | 	}
63 | 	ch.classifier.LoadModel(model)
64 | 	http.Handle("/predict", ch)
65 | 	err := http.ListenAndServe(":"+params["port"], nil)
66 | 	if err != nil {
67 | 		log.Fatal(err)
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/lr/minimizer_test.go:
--------------------------------------------------------------------------------
 1 | package lr
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | 	"math"
 6 | 	"testing"
 7 | )
 8 | 
 9 | type mseDiffFunction struct {
10 | 	center  core.Vector
11 | 	weights core.Vector
12 | 	grad    core.Vector
13 | 	init    core.Vector
14 | }
15 | 
16 | func getMSECostFunction() *mseDiffFunction {
17 | 	f := new(mseDiffFunction)
18 | 	f.center.Data = map[int64]float64{}
19 | 	f.weights.Data = map[int64]float64{0: 1, 1: 0.01}
20 | 	f.init.Data = map[int64]float64{0: 1, 1: 1}
21 | 	f.grad.Data = map[int64]float64{0: 0, 1: 0}
22 | 	return f
23 | }
24 | 
25 | func (f *mseDiffFunction) Value(x *core.Vector) float64 {
26 | 	var cost float64 = 0
27 | 	for n, val := range x.Data {
28 | 		diff := val - f.center.GetValue(n)
29 | 		cost += f.weights.GetValue(n) * diff * diff
30 | 	}
31 | 	return 0.5 * cost
32 | }
33 | 
34 | // Gradients for different points could use the same memory
35 | func (f *mseDiffFunction) Gradient(x *core.Vector) *core.Vector {
36 | 	for n, val := range x.Data {
37 | 		f.grad.SetValue(n, f.weights.GetValue(n)*(val-f.center.GetValue(n)))
38 | 	}
39 | 	return &f.grad
40 | }
41 | 
42 | func (f *mseDiffFunction) testResult(result *core.Vector, tolerance float64, t *testing.T) {
43 | 	for n, val := range result.Data {
44 | 		if math.Abs(val-f.center.GetValue(n)) > tolerance {
45 | 			t.Errorf("Mismatch\nIndex\tTrue\tResult\n%d\t%e\t%e\n", n, f.center.GetValue(n), val)
46 | 		}
47 | 	}
48 | }
49 | 
50 | func TestLBFGS(t *testing.T) {
51 | 	diffFunc := getMSECostFunction()
52 | 	minimizer := NewLBFGSMinimizer()
53 | 	result := minimizer.Minimize(diffFunc, &(diffFunc.init))
54 | 	diffFunc.testResult(result, 1e-6, t)
55 | }
56 | 
57 | func TestOWLQN(t *testing.T) {
58 | 	diffFunc := getMSECostFunction()
59 | 	minimizer := NewOWLQNMinimizer(0.001)
60 | 	result := minimizer.Minimize(diffFunc, &(diffFunc.init))
61 | 	diffFunc.testResult(result, 0, t)
62 | }
63 | 


--------------------------------------------------------------------------------
/core/label_preprocessing.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | import(
 4 |     "fmt"
 5 | )
 6 | 
 7 | type IntEncoder struct {
 8 |     Mapping map[int]int
 9 |     InverseMapping map[int]int
10 | }
11 | 
12 | func NewIntEncoder() *IntEncoder {
13 |     e := IntEncoder{}
14 |     e.Mapping = make(map[int]int)
15 |     e.InverseMapping = make(map[int]int)
16 |     return &e
17 | }
18 | 
19 | func (e *IntEncoder) Encoded(original int) int{
20 |     if encoded, ok := e.Mapping[original]; ok {
21 |         return encoded    
22 |     }
23 | 
24 |     e.Mapping[original] = len(e.Mapping)
25 |     encoded := e.Mapping[original]
26 |     e.InverseMapping[encoded] = original            
27 |     return encoded
28 | }
29 | 
30 | func (e *IntEncoder) Decoded(encoded int) (int, error){
31 |     if decoded, ok := e.InverseMapping[encoded]; ok {
32 |         return decoded, nil  
33 |     }
34 | 
35 |     return -1, fmt.Errorf("Can't find %d in dictionary...", encoded)
36 | }
37 | 
38 | type LabelEncoder struct { 
39 |     labelMapper *IntEncoder
40 | }
41 | 
42 | func NewLabelEncoder() *LabelEncoder {
43 |     e := LabelEncoder{}
44 |     e.labelMapper = NewIntEncoder()
45 |     return &e
46 | }
47 | 
48 | func (e *LabelEncoder) TransformSample(s *Sample) *Sample{
49 |     ret := s.Clone()
50 |     ret.Label = e.labelMapper.Encoded(ret.Label)
51 |     return ret
52 | }
53 | 
54 | func (e *LabelEncoder) TransformDataset(dataset *DataSet) *DataSet{
55 |     ret := NewDataSet()
56 |     for _, sample := range dataset.Samples {
57 |         ret.AddSample(e.TransformSample(sample))
58 |     }
59 | 
60 |     return ret
61 | }
62 | 
63 | func (e *LabelEncoder) InverseTransformSample(s *Sample) *Sample{
64 |     ret := s.Clone()
65 |     ret.Label, _ = e.labelMapper.Decoded(ret.Label)
66 |     return ret
67 | }
68 | 
69 | func (e *LabelEncoder) InverseTransformDataset(dataset *DataSet) *DataSet{
70 |     ret := NewDataSet()
71 |     for _, sample := range dataset.Samples {
72 |         ret.AddSample(e.InverseTransformSample(sample))
73 |     }
74 | 
75 |     return ret
76 | }


--------------------------------------------------------------------------------
/core/matrix.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | type Matrix struct {
 4 | 	Data map[int64]*Vector
 5 | }
 6 | 
 7 | func NewMatrix() *Matrix {
 8 | 	m := Matrix{}
 9 | 	m.Data = make(map[int64]*Vector)
10 | 	return &m
11 | }
12 | 
13 | func (m *Matrix) AddValue(k1, k2 int64, v float64){
14 | 	_, ok := m.Data[k1]
15 | 	if !ok {
16 | 		m.Data[k1] = NewVector()
17 | 	}
18 | 	m.Data[k1].AddValue(k2, v)
19 | }
20 | 
21 | func (m *Matrix) SetValue(k1, k2 int64, v float64){
22 | 	row, ok := m.Data[k1]
23 | 	if !ok {
24 | 		row = NewVector()
25 | 		m.Data[k1] = row
26 | 	}
27 | 	row.SetValue(k2, v)
28 | }
29 | 
30 | func (m *Matrix) GetValue(k1, k2 int64) float64 {
31 | 	row := m.GetRow(k1)
32 | 	if row == nil {
33 | 		return 0.0
34 | 	} else {
35 | 		return row.GetValue(k2)
36 | 	}
37 | }
38 | 
39 | func (m *Matrix) GetRow(k1 int64) *Vector {
40 | 	row, ok := m.Data[k1]
41 | 	if !ok {
42 | 		return nil
43 | 	} else {
44 | 		return row
45 | 	}
46 | }
47 | 
48 | func (m *Matrix) Scale(scale float64) *Matrix {
49 | 	ret := NewMatrix()
50 | 	for id, vi := range m.Data {
51 | 		ret.Data[id] = vi.Scale(scale)
52 | 	}
53 | 	return ret
54 | }
55 | 
56 | func (m *Matrix) MultiplyVector(v *Vector) *Vector {
57 | 	// This is intended for l-by-m * m-by-1
58 | 	// For m-by-1 * 1-by-n, use OuterProduct in vector.go
59 | 	// Probably should just have a MatrixMultiply for everything
60 | 	ret := NewVector()
61 | 	for id, vi := range m.Data {
62 | 		ret.SetValue(id, v.Dot(vi))
63 | 	}
64 | 	return ret
65 | }
66 | 
67 | func (m *Matrix) Trans() *Matrix {
68 | 	ret := NewMatrix()
69 | 	for rid, vi := range m.Data {
70 | 		for cid, w := range vi.Data {
71 | 			ret.SetValue(cid, rid, w)
72 | 		}
73 | 	}
74 | 	return ret
75 | }
76 | 
77 | func (m *Matrix) ElemWiseAddMatrix(n *Matrix) *Matrix {
78 | 	ret := NewMatrix()
79 | 	for key, mi := range m.Data{
80 | 		ret.Data[key] = mi
81 | 	}
82 | 	for key, ni := range n.Data{
83 | 		if ret.GetRow(key) == nil{
84 | 			ret.Data[key] = ni
85 | 		} else {
86 | 			ret.Data[key] = ni.ElemWiseAddVector(ret.GetRow(key))
87 | 		}
88 | 	}
89 | 	return ret
90 | }
91 | 


--------------------------------------------------------------------------------
/sa/sa_auc.go:
--------------------------------------------------------------------------------
 1 | package sa
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/xlvector/hector/core"
 6 | 	"github.com/xlvector/hector/eval"
 7 | 	"math/rand"
 8 | )
 9 | 
10 | type SAOptAUC struct {
11 | 	Model map[int64]float64
12 | }
13 | 
14 | func (self *SAOptAUC) SaveModel(path string) {
15 | 
16 | }
17 | 
18 | func (self *SAOptAUC) LoadModel(path string) {
19 | 
20 | }
21 | 
22 | func (algo *SAOptAUC) Init(params map[string]string) {
23 | 	algo.Model = make(map[int64]float64)
24 | }
25 | 
26 | func (algo *SAOptAUC) TrainAUC(samples []*core.Sample) float64 {
27 | 	predictions := []*eval.LabelPrediction{}
28 | 	for _, sample := range samples {
29 | 		pred := algo.Predict(sample)
30 | 		predictions = append(predictions, &(eval.LabelPrediction{Label: sample.Label, Prediction: pred}))
31 | 	}
32 | 	return eval.AUC(predictions)
33 | }
34 | 
35 | func (algo *SAOptAUC) Train(dataset *core.DataSet) {
36 | 	algo.Model = make(map[int64]float64)
37 | 	samples := []*core.Sample{}
38 | 	for _, sample := range dataset.Samples {
39 | 		for _, feature := range sample.Features {
40 | 			algo.Model[feature.Id] = 1.0 / float64(len(sample.Features))
41 | 		}
42 | 		samples = append(samples, sample)
43 | 	}
44 | 
45 | 	features := []int64{}
46 | 	for fid, _ := range algo.Model {
47 | 		features = append(features, fid)
48 | 	}
49 | 
50 | 	prev_auc := 0.5
51 | 	for i := 0; i < 5000; i++ {
52 | 		add := rand.Float64()
53 | 		fid := features[rand.Intn(len(features))]
54 | 		fweight := algo.Model[fid]
55 | 		algo.Model[fid] = add
56 | 		auc := algo.TrainAUC(samples)
57 | 
58 | 		if i%500 == 0 {
59 | 			fmt.Println(prev_auc)
60 | 		}
61 | 
62 | 		if prev_auc < auc {
63 | 			prev_auc = auc
64 | 		} else {
65 | 			algo.Model[fid] = fweight
66 | 		}
67 | 	}
68 | 	fmt.Println(algo.Model)
69 | }
70 | 
71 | func (algo *SAOptAUC) Predict(sample *core.Sample) float64 {
72 | 	ret := 0.0
73 | 	for _, feature := range sample.Features {
74 | 		model_feature_value, ok := algo.Model[feature.Id]
75 | 		if ok {
76 | 			ret += model_feature_value * feature.Value
77 | 		}
78 | 	}
79 | 	return ret
80 | }
81 | 


--------------------------------------------------------------------------------
/classifier_test.go:
--------------------------------------------------------------------------------
 1 | package hector
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestClassifiers(t *testing.T) {
 9 | 	train_dataset := core.LinearDataSet(1000)
10 | 	test_dataset := core.LinearDataSet(500)
11 | 
12 | 	algos := []string{"ep", "fm", "ftrl", "lr", "linear_svm", "lr_owlqn"}
13 | 
14 | 	params := make(map[string]string)
15 | 	params["beta"] = "1.0"
16 | 	params["steps"] = "10"
17 | 	params["lambda1"] = "0.1"
18 | 	params["lambda2"] = "1.0"
19 | 	params["alpha"] = "0.1"
20 | 	params["max-depth"] = "20"
21 | 	params["min-leaf-size"] = "5"
22 | 	params["tree-count"] = "10"
23 | 	params["learning-rate"] = "0.05"
24 | 	params["regularization"] = "0.0001"
25 | 	params["e"] = "0.1"
26 | 	params["c"] = "0.1"
27 | 	params["gini"] = "1.0"
28 | 	params["factors"] = "10"
29 | 
30 | 	for _, algo := range algos {
31 | 		classifier := GetClassifier(algo)
32 | 		classifier.Init(params)
33 | 		auc, _ := AlgorithmRunOnDataSet(classifier, train_dataset, test_dataset, "", params)
34 | 
35 | 		t.Logf("auc of %s in linear dataset is %f", algo, auc)
36 | 		if auc < 0.9 {
37 | 			t.Error("auc less than 0.9 in linear dataset")
38 | 		}
39 | 	}
40 | }
41 | 
42 | func TestClassifiersOnXOR(t *testing.T) {
43 | 	algos := []string{"ann", "rf", "rdt", "knn"}
44 | 
45 | 	params := make(map[string]string)
46 | 	params["steps"] = "30"
47 | 	params["max-depth"] = "10"
48 | 	params["min-leaf-size"] = "10"
49 | 	params["tree-count"] = "100"
50 | 	params["learning-rate"] = "0.1"
51 | 	params["learning-rate-discount"] = "1.0"
52 | 	params["regularization"] = "0.0001"
53 | 	params["gini"] = "1.0"
54 | 	params["hidden"] = "15"
55 | 	params["k"] = "10"
56 | 	params["feature-count"] = "1.0"
57 | 	params["dt-sample-ratio"] = "1.0"
58 | 
59 | 	for _, algo := range algos {
60 | 		train_dataset := core.XORDataSet(1000)
61 | 		test_dataset := core.XORDataSet(500)
62 | 		classifier := GetClassifier(algo)
63 | 		classifier.Init(params)
64 | 		auc, _ := AlgorithmRunOnDataSet(classifier, train_dataset, test_dataset, "", params)
65 | 
66 | 		t.Logf("auc of %s in xor dataset is %f", algo, auc)
67 | 		if auc < 0.9 {
68 | 			t.Error("auc less than 0.9 in xor dataset")
69 | 		}
70 | 	}
71 | }
72 | 


--------------------------------------------------------------------------------
/util/string_util.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"io"
 7 | 	"strconv"
 8 | 	"io/ioutil"
 9 | )
10 | 
11 | type StringBuilder struct {
12 | 	buffer bytes.Buffer
13 | }
14 | 
15 | func (self *StringBuilder) Write(strings ...string) *StringBuilder {
16 | 	for _, str := range strings {
17 | 		self.buffer.WriteString(str)
18 | 	}
19 | 	return self
20 | }
21 | 
22 | func (self *StringBuilder) Printf(format string, args ...interface{}) *StringBuilder {
23 | 	fmt.Fprintf(&self.buffer, format, args...)
24 | 	return self
25 | }
26 | 
27 | func (self *StringBuilder) Byte(value byte) *StringBuilder {
28 | 	self.buffer.WriteByte(value)
29 | 	return self
30 | }
31 | 
32 | func (self *StringBuilder) WriteBytes(bytes []byte) *StringBuilder {
33 | 	self.buffer.Write(bytes)
34 | 	return self
35 | }
36 | 
37 | func (self *StringBuilder) Int(value int) *StringBuilder {
38 | 	self.buffer.WriteString(strconv.Itoa(value))
39 | 	return self
40 | }
41 | 
42 | func (self *StringBuilder) Int64(value int64) *StringBuilder {
43 | 	self.buffer.WriteString(strconv.FormatInt(value, 10))
44 | 	return self
45 | }
46 | 
47 | func (self *StringBuilder) UInt64(value uint64) *StringBuilder {
48 | 	self.buffer.WriteString(strconv.FormatUint(value, 10))
49 | 	return self
50 | }
51 | 
52 | func (self *StringBuilder) Uint(value uint) *StringBuilder {
53 | 	self.buffer.WriteString(strconv.FormatUint(uint64(value), 10))
54 | 	return self
55 | }
56 | 
57 | func (self *StringBuilder) Float(value float64) *StringBuilder {
58 | 	self.buffer.WriteString(strconv.FormatFloat(value, 'f', -1, 64))
59 | 	return self
60 | }
61 | 
62 | func (self *StringBuilder) Bool(value bool) *StringBuilder {
63 | 	self.buffer.WriteString(strconv.FormatBool(value))
64 | 	return self
65 | }
66 | 
67 | func (self *StringBuilder) WriteTo(writer io.Writer) (n int64, err error) {
68 | 	return self.buffer.WriteTo(writer)
69 | }
70 | 
71 | func (self *StringBuilder) WriteToFile(filename string) error {
72 | 	return ioutil.WriteFile(filename, self.buffer.Bytes(), 0600)
73 | }
74 | 
75 | func (self *StringBuilder) Bytes() []byte {
76 | 	return self.buffer.Bytes()
77 | }
78 | 
79 | func (self *StringBuilder) String() string {
80 | 	return self.buffer.String()
81 | }
82 | 


--------------------------------------------------------------------------------
/dt/gbdt.go:
--------------------------------------------------------------------------------
 1 | package dt
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"fmt"
 6 | 	"github.com/xlvector/hector/core"
 7 | 	"math"
 8 | 	"os"
 9 | 	"strconv"
10 | )
11 | 
12 | type GBDT struct {
13 | 	dts        []*RegressionTree
14 | 	tree_count int
15 | 	shrink     float64
16 | }
17 | 
18 | func (self *GBDT) SaveModel(path string) {
19 | 	file, _ := os.Create(path)
20 | 	defer file.Close()
21 | 	for _, dt := range self.dts {
22 | 		buf := dt.tree.ToString()
23 | 		file.Write(buf)
24 | 		file.WriteString("\n#\n")
25 | 	}
26 | }
27 | 
28 | func (self *GBDT) LoadModel(path string) {
29 | 	file, _ := os.Open(path)
30 | 	defer file.Close()
31 | 
32 | 	self.dts = []*RegressionTree{}
33 | 	scanner := bufio.NewScanner(file)
34 | 	text := ""
35 | 	for scanner.Scan() {
36 | 		line := scanner.Text()
37 | 		if line == "#" {
38 | 			tree := Tree{}
39 | 			tree.FromString(text)
40 | 			dt := RegressionTree{tree: tree}
41 | 			self.dts = append(self.dts, &dt)
42 | 			text = ""
43 | 		} else {
44 | 			text += line + "\n"
45 | 		}
46 | 	}
47 | }
48 | 
49 | func (c *GBDT) Init(params map[string]string) {
50 | 	tree_count, _ := strconv.ParseInt(params["tree-count"], 10, 64)
51 | 	c.tree_count = int(tree_count)
52 | 	for i := 0; i < c.tree_count; i++ {
53 | 		dt := RegressionTree{}
54 | 		dt.Init(params)
55 | 		c.dts = append(c.dts, &dt)
56 | 	}
57 | 	c.shrink, _ = strconv.ParseFloat(params["learning-rate"], 64)
58 | }
59 | 
60 | func (c *GBDT) RMSE(dataset *core.DataSet) float64 {
61 | 	rmse := 0.0
62 | 	n := 0.0
63 | 	for _, sample := range dataset.Samples {
64 | 		rmse += (sample.Prediction) * (sample.Prediction)
65 | 		n += 1.0
66 | 	}
67 | 	return math.Sqrt(rmse / n)
68 | }
69 | 
70 | func (c *GBDT) Train(dataset *core.DataSet) {
71 | 	for _, sample := range dataset.Samples {
72 | 		sample.Prediction = sample.LabelDoubleValue()
73 | 	}
74 | 	for k, dt := range c.dts {
75 | 		dt.Train(dataset)
76 | 		for _, sample := range dataset.Samples {
77 | 			sample.Prediction -= c.shrink * dt.Predict(sample)
78 | 		}
79 | 		if k%10 == 0 {
80 | 			fmt.Println(c.RMSE(dataset))
81 | 		}
82 | 	}
83 | }
84 | 
85 | func (c *GBDT) Predict(sample *core.Sample) float64 {
86 | 	ret := 0.0
87 | 	for _, dt := range c.dts {
88 | 		ret += c.shrink * dt.Predict(sample)
89 | 	}
90 | 	return ret
91 | }
92 | 


--------------------------------------------------------------------------------
/lr/linear_regression.go:
--------------------------------------------------------------------------------
 1 | package lr
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"github.com/xlvector/hector/core"
 6 | 	"github.com/xlvector/hector/util"
 7 | 	"os"
 8 | 	"strconv"
 9 | 	"strings"
10 | )
11 | 
12 | type LinearRegression struct {
13 | 	Model  map[int64]float64
14 | 	Params LogisticRegressionParams
15 | }
16 | 
17 | func (algo *LinearRegression) SaveModel(path string) {
18 | 	sb := util.StringBuilder{}
19 | 	for f, g := range algo.Model {
20 | 		sb.Int64(f)
21 | 		sb.Write("\t")
22 | 		sb.Float(g)
23 | 		sb.Write("\n")
24 | 	}
25 | 	sb.WriteToFile(path)
26 | }
27 | 
28 | func (algo *LinearRegression) LoadModel(path string) {
29 | 	file, _ := os.Open(path)
30 | 	defer file.Close()
31 | 
32 | 	scaner := bufio.NewScanner(file)
33 | 	for scaner.Scan() {
34 | 		line := scaner.Text()
35 | 		tks := strings.Split(line, "\t")
36 | 		fid, _ := strconv.ParseInt(tks[0], 10, 64)
37 | 		fw, _ := strconv.ParseFloat(tks[1], 64)
38 | 		algo.Model[fid] = fw
39 | 	}
40 | }
41 | 
42 | func (algo *LinearRegression) Init(params map[string]string) {
43 | 	algo.Model = make(map[int64]float64)
44 | 
45 | 	algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64)
46 | 	algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64)
47 | }
48 | 
49 | func (algo *LinearRegression) Train(dataset *core.DataSet) {
50 | 	algo.Model = make(map[int64]float64)
51 | 	for step := 0; step < algo.Params.Steps; step++ {
52 | 		for _, sample := range dataset.Samples {
53 | 			prediction := algo.Predict(sample)
54 | 			err := sample.LabelDoubleValue() - prediction
55 | 			for _, feature := range sample.Features {
56 | 				model_feature_value, ok := algo.Model[feature.Id]
57 | 				if !ok {
58 | 					model_feature_value = 0.0
59 | 				}
60 | 				model_feature_value += algo.Params.LearningRate * (err*feature.Value - algo.Params.Regularization*model_feature_value)
61 | 				algo.Model[feature.Id] = model_feature_value
62 | 			}
63 | 		}
64 | 		algo.Params.LearningRate *= 0.9
65 | 	}
66 | }
67 | 
68 | func (algo *LinearRegression) Predict(sample *core.Sample) float64 {
69 | 	ret := 0.0
70 | 	for _, feature := range sample.Features {
71 | 		model_feature_value, ok := algo.Model[feature.Id]
72 | 		if ok {
73 | 			ret += model_feature_value * feature.Value
74 | 		}
75 | 	}
76 | 	return ret
77 | }
78 | 


--------------------------------------------------------------------------------
/lr/lbfgs_minimizer.go:
--------------------------------------------------------------------------------
 1 | package lr
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/xlvector/hector/core"
 6 | )
 7 | 
 8 | /**
 9 |  * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models"
10 |  * by Galen Andrew and Jianfeng Gao
11 |  * user: weixuan
12 |  * To change this template use File | Settings | File Templates.
13 |  */
14 | type LBFGSMinimizer struct {
15 | 	costFun      DiffFunction
16 | 	numHist      int
17 | 	maxIteration int
18 | 	tolerance    float64
19 | }
20 | 
21 | var lbfgs_output_switch bool = false
22 | 
23 | func NewLBFGSMinimizer() *LBFGSMinimizer {
24 | 	m := new(LBFGSMinimizer)
25 | 	m.numHist = 10
26 | 	m.maxIteration = 200
27 | 	m.tolerance = 1e-4
28 | 	return m
29 | }
30 | 
31 | func (m *LBFGSMinimizer) Minimize(costfun DiffFunction, init *core.Vector) *core.Vector {
32 | 	m.costFun = costfun
33 | 	var cost float64 = costfun.Value(init)
34 | 	var grad *core.Vector = costfun.Gradient(init).Copy()
35 | 	var pos *core.Vector = init.Copy()
36 | 	var terminalCriterion *relativeMeanImprCriterion = NewRelativeMeanImprCriterion(m.tolerance)
37 | 	terminalCriterion.addCost(cost)
38 | 
39 | 	var helper *QuasiNewtonHelper = NewQuasiNewtonHelper(m.numHist, m, pos, grad)
40 | 	if lbfgs_output_switch {
41 | 		fmt.Println("Iter\tcost\timprovement")
42 | 		fmt.Printf("%d\t%e\tUndefined", 0, cost)
43 | 	}
44 | 	for iter := 1; iter <= m.maxIteration; iter++ {
45 | 		dir := grad.Copy()
46 | 		dir.ApplyScale(-1.0)
47 | 		helper.ApplyQuasiInverseHession(dir)
48 | 		newCost, newPos := helper.BackTrackingLineSearch(cost, pos, grad, dir, iter == 1)
49 | 		if lbfgs_output_switch {
50 | 			fmt.Println("")
51 | 		}
52 | 		if cost == newCost {
53 | 			break
54 | 		}
55 | 		cost = newCost
56 | 		pos = newPos
57 | 		grad = costfun.Gradient(pos).Copy()
58 | 		terminalCriterion.addCost(cost)
59 | 		if lbfgs_output_switch {
60 | 			fmt.Printf("%d\t%e\t%e", iter, newCost, terminalCriterion.improvement)
61 | 		}
62 | 		if terminalCriterion.isTerminable() || helper.UpdateState(pos, grad) {
63 | 			if lbfgs_output_switch {
64 | 				fmt.Println("")
65 | 			}
66 | 			break
67 | 		}
68 | 	}
69 | 	return pos
70 | }
71 | 
72 | func (m *LBFGSMinimizer) Evaluate(pos *core.Vector) float64 {
73 | 	return m.costFun.Value(pos)
74 | }
75 | 
76 | func (m *LBFGSMinimizer) NextPoint(curPos *core.Vector, dir *core.Vector, alpha float64) *core.Vector {
77 | 	if lbfgs_output_switch {
78 | 		fmt.Printf(".")
79 | 	}
80 | 	return curPos.ElemWiseMultiplyAdd(dir, alpha)
81 | }
82 | 


--------------------------------------------------------------------------------
/fm/factorize_machine.go:
--------------------------------------------------------------------------------
 1 | package fm
 2 | 
 3 | import (
 4 | 	"github.com/xlvector/hector/core"
 5 | 	"github.com/xlvector/hector/util"
 6 | 	"strconv"
 7 | )
 8 | 
 9 | type FactorizeMachine struct {
10 | 	w      *core.Vector
11 | 	v      []*core.Vector
12 | 	params FactorizeMachineParams
13 | }
14 | 
15 | type FactorizeMachineParams struct {
16 | 	LearningRate   float64
17 | 	Regularization float64
18 | 	FactorNumber   int
19 | }
20 | 
21 | func (self *FactorizeMachine) SaveModel(path string) {
22 | 
23 | }
24 | 
25 | func (self *FactorizeMachine) LoadModel(path string) {
26 | 
27 | }
28 | 
29 | func (c *FactorizeMachine) Predict(sample *core.Sample) float64 {
30 | 	for _, f := range sample.Features {
31 | 		c.w.RandomInit(f.Id, 0.1)
32 | 		for k, _ := range c.v {
33 | 			c.v[k].RandomInit(f.Id, 0.1)
34 | 		}
35 | 	}
36 | 	ret := c.w.DotFeatures(sample.Features)
37 | 	for k, _ := range c.v {
38 | 		a := c.v[k].DotFeatures(sample.Features)
39 | 		b := 0.0
40 | 		for _, f := range sample.Features {
41 | 			vkf := c.v[k].GetValue(f.Id)
42 | 			b += f.Value * f.Value * vkf * vkf
43 | 		}
44 | 		ret += 0.5 * (a*a - b)
45 | 	}
46 | 	return util.Sigmoid(ret)
47 | }
48 | 
49 | func (c *FactorizeMachine) Init(params map[string]string) {
50 | 	c.w = core.NewVector()
51 | 	factor_number, _ := strconv.ParseInt(params["factors"], 10, 64)
52 | 	c.params.FactorNumber = int(factor_number)
53 | 	c.params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64)
54 | 	c.params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64)
55 | 
56 | 	for i := 0; i < c.params.FactorNumber; i++ {
57 | 		c.v = append(c.v, core.NewVector())
58 | 	}
59 | }
60 | 
61 | func (c *FactorizeMachine) Train(dataset *core.DataSet) {
62 | 	n := 0
63 | 	for _, sample := range dataset.Samples {
64 | 		n += 1
65 | 		if n%10000 == 0 {
66 | 			c.params.LearningRate *= 0.9
67 | 		}
68 | 		pred := c.Predict(sample)
69 | 		err := sample.LabelDoubleValue() - pred
70 | 
71 | 		vx := []float64{}
72 | 		for _, vf := range c.v {
73 | 			vx = append(vx, vf.DotFeatures(sample.Features))
74 | 		}
75 | 		for _, f := range sample.Features {
76 | 			fweight := c.w.GetValue(f.Id)
77 | 			fweight += c.params.LearningRate * (err*f.Value - c.params.Regularization*fweight)
78 | 			c.w.SetValue(f.Id, fweight)
79 | 
80 | 			for k, _ := range c.v {
81 | 				vkx := c.v[k].GetValue(f.Id)
82 | 				vkx += c.params.LearningRate * (err*(f.Value*vx[k]-f.Value*f.Value*vkx) - c.params.Regularization*vkx)
83 | 				c.v[k].SetValue(f.Id, vkx)
84 | 			}
85 | 		}
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/lr/logistic_regression.go:
--------------------------------------------------------------------------------
 1 | package lr
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"os"
 6 | 	"strconv"
 7 | 	"strings"
 8 | 
 9 | 	"github.com/xlvector/hector/core"
10 | 	"github.com/xlvector/hector/util"
11 | )
12 | 
13 | type LogisticRegressionParams struct {
14 | 	LearningRate   float64
15 | 	Regularization float64
16 | 	Steps          int
17 | }
18 | 
19 | type LogisticRegression struct {
20 | 	Model  map[int64]float64
21 | 	Params LogisticRegressionParams
22 | }
23 | 
24 | func (algo *LogisticRegression) SaveModel(path string) {
25 | 	sb := util.StringBuilder{}
26 | 	for f, g := range algo.Model {
27 | 		sb.Int64(f)
28 | 		sb.Write("\t")
29 | 		sb.Float(g)
30 | 		sb.Write("\n")
31 | 	}
32 | 	sb.WriteToFile(path)
33 | }
34 | 
35 | func (algo *LogisticRegression) LoadModel(path string) {
36 | 	file, _ := os.Open(path)
37 | 	defer file.Close()
38 | 	algo.Model = make(map[int64]float64)
39 | 	scaner := bufio.NewScanner(file)
40 | 	for scaner.Scan() {
41 | 		line := scaner.Text()
42 | 		tks := strings.Split(line, "\t")
43 | 		fid, _ := strconv.ParseInt(tks[0], 10, 64)
44 | 		fw, _ := strconv.ParseFloat(tks[1], 64)
45 | 		algo.Model[fid] = fw
46 | 	}
47 | }
48 | 
49 | func (algo *LogisticRegression) Init(params map[string]string) {
50 | 	algo.Model = make(map[int64]float64)
51 | 
52 | 	algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64)
53 | 	algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64)
54 | 	steps, _ := strconv.ParseInt(params["steps"], 10, 32)
55 | 	algo.Params.Steps = int(steps)
56 | }
57 | 
58 | func (algo *LogisticRegression) Train(dataset *core.DataSet) {
59 | 	algo.Model = make(map[int64]float64)
60 | 	for step := 0; step < algo.Params.Steps; step++ {
61 | 		for _, sample := range dataset.Samples {
62 | 			prediction := algo.Predict(sample)
63 | 			err := sample.LabelDoubleValue() - prediction
64 | 			for _, feature := range sample.Features {
65 | 				model_feature_value, ok := algo.Model[feature.Id]
66 | 				if !ok {
67 | 					model_feature_value = 0.0
68 | 				}
69 | 				model_feature_value += algo.Params.LearningRate * (err*feature.Value - algo.Params.Regularization*model_feature_value)
70 | 				algo.Model[feature.Id] = model_feature_value
71 | 			}
72 | 		}
73 | 		algo.Params.LearningRate *= 0.9
74 | 	}
75 | }
76 | 
77 | func (algo *LogisticRegression) Predict(sample *core.Sample) float64 {
78 | 	ret := 0.0
79 | 	for _, feature := range sample.Features {
80 | 		model_feature_value, ok := algo.Model[feature.Id]
81 | 		if ok {
82 | 			ret += model_feature_value * feature.Value
83 | 		}
84 | 	}
85 | 	return util.Sigmoid(ret)
86 | }
87 | 


--------------------------------------------------------------------------------
/lr/logistic_regression_streaming.go:
--------------------------------------------------------------------------------
 1 | package lr
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"log"
 6 | 	"math"
 7 | 	"os"
 8 | 	"strconv"
 9 | 	"strings"
10 | 
11 | 	"github.com/xlvector/hector/core"
12 | 	"github.com/xlvector/hector/util"
13 | )
14 | 
15 | type LogisticRegressionStream struct {
16 | 	Model  map[int64]float64
17 | 	Params LogisticRegressionParams
18 | }
19 | 
20 | func (algo *LogisticRegressionStream) SaveModel(path string) {
21 | 	sb := util.StringBuilder{}
22 | 	for f, g := range algo.Model {
23 | 		sb.Int64(f)
24 | 		sb.Write("\t")
25 | 		sb.Float(g)
26 | 		sb.Write("\n")
27 | 	}
28 | 	sb.WriteToFile(path)
29 | }
30 | 
31 | func (algo *LogisticRegressionStream) LoadModel(path string) {
32 | 	file, _ := os.Open(path)
33 | 	defer file.Close()
34 | 
35 | 	scaner := bufio.NewScanner(file)
36 | 	for scaner.Scan() {
37 | 		line := scaner.Text()
38 | 		tks := strings.Split(line, "\t")
39 | 		fid, _ := strconv.ParseInt(tks[0], 10, 64)
40 | 		fw, _ := strconv.ParseFloat(tks[1], 64)
41 | 		algo.Model[fid] = fw
42 | 	}
43 | }
44 | 
45 | func (algo *LogisticRegressionStream) Init(params map[string]string) {
46 | 	algo.Model = make(map[int64]float64)
47 | 
48 | 	algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64)
49 | 	algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64)
50 | 	steps, _ := strconv.ParseInt(params["steps"], 10, 32)
51 | 	algo.Params.Steps = int(steps)
52 | }
53 | 
54 | func (algo *LogisticRegressionStream) Train(dataset *core.StreamingDataSet) {
55 | 	algo.Model = make(map[int64]float64)
56 | 	totalErr := 0.0
57 | 	n := 0
58 | 	for sample := range dataset.Samples {
59 | 		prediction := algo.Predict(sample)
60 | 		err := sample.LabelDoubleValue() - prediction
61 | 		totalErr += math.Abs(err)
62 | 		n += 1
63 | 		if n%100000 == 0 {
64 | 			log.Println("proc ", n, totalErr/100000.0, sample.LabelDoubleValue(), prediction)
65 | 			totalErr = 0.0
66 | 		}
67 | 		for _, feature := range sample.Features {
68 | 			model_feature_value, ok := algo.Model[feature.Id]
69 | 			if !ok {
70 | 				model_feature_value = 0.0
71 | 			}
72 | 			model_feature_value += algo.Params.LearningRate * (err*feature.Value - algo.Params.Regularization*model_feature_value)
73 | 			algo.Model[feature.Id] = model_feature_value
74 | 		}
75 | 	}
76 | }
77 | 
78 | func (algo *LogisticRegressionStream) Predict(sample *core.Sample) float64 {
79 | 	ret := 0.0
80 | 	for _, feature := range sample.Features {
81 | 		model_feature_value, ok := algo.Model[feature.Id]
82 | 		if ok {
83 | 			ret += model_feature_value * feature.Value
84 | 		}
85 | 	}
86 | 	return util.Sigmoid(ret)
87 | }
88 | 


--------------------------------------------------------------------------------
/util/math_util.go:
--------------------------------------------------------------------------------
  1 | package util
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"strconv"
  6 | )
  7 | 
  8 | func Sigmoid(x float64)(y float64) {
  9 | 	y = 1 / (1 + math.Exp(-1 * x))
 10 | 	return y
 11 | }
 12 | 
 13 | func UnSigmoid(x float64) float64 {
 14 | 	x = x * 0.99 + 0.01
 15 | 	y := math.Log(x / (1 - x))
 16 | 	return y
 17 | }
 18 | 
 19 | func Signum(x float64) float64 {
 20 | 	ret := 0.0
 21 | 	if x > 0{
 22 | 		ret = 1.0
 23 | 	} else if(x < 0) {
 24 | 		ret = -1.0
 25 | 	} else {
 26 | 		ret = 0.0
 27 | 	}
 28 | 	return ret
 29 | }
 30 | 
 31 | func ParseInt64(str string) int64 {
 32 | 	ret, _ := strconv.ParseInt(str, 10, 64)
 33 | 	return ret
 34 | }
 35 | 
 36 | func ParseFloat64(str string) float64 {
 37 | 	ret, _ := strconv.ParseFloat(str, 64)
 38 | 	return ret
 39 | }
 40 | 
 41 | type Gaussian struct {
 42 | 	Mean, Vari float64
 43 | }
 44 | 
 45 | func (g *Gaussian) Integral(x float64) float64{
 46 | 	a1 := 0.254829592
 47 | 	a2 := -0.284496736
 48 | 	a3 := 1.421413741
 49 | 	a4 := -1.453152027
 50 | 	a5 := 1.061405429
 51 | 	p := 0.3275911
 52 | 
 53 | 	sign := 1.0
 54 | 	if x < 0{
 55 | 		sign = -1.0
 56 | 	}
 57 | 	x = math.Abs(x) / math.Sqrt(2.0)
 58 | 
 59 | 	t := 1.0 / (1.0 + p * x)
 60 | 	y := 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.Exp(-x * x)
 61 | 	return 0.5 * (1.0 + sign * y)
 62 | }
 63 | 
 64 | func (g *Gaussian) AddGaussian(g1 *Gaussian){
 65 | 	g.Mean += g1.Mean
 66 | 	g.Vari += g1.Vari
 67 | }
 68 | 
 69 | func (g *Gaussian) MultGaussian(g1 *Gaussian){
 70 | 	Mean := (g.Mean * g1.Vari + g1.Mean * g.Vari) / (g.Vari + g1.Vari)
 71 | 	Vari := g.Vari * g1.Vari / (g.Vari + g1.Vari)
 72 | 	g.Mean = Mean
 73 | 	g.Vari = Vari
 74 | }
 75 | 
 76 | func (g *Gaussian) Func(x float64) float64{
 77 | 	return math.Exp(-0.5 * x * x) * 0.3989423;
 78 | }
 79 | 
 80 | func (g *Gaussian) UpperTruncateGaussian(Mean, Vari, s float64){
 81 | 	sqrtVari := math.Sqrt(Vari)
 82 | 	a := (s - Mean) / sqrtVari
 83 | 	lambda := a
 84 | 	if a < 4.0 {
 85 | 		lambda = g.Func(a) / g.Integral(-1.0 * a)
 86 | 	}
 87 | 	Mean = Mean + sqrtVari * lambda
 88 | 	if lambda * (lambda - a) > 1{
 89 | 		Vari = 0.0
 90 | 	} else {
 91 | 		Vari *= 1 - lambda * (lambda - a)
 92 | 	}
 93 | 	g.Mean = Mean
 94 | 	g.Vari = Vari
 95 | }
 96 | 
 97 | func (g *Gaussian) LowerTruncateGaussian(Mean, Vari, s float64){
 98 | 	sqrtVari := math.Sqrt(Vari)
 99 | 	a := (s - Mean) / sqrtVari
100 | 	delta := -1.0 * a
101 | 	if a > -4.0 {
102 | 		delta = g.Func(a) / g.Integral(a)
103 | 	}
104 | 	Mean = Mean - sqrtVari * delta
105 | 	if a * delta + delta * delta > 1.0 {
106 | 		Vari = 0.0
107 | 	} else {
108 | 		Vari *= 1 - a * delta - delta * delta
109 | 	}
110 | 	g.Mean = Mean
111 | 	g.Vari = Vari
112 | }


--------------------------------------------------------------------------------
/eval/evaluation.go:
--------------------------------------------------------------------------------
  1 | package eval
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 	"math"
  6 | )
  7 | 
  8 | type LabelPrediction struct {
  9 | 	Prediction float64
 10 | 	Label int
 11 | }
 12 | 
 13 | type RealPrediction struct { // Real valued
 14 | 	Prediction float64
 15 | 	Value float64
 16 | }
 17 | 
 18 | type By func(p1, p2 *LabelPrediction) bool
 19 | 
 20 | type labelPredictionSorter struct {
 21 | 	predictions []*LabelPrediction
 22 | 	by      By
 23 | }
 24 | 
 25 | func (s *labelPredictionSorter) Len() int {
 26 | 	return len(s.predictions)
 27 | }
 28 | 
 29 | func (s *labelPredictionSorter) Swap(i, j int) {
 30 | 	s.predictions[i], s.predictions[j] = s.predictions[j], s.predictions[i]
 31 | }
 32 | 
 33 | func (s *labelPredictionSorter) Less(i, j int) bool {
 34 | 	return s.by(s.predictions[i], s.predictions[j])
 35 | }
 36 | 
 37 | func (by By) Sort(predictions []*LabelPrediction) {
 38 | 	sorter := &labelPredictionSorter{
 39 | 		predictions: predictions,
 40 | 		by:      by,
 41 | 	}
 42 | 	sort.Sort(sorter)
 43 | }
 44 | 
 45 | func AUC(predictions0 []*LabelPrediction) float64 {
 46 | 	predictions := []*LabelPrediction{}
 47 | 	for _, pred := range predictions0{
 48 | 		predictions = append(predictions, pred)
 49 | 	}
 50 | 	prediction := func(p1, p2 *LabelPrediction) bool {
 51 | 		return p1.Prediction > p2.Prediction
 52 | 	}
 53 | 	
 54 | 	By(prediction).Sort(predictions)
 55 | 	
 56 | 	pn := 0.0
 57 | 	nn := float64(len(predictions))
 58 | 	ret := 0.0
 59 | 	count := nn
 60 | 	for i, lp := range predictions{
 61 | 		if lp.Label > 0 {
 62 | 			pn += 1.0
 63 | 			nn -= 1.0
 64 | 			ret += float64(count) - float64(i)
 65 | 		}
 66 | 	}
 67 | 	ret2 := pn * (pn + 1) / 2.0;
 68 | 	if pn * nn == 0.0{
 69 | 		return 0.5
 70 | 	}
 71 | 	return (ret - ret2) / (pn * nn)
 72 | }
 73 | 
 74 | func RMSE(predictions []*LabelPrediction) float64 {
 75 | 	ret := 0.0
 76 | 	n := 0.0
 77 | 
 78 | 	for _, pred := range predictions {
 79 | 		ret += (float64(pred.Label) - pred.Prediction) * (float64(pred.Label) - pred.Prediction)
 80 | 		n += 1.0
 81 | 	}
 82 | 
 83 | 	return math.Sqrt(ret / n)
 84 | }
 85 | 
 86 | func ErrorRate(predictions []*LabelPrediction) float64 {
 87 | 	ret := 0.0
 88 | 	n := 0.0
 89 | 
 90 | 	for _, pred := range predictions {
 91 | 		if (float64(pred.Label) - 0.5) * (pred.Prediction - 0.5) < 0 {
 92 | 			ret += 1.0
 93 | 		}
 94 | 		n += 1.0
 95 | 	}
 96 | 	return ret / n
 97 | }
 98 | 
 99 | func RegRMSE(predictions []*RealPrediction) float64 {
100 | 	ret := 0.0
101 | 	n := 0.0
102 | 
103 | 	for _, pred := range predictions {
104 | 		ret += (pred.Value - pred.Prediction) * (pred.Value - pred.Prediction)
105 | 		n += 1.0
106 | 	}
107 | 
108 | 	return math.Sqrt(ret / n)
109 | }
110 | 
111 | 


--------------------------------------------------------------------------------
/svm/l1vm.go:
--------------------------------------------------------------------------------
  1 | package svm
  2 | 
  3 | import (
  4 | 	"github.com/xlvector/hector/core"
  5 | 	"github.com/xlvector/hector/lr"
  6 | 	"math"
  7 | 	"math/rand"
  8 | 	"strconv"
  9 | )
 10 | 
 11 | func Distance(x, y *core.Vector) float64 {
 12 | 	z := x.Copy()
 13 | 	z.AddVector(y, -1)
 14 | 	d := z.NormL2()
 15 | 	return d
 16 | }
 17 | 
 18 | func RBFKernel(x, y *core.Vector, radius float64) float64 {
 19 | 	d := Distance(x, y)
 20 | 	ret := math.Exp(-1.0 * d / radius)
 21 | 	return ret
 22 | }
 23 | 
 24 | type L1VM struct {
 25 | 	sv     []*core.Vector
 26 | 	ftrl   *lr.FTRLLogisticRegression
 27 | 	radius float64
 28 | 	count  int
 29 | }
 30 | 
 31 | func (self *L1VM) SaveModel(path string) {
 32 | 
 33 | }
 34 | 
 35 | func (self *L1VM) LoadModel(path string) {
 36 | 
 37 | }
 38 | 
 39 | func (c *L1VM) Init(params map[string]string) {
 40 | 	c.ftrl = &(lr.FTRLLogisticRegression{})
 41 | 	c.ftrl.Init(params)
 42 | 	c.radius, _ = strconv.ParseFloat(params["radius"], 64)
 43 | 	count, _ := strconv.ParseInt(params["sv"], 10, 64)
 44 | 	c.count = int(count)
 45 | }
 46 | 
 47 | func (c *L1VM) Predict(sample *core.Sample) float64 {
 48 | 	x := sample.GetFeatureVector()
 49 | 	return c.PredictVector(x)
 50 | }
 51 | 
 52 | func (c *L1VM) PredictVector(x *core.Vector) float64 {
 53 | 	s := core.NewSample()
 54 | 	for k, xs := range c.sv {
 55 | 
 56 | 		s.AddFeature(core.Feature{Id: int64(k), Value: RBFKernel(xs, x, c.radius)})
 57 | 	}
 58 | 	return c.ftrl.Predict(s)
 59 | }
 60 | 
 61 | func (c *L1VM) Train(dataset *core.DataSet) {
 62 | 	c.sv = []*core.Vector{}
 63 | 	kernel_dataset := core.NewDataSet()
 64 | 
 65 | 	positive := []int{}
 66 | 	negative := []int{}
 67 | 	for i, si := range dataset.Samples {
 68 | 		if si.Label > 0.0 {
 69 | 			positive = append(positive, i)
 70 | 		} else {
 71 | 			negative = append(negative, i)
 72 | 		}
 73 | 	}
 74 | 
 75 | 	perm_positive := rand.Perm(len(positive))
 76 | 
 77 | 	for i, k := range perm_positive {
 78 | 		if i > c.count {
 79 | 			break
 80 | 		}
 81 | 		c.sv = append(c.sv, dataset.Samples[positive[k]].GetFeatureVector())
 82 | 	}
 83 | 
 84 | 	perm_negative := rand.Perm(len(negative))
 85 | 
 86 | 	for i, k := range perm_negative {
 87 | 		if i > c.count {
 88 | 			break
 89 | 		}
 90 | 		c.sv = append(c.sv, dataset.Samples[negative[k]].GetFeatureVector())
 91 | 	}
 92 | 
 93 | 	for _, si := range dataset.Samples {
 94 | 		xi := si.GetFeatureVector()
 95 | 		tsample := core.NewSample()
 96 | 		tsample.Label = si.Label
 97 | 		for j, xj := range c.sv {
 98 | 			tsample.AddFeature(core.Feature{Id: int64(j), Value: RBFKernel(xi, xj, c.radius)})
 99 | 		}
100 | 		kernel_dataset.AddSample(tsample)
101 | 	}
102 | 
103 | 	c.ftrl.Train(kernel_dataset)
104 | }
105 | 


--------------------------------------------------------------------------------
/svm/linear_svm.go:
--------------------------------------------------------------------------------
  1 | package svm
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"github.com/xlvector/hector/core"
  7 | 	"github.com/xlvector/hector/util"
  8 | 	"math"
  9 | 	"math/rand"
 10 | 	"os"
 11 | 	"runtime"
 12 | 	"strconv"
 13 | 	"strings"
 14 | )
 15 | 
 16 | /*
 17 | This algorithm implement L1 Linear SVM described in "A Dual Coordinate Descent Method for Large-scale Linear SVM"
 18 | You can download the paper from http://ntu.csie.org/~cjlin/papers/cddual.pdf
 19 | */
 20 | type LinearSVM struct {
 21 | 	sv []*core.Vector
 22 | 	y  []float64
 23 | 	a  []float64
 24 | 	b  float64
 25 | 	C  float64
 26 | 	e  float64
 27 | 	w  *core.Vector
 28 | 
 29 | 	xx []float64
 30 | }
 31 | 
 32 | func (self *LinearSVM) SaveModel(path string) {
 33 | 	sb := util.StringBuilder{}
 34 | 	for f, g := range self.w.Data {
 35 | 		sb.Int64(f)
 36 | 		sb.Write("\t")
 37 | 		sb.Float(g)
 38 | 		sb.Write("\n")
 39 | 	}
 40 | 	sb.WriteToFile(path)
 41 | }
 42 | 
 43 | func (self *LinearSVM) LoadModel(path string) {
 44 | 	file, _ := os.Open(path)
 45 | 	defer file.Close()
 46 | 
 47 | 	scaner := bufio.NewScanner(file)
 48 | 	for scaner.Scan() {
 49 | 		line := scaner.Text()
 50 | 		tks := strings.Split(line, "\t")
 51 | 		fid, _ := strconv.ParseInt(tks[0], 10, 64)
 52 | 		fw, _ := strconv.ParseFloat(tks[1], 64)
 53 | 		self.w.SetValue(fid, fw)
 54 | 	}
 55 | }
 56 | 
 57 | func (c *LinearSVM) Init(params map[string]string) {
 58 | 	c.C, _ = strconv.ParseFloat(params["c"], 64)
 59 | 	c.e, _ = strconv.ParseFloat(params["e"], 64)
 60 | 
 61 | 	c.w = core.NewVector()
 62 | }
 63 | 
 64 | func (c *LinearSVM) Predict(sample *core.Sample) float64 {
 65 | 	x := sample.GetFeatureVector()
 66 | 	return c.PredictVector(x)
 67 | }
 68 | 
 69 | func (c *LinearSVM) PredictVector(x *core.Vector) float64 {
 70 | 	ret := c.w.Dot(x)
 71 | 	return ret
 72 | }
 73 | 
 74 | func (c *LinearSVM) Train(dataset *core.DataSet) {
 75 | 	c.sv = []*core.Vector{}
 76 | 	c.y = []float64{}
 77 | 	c.a = []float64{}
 78 | 	for k, sample := range dataset.Samples {
 79 | 		x := sample.GetFeatureVector()
 80 | 		c.sv = append(c.sv, x)
 81 | 		c.xx = append(c.xx, x.Dot(x))
 82 | 		if sample.Label > 0.0 {
 83 | 			c.y = append(c.y, 1.0)
 84 | 		} else {
 85 | 			c.y = append(c.y, -1.0)
 86 | 		}
 87 | 		c.a = append(c.a, c.C*rand.Float64()*0.0)
 88 | 		c.w.AddVector(x, c.y[k]*c.a[k])
 89 | 	}
 90 | 
 91 | 	da0 := 0.0
 92 | 	for {
 93 | 		da := 0.0
 94 | 		for i, ai := range c.a {
 95 | 			g := c.y[i]*c.w.Dot(c.sv[i]) - 1.0
 96 | 			pg := g
 97 | 			if ai < 1e-9 {
 98 | 				pg = math.Min(0.0, g)
 99 | 			} else if ai > c.C-1e-9 {
100 | 				pg = math.Max(0.0, g)
101 | 			}
102 | 
103 | 			if math.Abs(pg) > 1e-9 {
104 | 				ai0 := ai
105 | 				ai = math.Min(math.Max(0, ai-g/c.xx[i]), c.C)
106 | 				c.w.AddVector(c.sv[i], (ai-ai0)*c.y[i])
107 | 				da += math.Abs(ai - ai0)
108 | 			}
109 | 		}
110 | 		da /= float64(len(c.a))
111 | 		fmt.Println(da)
112 | 		if da < c.e || math.Abs(da-da0) < 1e-3 {
113 | 			break
114 | 		}
115 | 		da0 = da
116 | 	}
117 | 
118 | 	c.sv = nil
119 | 	runtime.GC()
120 | }
121 | 


--------------------------------------------------------------------------------
/core/array_vector.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | import (
  4 | 	"github.com/xlvector/hector/util"
  5 | 	"math"
  6 | 	"strconv"
  7 | 	"strings"
  8 | )
  9 | 
 10 | type ArrayVector struct {
 11 | 	data []float64
 12 | }
 13 | 
 14 | func NewArrayVector() *ArrayVector {
 15 | 	v := ArrayVector{}
 16 | 	v.data = []float64{}
 17 | 	return &v
 18 | }
 19 | 
 20 | func (v *ArrayVector) ToString() []byte {
 21 | 	sb := util.StringBuilder{}
 22 | 	for _, value := range v.data {
 23 | 		sb.Float(value)
 24 | 		sb.Write("|")
 25 | 	}
 26 | 	return sb.Bytes()
 27 | }
 28 | 
 29 | func (v *ArrayVector) FromString(buf string) {
 30 | 	tks := strings.Split(buf, "|")
 31 | 	for _, tk := range tks {
 32 | 		if len(tk) == 0 {
 33 | 			continue
 34 | 		}
 35 | 		value, _ := strconv.ParseFloat(tk, 64)
 36 | 		v.data = append(v.data, value)
 37 | 	}
 38 | }
 39 | 
 40 | func (v *ArrayVector) Expand(size int) {
 41 | 	for len(v.data) < size {
 42 | 		v.data = append(v.data, 0.0)
 43 | 	}
 44 | }
 45 | 
 46 | func (v *ArrayVector) AddValue(key int, value float64) {
 47 | 	v.Expand(key + 1)
 48 | 	v.data[key] += value
 49 | }
 50 | 
 51 | func (v *ArrayVector) GetValue(key int) float64 {
 52 | 	if key >= len(v.data) {
 53 | 		return 0.0
 54 | 	} else {
 55 | 		return v.data[key]
 56 | 	}
 57 | }
 58 | 
 59 | func (v *ArrayVector) SetValue(key int, value float64) {
 60 | 	v.Expand(key + 1)
 61 | 	v.data[key] = value
 62 | }
 63 | 
 64 | func (v *ArrayVector) AddVector(v2 *ArrayVector, alpha float64) {
 65 | 	for key, value := range v2.data {
 66 | 		v.AddValue(key, value*alpha)
 67 | 	}
 68 | }
 69 | 
 70 | func (v *ArrayVector) NormL2() float64 {
 71 | 	ret := 0.0
 72 | 	for _, val := range v.data {
 73 | 		ret += val * val
 74 | 	}
 75 | 	return ret
 76 | }
 77 | 
 78 | func (v *ArrayVector) Copy() *ArrayVector {
 79 | 	ret := NewArrayVector()
 80 | 	for key, val := range v.data {
 81 | 		ret.SetValue(key, val)
 82 | 	}
 83 | 	return ret
 84 | }
 85 | 
 86 | func (v *ArrayVector) KeyWithMaxValue() (int, float64) {
 87 | 	ret := 0
 88 | 	max_val := 0.0
 89 | 	for key, val := range v.data {
 90 | 		max_val = val
 91 | 		ret = key
 92 | 		break
 93 | 	}
 94 | 	for key, val := range v.data {
 95 | 		if max_val < val {
 96 | 			max_val = val
 97 | 			ret = key
 98 | 		}
 99 | 	}
100 | 	return ret, max_val
101 | }
102 | 
103 | func (v *ArrayVector) Sum() float64 {
104 | 	ret := 0.0
105 | 	for _, val := range v.data {
106 | 		ret += val
107 | 	}
108 | 	return ret
109 | }
110 | 
111 | func (v *ArrayVector) Dot(v2 *ArrayVector) float64 {
112 | 	va := v
113 | 	vb := v2
114 | 
115 | 	if len(v2.data) < len(v.data) {
116 | 		va = v2
117 | 		vb = v
118 | 	}
119 | 	ret := 0.0
120 | 	for key, a := range va.data {
121 | 		b := vb.data[key]
122 | 		ret += a * b
123 | 	}
124 | 	return ret
125 | }
126 | 
127 | func (v *ArrayVector) Scale(s float64) {
128 | 	for i, _ := range v.data {
129 | 		v.data[i] *= s
130 | 	}
131 | }
132 | 
133 | func (v *ArrayVector) SoftMaxNorm() *ArrayVector {
134 | 	sum := 0.0
135 | 	for _, val := range v.data {
136 | 		sum += math.Exp(val)
137 | 	}
138 | 	ret := NewArrayVector()
139 | 	for key, val := range v.data {
140 | 		ret.SetValue(key, math.Exp(val)/sum)
141 | 	}
142 | 	return ret
143 | }
144 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | hector
 2 | ======
 3 | 
 4 | Golang machine learning lib. Currently, it can be used to solve binary classification problems.
 5 | 
 6 | # Supported Algorithms
 7 | 
 8 | 1. Logistic Regression
 9 | 2. Factorized Machine
10 | 3. CART, Random Forest, Random Decision Tree, Gradient Boosting Decision Tree
11 | 4. Neural Network
12 | 
13 | # Dataset Format
14 | 
15 | Hector support libsvm-like data format. Following is an sample dataset
16 | 
17 | 	1 	1:0.7 3:0.1 9:0.4
18 | 	0	2:0.3 4:0.9 7:0.5
19 | 	0	2:0.7 5:0.3
20 | 	...
21 | 
22 | # How to Run
23 | 
24 | ## Run as tools
25 | 
26 | hector-cv.go will help you test one algorithm by cross validation in some dataset, you can run it by following steps:
27 | 
28 | 	go get github.com/xlvector/hector
29 | 	go install github.com/xlvector/hector/hectorcv
30 | 	hectorcv --method [Method] --train [Data Path] --cv 10
31 | 
32 | Here, Method include
33 | 
34 | 1. lr : logistic regression with SGD and L2 regularization.
35 | 2. ftrl : FTRL-proximal logistic regreesion with L1 regularization. Please review this paper for more details "Ad Click Prediction: a View from the Trenches".
36 | 3. ep : bayesian logistic regression with expectation propagation. Please review this paper for more details "Web-Scale Bayesian Click-Through Rate Prediction for Sponsored Search Advertising in Microsoft’s Bing Search Engine"
37 | 4. fm : factorization machine
38 | 5. cart : classifiaction tree
39 | 6. cart-regression : regression tree
40 | 7. rf : random forest
41 | 8. rdt : random decision trees
42 | 9. gbdt : gradient boosting decisio tree
43 | 10. linear-svm : linear svm with L1 regularization
44 | 11. svm : svm optimizaed by SMO (current, its linear svm)
45 | 12. l1vm : vector machine with L1 regularization by RBF kernel
46 | 13. knn : k-nearest neighbor classification
47 | 
48 | hector-run.go will help you train one algorithm on train dataset and test it on test dataset, you can run it by following steps:
49 | 
50 | 	cd src
51 | 	go build hector-run.go
52 | 	./hector-run --method [Method] --train [Data Path] --test [Data Path]
53 | 
54 | Above methods will direct train algorithm on train dataset and then test on test dataset. If you want to train algorithm and get the model file, you can run it by following steps:
55 | 
56 | 	./hector-run --method [Method] --action train --train [Data Path] --model [Model Path]
57 | 
58 | Then, you can use model file to test any test dataset:
59 | 
60 | 	./hector-run --method [Method] --action test --test [Data Path] --model [Model Path]
61 | 
62 | # Benchmark
63 | 
64 | ## Binary Classification
65 | 
66 | Following are datasets used in benchmarks, You can find them from [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/)
67 | 
68 | 1. heart
69 | 2. fourclass
70 | 
71 | I will do 5-fold cross validation on the dataset, and use AUC as evaluation metric. Following are the results:
72 | 
73 | DataSet | Method | AUC
74 | ------- | ------ | ---
75 | heart   | FTRL-LR   |0.9109
76 | heart   | EP-LR | 0.8982
77 | heart | CART | 0.8231
78 | heart | RDT | 0.9155
79 | heart | RF | 0.9019
80 | heart | GBDT | 0.9061
81 | fourclass | FTRL-LR | 0.8281
82 | fourclass | EP-LR | 0.7986
83 | fourclass | CART | 0.9832
84 | fourclass | RDT | 0.9925
85 | fourclass | RF | 0.9947
86 | fourclass | GBDT | 0.9958
87 | 
88 | 


--------------------------------------------------------------------------------
/mc_runner.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package hector is a golang based machine learning lib. It intend to implement all famous machine learning algoirhtms by golang.
  3 | Currently, it only support algorithms which can solve binary classification problems. Supported algorithms include:
  4 | 1. Decision Tree (CART, Random Forest, GBDT)
  5 | 2. Logistic Regression
  6 | 3. SVM
  7 | 4. Neural Network
  8 | */
  9 | package hector
 10 | 
 11 | import (
 12 | 	"github.com/xlvector/hector/algo"
 13 | 	"github.com/xlvector/hector/core"
 14 | 	"os"
 15 | 	"strconv"
 16 | )
 17 | 
 18 | func MultiClassRun(classifier algo.MultiClassClassifier, train_path string, test_path string, pred_path string, params map[string]string) (float64, error) {
 19 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
 20 | 	train_dataset := core.NewDataSet()
 21 | 
 22 | 	err := train_dataset.Load(train_path, global)
 23 | 
 24 | 	if err != nil {
 25 | 		return 0.5, err
 26 | 	}
 27 | 
 28 | 	test_dataset := core.NewDataSet()
 29 | 	err = test_dataset.Load(test_path, global)
 30 | 	if err != nil {
 31 | 		return 0.5, err
 32 | 	}
 33 | 	classifier.Init(params)
 34 | 	accuracy := MultiClassRunOnDataSet(classifier, train_dataset, test_dataset, pred_path, params)
 35 | 
 36 | 	return accuracy, nil
 37 | }
 38 | 
 39 | func MultiClassTrain(classifier algo.MultiClassClassifier, train_path string, params map[string]string) error {
 40 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
 41 | 	train_dataset := core.NewDataSet()
 42 | 
 43 | 	err := train_dataset.Load(train_path, global)
 44 | 
 45 | 	if err != nil {
 46 | 		return err
 47 | 	}
 48 | 
 49 | 	classifier.Init(params)
 50 | 	classifier.Train(train_dataset)
 51 | 
 52 | 	model_path, _ := params["model"]
 53 | 
 54 | 	if model_path != "" {
 55 | 		classifier.SaveModel(model_path)
 56 | 	}
 57 | 
 58 | 	return nil
 59 | }
 60 | 
 61 | func MultiClassTest(classifier algo.MultiClassClassifier, test_path string, pred_path string, params map[string]string) (float64, error) {
 62 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
 63 | 
 64 | 	model_path, _ := params["model"]
 65 | 	classifier.Init(params)
 66 | 	if model_path != "" {
 67 | 		classifier.LoadModel(model_path)
 68 | 	} else {
 69 | 		return 0.0, nil
 70 | 	}
 71 | 
 72 | 	test_dataset := core.NewDataSet()
 73 | 	err := test_dataset.Load(test_path, global)
 74 | 	if err != nil {
 75 | 		return 0.0, err
 76 | 	}
 77 | 
 78 | 	accuracy := MultiClassRunOnDataSet(classifier, nil, test_dataset, pred_path, params)
 79 | 
 80 | 	return accuracy, nil
 81 | }
 82 | 
 83 | func MultiClassRunOnDataSet(classifier algo.MultiClassClassifier, train_dataset, test_dataset *core.DataSet, pred_path string, params map[string]string) float64 {
 84 | 
 85 | 	if train_dataset != nil {
 86 | 		classifier.Train(train_dataset)
 87 | 	}
 88 | 
 89 | 	var pred_file *os.File
 90 | 	if pred_path != "" {
 91 | 		pred_file, _ = os.Create(pred_path)
 92 | 	}
 93 | 	accuracy := 0.0
 94 | 	total := 0.0
 95 | 	for _, sample := range test_dataset.Samples {
 96 | 		prediction := classifier.PredictMultiClass(sample)
 97 | 		label, _ := prediction.KeyWithMaxValue()
 98 | 		if int(label) == sample.Label {
 99 | 			accuracy += 1.0
100 | 		}
101 | 		total += 1.0
102 | 		if pred_file != nil {
103 | 			pred_file.WriteString(strconv.Itoa(label) + "\n")
104 | 		}
105 | 	}
106 | 	if pred_path != "" {
107 | 		defer pred_file.Close()
108 | 	}
109 | 
110 | 	return accuracy / total
111 | }
112 | 


--------------------------------------------------------------------------------
/core/sample.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | import (
  4 | 	"github.com/xlvector/hector/util"
  5 | )
  6 | 
  7 | /*
  8 | Sample - for classification
  9 | Here, label should be int value started from 0
 10 | */
 11 | type Sample struct {
 12 | 	Features   []Feature
 13 | 	Label      int
 14 | 	Prediction float64
 15 | }
 16 | 
 17 | func NewSample() *Sample {
 18 | 	ret := Sample{}
 19 | 	ret.Features = []Feature{}
 20 | 	ret.Label = 0
 21 | 	ret.Prediction = 0.0
 22 | 	return &ret
 23 | }
 24 | 
 25 | func (s *Sample) Clone() *Sample {
 26 | 	ret := NewSample()
 27 | 	ret.Label = s.Label
 28 | 	ret.Prediction = s.Prediction
 29 | 	for _, feature := range s.Features {
 30 | 		clone_feature := Feature{feature.Id, feature.Value}
 31 | 		ret.Features = append(ret.Features, clone_feature)
 32 | 	}
 33 | 
 34 | 	return ret
 35 | }
 36 | 
 37 | func (s *Sample) ToString(includePrediction bool) []byte {
 38 | 	sb := util.StringBuilder{}
 39 | 	sb.Int(s.Label)
 40 | 	sb.Write(" ")
 41 | 	if includePrediction {
 42 | 		sb.Float(s.Prediction)
 43 | 		sb.Write(" ")
 44 | 	}
 45 | 	for _, feature := range s.Features {
 46 | 		sb.Int64(feature.Id)
 47 | 		sb.Write(":")
 48 | 		sb.Float(feature.Value)
 49 | 		sb.Write(" ")
 50 | 	}
 51 | 	return sb.Bytes()
 52 | }
 53 | 
 54 | func (s *Sample) LabelDoubleValue() float64 {
 55 | 	if s.Label > 0 {
 56 | 		return 1.0
 57 | 	} else {
 58 | 		return 0.0
 59 | 	}
 60 | }
 61 | 
 62 | func (s *Sample) AddFeature(f Feature) {
 63 | 	s.Features = append(s.Features, f)
 64 | }
 65 | 
 66 | /* RawSample */
 67 | type RawSample struct {
 68 | 	Label      int
 69 | 	Prediction float64
 70 | 	Features   map[string]string
 71 | }
 72 | 
 73 | func NewRawSample() *RawSample {
 74 | 	ret := RawSample{}
 75 | 	ret.Features = make(map[string]string)
 76 | 	ret.Label = 0
 77 | 	ret.Prediction = 0.0
 78 | 	return &ret
 79 | }
 80 | 
 81 | func (s *RawSample) GetFeatureValue(key string) string {
 82 | 	value, ok := s.Features[key]
 83 | 	if ok {
 84 | 		return value
 85 | 	} else {
 86 | 		return "nil"
 87 | 	}
 88 | }
 89 | 
 90 | /* MapBasedSample */
 91 | type MapBasedSample struct {
 92 | 	Label      int
 93 | 	Prediction float64
 94 | 	Features   map[int64]float64
 95 | }
 96 | 
 97 | func (s *MapBasedSample) LabelDoubleValue() float64 {
 98 | 	return float64(s.Label)
 99 | }
100 | 
101 | func (s *Sample) ToMapBasedSample() *MapBasedSample {
102 | 	ret := MapBasedSample{}
103 | 	ret.Features = make(map[int64]float64)
104 | 	ret.Label = s.Label
105 | 	ret.Prediction = s.Prediction
106 | 	for _, feature := range s.Features {
107 | 		ret.Features[feature.Id] = feature.Value
108 | 	}
109 | 	return &ret
110 | }
111 | 
112 | func (s *Sample) GetFeatureVector() *Vector {
113 | 	ret := NewVector()
114 | 	for _, f := range s.Features {
115 | 		ret.SetValue(f.Id, f.Value)
116 | 	}
117 | 	return ret
118 | }
119 | 
120 | /*
121 | RealSample
122 | Real valued samples for regression
123 | */
124 | type RealSample struct {
125 | 	Features   []Feature
126 | 	Prediction float64
127 | 	Value      float64
128 | }
129 | 
130 | func NewRealSample() *RealSample {
131 | 	ret := RealSample{}
132 | 	ret.Features = []Feature{}
133 | 	ret.Value = 0.0
134 | 	ret.Prediction = 0.0
135 | 	return &ret
136 | }
137 | 
138 | func (rs *RealSample) GetFeatureVector() *Vector {
139 | 	ret := NewVector()
140 | 	for _, f := range rs.Features {
141 | 		ret.SetValue(f.Id, f.Value)
142 | 	}
143 | 	return ret
144 | }
145 | 
146 | func (s *RealSample) AddFeature(f Feature) {
147 | 	s.Features = append(s.Features, f)
148 | }
149 | 


--------------------------------------------------------------------------------
/lr/ftrl_logistic_regression.go:
--------------------------------------------------------------------------------
  1 | package lr
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"github.com/xlvector/hector/core"
  6 | 	"github.com/xlvector/hector/util"
  7 | 	"math"
  8 | 	"os"
  9 | 	"strconv"
 10 | 	"strings"
 11 | )
 12 | 
 13 | type FTRLLogisticRegressionParams struct {
 14 | 	Alpha, Beta, Lambda1, Lambda2 float64
 15 | 	Steps                         int
 16 | }
 17 | 
 18 | type FTRLFeatureWeight struct {
 19 | 	ni, zi float64
 20 | }
 21 | 
 22 | func (w *FTRLFeatureWeight) Wi(p FTRLLogisticRegressionParams) float64 {
 23 | 	wi := 0.0
 24 | 	if math.Abs(w.zi) > p.Lambda1 {
 25 | 		wi = (util.Signum(w.zi)*p.Lambda1 - w.zi) / (p.Lambda2 + (p.Beta+math.Sqrt(w.ni))/p.Alpha)
 26 | 	}
 27 | 	return wi
 28 | }
 29 | 
 30 | type FTRLLogisticRegression struct {
 31 | 	Model  map[int64]FTRLFeatureWeight
 32 | 	Params FTRLLogisticRegressionParams
 33 | }
 34 | 
 35 | func (algo *FTRLLogisticRegression) SaveModel(path string) {
 36 | 	sb := util.StringBuilder{}
 37 | 	for f, g := range algo.Model {
 38 | 		sb.Int64(f)
 39 | 		sb.Write("\t")
 40 | 		sb.Float(g.ni)
 41 | 		sb.Write("\t")
 42 | 		sb.Float(g.zi)
 43 | 		sb.Write("\n")
 44 | 	}
 45 | 	sb.WriteToFile(path)
 46 | }
 47 | 
 48 | func (algo *FTRLLogisticRegression) LoadModel(path string) {
 49 | 	file, _ := os.Open(path)
 50 | 	defer file.Close()
 51 | 
 52 | 	scaner := bufio.NewScanner(file)
 53 | 	for scaner.Scan() {
 54 | 		line := scaner.Text()
 55 | 		tks := strings.Split(line, "\t")
 56 | 		fid, _ := strconv.ParseInt(tks[0], 10, 64)
 57 | 		ni, _ := strconv.ParseFloat(tks[1], 64)
 58 | 		zi, _ := strconv.ParseFloat(tks[2], 64)
 59 | 		g := FTRLFeatureWeight{ni: ni, zi: zi}
 60 | 		algo.Model[fid] = g
 61 | 	}
 62 | }
 63 | 
 64 | func (algo *FTRLLogisticRegression) Predict(sample *core.Sample) float64 {
 65 | 	ret := 0.0
 66 | 	for _, feature := range sample.Features {
 67 | 		model_feature_value, ok := algo.Model[feature.Id]
 68 | 		if ok {
 69 | 			ret += model_feature_value.Wi(algo.Params) * feature.Value
 70 | 		}
 71 | 	}
 72 | 	return util.Sigmoid(ret)
 73 | }
 74 | 
 75 | func (algo *FTRLLogisticRegression) Init(params map[string]string) {
 76 | 	algo.Model = make(map[int64]FTRLFeatureWeight)
 77 | 	algo.Params.Alpha, _ = strconv.ParseFloat(params["alpha"], 64)
 78 | 	algo.Params.Lambda1, _ = strconv.ParseFloat(params["lambda1"], 64)
 79 | 	algo.Params.Lambda2, _ = strconv.ParseFloat(params["lambda2"], 64)
 80 | 	algo.Params.Beta, _ = strconv.ParseFloat(params["beta"], 64)
 81 | 	steps, _ := strconv.ParseInt(params["steps"], 10, 32)
 82 | 	algo.Params.Steps = int(steps)
 83 | }
 84 | 
 85 | func (algo *FTRLLogisticRegression) Clear() {
 86 | 	algo.Model = nil
 87 | 	algo.Model = make(map[int64]FTRLFeatureWeight)
 88 | }
 89 | 
 90 | func (algo *FTRLLogisticRegression) Train(dataset *core.DataSet) {
 91 | 	for step := 0; step < algo.Params.Steps; step++ {
 92 | 		for _, sample := range dataset.Samples {
 93 | 			prediction := algo.Predict(sample)
 94 | 			err := sample.LabelDoubleValue() - prediction
 95 | 			for _, feature := range sample.Features {
 96 | 				model_feature_value, ok := algo.Model[feature.Id]
 97 | 				if !ok {
 98 | 					model_feature_value = FTRLFeatureWeight{0.0, 0.0}
 99 | 				}
100 | 				zi := model_feature_value.zi
101 | 				ni := model_feature_value.ni
102 | 				gi := -1 * err * feature.Value
103 | 				sigma := (math.Sqrt(ni+gi*gi) - math.Sqrt(ni)) / algo.Params.Alpha
104 | 				wi := model_feature_value.Wi(algo.Params)
105 | 				zi += gi - sigma*wi
106 | 				ni += gi * gi
107 | 				algo.Model[feature.Id] = FTRLFeatureWeight{zi: zi, ni: ni}
108 | 			}
109 | 		}
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/combine/category_feature_combination.go:
--------------------------------------------------------------------------------
  1 | package combine
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/xlvector/hector/core"
  6 | 	"github.com/xlvector/hector/eval"
  7 | 	"github.com/xlvector/hector/lr"
  8 | 	"math/rand"
  9 | )
 10 | 
 11 | type CategoryFeatureCombination struct {
 12 | 	algo                 *lr.EPLogisticRegression
 13 | 	feature_combinations []core.CombinedFeature
 14 | 	output               string
 15 | }
 16 | 
 17 | func (c *CategoryFeatureCombination) Init(params map[string]string) {
 18 | 	c.algo = &(lr.EPLogisticRegression{})
 19 | 	c.algo.Init(params)
 20 | 	c.output = params["output"]
 21 | }
 22 | 
 23 | func (c *CategoryFeatureCombination) OneCVAUC(dataset0 *core.RawDataSet, combines []core.CombinedFeature, total_cv, cv int) float64 {
 24 | 	dataset := dataset0.ToDataSet(nil, combines)
 25 | 
 26 | 	train := dataset.Split(func(i int) bool { return i%total_cv != cv })
 27 | 
 28 | 	c.algo.Train(train)
 29 | 
 30 | 	test := dataset.Split(func(i int) bool { return i%total_cv == cv })
 31 | 
 32 | 	predictions := []*eval.LabelPrediction{}
 33 | 	for _, sample := range test.Samples {
 34 | 		pred := c.algo.Predict(sample)
 35 | 		lp := eval.LabelPrediction{Label: sample.Label, Prediction: pred}
 36 | 		predictions = append(predictions, &lp)
 37 | 	}
 38 | 	auc := eval.AUC(predictions)
 39 | 	c.algo.Clear()
 40 | 	return auc
 41 | }
 42 | 
 43 | func (c *CategoryFeatureCombination) FindCombination(dataset *core.RawDataSet) []core.CombinedFeature {
 44 | 	features := []string{}
 45 | 	for fkey, _ := range dataset.FeatureKeys {
 46 | 		features = append(features, fkey)
 47 | 	}
 48 | 	candidate_column_combines := []core.CombinedFeature{}
 49 | 	c.feature_combinations = []core.CombinedFeature{}
 50 | 
 51 | 	for i, fi := range features {
 52 | 		c.feature_combinations = append(c.feature_combinations, core.CombinedFeature{fi})
 53 | 		for j, fj := range features[i+1:] {
 54 | 			candidate_column_combines = append(candidate_column_combines, core.CombinedFeature{fi, fj})
 55 | 			for k, fk := range features[i+j+1:] {
 56 | 				candidate_column_combines = append(candidate_column_combines, core.CombinedFeature{fi, fj, fk})
 57 | 				for _, ft := range features[i+j+k+1:] {
 58 | 					candidate_column_combines = append(candidate_column_combines, core.CombinedFeature{fi, fj, fk, ft})
 59 | 				}
 60 | 			}
 61 | 		}
 62 | 	}
 63 | 	fmt.Printf("candidates %d\n", len(candidate_column_combines))
 64 | 	used_combines := make(map[int]bool)
 65 | 
 66 | 	total_cv := 3
 67 | 
 68 | 	best_auc := 0.0
 69 | 	best_combines := -1
 70 | 	for {
 71 | 		if len(used_combines) == len(candidate_column_combines) {
 72 | 			break
 73 | 		}
 74 | 		ok := false
 75 | 		for i, column_combines := range candidate_column_combines {
 76 | 			_, used := used_combines[i]
 77 | 			if used {
 78 | 				continue
 79 | 			}
 80 | 			temp_combines := c.feature_combinations
 81 | 			temp_combines = append(temp_combines, column_combines)
 82 | 
 83 | 			ave_auc := 0.0
 84 | 			for cv := 0; cv < total_cv; cv++ {
 85 | 				ave_auc += c.OneCVAUC(dataset, temp_combines, total_cv, cv)
 86 | 			}
 87 | 			ave_auc /= float64(total_cv)
 88 | 			if best_auc < ave_auc {
 89 | 				best_auc = ave_auc
 90 | 				best_combines = i
 91 | 				ok = true
 92 | 				if rand.Intn(10) == 1 {
 93 | 					break
 94 | 				}
 95 | 			}
 96 | 		}
 97 | 		if !ok {
 98 | 			break
 99 | 		}
100 | 		used_combines[best_combines] = true
101 | 		c.feature_combinations = append(c.feature_combinations, candidate_column_combines[best_combines])
102 | 		fmt.Println(best_auc)
103 | 		fmt.Println(c.feature_combinations)
104 | 	}
105 | 
106 | 	return c.feature_combinations
107 | }
108 | 


--------------------------------------------------------------------------------
/lr/quasinewton_helper.go:
--------------------------------------------------------------------------------
  1 | package lr
  2 | 
  3 | import (
  4 | 	"github.com/xlvector/hector/core"
  5 | 	"math"
  6 | )
  7 | 
  8 | /**
  9 |  * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models"
 10 |  * by Galen Andrew and Jianfeng Gao
 11 |  * user: weixuan
 12 |  * To change this template use File | Settings | File Templates.
 13 |  */
 14 | type QuasiNewtonHelper struct {
 15 | 	// config
 16 | 	numHist   int64
 17 | 	minimizer Minimizer
 18 | 	// historical data
 19 | 	sList, yList    []*core.Vector
 20 | 	roList          []float64
 21 | 	curPos, curGrad *core.Vector
 22 | }
 23 | 
 24 | type Minimizer interface {
 25 | 	NextPoint(curPos *core.Vector, dir *core.Vector, alpha float64) *core.Vector
 26 | 	Evaluate(curPos *core.Vector) float64
 27 | }
 28 | 
 29 | const MAX_BACKTRACKING_ITER = 50
 30 | 
 31 | // Description: the pos and gradient arguments should NOT be modified outside
 32 | func NewQuasiNewtonHelper(numHist int, minimizer Minimizer, curPos *core.Vector, curGrad *core.Vector) *QuasiNewtonHelper {
 33 | 	h := new(QuasiNewtonHelper)
 34 | 	h.numHist = int64(numHist)
 35 | 	h.minimizer = minimizer
 36 | 	h.curPos = curPos
 37 | 	h.curGrad = curGrad
 38 | 	h.sList = make([]*core.Vector, 0)
 39 | 	h.yList = make([]*core.Vector, 0)
 40 | 	h.roList = make([]float64, 0)
 41 | 	return h
 42 | }
 43 | 
 44 | // Description: Update the dir from -grad to optimal direction
 45 | //              Dir will be modified directly
 46 | func (h *QuasiNewtonHelper) ApplyQuasiInverseHession(dir *core.Vector) {
 47 | 	count := len(h.sList)
 48 | 	if count == 0 {
 49 | 		return
 50 | 	}
 51 | 	alphas := make([]float64, count, count)
 52 | 	for n := count - 1; n >= 0; n-- {
 53 | 		alphas[n] = -dir.Dot(h.sList[n]) / h.roList[n]
 54 | 		dir.ApplyElemWiseMultiplyAccumulation(h.yList[n], alphas[n])
 55 | 	}
 56 | 	lastY := h.yList[count-1]
 57 | 	yDotY := lastY.Dot(lastY)
 58 | 	scalar := h.roList[count-1] / yDotY
 59 | 	dir.ApplyScale(scalar)
 60 | 
 61 | 	for n := 0; n < count; n++ {
 62 | 		beta := dir.Dot(h.yList[n]) / h.roList[n]
 63 | 		dir.ApplyElemWiseMultiplyAccumulation(h.sList[n], -alphas[n]-beta)
 64 | 	}
 65 | 	return
 66 | }
 67 | 
 68 | func (h *QuasiNewtonHelper) BackTrackingLineSearch(cost float64, pos *core.Vector, grad *core.Vector, dir *core.Vector, isInit bool) (nextCost float64, nextPos *core.Vector) {
 69 | 	dotGradDir := grad.Dot(dir)
 70 | 	if dotGradDir == 0 {
 71 | 		return cost, pos
 72 | 	}
 73 | 	if dotGradDir > 0 {
 74 | 		panic("BackTracking: to the opposite direction of grad")
 75 | 	}
 76 | 
 77 | 	alpha := 1.0
 78 | 	backoff := 0.5
 79 | 	if isInit {
 80 | 		normDir := math.Sqrt(dir.Dot(dir))
 81 | 		alpha = (1 / normDir)
 82 | 		backoff = 0.1
 83 | 	}
 84 | 
 85 | 	var c1 float64 = 1e-4
 86 | 	for cntItr := 0; cntItr <= MAX_BACKTRACKING_ITER; cntItr++ {
 87 | 		nextPos = h.minimizer.NextPoint(pos, dir, alpha)
 88 | 		nextCost = h.minimizer.Evaluate(nextPos)
 89 | 		if nextCost <= cost+c1*dotGradDir*alpha {
 90 | 			break
 91 | 		}
 92 | 		alpha *= backoff
 93 | 	}
 94 | 	return nextCost, nextPos
 95 | }
 96 | 
 97 | // Description: the pos and gradient arguments should NOT be modified outside
 98 | func (h *QuasiNewtonHelper) UpdateState(nextPos *core.Vector, nextGrad *core.Vector) (isOptimal bool) {
 99 | 	if int64(len(h.sList)) >= h.numHist {
100 | 		h.sList = h.sList[1:]
101 | 		h.yList = h.yList[1:]
102 | 		h.roList = h.roList[1:]
103 | 	}
104 | 	newS := nextPos.ElemWiseMultiplyAdd(h.curPos, -1)
105 | 	newY := nextGrad.ElemWiseMultiplyAdd(h.curGrad, -1)
106 | 	ro := newS.Dot(newY)
107 | 	h.sList = append(h.sList, newS)
108 | 	h.yList = append(h.yList, newY)
109 | 	h.roList = append(h.roList, ro)
110 | 	h.curPos = nextPos
111 | 	h.curGrad = nextGrad
112 | 	return ro == 0
113 | }
114 | 


--------------------------------------------------------------------------------
/dt/random_forest.go:
--------------------------------------------------------------------------------
  1 | package dt
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"os"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"sync"
 11 | 
 12 | 	"github.com/xlvector/hector/core"
 13 | )
 14 | 
 15 | type RandomForestParams struct {
 16 | 	TreeCount    int
 17 | 	FeatureCount float64
 18 | }
 19 | 
 20 | type RandomForest struct {
 21 | 	trees               []*Tree
 22 | 	params              RandomForestParams
 23 | 	cart                CART
 24 | 	continuous_features bool
 25 | }
 26 | 
 27 | func (self *RandomForest) SaveModel(path string) {
 28 | 	file, _ := os.Create(path)
 29 | 	defer file.Close()
 30 | 	for _, tree := range self.trees {
 31 | 		buf := tree.ToString()
 32 | 		file.Write(buf)
 33 | 		file.WriteString("\n#\n")
 34 | 	}
 35 | }
 36 | 
 37 | func (self *RandomForest) LoadModel(path string) {
 38 | 	file, _ := os.Open(path)
 39 | 	defer file.Close()
 40 | 
 41 | 	self.trees = []*Tree{}
 42 | 	reader := bufio.NewReader(file)
 43 | 	text := []string{}
 44 | 	for {
 45 | 		line, err := reader.ReadString('\n')
 46 | 		if err != nil {
 47 | 			break
 48 | 		}
 49 | 		line = strings.TrimSpace(line)
 50 | 		if line == "#" {
 51 | 			tree := Tree{}
 52 | 			tree.fromString(text)
 53 | 			self.trees = append(self.trees, &tree)
 54 | 			text = []string{}
 55 | 		} else {
 56 | 			text = append(text, line)
 57 | 		}
 58 | 	}
 59 | 	log.Println("rf tree count :", len(self.trees))
 60 | }
 61 | 
 62 | func (dt *RandomForest) Init(params map[string]string) {
 63 | 	dt.trees = []*Tree{}
 64 | 	dt.cart.Init(params)
 65 | 	tree_count, _ := strconv.ParseInt(params["tree-count"], 10, 64)
 66 | 	feature_count, _ := strconv.ParseFloat(params["feature-count"], 64)
 67 | 	dt.params.TreeCount = int(tree_count)
 68 | 	dt.params.FeatureCount = feature_count
 69 | }
 70 | 
 71 | func (dt *RandomForest) Train(dataset *core.DataSet) {
 72 | 	samples := []*core.MapBasedSample{}
 73 | 	feature_weights := make(map[int64]float64)
 74 | 	for _, sample := range dataset.Samples {
 75 | 		if !dt.continuous_features {
 76 | 			for _, f := range sample.Features {
 77 | 				_, ok := feature_weights[f.Id]
 78 | 				if !ok {
 79 | 					feature_weights[f.Id] = f.Value
 80 | 				}
 81 | 				if feature_weights[f.Id] != f.Value {
 82 | 					dt.continuous_features = true
 83 | 				}
 84 | 			}
 85 | 		}
 86 | 		msample := sample.ToMapBasedSample()
 87 | 		samples = append(samples, msample)
 88 | 	}
 89 | 	dt.cart.continuous_features = dt.continuous_features
 90 | 
 91 | 	trees := make(chan *Tree, dt.params.TreeCount)
 92 | 	var wait sync.WaitGroup
 93 | 	wait.Add(dt.params.TreeCount)
 94 | 
 95 | 	for i := 0; i < dt.params.TreeCount; i++ {
 96 | 
 97 | 		go func() {
 98 | 			tree := dt.cart.SingleTreeBuild(samples, dt.params.FeatureCount, true)
 99 | 			trees <- &tree
100 | 			fmt.Printf(".")
101 | 			wait.Done()
102 | 		}()
103 | 	}
104 | 	wait.Wait()
105 | 	fmt.Println()
106 | 	close(trees)
107 | 	for tree := range trees {
108 | 		dt.trees = append(dt.trees, tree)
109 | 	}
110 | }
111 | 
112 | func (dt *RandomForest) Predict(sample *core.Sample) float64 {
113 | 	msample := sample.ToMapBasedSample()
114 | 	predictions := 0.0
115 | 	total := 0.0
116 | 	for _, tree := range dt.trees {
117 | 		node, _ := PredictBySingleTree(tree, msample)
118 | 		predictions += node.prediction.GetValue(1)
119 | 		total += 1.0
120 | 	}
121 | 	return predictions / total
122 | }
123 | 
124 | func (dt *RandomForest) PredictMultiClass(sample *core.Sample) *core.ArrayVector {
125 | 	msample := sample.ToMapBasedSample()
126 | 	predictions := core.NewArrayVector()
127 | 	total := 0.0
128 | 	for _, tree := range dt.trees {
129 | 		node, _ := PredictBySingleTree(tree, msample)
130 | 		predictions.AddVector(node.prediction, 1.0)
131 | 		total += 1.0
132 | 	}
133 | 	predictions.Scale(1.0 / total)
134 | 	return predictions
135 | }
136 | 


--------------------------------------------------------------------------------
/lr/lr_owlqn.go:
--------------------------------------------------------------------------------
  1 | package lr
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"github.com/xlvector/hector/core"
  6 | 	"github.com/xlvector/hector/util"
  7 | 	"math"
  8 | 	"os"
  9 | 	"strconv"
 10 | 	"strings"
 11 | )
 12 | 
 13 | type LROWLQNParams struct {
 14 | 	Regularization float64
 15 | }
 16 | 
 17 | type LROWLQN struct {
 18 | 	Model  *core.Vector
 19 | 	Params LROWLQNParams
 20 | 	// for training
 21 | 	dataSet  *core.DataSet
 22 | 	lastPos  *core.Vector
 23 | 	lastCost float64
 24 | 	lastGrad *core.Vector
 25 | }
 26 | 
 27 | func (lr *LROWLQN) SaveModel(path string) {
 28 | 	sb := util.StringBuilder{}
 29 | 	for key, val := range lr.Model.Data {
 30 | 		sb.Int64(key)
 31 | 		sb.Write("\t")
 32 | 		sb.Float(val)
 33 | 		sb.Write("\n")
 34 | 	}
 35 | 	sb.WriteToFile(path)
 36 | }
 37 | 
 38 | func (lr *LROWLQN) LoadModel(path string) {
 39 | 	file, _ := os.Open(path)
 40 | 	defer file.Close()
 41 | 
 42 | 	scaner := bufio.NewScanner(file)
 43 | 	for scaner.Scan() {
 44 | 		line := scaner.Text()
 45 | 		tks := strings.Split(line, "\t")
 46 | 		key, _ := strconv.ParseInt(tks[0], 10, 64)
 47 | 		val, _ := strconv.ParseFloat(tks[1], 64)
 48 | 		lr.Model.SetValue(key, val)
 49 | 	}
 50 | }
 51 | 
 52 | func (lr *LROWLQN) Init(params map[string]string) {
 53 | 	lr.Model = core.NewVector()
 54 | 	lr.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64)
 55 | }
 56 | 
 57 | func (lr *LROWLQN) updateValueGrad(pos *core.Vector, dataset *core.DataSet) {
 58 | 	var totalLoss float64 = 0.0
 59 | 	var grad *core.Vector = core.NewVector()
 60 | 	for _, sample := range dataset.Samples {
 61 | 		var score float64 = lr.getScore(pos, sample)
 62 | 		var signScore float64 = score
 63 | 		if sample.Label == 0 {
 64 | 			signScore = -score
 65 | 		}
 66 | 		var prob float64
 67 | 		var lnProb float64
 68 | 		if signScore < -30 {
 69 | 			prob = 0
 70 | 			lnProb = signScore
 71 | 		} else if signScore > 30 {
 72 | 			prob = 1
 73 | 			lnProb = 0
 74 | 		} else {
 75 | 			prob = 1.0 / (1.0 + math.Exp(-signScore))
 76 | 			lnProb = math.Log(prob)
 77 | 		}
 78 | 		var scale float64
 79 | 		if sample.Label == 0 {
 80 | 			scale = (1 - prob)
 81 | 		} else {
 82 | 			scale = -(1 - prob)
 83 | 		}
 84 | 		totalLoss += -lnProb
 85 | 		for _, fea := range sample.Features {
 86 | 			grad.AddValue(fea.Id, scale*fea.Value)
 87 | 		}
 88 | 	}
 89 | 	lr.lastPos = pos.Copy()
 90 | 	lr.lastCost = totalLoss
 91 | 	lr.lastGrad = grad
 92 | }
 93 | 
 94 | func (lr *LROWLQN) Equals(x *core.Vector, y *core.Vector) bool {
 95 | 	if y == nil && x == nil {
 96 | 		return true
 97 | 	}
 98 | 	if y == nil || x == nil {
 99 | 		return false
100 | 	}
101 | 	for key, val := range x.Data {
102 | 		if y.GetValue(key) != val {
103 | 			return false
104 | 		}
105 | 	}
106 | 	for key, val := range y.Data {
107 | 		if x.GetValue(key) != val {
108 | 			return false
109 | 		}
110 | 	}
111 | 	return true
112 | }
113 | 
114 | func (lr *LROWLQN) Value(pos *core.Vector) float64 {
115 | 	if lr.Equals(pos, lr.lastPos) {
116 | 		return lr.lastCost
117 | 	}
118 | 	lr.updateValueGrad(pos, lr.dataSet)
119 | 	return lr.lastCost
120 | }
121 | 
122 | func (lr *LROWLQN) Gradient(pos *core.Vector) *core.Vector {
123 | 	if lr.Equals(pos, lr.lastPos) {
124 | 		return lr.lastGrad
125 | 	}
126 | 	lr.updateValueGrad(pos, lr.dataSet)
127 | 	return lr.lastGrad
128 | }
129 | 
130 | func (lr *LROWLQN) Train(dataset *core.DataSet) {
131 | 	lr.dataSet = dataset
132 | 	minimizer := NewOWLQNMinimizer(lr.Params.Regularization)
133 | 	lr.Model = minimizer.Minimize(lr, core.NewVector())
134 | }
135 | 
136 | func (lr *LROWLQN) getScore(model *core.Vector, sample *core.Sample) float64 {
137 | 	var score float64 = 0
138 | 	for _, fea := range sample.Features {
139 | 		score += model.GetValue(fea.Id) * fea.Value
140 | 	}
141 | 	return score
142 | }
143 | 
144 | func (lr *LROWLQN) Predict(sample *core.Sample) float64 {
145 | 	score := lr.getScore(lr.Model, sample)
146 | 	score = 1.0 / (1.0 + math.Exp(-score))
147 | 	return score
148 | }
149 | 


--------------------------------------------------------------------------------
/lr/ep_logistic_regression.go:
--------------------------------------------------------------------------------
  1 | package lr
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"github.com/xlvector/hector/core"
  6 | 	"github.com/xlvector/hector/util"
  7 | 	"math"
  8 | 	"os"
  9 | 	"strconv"
 10 | 	"strings"
 11 | )
 12 | 
 13 | type EPLogisticRegressionParams struct {
 14 | 	init_var, beta float64
 15 | }
 16 | 
 17 | type EPLogisticRegression struct {
 18 | 	Model  map[int64]*util.Gaussian
 19 | 	params EPLogisticRegressionParams
 20 | }
 21 | 
 22 | func (algo *EPLogisticRegression) SaveModel(path string) {
 23 | 	sb := util.StringBuilder{}
 24 | 	for f, g := range algo.Model {
 25 | 		sb.Int64(f)
 26 | 		sb.Write("\t")
 27 | 		sb.Float(g.Mean)
 28 | 		sb.Write("\t")
 29 | 		sb.Float(g.Vari)
 30 | 		sb.Write("\n")
 31 | 	}
 32 | 	sb.WriteToFile(path)
 33 | }
 34 | 
 35 | func (algo *EPLogisticRegression) LoadModel(path string) {
 36 | 	file, _ := os.Open(path)
 37 | 	defer file.Close()
 38 | 
 39 | 	scaner := bufio.NewScanner(file)
 40 | 	for scaner.Scan() {
 41 | 		line := scaner.Text()
 42 | 		tks := strings.Split(line, "\t")
 43 | 		fid, _ := strconv.ParseInt(tks[0], 10, 64)
 44 | 		mean, _ := strconv.ParseFloat(tks[1], 64)
 45 | 		vari, _ := strconv.ParseFloat(tks[2], 64)
 46 | 		g := util.Gaussian{Mean: mean, Vari: vari}
 47 | 		algo.Model[fid] = &g
 48 | 	}
 49 | }
 50 | 
 51 | func (algo *EPLogisticRegression) Predict(sample *core.Sample) float64 {
 52 | 	s := util.Gaussian{Mean: 0.0, Vari: 0.0}
 53 | 	for _, feature := range sample.Features {
 54 | 		if feature.Value == 0.0 {
 55 | 			continue
 56 | 		}
 57 | 		wi, ok := algo.Model[feature.Id]
 58 | 		if !ok {
 59 | 			wi = &(util.Gaussian{Mean: 0.0, Vari: algo.params.init_var})
 60 | 		}
 61 | 		s.Mean += feature.Value * wi.Mean
 62 | 		s.Vari += feature.Value * feature.Value * wi.Vari
 63 | 	}
 64 | 
 65 | 	t := s
 66 | 	t.Vari += algo.params.beta
 67 | 	return t.Integral(t.Mean / math.Sqrt(t.Vari))
 68 | }
 69 | 
 70 | func (algo *EPLogisticRegression) Init(params map[string]string) {
 71 | 	algo.Model = make(map[int64]*util.Gaussian)
 72 | 	algo.params.beta, _ = strconv.ParseFloat(params["beta"], 64)
 73 | 	algo.params.init_var = 1.0
 74 | }
 75 | 
 76 | func (algo *EPLogisticRegression) Clear() {
 77 | 	algo.Model = nil
 78 | 	algo.Model = make(map[int64]*util.Gaussian)
 79 | }
 80 | 
 81 | func (algo *EPLogisticRegression) Train(dataset *core.DataSet) {
 82 | 
 83 | 	for _, sample := range dataset.Samples {
 84 | 		s := util.Gaussian{Mean: 0.0, Vari: 0.0}
 85 | 		for _, feature := range sample.Features {
 86 | 			if feature.Value == 0.0 {
 87 | 				continue
 88 | 			}
 89 | 			wi, ok := algo.Model[feature.Id]
 90 | 			if !ok {
 91 | 				wi = &(util.Gaussian{Mean: 0.0, Vari: algo.params.init_var})
 92 | 				algo.Model[feature.Id] = wi
 93 | 			}
 94 | 			s.Mean += feature.Value * wi.Mean
 95 | 			s.Vari += feature.Value * feature.Value * wi.Vari
 96 | 		}
 97 | 
 98 | 		t := s
 99 | 		t.Vari += algo.params.beta
100 | 
101 | 		t2 := util.Gaussian{Mean: 0.0, Vari: 0.0}
102 | 		if sample.Label > 0.0 {
103 | 			t2.UpperTruncateGaussian(t.Mean, t.Vari, 0.0)
104 | 		} else {
105 | 			t2.LowerTruncateGaussian(t.Mean, t.Vari, 0.0)
106 | 		}
107 | 		t.MultGaussian(&t2)
108 | 		s2 := t
109 | 		s2.Vari += algo.params.beta
110 | 		s0 := s
111 | 		s.MultGaussian(&s2)
112 | 
113 | 		for _, feature := range sample.Features {
114 | 			if feature.Value == 0.0 {
115 | 				continue
116 | 			}
117 | 			wi0 := util.Gaussian{Mean: 0.0, Vari: algo.params.init_var}
118 | 			w2 := util.Gaussian{Mean: 0.0, Vari: 0.0}
119 | 			wi, _ := algo.Model[feature.Id]
120 | 			w2.Mean = (s.Mean - (s0.Mean - wi.Mean*feature.Value)) / feature.Value
121 | 			w2.Vari = (s.Vari + (s0.Vari - wi.Vari*feature.Value*feature.Value)) / (feature.Value * feature.Value)
122 | 			wi.MultGaussian(&w2)
123 | 			wi_vari := wi.Vari
124 | 			wi_new_vari := wi_vari * wi0.Vari / (0.99*wi0.Vari + 0.01*wi.Vari)
125 | 			wi.Vari = wi_new_vari
126 | 			wi.Mean = wi.Vari * (0.99*wi.Mean/wi_vari + 0.01*wi0.Mean/wi.Vari)
127 | 			if wi.Vari < algo.params.init_var*0.01 {
128 | 				wi.Vari = algo.params.init_var * 0.01
129 | 			}
130 | 			algo.Model[feature.Id] = wi
131 | 		}
132 | 	}
133 | }
134 | 


--------------------------------------------------------------------------------
/bin/hector-stack.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"github.com/xlvector/hector"
  7 | 	"github.com/xlvector/hector/core"
  8 | 	"github.com/xlvector/hector/eval"
  9 | 	"github.com/xlvector/hector/lr"
 10 | 	"os"
 11 | 	"strconv"
 12 | 	"sync"
 13 | )
 14 | 
 15 | func SplitFile(input string, total int, part int) (string, string, error) {
 16 | 	file, err := os.Open(input)
 17 | 	if err != nil {
 18 | 		return "", "", err
 19 | 	}
 20 | 	defer file.Close()
 21 | 
 22 | 	train_path := input + ".train." + strconv.Itoa(part)
 23 | 	train_file, err := os.Create(train_path)
 24 | 	if err != nil {
 25 | 		return "", "", err
 26 | 	}
 27 | 	defer train_file.Close()
 28 | 
 29 | 	test_path := input + ".test." + strconv.Itoa(part)
 30 | 	test_file, err := os.Create(test_path)
 31 | 	if err != nil {
 32 | 		return "", "", err
 33 | 	}
 34 | 	defer test_file.Close()
 35 | 
 36 | 	scanner := bufio.NewScanner(file)
 37 | 	k := 0
 38 | 	for scanner.Scan() {
 39 | 		if k%total == part {
 40 | 			test_file.WriteString(scanner.Text() + "\n")
 41 | 		} else {
 42 | 			train_file.WriteString(scanner.Text() + "\n")
 43 | 		}
 44 | 		k += 1
 45 | 	}
 46 | 	return train_path, test_path, nil
 47 | }
 48 | 
 49 | func main() {
 50 | 	train_path, test_path, pred_path, _, params := hector.PrepareParams()
 51 | 	total := 5
 52 | 	methods := []string{"ftrl", "fm"}
 53 | 	all_methods_predictions := [][]*eval.LabelPrediction{}
 54 | 	all_methods_test_predictions := [][]*eval.LabelPrediction{}
 55 | 	for _, method := range methods {
 56 | 		fmt.Println(method)
 57 | 		average_auc := 0.0
 58 | 		all_predictions := []*eval.LabelPrediction{}
 59 | 		for part := 0; part < total; part++ {
 60 | 			train, test, _ := SplitFile(train_path, total, part)
 61 | 			classifier := hector.GetClassifier(method)
 62 | 
 63 | 			auc, predictions, _ := hector.AlgorithmRun(classifier, train, test, "", params)
 64 | 			fmt.Println("AUC:")
 65 | 			fmt.Println(auc)
 66 | 			average_auc += auc
 67 | 			os.Remove(train)
 68 | 			os.Remove(test)
 69 | 			classifier = nil
 70 | 			for _, pred := range predictions {
 71 | 				all_predictions = append(all_predictions, pred)
 72 | 			}
 73 | 		}
 74 | 		all_methods_predictions = append(all_methods_predictions, all_predictions)
 75 | 		fmt.Println(average_auc / float64(total))
 76 | 
 77 | 		classifier := hector.GetClassifier(method)
 78 | 		fmt.Println(test_path)
 79 | 		_, test_predictions, _ := hector.AlgorithmRun(classifier, train_path, test_path, "", params)
 80 | 		all_methods_test_predictions = append(all_methods_test_predictions, test_predictions)
 81 | 	}
 82 | 
 83 | 	var wait sync.WaitGroup
 84 | 	wait.Add(2)
 85 | 	dataset := core.NewDataSet()
 86 | 	go func() {
 87 | 		for i, _ := range all_methods_predictions[0] {
 88 | 			sample := core.NewSample()
 89 | 			sample.Label = all_methods_predictions[0][i].Label
 90 | 			for j, _ := range all_methods_predictions {
 91 | 				feature := core.Feature{Id: int64(j), Value: all_methods_predictions[j][i].Prediction}
 92 | 				sample.AddFeature(feature)
 93 | 			}
 94 | 			dataset.Samples <- sample
 95 | 		}
 96 | 		close(dataset.Samples)
 97 | 		wait.Done()
 98 | 	}()
 99 | 
100 | 	ensembler := lr.LinearRegression{}
101 | 	go func() {
102 | 		ensembler.Init(params)
103 | 		ensembler.Train(dataset)
104 | 		wait.Done()
105 | 	}()
106 | 	wait.Wait()
107 | 
108 | 	fmt.Println(ensembler.Model)
109 | 
110 | 	wait.Add(2)
111 | 	test_dataset := hector.NewDataSet()
112 | 	go func() {
113 | 		for i, _ := range all_methods_test_predictions[0] {
114 | 			sample := hector.NewSample()
115 | 			sample.Label = all_methods_test_predictions[0][i].Prediction
116 | 			for j, _ := range all_methods_test_predictions {
117 | 				feature := hector.Feature{Id: int64(j), Value: all_methods_test_predictions[j][i].Prediction}
118 | 				sample.AddFeature(feature)
119 | 			}
120 | 			test_dataset.Samples <- sample
121 | 		}
122 | 		close(test_dataset.Samples)
123 | 		wait.Done()
124 | 	}()
125 | 
126 | 	go func() {
127 | 		pred_file, _ := os.Create(test_path + ".out")
128 | 		for sample := range test_dataset.Samples {
129 | 			prediction := sample.Label //ensembler.Predict(sample)
130 | 			pred_file.WriteString(strconv.FormatFloat(prediction, 'g', 5, 64) + "\n")
131 | 		}
132 | 		defer pred_file.Close()
133 | 		wait.Done()
134 | 	}()
135 | 	wait.Wait()
136 | }
137 | 


--------------------------------------------------------------------------------
/svm/svm.go:
--------------------------------------------------------------------------------
  1 | package svm
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/xlvector/hector/core"
  6 | 	"math"
  7 | 	"math/rand"
  8 | 	"strconv"
  9 | )
 10 | 
 11 | type SVM struct {
 12 | 	sv []*core.Vector
 13 | 	y  []float64
 14 | 	a  []float64
 15 | 	b  float64
 16 | 	C  float64
 17 | 	e  float64
 18 | 	w  *core.Vector
 19 | 
 20 | 	xx []float64
 21 | }
 22 | 
 23 | func (self *SVM) SaveModel(path string) {
 24 | 
 25 | }
 26 | 
 27 | func (self *SVM) LoadModel(path string) {
 28 | 
 29 | }
 30 | 
 31 | type SVMValues struct {
 32 | 	a1, a2, e1, e2, k11, k12, k22 float64
 33 | 	i1, i2                        int
 34 | }
 35 | 
 36 | func (c *SVM) Init(params map[string]string) {
 37 | 	c.C, _ = strconv.ParseFloat(params["c"], 64)
 38 | 	c.e, _ = strconv.ParseFloat(params["e"], 64)
 39 | 
 40 | 	c.w = core.NewVector()
 41 | }
 42 | 
 43 | func (c *SVM) Predict(sample *core.Sample) float64 {
 44 | 	x := sample.GetFeatureVector()
 45 | 	return c.PredictVector(x)
 46 | }
 47 | 
 48 | func (c *SVM) PredictVector(x *core.Vector) float64 {
 49 | 	ret := c.w.Dot(x) - c.b
 50 | 	return ret
 51 | }
 52 | 
 53 | func (c *SVM) MatchKKT(y, f, a float64) bool {
 54 | 	ep := c.C * 0.01
 55 | 	if a < ep && y*f > 1.0 {
 56 | 		return true
 57 | 	}
 58 | 
 59 | 	if a > c.C-ep && y*f < 1.0 {
 60 | 		return true
 61 | 	}
 62 | 
 63 | 	if a > ep && a < c.C-ep && y*f == 1.0 {
 64 | 		return true
 65 | 	}
 66 | 
 67 | 	return false
 68 | }
 69 | 
 70 | func (c *SVM) Train(dataset *core.DataSet) {
 71 | 	c.sv = []*core.Vector{}
 72 | 	c.y = []float64{}
 73 | 	c.a = []float64{}
 74 | 	for k, sample := range dataset.Samples {
 75 | 		x := sample.GetFeatureVector()
 76 | 		c.sv = append(c.sv, x)
 77 | 		c.xx = append(c.xx, x.Dot(x))
 78 | 		if sample.Label > 0.0 {
 79 | 			c.y = append(c.y, 1.0)
 80 | 		} else {
 81 | 			c.y = append(c.y, -1.0)
 82 | 		}
 83 | 		c.a = append(c.a, c.C*rand.Float64())
 84 | 		c.w.AddVector(x, c.y[k]*c.a[k])
 85 | 	}
 86 | 
 87 | 	c.b = 0.0
 88 | 	for k, x := range c.sv {
 89 | 		c.b += c.PredictVector(x) - c.y[k]
 90 | 	}
 91 | 	c.b /= float64(len(c.sv))
 92 | 	fmt.Println(c.b)
 93 | 
 94 | 	for step := 0; step < 100; step++ {
 95 | 		da := 0.0
 96 | 		for i1 := 0; i1 < len(c.sv); i1++ {
 97 | 			a1 := c.a[i1]
 98 | 			x1 := c.sv[i1]
 99 | 			y1 := c.y[i1]
100 | 			p1 := c.PredictVector(x1)
101 | 			if c.MatchKKT(y1, p1, a1) {
102 | 				continue
103 | 			}
104 | 			maxde := 0.0
105 | 			best_values := SVMValues{}
106 | 			for k2 := 0; k2 < 10; k2++ {
107 | 				i2 := rand.Intn(len(c.sv))
108 | 				if i1 == i2 {
109 | 					continue
110 | 				}
111 | 
112 | 				x2 := c.sv[i2]
113 | 				y2 := c.y[i2]
114 | 				p2 := c.PredictVector(x2)
115 | 				k11 := c.xx[i1]
116 | 				k12 := x1.Dot(x2)
117 | 				k22 := c.xx[i2]
118 | 
119 | 				a2 := c.a[i2]
120 | 
121 | 				u := math.Max(0, a2-a1)
122 | 				v := math.Min(c.C, c.C+a2-a1)
123 | 				if y1*y2 > 0.0 {
124 | 					u = math.Max(0, a2+a1-c.C)
125 | 					v = math.Min(c.C, a1+a2)
126 | 				}
127 | 
128 | 				e1 := p1 - y1
129 | 				e2 := p2 - y2
130 | 
131 | 				a2old := a2
132 | 				a2 += y2 * (e1 - e2) / (k11 + k22 - 2*k12)
133 | 
134 | 				a2 = math.Max(u, math.Min(a2, v))
135 | 
136 | 				a1 += y1 * y2 * (a2old - a2)
137 | 
138 | 				if math.Abs(e1-e2) > maxde {
139 | 					maxde = math.Abs(e1 - e2)
140 | 					best_values.a1 = a1
141 | 					best_values.a2 = a2
142 | 					best_values.i1 = i1
143 | 					best_values.i2 = i2
144 | 					best_values.e1 = e1
145 | 					best_values.e2 = e2
146 | 				}
147 | 				if maxde >= 4.0 {
148 | 					break
149 | 				}
150 | 			}
151 | 			da += math.Abs(c.a[best_values.i1] - best_values.a1)
152 | 			c.w.AddVector(c.sv[best_values.i1], c.y[best_values.i1]*(best_values.a1-c.a[best_values.i1]))
153 | 			c.w.AddVector(c.sv[best_values.i2], c.y[best_values.i2]*(best_values.a2-c.a[best_values.i2]))
154 | 			/*
155 | 				b1 := c.b - best_values.e1 - c.y[best_values.i1] * (best_values.a1 - c.a[best_values.i1]) * best_values.k11 - c.y[best_values.i2] * (best_values.a2 - c.a[best_values.i2]) * best_values.k12
156 | 				b2 := c.b - best_values.e2 - c.y[best_values.i1] * (best_values.a1 - c.a[best_values.i1]) * best_values.k12 - c.y[best_values.i2] * (best_values.a2 - c.a[best_values.i2]) * best_values.k22
157 | 				if best_values.a1 > 0.0 && best_values.a1 < c.C{
158 | 					c.b = b1
159 | 				} else {
160 | 					if best_values.a2 > 0.0 && best_values.a2 < c.C {
161 | 						c.b = b2
162 | 					} else {
163 | 						c.b = 0.5 * (b1 + b2)
164 | 					}
165 | 				}*/
166 | 			c.a[best_values.i1] = best_values.a1
167 | 			c.a[best_values.i2] = best_values.a2
168 | 		}
169 | 		da /= float64(len(c.sv))
170 | 		fmt.Printf(".. %f %f\n", da, c.b)
171 | 		if da < c.e {
172 | 			break
173 | 		}
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/lr/owlqn_minimizer.go:
--------------------------------------------------------------------------------
  1 | package lr
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/xlvector/hector/core"
  6 | 	"math"
  7 | )
  8 | 
  9 | /**
 10 |  * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models"
 11 |  * by Galen Andrew and Jianfeng Gao
 12 |  * user: weixuan
 13 |  * To change this template use File | Settings | File Templates.
 14 |  */
 15 | type OWLQNMinimizer struct {
 16 | 	l1reg        float64
 17 | 	costFun      DiffFunction
 18 | 	numHist      int
 19 | 	maxIteration int
 20 | 	tolerance    float64
 21 | }
 22 | 
 23 | var owlqn_output_switch bool = false
 24 | 
 25 | func NewOWLQNMinimizer(l1reg float64) *OWLQNMinimizer {
 26 | 	m := new(OWLQNMinimizer)
 27 | 	m.l1reg = l1reg
 28 | 	m.numHist = 10
 29 | 	m.maxIteration = 20
 30 | 	m.tolerance = 1e-4
 31 | 	return m
 32 | }
 33 | 
 34 | func (m *OWLQNMinimizer) Minimize(costfun DiffFunction, init *core.Vector) *core.Vector {
 35 | 	m.costFun = costfun
 36 | 	var cost float64 = m.Evaluate(init)
 37 | 	var grad *core.Vector = costfun.Gradient(init).Copy()
 38 | 	var pos *core.Vector = init.Copy()
 39 | 	var terminalCriterion *relativeMeanImprCriterion = NewRelativeMeanImprCriterion(m.tolerance)
 40 | 	terminalCriterion.addCost(cost)
 41 | 
 42 | 	var helper *QuasiNewtonHelper = NewQuasiNewtonHelper(m.numHist, m, pos, grad)
 43 | 	if owlqn_output_switch {
 44 | 		fmt.Println("Iter\tcost\timprovement")
 45 | 		fmt.Printf("%d\t%e\tUndefined", 0, cost)
 46 | 	}
 47 | 	for iter := 1; iter <= m.maxIteration; iter++ {
 48 | 		// customed steepest descending dir
 49 | 		steepestDescDir := grad.Copy()
 50 | 		m.updateGrad(pos, steepestDescDir)
 51 | 		steepestDescDir.ApplyScale(-1.0)
 52 | 		dir := steepestDescDir.Copy()
 53 | 		// quasi-newton dir
 54 | 		helper.ApplyQuasiInverseHession(dir)
 55 | 		m.fixDirSign(dir, steepestDescDir)
 56 | 		// customed grad for the new position
 57 | 		potentialGrad := grad.Copy()
 58 | 		m.updateGradForNewPos(pos, potentialGrad, dir)
 59 | 		newCost, newPos := helper.BackTrackingLineSearch(cost, pos, potentialGrad, dir, iter == 1)
 60 | 		if owlqn_output_switch {
 61 | 			fmt.Println("")
 62 | 		}
 63 | 		if cost == newCost {
 64 | 			break
 65 | 		}
 66 | 		cost = newCost
 67 | 		pos = newPos
 68 | 		grad = costfun.Gradient(pos).Copy()
 69 | 		terminalCriterion.addCost(cost)
 70 | 		if owlqn_output_switch {
 71 | 			fmt.Printf("%d\t%e\t%e", iter, newCost, terminalCriterion.improvement)
 72 | 		}
 73 | 		if terminalCriterion.isTerminable() || helper.UpdateState(pos, grad) {
 74 | 			if owlqn_output_switch {
 75 | 				fmt.Println("")
 76 | 			}
 77 | 			break
 78 | 		}
 79 | 	}
 80 | 	return pos
 81 | }
 82 | 
 83 | // Description: assume all the features in x also appears in grad
 84 | //              all the features in dir must be in grad
 85 | func (m *OWLQNMinimizer) updateGradForNewPos(x *core.Vector, grad *core.Vector, dir *core.Vector) {
 86 | 	if m.l1reg == 0 {
 87 | 		return
 88 | 	}
 89 | 	for key, val := range grad.Data {
 90 | 		xval := x.GetValue(key)
 91 | 		if xval < 0 {
 92 | 			grad.SetValue(key, val-m.l1reg)
 93 | 		} else if xval > 0 {
 94 | 			grad.SetValue(key, val+m.l1reg)
 95 | 		} else {
 96 | 			dirval := dir.GetValue(key)
 97 | 			if dirval < 0 {
 98 | 				grad.SetValue(key, val-m.l1reg)
 99 | 			} else if dirval > 0 {
100 | 				grad.SetValue(key, val+m.l1reg)
101 | 			}
102 | 		}
103 | 	}
104 | 	return
105 | }
106 | 
107 | // Description: assume all the features in x also appears in grad
108 | func (m *OWLQNMinimizer) updateGrad(x *core.Vector, grad *core.Vector) {
109 | 	if m.l1reg == 0 {
110 | 		return
111 | 	}
112 | 	for key, val := range grad.Data {
113 | 		xval := x.GetValue(key)
114 | 		if xval < 0 {
115 | 			grad.SetValue(key, val-m.l1reg)
116 | 		} else if xval > 0 {
117 | 			grad.SetValue(key, val+m.l1reg)
118 | 		} else {
119 | 			if val < -m.l1reg {
120 | 				grad.SetValue(key, val+m.l1reg)
121 | 			} else if val > m.l1reg {
122 | 				grad.SetValue(key, val-m.l1reg)
123 | 			}
124 | 		}
125 | 	}
126 | 	return
127 | }
128 | 
129 | func (m *OWLQNMinimizer) fixDirSign(dir *core.Vector, steepestDescDir *core.Vector) {
130 | 	if m.l1reg == 0 {
131 | 		return
132 | 	}
133 | 	for key, val := range dir.Data {
134 | 		if val*steepestDescDir.GetValue(key) <= 0 {
135 | 			dir.SetValue(key, 0)
136 | 		}
137 | 	}
138 | }
139 | 
140 | func (m *OWLQNMinimizer) Evaluate(pos *core.Vector) float64 {
141 | 	cost := m.costFun.Value(pos)
142 | 	for _, val := range pos.Data {
143 | 		cost += math.Abs(val) * m.l1reg
144 | 	}
145 | 	return cost
146 | }
147 | 
148 | func (m *OWLQNMinimizer) NextPoint(curPos *core.Vector, dir *core.Vector, alpha float64) *core.Vector {
149 | 	if owlqn_output_switch {
150 | 		fmt.Printf(".")
151 | 	}
152 | 	newPos := curPos.ElemWiseMultiplyAdd(dir, alpha)
153 | 	if m.l1reg > 0 {
154 | 		for key, val := range curPos.Data {
155 | 			if val*newPos.GetValue(key) < 0 {
156 | 				newPos.SetValue(key, 0)
157 | 			}
158 | 		}
159 | 	}
160 | 	return newPos
161 | }
162 | 


--------------------------------------------------------------------------------
/core/vector.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | import (
  4 | 	"github.com/xlvector/hector/util"
  5 | 	"math"
  6 | 	"math/rand"
  7 | 	"strconv"
  8 | 	"strings"
  9 | )
 10 | 
 11 | type Vector struct {
 12 | 	Data map[int64]float64
 13 | }
 14 | 
 15 | func NewVector() *Vector {
 16 | 	v := Vector{}
 17 | 	v.Data = make(map[int64]float64)
 18 | 	return &v
 19 | }
 20 | 
 21 | func (v *Vector) ToString() []byte {
 22 | 	sb := util.StringBuilder{}
 23 | 	for key, value := range v.Data {
 24 | 		sb.Int64(key)
 25 | 		sb.Write(":")
 26 | 		sb.Float(value)
 27 | 		sb.Write("|")
 28 | 	}
 29 | 	return sb.Bytes()
 30 | }
 31 | 
 32 | func (v *Vector) FromString(buf string) {
 33 | 	tks := strings.Split(buf, "|")
 34 | 	for _, tk := range tks {
 35 | 		if len(tk) == 0 {
 36 | 			continue
 37 | 		}
 38 | 		kv := strings.Split(tk, ":")
 39 | 		key, _ := strconv.ParseInt(kv[0], 10, 64)
 40 | 		value, _ := strconv.ParseFloat(kv[1], 64)
 41 | 		v.Data[key] = value
 42 | 	}
 43 | }
 44 | 
 45 | func (v *Vector) AddValue(key int64, value float64) {
 46 | 	_, ok := v.Data[key]
 47 | 	if ok {
 48 | 		v.Data[key] += value
 49 | 	} else {
 50 | 		v.Data[key] = value
 51 | 	}
 52 | }
 53 | 
 54 | func (v *Vector) GetValue(key int64) float64 {
 55 | 	value, ok := v.Data[key]
 56 | 	if !ok {
 57 | 		return 0.0
 58 | 	} else {
 59 | 		return value
 60 | 	}
 61 | }
 62 | 
 63 | func (v *Vector) RandomInit(key int64, c float64) {
 64 | 	value, ok := v.Data[key]
 65 | 	if !ok {
 66 | 		value = rand.NormFloat64() * c
 67 | 		v.Data[key] = value
 68 | 	}
 69 | }
 70 | 
 71 | func (v *Vector) SetValue(key int64, value float64) {
 72 | 	v.Data[key] = value
 73 | }
 74 | 
 75 | func (v *Vector) AddVector(v2 *Vector, alpha float64) {
 76 | 	for key, value := range v2.Data {
 77 | 		v.AddValue(key, value*alpha)
 78 | 	}
 79 | }
 80 | 
 81 | func (v *Vector) NormL2() float64 {
 82 | 	ret := 0.0
 83 | 	for _, val := range v.Data {
 84 | 		ret += val * val
 85 | 	}
 86 | 	return ret
 87 | }
 88 | 
 89 | func (v *Vector) Copy() *Vector {
 90 | 	ret := NewVector()
 91 | 	for key, val := range v.Data {
 92 | 		ret.SetValue(key, val)
 93 | 	}
 94 | 	return ret
 95 | }
 96 | 
 97 | func (v *Vector) KeyWithMaxValue() (int64, float64) {
 98 | 	ret := int64(0)
 99 | 	max_val := 0.0
100 | 	for key, val := range v.Data {
101 | 		max_val = val
102 | 		ret = key
103 | 		break
104 | 	}
105 | 	for key, val := range v.Data {
106 | 		if max_val < val {
107 | 			max_val = val
108 | 			ret = key
109 | 		}
110 | 	}
111 | 	return ret, max_val
112 | }
113 | 
114 | func (v *Vector) Sum() float64 {
115 | 	ret := 0.0
116 | 	for _, val := range v.Data {
117 | 		ret += val
118 | 	}
119 | 	return ret
120 | }
121 | 
122 | func (v *Vector) Dot(v2 *Vector) float64 {
123 | 	va := v
124 | 	vb := v2
125 | 
126 | 	if len(v2.Data) < len(v.Data) {
127 | 		va = v2
128 | 		vb = v
129 | 	}
130 | 	ret := 0.0
131 | 	for key, a := range va.Data {
132 | 		b, ok := vb.Data[key]
133 | 		if ok {
134 | 			ret += a * b
135 | 		}
136 | 	}
137 | 	return ret
138 | }
139 | 
140 | func (v *Vector) DotFeatures(fs []Feature) float64 {
141 | 	ret := 0.0
142 | 	for _, f := range fs {
143 | 		ret += f.Value * v.GetValue(f.Id)
144 | 	}
145 | 	return ret
146 | }
147 | 
148 | type ElemOperation func(float64) float64
149 | 
150 | func (v *Vector) ApplyOnElem(fn ElemOperation) *Vector {
151 | 	ret := NewVector()
152 | 	for key, val := range v.Data {
153 | 		ret.SetValue(key, fn(val))
154 | 	}
155 | 	return ret
156 | }
157 | 
158 | func (v *Vector) Scale(scale float64) *Vector {
159 | 	ret := NewVector()
160 | 	for key, val := range v.Data {
161 | 		ret.SetValue(key, val*scale)
162 | 	}
163 | 	return ret
164 | }
165 | 
166 | func (v *Vector) ApplyScale(scale float64) {
167 | 	for key, val := range v.Data {
168 | 		v.Data[key] = val * scale
169 | 	}
170 | }
171 | 
172 | func (v *Vector) SoftMaxNorm() *Vector {
173 | 	sum := 0.0
174 | 	for _, val := range v.Data {
175 | 		sum += math.Exp(val)
176 | 	}
177 | 	ret := NewVector()
178 | 	for key, val := range v.Data {
179 | 		ret.SetValue(key, math.Exp(val)/sum)
180 | 	}
181 | 	return ret
182 | }
183 | 
184 | func (v *Vector) ElemWiseAddVector(u *Vector) *Vector {
185 | 	ret := NewVector()
186 | 	for key, vi := range v.Data {
187 | 		ret.SetValue(key, vi)
188 | 	}
189 | 	for key, ui := range u.Data {
190 | 		ret.AddValue(key, ui)
191 | 	}
192 | 	return ret
193 | }
194 | 
195 | func (v *Vector) ElemWiseMultiply(u *Vector) *Vector {
196 | 	ret := NewVector()
197 | 	for key, val := range v.Data {
198 | 		ual := u.GetValue(key)
199 | 		if ual != 0 && val != 0 {
200 | 			ret.SetValue(key, val*ual)
201 | 		}
202 | 	}
203 | 	return ret
204 | }
205 | 
206 | func (v *Vector) ElemWiseMultiplyAdd(u *Vector, s float64) *Vector {
207 | 	ret := NewVector()
208 | 	for key, val := range v.Data {
209 | 		ret.SetValue(key, val)
210 | 	}
211 | 	for key, val := range u.Data {
212 | 		ret.AddValue(key, val*s)
213 | 	}
214 | 	return ret
215 | }
216 | 
217 | func (v *Vector) ApplyElemWiseMultiplyAccumulation(u *Vector, s float64) {
218 | 	for key, val := range u.Data {
219 | 		v.AddValue(key, val*s)
220 | 	}
221 | }
222 | 
223 | func (v *Vector) OuterProduct(u *Vector) *Matrix {
224 | 	ret := NewMatrix()
225 | 	for key, vi := range v.Data {
226 | 		ret.Data[key] = u.Scale(vi)
227 | 	}
228 | 	return ret
229 | }
230 | 
231 | func (v *Vector) MultiplyMatrix(m *Matrix) *Vector {
232 | 	ret := NewVector()
233 | 	for k, v := range v.Data {
234 | 		u, ok := m.Data[k]
235 | 		if ok {
236 | 			for ki, ui := range u.Data {
237 | 				ret.Data[ki] += v * ui
238 | 			}
239 | 		}
240 | 	}
241 | 	return ret
242 | }
243 | 


--------------------------------------------------------------------------------
/gp/gaussian_process.go:
--------------------------------------------------------------------------------
  1 | package gp
  2 | 
  3 | import (
  4 | 	"github.com/xlvector/hector/core"
  5 | 	"math"
  6 | 	"strconv"
  7 | )
  8 | 
  9 | type GaussianProcessParameters struct {
 10 | 	Dim   int64
 11 | 	Theta float64
 12 | }
 13 | 
 14 | type GaussianProcess struct {
 15 | 	Params            GaussianProcessParameters
 16 | 	CovarianceFunc    CovFunc
 17 | 	CovMatrix         *core.Matrix
 18 | 	TargetValues      *core.Vector
 19 | 	InvCovTarget      *core.Vector // inv(CovMatrix)*TargetValues
 20 | 	DataSet           *core.RealDataSet
 21 | 	TrainingDataCount int64
 22 | }
 23 | 
 24 | func (self *GaussianProcess) SaveModel(path string) {
 25 | 
 26 | }
 27 | 
 28 | func (self *GaussianProcess) LoadModel(path string) {
 29 | 
 30 | }
 31 | 
 32 | /*
 33 |    Given matrix m and vector v, compute inv(m)*v.
 34 |    Based on Gibbs and MacKay 1997, and Mark N. Gibbs's PhD dissertation
 35 | 
 36 |    Details:
 37 |    A - positive seminidefinite matrix
 38 |    u - a vector
 39 |    theta - positive number
 40 |    C = A + I*theta
 41 |    Returns inv(C)*u - So you need the diagonal noise term for covariance matrix in a sense.
 42 |    However, this algorithm is numerically stable, the noise term can be very small and the inversion can still be calculated...
 43 | */
 44 | func (algo *GaussianProcess) ApproximateInversion(A *core.Matrix, u *core.Vector, theta float64, dim int64) *core.Vector {
 45 | 	max_itr := 500
 46 | 	tol := 0.01
 47 | 
 48 | 	C := core.NewMatrix()
 49 | 	for key, val := range A.Data {
 50 | 		C.Data[key] = val.Copy()
 51 | 	}
 52 | 
 53 | 	// Add theta to diagonal elements
 54 | 	for i := int64(0); i < dim; i++ {
 55 | 		_, ok := C.Data[i]
 56 | 		if !ok {
 57 | 			C.Data[i] = core.NewVector()
 58 | 		}
 59 | 		C.Data[i].Data[i] = C.Data[i].Data[i] + theta
 60 | 	}
 61 | 
 62 | 	var Q_l float64
 63 | 	var Q_u float64
 64 | 	var dQ float64
 65 | 	u_norm := u.Dot(u) / 2
 66 | 
 67 | 	// Lower bound
 68 | 	y_l := core.NewVector()
 69 | 	g_l := u.Copy()
 70 | 	h_l := u.Copy()
 71 | 	lambda_l := float64(0)
 72 | 	gamma_l := float64(0)
 73 | 	var tmp_f1 float64
 74 | 	var tmp_f2 float64
 75 | 	var tmp_v1 *core.Vector
 76 | 	tmp_f1 = g_l.Dot(g_l)
 77 | 	tmp_v1 = C.MultiplyVector(h_l)
 78 | 
 79 | 	// Upper bound
 80 | 	y_u := core.NewVector()
 81 | 	g_u := u.Copy()
 82 | 	h_u := u.Copy()
 83 | 	lambda_u := float64(0)
 84 | 	gamma_u := float64(0)
 85 | 	var tmp_f3 float64
 86 | 	var tmp_f4 float64
 87 | 	var tmp_v3 *core.Vector
 88 | 	var tmp_v4 *core.Vector
 89 | 	tmp_v3 = g_u.MultiplyMatrix(A)
 90 | 	tmp_v4 = C.MultiplyVector(h_u)
 91 | 	tmp_f3 = tmp_v1.Dot(g_u)
 92 | 
 93 | 	for i := 0; i < max_itr; i++ {
 94 | 		// Lower bound
 95 | 		lambda_l = tmp_f1 / h_l.Dot(tmp_v1)
 96 | 		y_l.AddVector(h_l, lambda_l) //y_l next
 97 | 		Q_l = y_l.Dot(u) - 0.5*(y_l.MultiplyMatrix(C)).Dot(y_l)
 98 | 
 99 | 		// Upper bound
100 | 		lambda_u = tmp_f3 / tmp_v3.Dot(tmp_v4)
101 | 		y_u.AddVector(h_u, lambda_u) //y_u next
102 | 		Q_u = (y_u.MultiplyMatrix(A)).Dot(u) - 0.5*((y_u.MultiplyMatrix(C)).MultiplyMatrix(A)).Dot(y_u)
103 | 
104 | 		dQ = (u_norm-Q_u)/theta - Q_l
105 | 		if dQ < tol {
106 | 			break
107 | 		}
108 | 
109 | 		// Lower bound var updates
110 | 		g_l.AddVector(tmp_v1, -lambda_l) //g_l next
111 | 		tmp_f2 = g_l.Dot(g_l)
112 | 		gamma_l = tmp_f2 / tmp_f1
113 | 		for key, val := range h_l.Data {
114 | 			h_l.SetValue(key, val*gamma_l)
115 | 		}
116 | 		h_l.AddVector(g_l, 1)          //h_l next
117 | 		tmp_f1 = tmp_f2                //tmp_f1 next
118 | 		tmp_v1 = C.MultiplyVector(h_l) //tmp_v1 next
119 | 
120 | 		// Upper bound var updates
121 | 		g_u.AddVector(tmp_v4, -lambda_u) //g_u next
122 | 		tmp_v3 = g_u.MultiplyMatrix(A)   //tmp_v3 next
123 | 		tmp_f4 = tmp_v3.Dot(g_u)
124 | 		gamma_u = tmp_f4 / tmp_f3
125 | 		for key, val := range h_u.Data {
126 | 			h_u.SetValue(key, val*gamma_u)
127 | 		}
128 | 		h_u.AddVector(g_u, 1)          //h_u next
129 | 		tmp_v4 = C.MultiplyVector(h_u) //tmp_v4 next
130 | 		tmp_f3 = tmp_f4                // tmp_f3 next
131 | 	}
132 | 
133 | 	return y_l
134 | }
135 | 
136 | func (algo *GaussianProcess) ExtractTargetValuesAsVector(samples []*core.RealSample) *core.Vector {
137 | 	targets := core.NewVector()
138 | 	for i := 0; i < len(samples); i++ {
139 | 		targets.SetValue(int64(i), samples[i].Value)
140 | 	}
141 | 	return targets
142 | }
143 | 
144 | func (algo *GaussianProcess) Init(params map[string]string) {
145 | 
146 | 	dim, _ := strconv.ParseInt(params["dim"], 10, 64)
147 | 
148 | 	algo.Params = GaussianProcessParameters{}
149 | 	algo.Params.Dim = dim    // Pass in dim as a param.. and require feature space to be continous.
150 | 	algo.Params.Theta = 1e-7 // Used by approximate inversion as the diagonal noise
151 | 
152 | 	radius := 0.1
153 | 	camp := 40.0
154 | 	cf := CovSEARD{}
155 | 	radiuses := core.NewVector()
156 | 	for i := int64(1); i <= dim; i++ {
157 | 		radiuses.SetValue(i, radius)
158 | 	}
159 | 	cf.Init(radiuses, camp)
160 | 
161 | 	algo.CovarianceFunc = cf.Cov
162 | }
163 | 
164 | func (algo *GaussianProcess) Train(dataset *core.RealDataSet) {
165 | 	algo.DataSet = dataset
166 | 	algo.TrainingDataCount = int64(len(dataset.Samples))
167 | 	algo.CovMatrix = CovMatrix(algo.DataSet.Samples, algo.CovarianceFunc)
168 | 	algo.TargetValues = algo.ExtractTargetValuesAsVector(algo.DataSet.Samples)
169 | 	algo.InvCovTarget = algo.ApproximateInversion(algo.CovMatrix, algo.TargetValues, algo.Params.Theta, algo.TrainingDataCount)
170 | }
171 | 
172 | func (algo *GaussianProcess) Predict(sample *core.RealSample) float64 {
173 | 	k := CovVector(algo.DataSet.Samples, sample, algo.CovarianceFunc)
174 | 	pred := k.Dot(algo.InvCovTarget)
175 | 
176 | 	return pred
177 | }
178 | 
179 | func (algo *GaussianProcess) PredictStd(sample *core.RealSample) float64 {
180 | 	k := CovVector(algo.DataSet.Samples, sample, algo.CovarianceFunc)
181 | 	C_inv_k := algo.ApproximateInversion(algo.CovMatrix, k, algo.Params.Theta, algo.TrainingDataCount)
182 | 	std := math.Sqrt(algo.CovarianceFunc(sample.GetFeatureVector(), sample.GetFeatureVector()) - k.Dot(C_inv_k))
183 | 	return std
184 | }
185 | 


--------------------------------------------------------------------------------
/ann/neural_network.go:
--------------------------------------------------------------------------------
  1 | package ann
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/xlvector/hector/core"
  6 | 	"github.com/xlvector/hector/util"
  7 | 	"math"
  8 | 	"math/rand"
  9 | 	"strconv"
 10 | )
 11 | 
 12 | type NeuralNetworkParams struct {
 13 | 	LearningRate         float64
 14 | 	LearningRateDiscount float64
 15 | 	Regularization       float64
 16 | 	Hidden               int64
 17 | 	Steps                int
 18 | 	Verbose              int
 19 | }
 20 | 
 21 | type TwoLayerWeights struct {
 22 | 	L1 *core.Matrix
 23 | 	L2 *core.Matrix
 24 | }
 25 | 
 26 | /*
 27 | Please refer to this chapter to know algorithm details :
 28 | http://www4.rgu.ac.uk/files/chapter3%20-%20bp.pdf
 29 | */
 30 | type NeuralNetwork struct {
 31 | 	Model    TwoLayerWeights
 32 | 	MaxLabel int64
 33 | 	Params   NeuralNetworkParams
 34 | }
 35 | 
 36 | func RandomInitVector(dim int64) *core.Vector {
 37 | 	v := core.NewVector()
 38 | 	var i int64
 39 | 	for i = 0; i < dim; i++ {
 40 | 		v.Data[i] = (rand.Float64() - 0.5) / math.Sqrt(float64(dim))
 41 | 	}
 42 | 	return v
 43 | }
 44 | 
 45 | func (self *NeuralNetwork) SaveModel(path string) {
 46 | 
 47 | }
 48 | 
 49 | func (self *NeuralNetwork) LoadModel(path string) {
 50 | 
 51 | }
 52 | 
 53 | func (algo *NeuralNetwork) Init(params map[string]string) {
 54 | 	algo.Params.LearningRate, _ = strconv.ParseFloat(params["learning-rate"], 64)
 55 | 	algo.Params.LearningRateDiscount, _ = strconv.ParseFloat(params["learning-rate-discount"], 64)
 56 | 	algo.Params.Regularization, _ = strconv.ParseFloat(params["regularization"], 64)
 57 | 	steps, _ := strconv.ParseInt(params["steps"], 10, 32)
 58 | 	hidden, _ := strconv.ParseInt(params["hidden"], 10, 64)
 59 | 	verbose, _ := strconv.ParseInt(params["verbose"], 10, 32)
 60 | 
 61 | 	algo.Params.Steps = int(steps)
 62 | 	algo.Params.Hidden = int64(hidden)
 63 | 	algo.Params.Verbose = int(verbose)
 64 | }
 65 | 
 66 | func (algo *NeuralNetwork) Train(dataset *core.DataSet) {
 67 | 	algo.Model = TwoLayerWeights{}
 68 | 	algo.Model.L1 = core.NewMatrix()
 69 | 	algo.Model.L2 = core.NewMatrix()
 70 | 
 71 | 	for i := int64(0); i < algo.Params.Hidden; i++ {
 72 | 		algo.Model.L1.Data[i] = core.NewVector()
 73 | 	}
 74 | 
 75 | 	initalized := make(map[int64]int)
 76 | 	max_label := 0
 77 | 	for _, sample := range dataset.Samples {
 78 | 		if max_label < sample.Label {
 79 | 			max_label = sample.Label
 80 | 		}
 81 | 		for _, f := range sample.Features {
 82 | 			_, ok := initalized[f.Id]
 83 | 			if !ok {
 84 | 				for i := int64(0); i < algo.Params.Hidden; i++ {
 85 | 					algo.Model.L1.SetValue(i, f.Id, (rand.Float64()-0.5)/math.Sqrt(float64(algo.Params.Hidden)))
 86 | 				}
 87 | 				initalized[f.Id] = 1
 88 | 			}
 89 | 		}
 90 | 	}
 91 | 	algo.MaxLabel = int64(max_label)
 92 | 
 93 | 	for i := int64(0); i <= algo.Params.Hidden; i++ {
 94 | 		for j := int64(0); j <= algo.MaxLabel; j++ {
 95 | 			algo.Model.L2.SetValue(i, j, (rand.NormFloat64() / math.Sqrt(float64(algo.MaxLabel)+1.0)))
 96 | 		}
 97 | 	}
 98 | 
 99 | 	for step := 0; step < algo.Params.Steps; step++ {
100 | 		if algo.Params.Verbose <= 0 {
101 | 			fmt.Printf(".")
102 | 		}
103 | 		total := len(dataset.Samples)
104 | 		counter := 0
105 | 		for _, sample := range dataset.Samples {
106 | 			y := core.NewVector()
107 | 			z := core.NewVector()
108 | 			e := core.NewVector()
109 | 			delta_hidden := core.NewVector()
110 | 
111 | 			for i := int64(0); i < algo.Params.Hidden; i++ {
112 | 				sum := float64(0)
113 | 				wi := algo.Model.L1.Data[i]
114 | 				for _, f := range sample.Features {
115 | 					sum += f.Value * wi.GetValue(f.Id)
116 | 				}
117 | 				y.Data[i] = util.Sigmoid(sum)
118 | 			}
119 | 			y.Data[algo.Params.Hidden] = 1.0
120 | 			for i := int64(0); i <= algo.MaxLabel; i++ {
121 | 				sum := float64(0)
122 | 				for j := int64(0); j <= algo.Params.Hidden; j++ {
123 | 					sum += y.GetValue(j) * algo.Model.L2.GetValue(j, i)
124 | 				}
125 | 				z.SetValue(i, sum)
126 | 			}
127 | 			z = z.SoftMaxNorm()
128 | 			e.SetValue(int64(sample.Label), 1.0)
129 | 			e.AddVector(z, -1.0)
130 | 
131 | 			for i := int64(0); i <= algo.Params.Hidden; i++ {
132 | 				delta := float64(0)
133 | 				for j := int64(0); j <= algo.MaxLabel; j++ {
134 | 					wij := algo.Model.L2.GetValue(i, j)
135 | 					sig_ij := e.GetValue(j) * (1 - z.GetValue(j)) * z.GetValue(j)
136 | 					delta += sig_ij * wij
137 | 					wij += algo.Params.LearningRate * (y.GetValue(i)*sig_ij - algo.Params.Regularization*wij)
138 | 					algo.Model.L2.SetValue(i, j, wij)
139 | 				}
140 | 				delta_hidden.SetValue(i, delta)
141 | 			}
142 | 
143 | 			for i := int64(0); i < algo.Params.Hidden; i++ {
144 | 				wi := algo.Model.L1.Data[i]
145 | 				for _, f := range sample.Features {
146 | 					wji := wi.GetValue(f.Id)
147 | 					wji += algo.Params.LearningRate * (delta_hidden.GetValue(i)*f.Value*y.GetValue(i)*(1-y.GetValue(i)) - algo.Params.Regularization*wji)
148 | 					wi.SetValue(f.Id, wji)
149 | 				}
150 | 			}
151 | 			counter++
152 | 			if algo.Params.Verbose > 0 && counter%2000 == 0 {
153 | 				fmt.Printf("Epoch %d %f%%\n", step+1, float64(counter)/float64(total)*100)
154 | 			}
155 | 		}
156 | 
157 | 		if algo.Params.Verbose > 0 {
158 | 			algo.Evaluate(dataset)
159 | 		}
160 | 		algo.Params.LearningRate *= algo.Params.LearningRateDiscount
161 | 	}
162 | 	fmt.Println()
163 | }
164 | 
165 | func (algo *NeuralNetwork) PredictMultiClass(sample *core.Sample) *core.ArrayVector {
166 | 	y := core.NewVector()
167 | 	z := core.NewArrayVector()
168 | 	for i := int64(0); i < algo.Params.Hidden; i++ {
169 | 		sum := float64(0)
170 | 		for _, f := range sample.Features {
171 | 			sum += f.Value * algo.Model.L1.Data[i].GetValue(f.Id)
172 | 		}
173 | 		y.Data[i] = util.Sigmoid(sum)
174 | 	}
175 | 	y.Data[algo.Params.Hidden] = 1
176 | 	for i := 0; i <= int(algo.MaxLabel); i++ {
177 | 		sum := float64(0)
178 | 		for j := int64(0); j <= algo.Params.Hidden; j++ {
179 | 			sum += y.GetValue(j) * algo.Model.L2.GetValue(j, int64(i))
180 | 		}
181 | 		z.SetValue(i, sum)
182 | 	}
183 | 	z = z.SoftMaxNorm()
184 | 	return z
185 | }
186 | 
187 | func (algo *NeuralNetwork) Predict(sample *core.Sample) float64 {
188 | 	z := algo.PredictMultiClass(sample)
189 | 	return z.GetValue(1)
190 | }
191 | 
192 | func (algo *NeuralNetwork) Evaluate(dataset *core.DataSet) {
193 | 	accuracy := 0.0
194 | 	total := 0.0
195 | 	for _, sample := range dataset.Samples {
196 | 		prediction := algo.PredictMultiClass(sample)
197 | 		label, _ := prediction.KeyWithMaxValue()
198 | 		if int(label) == sample.Label {
199 | 			accuracy += 1.0
200 | 		}
201 | 		total += 1.0
202 | 	}
203 | 	fmt.Printf("accuracy %f%%\n", accuracy/total*100)
204 | }
205 | 


--------------------------------------------------------------------------------
/algo_runner.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package hector is a golang based machine learning lib. It intend to implement all famous machine learning algoirhtms by golang.
  3 | Currently, it only support algorithms which can solve binary classification problems. Supported algorithms include:
  4 | 1. Decision Tree (CART, Random Forest, GBDT)
  5 | 2. Logistic Regression
  6 | 3. SVM
  7 | 4. Neural Network
  8 | */
  9 | package hector
 10 | 
 11 | import (
 12 | 	"github.com/xlvector/hector/algo"
 13 | 	"github.com/xlvector/hector/core"
 14 | 	"github.com/xlvector/hector/eval"
 15 | 	"os"
 16 | 	"strconv"
 17 | )
 18 | 
 19 | func AlgorithmRun(classifier algo.Classifier,
 20 | 	train_path string, test_path string, pred_path string,
 21 | 	params map[string]string) (float64, []*eval.LabelPrediction, error) {
 22 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
 23 | 	train_dataset := core.NewDataSet()
 24 | 
 25 | 	err := train_dataset.Load(train_path, global)
 26 | 
 27 | 	if err != nil {
 28 | 		return 0.5, nil, err
 29 | 	}
 30 | 
 31 | 	test_dataset := core.NewDataSet()
 32 | 	err = test_dataset.Load(test_path, global)
 33 | 	if err != nil {
 34 | 		return 0.5, nil, err
 35 | 	}
 36 | 	classifier.Init(params)
 37 | 	auc, predictions := AlgorithmRunOnDataSet(classifier, train_dataset,
 38 | 		test_dataset, pred_path, params)
 39 | 
 40 | 	return auc, predictions, nil
 41 | }
 42 | 
 43 | func AlgorithmTrain(classifier algo.Classifier, train_path string,
 44 | 	params map[string]string) error {
 45 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
 46 | 	train_dataset := core.NewDataSet()
 47 | 
 48 | 	err := train_dataset.Load(train_path, global)
 49 | 
 50 | 	if err != nil {
 51 | 		return err
 52 | 	}
 53 | 
 54 | 	classifier.Init(params)
 55 | 	classifier.Train(train_dataset)
 56 | 
 57 | 	model_path, _ := params["model"]
 58 | 
 59 | 	if model_path != "" {
 60 | 		classifier.SaveModel(model_path)
 61 | 	}
 62 | 
 63 | 	return nil
 64 | }
 65 | 
 66 | func AlgorithmTest(classifier algo.Classifier, test_path string, pred_path string, params map[string]string) (float64, []*eval.LabelPrediction, error) {
 67 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
 68 | 
 69 | 	model_path, _ := params["model"]
 70 | 	classifier.Init(params)
 71 | 	if model_path != "" {
 72 | 		classifier.LoadModel(model_path)
 73 | 	} else {
 74 | 		return 0.0, nil, nil
 75 | 	}
 76 | 
 77 | 	test_dataset := core.NewDataSet()
 78 | 	err := test_dataset.Load(test_path, global)
 79 | 	if err != nil {
 80 | 		return 0.0, nil, err
 81 | 	}
 82 | 
 83 | 	auc, predictions := AlgorithmRunOnDataSet(classifier, nil, test_dataset, pred_path, params)
 84 | 
 85 | 	return auc, predictions, nil
 86 | }
 87 | 
 88 | func AlgorithmRunOnDataSet(classifier algo.Classifier, train_dataset, test_dataset *core.DataSet, pred_path string, params map[string]string) (float64, []*eval.LabelPrediction) {
 89 | 
 90 | 	if train_dataset != nil {
 91 | 		classifier.Train(train_dataset)
 92 | 	}
 93 | 
 94 | 	predictions := []*eval.LabelPrediction{}
 95 | 	var pred_file *os.File
 96 | 	if pred_path != "" {
 97 | 		pred_file, _ = os.Create(pred_path)
 98 | 	}
 99 | 	for _, sample := range test_dataset.Samples {
100 | 		prediction := classifier.Predict(sample)
101 | 		if pred_file != nil {
102 | 			pred_file.WriteString(strconv.FormatFloat(prediction, 'g', 5, 64) + "\n")
103 | 		}
104 | 		predictions = append(predictions, &(eval.LabelPrediction{Label: sample.Label, Prediction: prediction}))
105 | 	}
106 | 	if pred_path != "" {
107 | 		defer pred_file.Close()
108 | 	}
109 | 
110 | 	auc := eval.AUC(predictions)
111 | 	return auc, predictions
112 | }
113 | 
114 | /* Regression */
115 | func RegAlgorithmRun(regressor algo.Regressor, train_path string, test_path string, pred_path string, params map[string]string) (float64, []*eval.RealPrediction, error) {
116 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
117 | 	train_dataset := core.NewRealDataSet()
118 | 
119 | 	err := train_dataset.Load(train_path, global)
120 | 
121 | 	if err != nil {
122 | 		return 0.5, nil, err
123 | 	}
124 | 
125 | 	test_dataset := core.NewRealDataSet()
126 | 	err = test_dataset.Load(test_path, global)
127 | 	if err != nil {
128 | 		return 0.5, nil, err
129 | 	}
130 | 	regressor.Init(params)
131 | 	rmse, predictions := RegAlgorithmRunOnDataSet(regressor, train_dataset, test_dataset, pred_path, params)
132 | 
133 | 	return rmse, predictions, nil
134 | }
135 | 
136 | func RegAlgorithmTrain(regressor algo.Regressor, train_path string, params map[string]string) error {
137 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
138 | 	train_dataset := core.NewRealDataSet()
139 | 
140 | 	err := train_dataset.Load(train_path, global)
141 | 
142 | 	if err != nil {
143 | 		return err
144 | 	}
145 | 
146 | 	regressor.Init(params)
147 | 	regressor.Train(train_dataset)
148 | 
149 | 	model_path, _ := params["model"]
150 | 
151 | 	if model_path != "" {
152 | 		regressor.SaveModel(model_path)
153 | 	}
154 | 
155 | 	return nil
156 | }
157 | 
158 | func RegAlgorithmTest(regressor algo.Regressor, test_path string, pred_path string, params map[string]string) (float64, []*eval.RealPrediction, error) {
159 | 	global, _ := strconv.ParseInt(params["global"], 10, 64)
160 | 
161 | 	model_path, _ := params["model"]
162 | 	regressor.Init(params)
163 | 	if model_path != "" {
164 | 		regressor.LoadModel(model_path)
165 | 	} else {
166 | 		return 0.0, nil, nil
167 | 	}
168 | 
169 | 	test_dataset := core.NewRealDataSet()
170 | 	err := test_dataset.Load(test_path, global)
171 | 	if err != nil {
172 | 		return 0.0, nil, err
173 | 	}
174 | 
175 | 	rmse, predictions := RegAlgorithmRunOnDataSet(regressor, nil, test_dataset, pred_path, params)
176 | 
177 | 	return rmse, predictions, nil
178 | }
179 | 
180 | func RegAlgorithmRunOnDataSet(regressor algo.Regressor, train_dataset, test_dataset *core.RealDataSet, pred_path string, params map[string]string) (float64, []*eval.RealPrediction) {
181 | 
182 | 	if train_dataset != nil {
183 | 		regressor.Train(train_dataset)
184 | 	}
185 | 
186 | 	predictions := []*eval.RealPrediction{}
187 | 	var pred_file *os.File
188 | 	if pred_path != "" {
189 | 		pred_file, _ = os.Create(pred_path)
190 | 	}
191 | 	for _, sample := range test_dataset.Samples {
192 | 		prediction := regressor.Predict(sample)
193 | 		if pred_file != nil {
194 | 			pred_file.WriteString(strconv.FormatFloat(prediction, 'g', 5, 64) + "\n")
195 | 		}
196 | 		predictions = append(predictions, &eval.RealPrediction{Value: sample.Value, Prediction: prediction})
197 | 	}
198 | 	if pred_path != "" {
199 | 		defer pred_file.Close()
200 | 	}
201 | 
202 | 	rmse := eval.RegRMSE(predictions)
203 | 	return rmse, predictions
204 | }
205 | 


--------------------------------------------------------------------------------
/params.go:
--------------------------------------------------------------------------------
  1 | package hector
  2 | 
  3 | import (
  4 | 	"flag"
  5 | 	"fmt"
  6 | 	"math/rand"
  7 | 	"runtime"
  8 | 	"strconv"
  9 | 	"time"
 10 | 
 11 | 	"github.com/xlvector/hector/algo"
 12 | 	"github.com/xlvector/hector/ann"
 13 | 	"github.com/xlvector/hector/dt"
 14 | 	"github.com/xlvector/hector/fm"
 15 | 	"github.com/xlvector/hector/gp"
 16 | 	"github.com/xlvector/hector/lr"
 17 | 	"github.com/xlvector/hector/sa"
 18 | 	"github.com/xlvector/hector/svm"
 19 | )
 20 | 
 21 | func GetMutliClassClassifier(method string) algo.MultiClassClassifier {
 22 | 	rand.Seed(time.Now().UTC().UnixNano())
 23 | 	var classifier algo.MultiClassClassifier
 24 | 
 25 | 	if method == "rf" {
 26 | 		classifier = &(dt.RandomForest{})
 27 | 	} else if method == "cart" {
 28 | 		classifier = &(dt.CART{})
 29 | 	} else if method == "rdt" {
 30 | 		classifier = &(dt.RandomDecisionTree{})
 31 | 	} else if method == "knn" {
 32 | 		classifier = &(svm.KNN{})
 33 | 	} else if method == "ann" {
 34 | 		classifier = &(ann.NeuralNetwork{})
 35 | 	}
 36 | 	return classifier
 37 | }
 38 | 
 39 | func GetClassifier(method string) algo.Classifier {
 40 | 	rand.Seed(time.Now().UTC().UnixNano())
 41 | 	var classifier algo.Classifier
 42 | 
 43 | 	if method == "lr" {
 44 | 		classifier = &(lr.LogisticRegression{})
 45 | 	} else if method == "ftrl" {
 46 | 		classifier = &(lr.FTRLLogisticRegression{})
 47 | 	} else if method == "ep" {
 48 | 		classifier = &(lr.EPLogisticRegression{})
 49 | 	} else if method == "rdt" {
 50 | 		classifier = &(dt.RandomDecisionTree{})
 51 | 	} else if method == "cart" {
 52 | 		classifier = &(dt.CART{})
 53 | 	} else if method == "cart-regression" {
 54 | 		classifier = &(dt.RegressionTree{})
 55 | 	} else if method == "rf" {
 56 | 		classifier = &(dt.RandomForest{})
 57 | 	} else if method == "fm" {
 58 | 		classifier = &(fm.FactorizeMachine{})
 59 | 	} else if method == "sa" {
 60 | 		classifier = &(sa.SAOptAUC{})
 61 | 	} else if method == "gbdt" {
 62 | 		classifier = &(dt.GBDT{})
 63 | 	} else if method == "svm" {
 64 | 		classifier = &(svm.SVM{})
 65 | 	} else if method == "linear_svm" {
 66 | 		classifier = &(svm.LinearSVM{})
 67 | 	} else if method == "l1vm" {
 68 | 		classifier = &(svm.L1VM{})
 69 | 	} else if method == "knn" {
 70 | 		classifier = &(svm.KNN{})
 71 | 	} else if method == "ann" {
 72 | 		classifier = &(ann.NeuralNetwork{})
 73 | 	} else if method == "lr_owlqn" {
 74 | 		classifier = &(lr.LROWLQN{})
 75 | 	} else {
 76 | 		classifier = &(lr.LogisticRegression{})
 77 | 	}
 78 | 	return classifier
 79 | }
 80 | 
 81 | func GetRegressor(method string) algo.Regressor {
 82 | 	rand.Seed(time.Now().UTC().UnixNano())
 83 | 
 84 | 	var regressor algo.Regressor
 85 | 
 86 | 	if method == "gp" {
 87 | 		regressor = &(gp.GaussianProcess{})
 88 | 	}
 89 | 	return regressor
 90 | }
 91 | 
 92 | func PrepareParams() (string, string, string, string, map[string]string) {
 93 | 	params := make(map[string]string)
 94 | 	train_path := flag.String("train", "train.tsv", "path of training file")
 95 | 	test_path := flag.String("test", "test.tsv", "path of testing file")
 96 | 	pred_path := flag.String("pred", "", "path of pred file")
 97 | 	output := flag.String("output", "", "output file path")
 98 | 	verbose := flag.Int("v", 0, "verbose output if 1")
 99 | 	learning_rate := flag.String("learning-rate", "0.01", "learning rate")
100 | 	learning_rate_discount := flag.String("learning-rate-discount", "1.0", "discount rate of learning rate per training step")
101 | 	regularization := flag.String("regularization", "0.01", "regularization")
102 | 	alpha := flag.String("alpha", "0.1", "alpha of ftrl")
103 | 	beta := flag.String("beta", "1", "beta of ftrl")
104 | 	c := flag.String("c", "1", "C in svm")
105 | 	e := flag.String("e", "0.01", "stop threshold")
106 | 	lambda1 := flag.String("lambda1", "0.1", "lambda1 of ftrl")
107 | 	lambda2 := flag.String("lambda2", "0.1", "lambda2 of ftrl")
108 | 	tree_count := flag.String("tree-count", "10", "tree count in rdt/rf")
109 | 	feature_count := flag.String("feature-count", "1.0", "feature count in rdt/rf")
110 | 	gini := flag.String("gini", "1.0", "gini threshold, between (0, 0.5]")
111 | 	min_leaf_size := flag.String("min-leaf-size", "10", "min leaf size in dt")
112 | 	max_depth := flag.String("max-depth", "10", "max depth of dt")
113 | 	factors := flag.String("factors", "10", "factor number in factorized machine")
114 | 	steps := flag.Int("steps", 1, "steps before convergent")
115 | 	global := flag.Int64("global", -1, "feature id of global bias")
116 | 	method := flag.String("method", "lr", "algorithm name")
117 | 	cv := flag.Int("cv", 7, "cross validation folder count")
118 | 	k := flag.String("k", "3", "neighborhood size of knn")
119 | 	radius := flag.String("radius", "1.0", "radius of RBF kernel")
120 | 	sv := flag.String("sv", "8", "support vector count for l1vm")
121 | 	hidden := flag.Int64("hidden", 1, "hidden neuron number")
122 | 	profile := flag.String("profile", "", "profile file name")
123 | 	model := flag.String("model", "", "model file name")
124 | 	action := flag.String("action", "", "train or test, do both if action is empty string")
125 | 	core := flag.Int("core", 1, "core number when run program")
126 | 	dt_sample_ratio := flag.String("dt-sample-ratio", "1.0", "sampling ratio when split feature in decision tree")
127 | 	dim := flag.String("dim", "1", "input space dimension")
128 | 	port := flag.String("port", "8080", "port")
129 | 
130 | 	flag.Parse()
131 | 	runtime.GOMAXPROCS(*core)
132 | 	fmt.Println(*train_path)
133 | 	fmt.Println(*test_path)
134 | 	fmt.Println(*method)
135 | 	params["port"] = *port
136 | 	params["verbose"] = strconv.FormatInt(int64(*verbose), 10)
137 | 	params["learning-rate"] = *learning_rate
138 | 	params["learning-rate-discount"] = *learning_rate_discount
139 | 	params["regularization"] = *regularization
140 | 	params["alpha"] = *alpha
141 | 	params["beta"] = *beta
142 | 	params["lambda1"] = *lambda1
143 | 	params["lambda2"] = *lambda2
144 | 	params["tree-count"] = *tree_count
145 | 	params["feature-count"] = *feature_count
146 | 	params["max-depth"] = *max_depth
147 | 	params["min-leaf-size"] = *min_leaf_size
148 | 	params["steps"] = strconv.FormatInt(int64(*steps), 10)
149 | 	params["global"] = strconv.FormatInt(*global, 10)
150 | 	params["gini"] = *gini
151 | 	params["factors"] = *factors
152 | 	params["output"] = *output
153 | 	params["c"] = *c
154 | 	params["e"] = *e
155 | 	params["k"] = *k
156 | 	params["cv"] = strconv.FormatInt(int64(*cv), 10)
157 | 	params["radius"] = *radius
158 | 	params["sv"] = *sv
159 | 	params["hidden"] = strconv.FormatInt(int64(*hidden), 10)
160 | 	params["profile"] = *profile
161 | 	params["action"] = *action
162 | 	params["model"] = *model
163 | 	params["method"] = *method
164 | 	params["dt-sample-ratio"] = *dt_sample_ratio
165 | 	params["dim"] = *dim
166 | 
167 | 	fmt.Println(params)
168 | 	return *train_path, *test_path, *pred_path, *method, params
169 | }
170 | 


--------------------------------------------------------------------------------
/dt/regression_tree.go:
--------------------------------------------------------------------------------
  1 | package dt
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"container/list"
  6 | 	"github.com/xlvector/hector/core"
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"sort"
 10 | 	"strconv"
 11 | )
 12 | 
 13 | type RegressionTree struct {
 14 | 	tree   Tree
 15 | 	params CARTParams
 16 | }
 17 | 
 18 | func (self *RegressionTree) SaveModel(path string) {
 19 | 	ioutil.WriteFile(path, self.tree.ToString(), 0600)
 20 | }
 21 | 
 22 | func (self *RegressionTree) LoadModel(path string) {
 23 | 	file, _ := os.Open(path)
 24 | 	defer file.Close()
 25 | 	text := ""
 26 | 	scanner := bufio.NewScanner(file)
 27 | 	for scanner.Scan() {
 28 | 		text += scanner.Text() + "\n"
 29 | 	}
 30 | 	self.tree.FromString(string(text))
 31 | }
 32 | 
 33 | func (dt *RegressionTree) GoLeft(sample *core.MapBasedSample, feature_split core.Feature) bool {
 34 | 	value, ok := sample.Features[feature_split.Id]
 35 | 	if ok && value >= feature_split.Value {
 36 | 		return true
 37 | 	} else {
 38 | 		return false
 39 | 	}
 40 | }
 41 | 
 42 | func (dt *RegressionTree) GetElementFromQueue(queue *list.List, n int) []*TreeNode {
 43 | 	ret := []*TreeNode{}
 44 | 	for i := 0; i < n; i++ {
 45 | 		node := queue.Front()
 46 | 		if node == nil {
 47 | 			break
 48 | 		}
 49 | 		ret = append(ret, (node.Value.(*TreeNode)))
 50 | 		queue.Remove(node)
 51 | 	}
 52 | 	return ret
 53 | }
 54 | 
 55 | func (dt *RegressionTree) FindBestSplit(samples []*core.MapBasedSample, node *TreeNode, select_features map[int64]bool) {
 56 | 	feature_weight_labels := make(map[int64]*core.FeatureGoalDistribution)
 57 | 	sum_total := 0.0
 58 | 	sum_total2 := 0.0
 59 | 	count_total := 0.0
 60 | 	for _, k := range node.samples {
 61 | 		sum_total += samples[k].Prediction
 62 | 		sum_total2 += samples[k].Prediction * samples[k].Prediction
 63 | 		count_total += 1.0
 64 | 	}
 65 | 
 66 | 	feature_sum_right := core.NewVector()
 67 | 	feature_sum_right2 := core.NewVector()
 68 | 	feature_count_right := core.NewVector()
 69 | 
 70 | 	for _, k := range node.samples {
 71 | 		for fid, fvalue := range samples[k].Features {
 72 | 			feature_count_right.AddValue(fid, 1.0)
 73 | 			feature_sum_right.AddValue(fid, samples[k].Prediction)
 74 | 			feature_sum_right2.AddValue(fid, samples[k].Prediction*samples[k].Prediction)
 75 | 			_, ok := feature_weight_labels[fid]
 76 | 			if !ok {
 77 | 				feature_weight_labels[fid] = core.NewFeatureGoalDistribution()
 78 | 			}
 79 | 			feature_weight_labels[fid].AddWeightGoal(fvalue, samples[k].Prediction)
 80 | 		}
 81 | 	}
 82 | 
 83 | 	min_vari := 1e20
 84 | 	node.feature_split = core.Feature{Id: -1, Value: 0}
 85 | 	for fid, distribution := range feature_weight_labels {
 86 | 		sort.Sort(distribution)
 87 | 		split, vari := distribution.BestSplitByVariance(sum_total-feature_sum_right.GetValue(fid),
 88 | 			sum_total2-feature_sum_right2.GetValue(fid),
 89 | 			count_total-feature_count_right.GetValue(fid),
 90 | 			feature_sum_right.GetValue(fid),
 91 | 			feature_sum_right2.GetValue(fid),
 92 | 			feature_count_right.GetValue(fid))
 93 | 		if min_vari > vari {
 94 | 			min_vari = vari
 95 | 			node.feature_split.Id = fid
 96 | 			node.feature_split.Value = split
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | func (dt *RegressionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, select_features map[int64]bool) {
102 | 	if node.depth >= dt.params.MaxDepth {
103 | 		return
104 | 	}
105 | 
106 | 	dt.FindBestSplit(samples, node, select_features)
107 | 
108 | 	if node.feature_split.Id < 0 {
109 | 		return
110 | 	}
111 | 	left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}}
112 | 	right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}}
113 | 
114 | 	left_positive := 0.0
115 | 	left_total := 0.0
116 | 	right_positive := 0.0
117 | 	right_total := 0.0
118 | 	for _, k := range node.samples {
119 | 		if dt.GoLeft(samples[k], node.feature_split) {
120 | 			left_node.samples = append(left_node.samples, k)
121 | 			left_positive += samples[k].Prediction
122 | 			left_total += 1.0
123 | 		} else {
124 | 			right_node.samples = append(right_node.samples, k)
125 | 			right_positive += samples[k].Prediction
126 | 			right_total += 1.0
127 | 		}
128 | 	}
129 | 	node.samples = nil
130 | 
131 | 	if len(left_node.samples) > dt.params.MinLeafSize {
132 | 		left_node.sample_count = len(left_node.samples)
133 | 		left_node.prediction.SetValue(0, left_positive/left_total)
134 | 		queue.PushBack(&left_node)
135 | 		node.left = len(tree.nodes)
136 | 		tree.AddTreeNode(&left_node)
137 | 	}
138 | 
139 | 	if len(right_node.samples) > dt.params.MinLeafSize {
140 | 		right_node.sample_count = len(right_node.samples)
141 | 		right_node.prediction.SetValue(0, right_positive/right_total)
142 | 		queue.PushBack(&right_node)
143 | 		node.right = len(tree.nodes)
144 | 		tree.AddTreeNode(&right_node)
145 | 	}
146 | }
147 | 
148 | func (dt *RegressionTree) SingleTreeBuild(samples []*core.MapBasedSample, select_features map[int64]bool) Tree {
149 | 	tree := Tree{}
150 | 	queue := list.New()
151 | 	root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}}
152 | 	total := 0.0
153 | 	positive := 0.0
154 | 	for i, sample := range samples {
155 | 		root.AddSample(i)
156 | 		total += 1.0
157 | 		positive += sample.Prediction
158 | 	}
159 | 	root.sample_count = len(root.samples)
160 | 	root.prediction.SetValue(0, positive/total)
161 | 
162 | 	queue.PushBack(&root)
163 | 	tree.AddTreeNode(&root)
164 | 	for {
165 | 		nodes := dt.GetElementFromQueue(queue, 10)
166 | 		if len(nodes) == 0 {
167 | 			break
168 | 		}
169 | 
170 | 		for _, node := range nodes {
171 | 			dt.AppendNodeToTree(samples, node, queue, &tree, select_features)
172 | 		}
173 | 	}
174 | 	return tree
175 | }
176 | 
177 | func (dt *RegressionTree) PredictBySingleTree(tree *Tree, sample *core.MapBasedSample) (*TreeNode, string) {
178 | 	path := ""
179 | 	node := tree.GetNode(0)
180 | 	path += node.ToString()
181 | 	for {
182 | 		if dt.GoLeft(sample, node.feature_split) {
183 | 			if node.left >= 0 && node.left < tree.Size() {
184 | 				node = tree.GetNode(node.left)
185 | 				path += "-" + node.ToString()
186 | 			} else {
187 | 				break
188 | 			}
189 | 		} else {
190 | 			if node.right >= 0 && node.right < tree.Size() {
191 | 				node = tree.GetNode(node.right)
192 | 				path += "+" + node.ToString()
193 | 			} else {
194 | 				break
195 | 			}
196 | 		}
197 | 	}
198 | 	return node, path
199 | }
200 | 
201 | func (dt *RegressionTree) Train(dataset *core.DataSet) {
202 | 	samples := []*core.MapBasedSample{}
203 | 	for _, sample := range dataset.Samples {
204 | 		msample := sample.ToMapBasedSample()
205 | 		samples = append(samples, msample)
206 | 	}
207 | 	dt.tree = dt.SingleTreeBuild(samples, nil)
208 | }
209 | 
210 | func (dt *RegressionTree) Predict(sample *core.Sample) float64 {
211 | 	msample := sample.ToMapBasedSample()
212 | 	node, _ := dt.PredictBySingleTree(&dt.tree, msample)
213 | 	return node.prediction.GetValue(0)
214 | }
215 | 
216 | func (dt *RegressionTree) Init(params map[string]string) {
217 | 	dt.tree = Tree{}
218 | 	min_leaf_size, _ := strconv.ParseInt(params["min-leaf-size"], 10, 32)
219 | 	max_depth, _ := strconv.ParseInt(params["max-depth"], 10, 32)
220 | 
221 | 	dt.params.MinLeafSize = int(min_leaf_size)
222 | 	dt.params.MaxDepth = int(max_depth)
223 | 	dt.params.GiniThreshold, _ = strconv.ParseFloat(params["gini"], 64)
224 | }
225 | 


--------------------------------------------------------------------------------
/dt/random_decision_tree.go:
--------------------------------------------------------------------------------
  1 | package dt
  2 | 
  3 | import (
  4 | 	"container/list"
  5 | 	"fmt"
  6 | 	"math/rand"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 	"sync"
 10 | 
 11 | 	"github.com/xlvector/hector/core"
 12 | 	"github.com/xlvector/hector/util"
 13 | )
 14 | 
 15 | type TreeNode struct {
 16 | 	left, right, depth int
 17 | 	prediction         *core.ArrayVector
 18 | 	sample_count       int
 19 | 	samples            []int
 20 | 	feature_split      core.Feature
 21 | }
 22 | 
 23 | func (t *TreeNode) ToString() string {
 24 | 	return strconv.FormatInt(t.feature_split.Id, 10) + ":" + strconv.FormatFloat(t.feature_split.Value, 'g', 3, 64)
 25 | }
 26 | 
 27 | func (t *TreeNode) AddSample(k int) {
 28 | 	t.samples = append(t.samples, k)
 29 | }
 30 | 
 31 | type Tree struct {
 32 | 	nodes []*TreeNode
 33 | }
 34 | 
 35 | func (t *Tree) AddTreeNode(n *TreeNode) {
 36 | 	t.nodes = append(t.nodes, n)
 37 | }
 38 | 
 39 | func (t *Tree) Size() int {
 40 | 	return len(t.nodes)
 41 | }
 42 | 
 43 | func (t *Tree) GetNode(i int) *TreeNode {
 44 | 	return t.nodes[i]
 45 | }
 46 | 
 47 | func (t *Tree) ToString() []byte {
 48 | 	sb := util.StringBuilder{}
 49 | 	sb.Int(len(t.nodes))
 50 | 	sb.Write("\n")
 51 | 	for i, node := range t.nodes {
 52 | 		sb.Int(i)
 53 | 		sb.Write("\t")
 54 | 		sb.Int(node.left)
 55 | 		sb.Write("\t")
 56 | 		sb.Int(node.right)
 57 | 		sb.Write("\t")
 58 | 		sb.Int(node.depth)
 59 | 		sb.Write("\t")
 60 | 		sb.WriteBytes(node.prediction.ToString())
 61 | 		sb.Write("\t")
 62 | 		sb.Int(node.sample_count)
 63 | 		sb.Write("\t")
 64 | 		sb.Int64(node.feature_split.Id)
 65 | 		sb.Write("\t")
 66 | 		sb.Float(node.feature_split.Value)
 67 | 		sb.Write("\n")
 68 | 	}
 69 | 	return sb.Bytes()
 70 | }
 71 | 
 72 | func (t *Tree) fromString(lines []string) {
 73 | 	size, _ := strconv.Atoi(lines[0])
 74 | 	t.nodes = make([]*TreeNode, size+1, size+1)
 75 | 	for _, line := range lines[1:] {
 76 | 		if len(line) == 0 {
 77 | 			break
 78 | 		}
 79 | 		tks := strings.Split(line, "\t")
 80 | 		node := TreeNode{}
 81 | 		i, _ := strconv.Atoi(tks[0])
 82 | 		node.left, _ = strconv.Atoi(tks[1])
 83 | 		node.right, _ = strconv.Atoi(tks[2])
 84 | 		node.depth, _ = strconv.Atoi(tks[3])
 85 | 		node.prediction = core.NewArrayVector()
 86 | 		node.prediction.FromString(tks[4])
 87 | 		node.sample_count, _ = strconv.Atoi(tks[5])
 88 | 		node.feature_split = core.Feature{}
 89 | 		node.feature_split.Id, _ = strconv.ParseInt(tks[6], 10, 64)
 90 | 		node.feature_split.Value, _ = strconv.ParseFloat(tks[7], 64)
 91 | 		t.nodes[i] = &node
 92 | 	}
 93 | }
 94 | 
 95 | func (t *Tree) FromString(buf string) {
 96 | 	lines := strings.Split(buf, "\n")
 97 | 	t.fromString(lines)
 98 | }
 99 | 
100 | type RDTParams struct {
101 | 	TreeCount   int
102 | 	MinLeafSize int
103 | 	MaxDepth    int
104 | }
105 | 
106 | type RandomDecisionTree struct {
107 | 	trees  []*Tree
108 | 	params RDTParams
109 | }
110 | 
111 | func (self *RandomDecisionTree) SaveModel(path string) {
112 | 
113 | }
114 | 
115 | func (self *RandomDecisionTree) LoadModel(path string) {
116 | 
117 | }
118 | 
119 | func (rdt *RandomDecisionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree) {
120 | 	node.prediction = core.NewArrayVector()
121 | 	for _, k := range node.samples {
122 | 		node.prediction.AddValue(samples[k].Label, 1.0)
123 | 	}
124 | 	node.prediction.Scale(1.0 / node.prediction.Sum())
125 | 
126 | 	random_sample := samples[node.samples[rand.Intn(len(node.samples))]]
127 | 
128 | 	split := core.Feature{Id: -1, Value: -1.0}
129 | 	for fid, fvalue := range random_sample.Features {
130 | 		if split.Id < 0 || rand.Intn(len(random_sample.Features)) == 0 {
131 | 			split.Id = fid
132 | 			split.Value = fvalue
133 | 		}
134 | 	}
135 | 
136 | 	if split.Id < 0 || node.depth > rdt.params.MaxDepth {
137 | 		return
138 | 	}
139 | 
140 | 	node.feature_split = split
141 | 	left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}
142 | 	right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}
143 | 
144 | 	for _, k := range node.samples {
145 | 		if DTGoLeft(samples[k], node.feature_split) {
146 | 			left_node.samples = append(left_node.samples, k)
147 | 		} else {
148 | 			right_node.samples = append(right_node.samples, k)
149 | 		}
150 | 	}
151 | 	node.samples = nil
152 | 
153 | 	if len(left_node.samples) == 0 || len(right_node.samples) == 0 {
154 | 		return
155 | 	}
156 | 
157 | 	if len(left_node.samples) > rdt.params.MinLeafSize {
158 | 		queue.PushBack(&left_node)
159 | 		node.left = len(tree.nodes)
160 | 		tree.AddTreeNode(&left_node)
161 | 	}
162 | 
163 | 	if len(right_node.samples) > rdt.params.MinLeafSize {
164 | 		queue.PushBack(&right_node)
165 | 		node.right = len(tree.nodes)
166 | 		tree.AddTreeNode(&right_node)
167 | 	}
168 | }
169 | 
170 | func (rdt *RandomDecisionTree) SingleTreeBuild(samples []*core.MapBasedSample) Tree {
171 | 	tree := Tree{}
172 | 	queue := list.New()
173 | 	root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}}
174 | 
175 | 	for i := 0; i < len(samples); i++ {
176 | 		k := rand.Intn(len(samples))
177 | 		root.AddSample(k)
178 | 		root.prediction.AddValue(samples[k].Label, 1.0)
179 | 	}
180 | 	root.sample_count = len(root.samples)
181 | 	root.prediction.Scale(1.0 / root.prediction.Sum())
182 | 
183 | 	queue.PushBack(&root)
184 | 	tree.AddTreeNode(&root)
185 | 	for {
186 | 		nodes := DTGetElementFromQueue(queue, 10)
187 | 		if len(nodes) == 0 {
188 | 			break
189 | 		}
190 | 
191 | 		for _, node := range nodes {
192 | 			rdt.AppendNodeToTree(samples, node, queue, &tree)
193 | 		}
194 | 	}
195 | 	return tree
196 | }
197 | 
198 | func (rdt *RandomDecisionTree) RandomShuffle(features []core.Feature) {
199 | 	for i := range features {
200 | 		j := rand.Intn(i + 1)
201 | 		features[i], features[j] = features[j], features[i]
202 | 	}
203 | }
204 | 
205 | func (rdt *RandomDecisionTree) Train(dataset *core.DataSet) {
206 | 	samples := []*core.MapBasedSample{}
207 | 	for _, sample := range dataset.Samples {
208 | 		samples = append(samples, sample.ToMapBasedSample())
209 | 	}
210 | 	dataset.Samples = nil
211 | 
212 | 	forest := make(chan *Tree, rdt.params.TreeCount)
213 | 	var wait sync.WaitGroup
214 | 	wait.Add(rdt.params.TreeCount)
215 | 	for k := 0; k < rdt.params.TreeCount; k++ {
216 | 		go func() {
217 | 			tree := rdt.SingleTreeBuild(samples)
218 | 			forest <- &tree
219 | 			fmt.Printf(".")
220 | 			wait.Done()
221 | 		}()
222 | 	}
223 | 	wait.Wait()
224 | 	fmt.Println()
225 | 	close(forest)
226 | 	for tree := range forest {
227 | 		rdt.trees = append(rdt.trees, tree)
228 | 	}
229 | }
230 | 
231 | func (rdt *RandomDecisionTree) Predict(sample *core.Sample) float64 {
232 | 	ret := 0.0
233 | 	total := 0.0
234 | 	msample := sample.ToMapBasedSample()
235 | 	for _, tree := range rdt.trees {
236 | 		node, _ := PredictBySingleTree(tree, msample)
237 | 		ret += node.prediction.GetValue(1)
238 | 		total += 1.0
239 | 	}
240 | 	return ret / total
241 | }
242 | 
243 | func (rdt *RandomDecisionTree) PredictMultiClass(sample *core.Sample) *core.ArrayVector {
244 | 	msample := sample.ToMapBasedSample()
245 | 	predictions := core.NewArrayVector()
246 | 	total := 0.0
247 | 	for _, tree := range rdt.trees {
248 | 		node, _ := PredictBySingleTree(tree, msample)
249 | 		predictions.AddVector(node.prediction, 1.0)
250 | 		total += 1.0
251 | 	}
252 | 	predictions.Scale(1.0 / total)
253 | 	return predictions
254 | }
255 | 
256 | func (rdt *RandomDecisionTree) Init(params map[string]string) {
257 | 	rdt.trees = []*Tree{}
258 | 	rdt.params.MinLeafSize, _ = strconv.Atoi(params["min-leaf-size"])
259 | 	rdt.params.TreeCount, _ = strconv.Atoi(params["tree-count"])
260 | 	rdt.params.MaxDepth, _ = strconv.Atoi(params["max-depth"])
261 | }
262 | 


--------------------------------------------------------------------------------
/core/feature_analyze.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | import(
  4 | 	"sort"
  5 | 	"math"
  6 | )
  7 | 
  8 | type WeightLabel struct {
  9 | 	weight float64
 10 | 	label int
 11 | }
 12 | 
 13 | func (self *WeightLabel) LabelDoubleValue() float64{
 14 | 	return float64(self.label)
 15 | }
 16 | 
 17 | type FeatureLabelDistribution struct {
 18 | 	weight_label []WeightLabel
 19 | }
 20 | 
 21 | type WeightGoal struct {
 22 | 	weight float64
 23 | 	goal float64
 24 | }
 25 | 
 26 | type FeatureGoalDistribution struct {
 27 | 	weight_goal []WeightGoal
 28 | }
 29 | 
 30 | func NewFeatureLabelDistribution() *FeatureLabelDistribution{
 31 | 	ret := FeatureLabelDistribution{}
 32 | 	ret.weight_label = []WeightLabel{}
 33 | 	return &ret
 34 | }
 35 | 
 36 | func NewFeatureGoalDistribution() *FeatureGoalDistribution{
 37 | 	ret := FeatureGoalDistribution{}
 38 | 	ret.weight_goal = []WeightGoal{}
 39 | 	return &ret
 40 | }
 41 | 
 42 | func (f *FeatureLabelDistribution) AddWeightLabel(weight float64, label int){
 43 | 	wl := WeightLabel{weight:weight, label:label}
 44 | 	f.weight_label = append(f.weight_label, wl)
 45 | }
 46 | 
 47 | func (f *FeatureGoalDistribution) AddWeightGoal(weight float64, goal float64){
 48 | 	wl := WeightGoal{weight:weight, goal:goal}
 49 | 	f.weight_goal = append(f.weight_goal, wl)
 50 | }
 51 | 
 52 | func (f *FeatureLabelDistribution) Len() int {
 53 | 	return len(f.weight_label)
 54 | }
 55 | 
 56 | func (f *FeatureLabelDistribution) Swap(i, j int) {
 57 | 	f.weight_label[i], f.weight_label[j] = f.weight_label[j], f.weight_label[i]
 58 | }
 59 | 
 60 | func (f *FeatureLabelDistribution) Less(i, j int) bool {
 61 | 	return (f.weight_label[i].weight < f.weight_label[j].weight)
 62 | }
 63 | 
 64 | func (f *FeatureGoalDistribution) Len() int {
 65 | 	return len(f.weight_goal)
 66 | }
 67 | 
 68 | func (f *FeatureGoalDistribution) Swap(i, j int) {
 69 | 	f.weight_goal[i], f.weight_goal[j] = f.weight_goal[j], f.weight_goal[i]
 70 | }
 71 | 
 72 | func (f *FeatureGoalDistribution) Less(i, j int) bool {
 73 | 	return (f.weight_goal[i].weight < f.weight_goal[j].weight)
 74 | }
 75 | 
 76 | func (f *FeatureLabelDistribution) PositiveCount() int {
 77 | 	ret := 0
 78 | 	for _, e := range f.weight_label{
 79 | 		ret += int(e.label)
 80 | 	}
 81 | 	return ret
 82 | }
 83 | 
 84 | func (f *FeatureLabelDistribution) LabelDistribution() *ArrayVector {
 85 | 	ret := NewArrayVector()
 86 | 	for _, e := range f.weight_label {
 87 | 		ret.AddValue(e.label, 1.0)
 88 | 	}
 89 | 	return ret
 90 | }
 91 | 
 92 | func (f *FeatureGoalDistribution) Variance(sum_left, sum_left2, count_left, sum_right, sum_right2, count_right float64) float64 {
 93 | 	mean_left := sum_left / count_left
 94 | 	mean_right := sum_right / count_right
 95 | 
 96 | 	return sum_left2 + sum_right2 - mean_left * mean_left * count_left - mean_right * mean_right * count_right
 97 | }
 98 | 
 99 | func (f *FeatureGoalDistribution) BestSplitByVariance(sum_left, sum_left2, count_left, sum_right, sum_right2, count_right float64) (float64, float64) {
100 | 
101 | 	min_vari := 100000.0
102 | 	split := f.weight_goal[0].weight - 1.0
103 | 	prev_weight := f.weight_goal[0].weight - 1.0
104 | 	for _, wl := range f.weight_goal{
105 | 		if prev_weight != wl.weight{
106 | 			vari := f.Variance(sum_left, sum_left2, count_left, sum_right, sum_right2, count_right)
107 | 			if vari < min_vari{
108 | 				min_vari = vari
109 | 				split = wl.weight
110 | 			}	
111 | 		}
112 | 		prev_weight = wl.weight
113 | 		sum_left += wl.goal
114 | 		sum_left2 += wl.goal * wl.goal
115 | 		count_left += 1.0
116 | 
117 | 		sum_right -= wl.goal
118 | 		sum_right2 -= wl.goal * wl.goal
119 | 		count_right -= 1.0
120 | 	}
121 | 	return split, min_vari
122 | }
123 | 
124 | /*
125 | func Gini(pleft, tleft, pright, tright float64) float64 {
126 | 	if tleft == 0.0 || tright == 0.0{
127 | 		return 1.0
128 | 	}
129 | 	p11 := pleft / tleft
130 | 	g1 := 1 - p11 * p11 - (1 - p11) * (1 - p11)
131 | 	p21 := pright / tright
132 | 	g2 := 1 - p21 * p21 - (1 - p21) * (1 - p21)
133 | 	ret := tleft * g1 / (tleft + tright) + tright * g2 / (tleft + tright)
134 | 	return ret	
135 | }
136 | */
137 | 
138 | func Gini(left_dis, right_dis *ArrayVector) float64 {
139 | 	left_sum := left_dis.Sum()
140 | 	right_sum := right_dis.Sum()
141 | 
142 | 	if left_sum == 0.0 || right_sum == 0.0 {
143 | 		return 1.0
144 | 	}
145 | 
146 | 	left_gini := 1.0
147 | 	for _, p := range left_dis.data {
148 | 		left_gini -= (p / left_sum) * (p / left_sum)
149 | 	}
150 | 
151 | 	right_gini := 1.0
152 | 	for _, p := range right_dis.data {
153 | 		right_gini -= (p / right_sum) * (p / right_sum)
154 | 	}
155 | 	return (left_sum * left_gini + right_sum * right_gini) / (left_sum + right_sum)
156 | }
157 | 
158 | /*
159 | func (f *FeatureLabelDistribution) BestSplitByGini(total, positive int) (float64, float64) {
160 | 	pright := float64(f.PositiveCount())
161 | 	tright := float64(len(f.weight_label))
162 | 	pleft := float64(positive) - pright
163 | 	tleft := float64(total) - tright
164 | 	min_gini := Gini(pleft, tleft, pright, tright)
165 | 	split := f.weight_label[0].weight
166 | 	prev_weight := f.weight_label[0].weight
167 | 	for _, wl := range f.weight_label{
168 | 		if prev_weight != wl.weight{
169 | 			gini := Gini(pleft, tleft, pright, tright)
170 | 			if gini < min_gini{
171 | 				min_gini = gini
172 | 				split = wl.weight
173 | 			}	
174 | 		}
175 | 		prev_weight = wl.weight
176 | 		tleft += 1.0
177 | 		tright -= 1.0
178 | 		pleft += float64(wl.label)
179 | 		pright -= float64(wl.label)
180 | 	}
181 | 	return split, min_gini	
182 | }
183 | */
184 | 
185 | func (self *FeatureLabelDistribution) BestSplitByGini(total_dis *ArrayVector) (float64, float64) {
186 | 	left_dis := total_dis.Copy()
187 | 	right_dis := self.LabelDistribution()
188 | 	left_dis.AddVector(right_dis, -1.0)
189 | 
190 | 	min_gini := Gini(left_dis, right_dis)
191 | 	split := self. weight_label[0].weight
192 | 	prev_weight := self.weight_label[0].weight
193 | 	for _, wl := range self.weight_label {
194 | 		if prev_weight != wl.weight {
195 | 			gini := Gini(left_dis, right_dis)
196 | 			if gini < min_gini{
197 | 				min_gini = gini
198 | 				split = wl.weight
199 | 			}
200 | 		}
201 | 		prev_weight = wl.weight
202 | 		left_dis.AddValue(wl.label, 1.0)
203 | 		right_dis.AddValue(wl.label, -1.0)
204 | 	}
205 | 	return split, min_gini
206 | }
207 | 
208 | func (f *FeatureLabelDistribution) InformationValue(global_total, global_positive int) float64 {
209 | 	with_total := len(f.weight_label)
210 | 	with_positive := f.PositiveCount()
211 | 	
212 | 	positives := []int{}
213 | 	negatives := []int{}
214 | 	
215 | 	positives = append(positives, global_positive - with_positive)
216 | 	negatives = append(negatives, (global_total - global_positive) - (with_total - with_positive))
217 | 	
218 | 	sort.Sort(f)
219 | 	
220 | 	prev_c := -1
221 | 	pos := 0
222 | 	total := 0
223 | 	for i, e := range f.weight_label {
224 | 		c := int(200.0 * float64(i) / float64(with_total))
225 | 		if c != prev_c {
226 | 			if total > 0{
227 | 				positives = append(positives, pos)
228 | 				negatives = append(negatives, total - pos)
229 | 				pos = 0
230 | 				total = 0
231 | 			}	
232 | 		}
233 | 		prev_c = c
234 | 		pos += int(e.label)
235 | 		total += 1
236 | 	}
237 | 	if total > 0{
238 | 		positives = append(positives, pos)
239 | 		negatives = append(negatives, total - pos)
240 | 	}
241 | 	
242 | 	sum_positive := 0
243 | 	sum_negative := 0
244 | 	for _, v := range positives{
245 | 		sum_positive += v
246 | 	}
247 | 	for _, v := range negatives{
248 | 		sum_negative += v
249 | 	}
250 | 	iv := 0.0
251 | 	for i := range positives{
252 | 		positive_ratio := float64(positives[i]) / float64(sum_positive)
253 | 		negative_ratio := float64(negatives[i]) / float64(sum_negative)
254 | 		iv += (positive_ratio - negative_ratio) * math.Log((0.00001 + positive_ratio) / (0.00001 + negative_ratio))
255 | 	}
256 | 	return iv
257 | }
258 | 
259 | func InformationValue(dataset *DataSet) map[int64]float64 {
260 | 	feature_weight_labels := make(map[int64]*FeatureLabelDistribution)
261 | 	total := 0
262 | 	positive := 0
263 | 	for _,sample := range dataset.Samples {
264 | 		total += 1
265 | 		positive += int(sample.Label)
266 | 		for _, feature := range sample.Features {
267 | 			_, ok := feature_weight_labels[feature.Id]
268 | 			if !ok {
269 | 				feature_weight_labels[feature.Id] = NewFeatureLabelDistribution()
270 | 			}
271 | 			feature_weight_labels[feature.Id].AddWeightLabel(feature.Value, sample.Label)
272 | 		}
273 | 	}
274 | 	
275 | 	ret := make(map[int64]float64)
276 | 	
277 | 	for fid, distribution := range feature_weight_labels{
278 | 		ret[fid] = distribution.InformationValue(total, positive)
279 | 	}
280 | 	return ret
281 | }
282 | 


--------------------------------------------------------------------------------
/dt/cart.go:
--------------------------------------------------------------------------------
  1 | package dt
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"container/list"
  6 | 	"fmt"
  7 | 	"github.com/xlvector/hector/core"
  8 | 	"io/ioutil"
  9 | 	"math"
 10 | 	"math/rand"
 11 | 	"os"
 12 | 	"sort"
 13 | 	"strconv"
 14 | )
 15 | 
 16 | /*
 17 | CART is classification and regression tree, this class implement classification tree and use gini
 18 | to split features
 19 | */
 20 | type CART struct {
 21 | 	tree                Tree
 22 | 	params              CARTParams
 23 | 	continuous_features bool
 24 | 	salt                int64
 25 | }
 26 | 
 27 | func DTGoLeft(sample *core.MapBasedSample, feature_split core.Feature) bool {
 28 | 	value, ok := sample.Features[feature_split.Id]
 29 | 	if ok && value >= feature_split.Value {
 30 | 		return true
 31 | 	} else {
 32 | 		return false
 33 | 	}
 34 | }
 35 | 
 36 | func DTGetElementFromQueue(queue *list.List, n int) []*TreeNode {
 37 | 	ret := []*TreeNode{}
 38 | 	for i := 0; i < n; i++ {
 39 | 		node := queue.Front()
 40 | 		if node == nil {
 41 | 			break
 42 | 		}
 43 | 		ret = append(ret, (node.Value.(*TreeNode)))
 44 | 		queue.Remove(node)
 45 | 	}
 46 | 	return ret
 47 | }
 48 | 
 49 | func (dt *CART) RandByFeatureId(fid int64) float64 {
 50 | 	ret := fid*19857 + dt.salt
 51 | 	r := math.Abs(float64(ret%1000) / 1000.0)
 52 | 	return r
 53 | }
 54 | 
 55 | func (dt *CART) FindBestSplitOfContinusousFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) {
 56 | 	feature_weight_labels := make(map[int64]*core.FeatureLabelDistribution)
 57 | 	total_dis := core.NewArrayVector()
 58 | 	for i, k := range node.samples {
 59 | 		if i > 10 && rand.Float64() > dt.params.SamplingRatio {
 60 | 			continue
 61 | 		}
 62 | 		total_dis.AddValue(samples[k].Label, 1.0)
 63 | 		for fid, fvalue := range samples[k].Features {
 64 | 			if dt.RandByFeatureId(fid) > feature_select_prob {
 65 | 				continue
 66 | 			}
 67 | 			_, ok := feature_weight_labels[fid]
 68 | 			if !ok {
 69 | 				feature_weight_labels[fid] = core.NewFeatureLabelDistribution()
 70 | 			}
 71 | 			feature_weight_labels[fid].AddWeightLabel(fvalue, samples[k].Label)
 72 | 		}
 73 | 	}
 74 | 
 75 | 	min_gini := 1.0
 76 | 	node.feature_split = core.Feature{Id: -1, Value: 0}
 77 | 	for fid, distribution := range feature_weight_labels {
 78 | 		sort.Sort(distribution)
 79 | 		split, gini := distribution.BestSplitByGini(total_dis)
 80 | 		if min_gini > gini {
 81 | 			min_gini = gini
 82 | 			node.feature_split.Id = fid
 83 | 			node.feature_split.Value = split
 84 | 		}
 85 | 	}
 86 | 	if min_gini > dt.params.GiniThreshold {
 87 | 		node.feature_split.Id = -1
 88 | 		node.feature_split.Value = 0.0
 89 | 	}
 90 | }
 91 | 
 92 | func (dt *CART) FindBestSplitOfBinaryFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) {
 93 | 	feature_right_dis := make(map[int64]*core.ArrayVector)
 94 | 	total_dis := core.NewArrayVector()
 95 | 	for i, k := range node.samples {
 96 | 		if i > 10 && rand.Float64() > dt.params.SamplingRatio {
 97 | 			continue
 98 | 		}
 99 | 		total_dis.AddValue(samples[k].Label, 1.0)
100 | 		for fid, _ := range samples[k].Features {
101 | 			if dt.RandByFeatureId(fid) > feature_select_prob {
102 | 				continue
103 | 			}
104 | 			_, ok := feature_right_dis[fid]
105 | 			if !ok {
106 | 				feature_right_dis[fid] = core.NewArrayVector()
107 | 			}
108 | 			feature_right_dis[fid].AddValue(samples[k].Label, 1.0)
109 | 		}
110 | 	}
111 | 
112 | 	min_gini := 1.0
113 | 	node.feature_split = core.Feature{Id: -1, Value: 0}
114 | 	for fid, right_dis := range feature_right_dis {
115 | 		left_dis := total_dis.Copy()
116 | 		left_dis.AddVector(right_dis, -1.0)
117 | 		gini := core.Gini(left_dis, right_dis)
118 | 		if min_gini > gini {
119 | 			min_gini = gini
120 | 			node.feature_split.Id = fid
121 | 			node.feature_split.Value = 1.0
122 | 		}
123 | 	}
124 | 	if min_gini > dt.params.GiniThreshold {
125 | 		node.feature_split.Id = -1
126 | 		node.feature_split.Value = 0.0
127 | 	}
128 | }
129 | 
130 | func (dt *CART) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, feature_select_prob float64) {
131 | 	if node.depth >= dt.params.MaxDepth {
132 | 		return
133 | 	}
134 | 
135 | 	if dt.continuous_features {
136 | 		dt.FindBestSplitOfContinusousFeature(samples, node, feature_select_prob)
137 | 	} else {
138 | 		dt.FindBestSplitOfBinaryFeature(samples, node, feature_select_prob)
139 | 	}
140 | 	if node.feature_split.Id < 0 {
141 | 		return
142 | 	}
143 | 	left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}
144 | 	right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}
145 | 
146 | 	left_node.prediction = core.NewArrayVector()
147 | 	right_node.prediction = core.NewArrayVector()
148 | 	for _, k := range node.samples {
149 | 		if DTGoLeft(samples[k], node.feature_split) {
150 | 			left_node.samples = append(left_node.samples, k)
151 | 			left_node.prediction.AddValue(samples[k].Label, 1.0)
152 | 		} else {
153 | 			right_node.samples = append(right_node.samples, k)
154 | 			right_node.prediction.AddValue(samples[k].Label, 1.0)
155 | 		}
156 | 	}
157 | 	node.samples = nil
158 | 
159 | 	if len(left_node.samples) > dt.params.MinLeafSize {
160 | 		left_node.sample_count = len(left_node.samples)
161 | 		left_node.prediction.Scale(1.0 / left_node.prediction.Sum())
162 | 		queue.PushBack(&left_node)
163 | 		node.left = len(tree.nodes)
164 | 		tree.AddTreeNode(&left_node)
165 | 	}
166 | 
167 | 	if len(right_node.samples) > dt.params.MinLeafSize {
168 | 		right_node.sample_count = len(right_node.samples)
169 | 		right_node.prediction.Scale(1.0 / right_node.prediction.Sum())
170 | 		queue.PushBack(&right_node)
171 | 		node.right = len(tree.nodes)
172 | 		tree.AddTreeNode(&right_node)
173 | 	}
174 | }
175 | 
176 | func (dt *CART) SingleTreeBuild(samples []*core.MapBasedSample, feature_select_prob float64, bootstrap bool) Tree {
177 | 	tree := Tree{}
178 | 	queue := list.New()
179 | 	root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}}
180 | 
181 | 	if !bootstrap {
182 | 		for i, sample := range samples {
183 | 			root.AddSample(i)
184 | 			root.prediction.AddValue(sample.Label, 1.0)
185 | 		}
186 | 	} else {
187 | 		for i := 0; i < len(samples); i++ {
188 | 			k := rand.Intn(len(samples))
189 | 			root.AddSample(k)
190 | 			root.prediction.AddValue(samples[k].Label, 1.0)
191 | 		}
192 | 	}
193 | 	root.sample_count = len(root.samples)
194 | 	root.prediction.Scale(1.0 / root.prediction.Sum())
195 | 
196 | 	queue.PushBack(&root)
197 | 	tree.AddTreeNode(&root)
198 | 	for {
199 | 		nodes := DTGetElementFromQueue(queue, 10)
200 | 		if len(nodes) == 0 {
201 | 			break
202 | 		}
203 | 
204 | 		for _, node := range nodes {
205 | 			dt.AppendNodeToTree(samples, node, queue, &tree, feature_select_prob)
206 | 		}
207 | 	}
208 | 	return tree
209 | }
210 | 
211 | func PredictBySingleTree(tree *Tree, sample *core.MapBasedSample) (*TreeNode, string) {
212 | 	path := ""
213 | 	node := tree.GetNode(0)
214 | 	path += node.ToString()
215 | 	for {
216 | 		if DTGoLeft(sample, node.feature_split) {
217 | 			if node.left >= 0 && node.left < tree.Size() {
218 | 				node = tree.GetNode(node.left)
219 | 				path += "-" + node.ToString()
220 | 			} else {
221 | 				break
222 | 			}
223 | 		} else {
224 | 			if node.right >= 0 && node.right < tree.Size() {
225 | 				node = tree.GetNode(node.right)
226 | 				path += "+" + node.ToString()
227 | 			} else {
228 | 				break
229 | 			}
230 | 		}
231 | 	}
232 | 	return node, path
233 | }
234 | 
235 | func (dt *CART) Train(dataset *core.DataSet) {
236 | 	samples := []*core.MapBasedSample{}
237 | 	feature_weights := make(map[int64]float64)
238 | 	for _, sample := range dataset.Samples {
239 | 		if !dt.continuous_features {
240 | 			for _, f := range sample.Features {
241 | 				_, ok := feature_weights[f.Id]
242 | 				if !ok {
243 | 					feature_weights[f.Id] = f.Value
244 | 				}
245 | 				if feature_weights[f.Id] != f.Value {
246 | 					dt.continuous_features = true
247 | 				}
248 | 			}
249 | 		}
250 | 		msample := sample.ToMapBasedSample()
251 | 		samples = append(samples, msample)
252 | 	}
253 | 	if dt.continuous_features {
254 | 		fmt.Println("Continuous DataSet")
255 | 	} else {
256 | 		fmt.Println("Binary DataSet")
257 | 	}
258 | 	dt.tree = dt.SingleTreeBuild(samples, 1.0, false)
259 | }
260 | 
261 | func (dt *CART) Predict(sample *core.Sample) float64 {
262 | 	msample := sample.ToMapBasedSample()
263 | 	node, _ := PredictBySingleTree(&dt.tree, msample)
264 | 	return node.prediction.GetValue(1)
265 | }
266 | 
267 | func (dt *CART) PredictMultiClass(sample *core.Sample) *core.ArrayVector {
268 | 	msample := sample.ToMapBasedSample()
269 | 	node, _ := PredictBySingleTree(&dt.tree, msample)
270 | 	return node.prediction
271 | }
272 | 
273 | func (self *CART) SaveModel(path string) {
274 | 	ioutil.WriteFile(path, self.tree.ToString(), 0600)
275 | }
276 | 
277 | func (self *CART) LoadModel(path string) {
278 | 	file, _ := os.Open(path)
279 | 	defer file.Close()
280 | 	text := ""
281 | 	scanner := bufio.NewScanner(file)
282 | 	for scanner.Scan() {
283 | 		text += scanner.Text() + "\n"
284 | 	}
285 | 	self.tree.FromString(string(text))
286 | }
287 | 
288 | type CARTParams struct {
289 | 	MaxDepth      int
290 | 	MinLeafSize   int
291 | 	GiniThreshold float64
292 | 	SamplingRatio float64
293 | }
294 | 
295 | func (dt *CART) Init(params map[string]string) {
296 | 	dt.tree = Tree{}
297 | 	dt.continuous_features = false
298 | 	min_leaf_size, _ := strconv.ParseInt(params["min-leaf-size"], 10, 32)
299 | 	max_depth, _ := strconv.ParseInt(params["max-depth"], 10, 32)
300 | 
301 | 	dt.params.MinLeafSize = int(min_leaf_size)
302 | 	dt.params.MaxDepth = int(max_depth)
303 | 	dt.params.GiniThreshold, _ = strconv.ParseFloat(params["gini"], 64)
304 | 	dt.salt = rand.Int63n(10000000000)
305 | 	dt.params.SamplingRatio, _ = strconv.ParseFloat(params["dt-sample-ratio"], 64)
306 | }
307 | 


--------------------------------------------------------------------------------
/core/dataset.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"os"
  8 | 	"sort"
  9 | 	"strconv"
 10 | 	"strings"
 11 | 
 12 | 	"github.com/xlvector/hector/util"
 13 | )
 14 | 
 15 | type CombinedFeature []string
 16 | 
 17 | type FeatureSplit []float64
 18 | 
 19 | func FindCategory(split []float64, value float64) int {
 20 | 	return sort.Search(len(split), func(i int) bool { return split[i] >= value })
 21 | }
 22 | 
 23 | /* RawDataSet */
 24 | type RawDataSet struct {
 25 | 	Samples     []*RawSample
 26 | 	FeatureKeys map[string]bool
 27 | }
 28 | 
 29 | func NewRawDataSet() *RawDataSet {
 30 | 	ret := RawDataSet{}
 31 | 	ret.Samples = []*RawSample{}
 32 | 	ret.FeatureKeys = make(map[string]bool)
 33 | 	return &ret
 34 | }
 35 | 
 36 | func (d *RawDataSet) AddSample(sample *RawSample) {
 37 | 	d.Samples = append(d.Samples, sample)
 38 | }
 39 | 
 40 | func (d *RawDataSet) ToDataSet(splits map[string][]float64, combinations []CombinedFeature) *DataSet {
 41 | 	out_data := NewDataSet()
 42 | 	fm := make(map[string]int64)
 43 | 	for _, sample := range d.Samples {
 44 | 		out_sample := NewSample()
 45 | 		out_sample.Label = sample.Label
 46 | 		if splits != nil {
 47 | 			for fkey_str, fvalue_str := range sample.Features {
 48 | 				fkey := ""
 49 | 				fvalue := 0.0
 50 | 				if GetFeatureType(fkey_str) == FeatureTypeEnum.CONTINUOUS_FEATURE {
 51 | 					split, ok := splits[fkey_str]
 52 | 					if ok {
 53 | 						cat := FindCategory(split, util.ParseFloat64(fvalue_str))
 54 | 						fkey = fkey_str + "_" + strconv.FormatInt(int64(cat), 10)
 55 | 						fvalue = 1.0
 56 | 					} else {
 57 | 						fvalue = util.ParseFloat64(fvalue_str)
 58 | 					}
 59 | 					fm[fkey] = util.Hash(fkey)
 60 | 					out_sample.AddFeature(Feature{Id: util.Hash(fkey), Value: fvalue})
 61 | 				}
 62 | 			}
 63 | 		}
 64 | 		for _, combination := range combinations {
 65 | 			fkey := ""
 66 | 			for _, ckey := range combination {
 67 | 				fkey += ckey
 68 | 				fkey += ":"
 69 | 				fkey += sample.GetFeatureValue(ckey)
 70 | 				fkey += "_"
 71 | 			}
 72 | 			fm[fkey] = util.Hash(fkey)
 73 | 			out_sample.AddFeature(Feature{Id: util.Hash(fkey), Value: 1.0})
 74 | 		}
 75 | 		out_data.AddSample(out_sample)
 76 | 	}
 77 | 	f, _ := os.Create("features.tsv")
 78 | 	defer f.Close()
 79 | 	w := bufio.NewWriter(f)
 80 | 	for k, v := range fm {
 81 | 		w.WriteString(k + "\t" + strconv.FormatInt(v, 10) + "\n")
 82 | 	}
 83 | 
 84 | 	return out_data
 85 | }
 86 | 
 87 | func (d *RawDataSet) Load(path string) error {
 88 | 	file, err := os.Open(path)
 89 | 	if err != nil {
 90 | 		return err
 91 | 	}
 92 | 	defer file.Close()
 93 | 	ch := make(chan string, 1000)
 94 | 	go func() {
 95 | 		reader := bufio.NewReader(file)
 96 | 		for {
 97 | 			line, err := reader.ReadString('\n')
 98 | 			if err != nil {
 99 | 				break
100 | 			}
101 | 			ch <- line
102 | 		}
103 | 		close(ch)
104 | 	}()
105 | 
106 | 	n := 0
107 | 	for line := range ch {
108 | 		n += 1
109 | 		if n%10000 == 0 {
110 | 			fmt.Println(n, len(ch))
111 | 		}
112 | 		line = strings.Replace(line, " ", "\t", -1)
113 | 		tks := strings.Split(line, "\t")
114 | 		sample := NewRawSample()
115 | 		for i, tk := range tks {
116 | 			if i == 0 {
117 | 				label, err := strconv.ParseInt(tk, 10, 16)
118 | 				if err != nil {
119 | 					break
120 | 				}
121 | 				if label > 0 {
122 | 					sample.Label = 1.0
123 | 				} else {
124 | 					sample.Label = 0.0
125 | 				}
126 | 			} else {
127 | 				kv := strings.Split(tk, ":")
128 | 				sample.Features[kv[0]] = kv[1]
129 | 				d.FeatureKeys[kv[0]] = true
130 | 			}
131 | 		}
132 | 		d.AddSample(sample)
133 | 	}
134 | 	return nil
135 | }
136 | 
137 | /*Streaming*/
138 | type StreamingDataSet struct {
139 | 	Samples chan *Sample
140 | }
141 | 
142 | func NewStreamingDataSet() *StreamingDataSet {
143 | 	return &StreamingDataSet{
144 | 		Samples: make(chan *Sample, 10000),
145 | 	}
146 | }
147 | 
148 | func (d *StreamingDataSet) AddSample(sample *Sample) {
149 | 	d.Samples <- sample
150 | }
151 | 
152 | func (d *StreamingDataSet) Load(path string, global_bias_feature_id int64) error {
153 | 	for step := 0; step < 2; step++ {
154 | 		file, err := os.Open(path)
155 | 		defer file.Close()
156 | 		if err != nil {
157 | 			log.Fatalln("load file fail: ", err)
158 | 		}
159 | 		reader := bufio.NewReader(file)
160 | 		for {
161 | 			line, err := reader.ReadString('\n')
162 | 			if err != nil {
163 | 				break
164 | 			}
165 | 			tks := strings.Split(strings.TrimSpace(line), "\t")
166 | 			sample := Sample{Features: make([]Feature, 0, 20), Label: 0}
167 | 			for i, tk := range tks {
168 | 				if i == 0 {
169 | 					label, _ := strconv.Atoi(tk)
170 | 					sample.Label = label
171 | 				} else {
172 | 					kv := strings.Split(tk, ":")
173 | 					feature_id, err := strconv.ParseInt(kv[0], 10, 64)
174 | 					if err != nil {
175 | 						log.Fatalln("wrong feature: ", kv[0])
176 | 					}
177 | 					feature_value := 1.0
178 | 					if len(kv) > 1 {
179 | 						feature_value, err = strconv.ParseFloat(kv[1], 64)
180 | 						if err != nil {
181 | 							log.Fatalln("wrong value: ", kv[1])
182 | 						}
183 | 					}
184 | 					feature := Feature{feature_id, feature_value}
185 | 					sample.Features = append(sample.Features, feature)
186 | 				}
187 | 			}
188 | 			if global_bias_feature_id >= 0 {
189 | 				sample.Features = append(sample.Features, Feature{global_bias_feature_id, 1.0})
190 | 			}
191 | 			d.AddSample(&sample)
192 | 		}
193 | 	}
194 | 	close(d.Samples)
195 | 	return nil
196 | }
197 | 
198 | /* DataSet */
199 | type DataSet struct {
200 | 	Samples          []*Sample
201 | 	FeatureNameIdMap map[int64]string
202 | 	max_label        int
203 | }
204 | 
205 | func NewDataSet() *DataSet {
206 | 	ret := DataSet{}
207 | 	ret.Samples = []*Sample{}
208 | 	ret.FeatureNameIdMap = make(map[int64]string)
209 | 	return &ret
210 | }
211 | 
212 | func (d *DataSet) AddSample(sample *Sample) {
213 | 	d.Samples = append(d.Samples, sample)
214 | 	if d.max_label < sample.Label {
215 | 		d.max_label = sample.Label
216 | 	}
217 | }
218 | 
219 | func (d *DataSet) Load(path string, global_bias_feature_id int64) error {
220 | 	fm := make(map[string]int64)
221 | 
222 | 	ch := make(chan string, 1000)
223 | 	go func() {
224 | 		file, err := os.Open(path)
225 | 		defer file.Close()
226 | 		defer close(ch)
227 | 		if err != nil {
228 | 			log.Println("load file fail: ", err)
229 | 			return
230 | 		}
231 | 
232 | 		scanner := bufio.NewScanner(file)
233 | 
234 | 		for scanner.Scan() {
235 | 			line := strings.Replace(scanner.Text(), " ", "\t", -1)
236 | 			ch <- line
237 | 		}
238 | 	}()
239 | 
240 | 	for line := range ch {
241 | 		tks := strings.Split(line, "\t")
242 | 		sample := Sample{Features: make([]Feature, 0, 20), Label: 0}
243 | 		for i, tk := range tks {
244 | 			if i == 0 {
245 | 				label, _ := strconv.Atoi(tk)
246 | 				sample.Label = label
247 | 				if d.max_label < label {
248 | 					d.max_label = label
249 | 				}
250 | 			} else {
251 | 				kv := strings.Split(tk, ":")
252 | 				feature_id, err := strconv.ParseInt(kv[0], 10, 64)
253 | 				if err != nil {
254 | 					feature_id = util.Hash(kv[0])
255 | 					fm[kv[0]] = feature_id
256 | 				}
257 | 				d.FeatureNameIdMap[feature_id] = kv[0]
258 | 				feature_value := 1.0
259 | 				if len(kv) > 1 {
260 | 					feature_value, err = strconv.ParseFloat(kv[1], 64)
261 | 					if err != nil {
262 | 						break
263 | 					}
264 | 				}
265 | 				feature := Feature{feature_id, feature_value}
266 | 				sample.Features = append(sample.Features, feature)
267 | 			}
268 | 		}
269 | 		if global_bias_feature_id >= 0 {
270 | 			sample.Features = append(sample.Features, Feature{global_bias_feature_id, 1.0})
271 | 		}
272 | 		d.AddSample(&sample)
273 | 	}
274 | 	f, _ := os.Create("features.tsv")
275 | 	defer f.Close()
276 | 	w := bufio.NewWriter(f)
277 | 	for k, v := range fm {
278 | 		w.WriteString(k + "\t" + strconv.FormatInt(v, 10) + "\n")
279 | 	}
280 | 
281 | 	log.Println("dataset size : ", len(d.Samples))
282 | 	return nil
283 | }
284 | 
285 | func RemoveLowFreqFeatures(dataset *DataSet, threshold float64) {
286 | 	freq := NewVector()
287 | 
288 | 	for _, sample := range dataset.Samples {
289 | 		for _, feature := range sample.Features {
290 | 			freq.AddValue(feature.Id, 1.0)
291 | 		}
292 | 	}
293 | 
294 | 	for _, sample := range dataset.Samples {
295 | 		features := []Feature{}
296 | 		for _, feature := range sample.Features {
297 | 			if freq.GetValue(feature.Id) > threshold {
298 | 				features = append(features, feature)
299 | 			}
300 | 		}
301 | 		sample.Features = features
302 | 	}
303 | }
304 | 
305 | func (d *DataSet) Split(f func(int) bool) *DataSet {
306 | 	out_data := NewDataSet()
307 | 	for i, sample := range d.Samples {
308 | 		if f(i) {
309 | 			out_data.AddSample(sample)
310 | 		}
311 | 	}
312 | 	return out_data
313 | }
314 | 
315 | /* Real valued DataSet */
316 | type RealDataSet struct {
317 | 	Samples []*RealSample
318 | }
319 | 
320 | func NewRealDataSet() *RealDataSet {
321 | 	ret := RealDataSet{}
322 | 	ret.Samples = []*RealSample{}
323 | 	return &ret
324 | }
325 | 
326 | func (d *RealDataSet) AddSample(sample *RealSample) {
327 | 	d.Samples = append(d.Samples, sample)
328 | }
329 | 
330 | func (d *RealDataSet) Load(path string, global_bias_feature_id int64) error {
331 | 	file, err := os.Open(path)
332 | 	if err != nil {
333 | 		return err
334 | 	}
335 | 	defer file.Close()
336 | 
337 | 	scanner := bufio.NewScanner(file)
338 | 
339 | 	for scanner.Scan() {
340 | 		line := strings.Replace(scanner.Text(), " ", "\t", -1)
341 | 		tks := strings.Split(line, "\t")
342 | 		sample := RealSample{Features: []Feature{}, Value: 0.0}
343 | 		for i, tk := range tks {
344 | 			if i == 0 {
345 | 				value := util.ParseFloat64(tk)
346 | 				sample.Value = value
347 | 			} else {
348 | 				kv := strings.Split(tk, ":")
349 | 				feature_id, err := strconv.ParseInt(kv[0], 10, 64)
350 | 				if err != nil {
351 | 					break
352 | 				}
353 | 				feature_value := 1.0
354 | 				if len(kv) > 1 {
355 | 					feature_value, err = strconv.ParseFloat(kv[1], 64)
356 | 					if err != nil {
357 | 						break
358 | 					}
359 | 				}
360 | 				feature := Feature{feature_id, feature_value}
361 | 				sample.Features = append(sample.Features, feature)
362 | 			}
363 | 		}
364 | 		if global_bias_feature_id >= 0 {
365 | 			sample.Features = append(sample.Features, Feature{global_bias_feature_id, 1.0})
366 | 		}
367 | 		d.AddSample(&sample)
368 | 	}
369 | 	if scanner.Err() != nil {
370 | 		return scanner.Err()
371 | 	}
372 | 	return nil
373 | }
374 | 


--------------------------------------------------------------------------------