├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Dockerfile ├── README.md ├── REFERENCES.md ├── clusterer ├── kmeans.go └── kmeansstream.go ├── clusters.png ├── dataset.csv ├── decoder ├── multidecoder.go └── spherical.go ├── defaults └── defaults.go ├── demo ├── README.md ├── data │ └── MNISTnumImages5000.txt ├── main.go └── plots │ ├── centroid-dimensions-0.png │ ├── centroid-dimensions-1.png │ ├── centroid-dimensions-2.png │ ├── centroid-dimensions-3.png │ ├── centroid-dimensions-4.png │ ├── centroid-dimensions-5.png │ ├── centroid-dimensions-6.png │ ├── centroid-dimensions-7.png │ ├── centroid-dimensions-8.png │ ├── centroid-dimensions-9.png │ ├── centroid-drawing-0.png │ ├── centroid-drawing-1.png │ ├── centroid-drawing-2.png │ ├── centroid-drawing-3.png │ ├── centroid-drawing-4.png │ ├── centroid-drawing-5.png │ ├── centroid-drawing-6.png │ ├── centroid-drawing-7.png │ ├── centroid-drawing-8.png │ └── centroid-drawing-9.png ├── hash └── murmur.go ├── itemset ├── centroid.go ├── khhcentroidcounter.go └── khhcountminsketch.go ├── lsh └── lsh.go ├── overview.png ├── parse └── parser.go ├── plots ├── kmeans │ ├── centroid-0.png │ ├── centroid-1.png │ ├── centroid-2.png │ ├── centroid-3.png │ ├── centroid-4.png │ ├── centroid-5.png │ ├── centroid-6.png │ ├── centroid-7.png │ ├── centroid-8.png │ ├── centroid-9.png │ ├── heat0.png │ ├── heat1.png │ ├── heat2.png │ ├── heat3.png │ ├── heat4.png │ ├── heat5.png │ ├── heat6.png │ ├── heat7.png │ ├── heat8.png │ ├── heat9.png │ ├── paint0.png │ ├── paint1.png │ ├── paint2.png │ ├── paint3.png │ ├── paint4.png │ ├── paint5.png │ ├── paint6.png │ ├── paint7.png │ ├── paint8.png │ └── paint9.png └── rphash │ ├── centroid-0.png │ ├── centroid-1.png │ ├── centroid-2.png │ ├── centroid-3.png │ ├── centroid-4.png │ ├── centroid-5.png │ ├── centroid-6.png │ ├── centroid-7.png │ ├── centroid-8.png │ ├── centroid-9.png │ ├── heat0.png │ ├── heat1.png │ ├── heat2.png │ ├── heat3.png │ ├── heat4.png │ ├── heat5.png │ ├── heat6.png │ ├── heat7.png │ ├── heat8.png │ ├── heat9.png │ ├── paint0.png │ ├── paint1.png │ ├── paint2.png │ ├── paint3.png │ ├── paint4.png │ ├── paint5.png │ ├── paint6.png │ ├── paint7.png │ ├── paint8.png │ └── paint9.png ├── projector ├── dbfriendly.go └── fjlt.go ├── reader ├── simplearray.go └── streamobject.go ├── results.txt ├── rphash ├── rphash.go ├── rphash.png ├── simple └── simple.go ├── stream └── stream.go ├── tests ├── clusterer_test.go ├── data │ ├── fake_data_500_100.txt │ ├── fake_data_500_1000.txt │ └── people.json ├── decoder_test.go ├── hash_test.go ├── itemset_test.go ├── lsh_test.go ├── parser_test.go ├── projector_test.go ├── simple_test.go ├── simplearray_test.go ├── stream_test.go ├── streamobject_test.go ├── tests.go └── util_test.go ├── types └── types.go └── utils ├── bit-shift.go ├── centriod-priority-queue.go ├── file-reader.go ├── generate-data.go ├── hashsets.go ├── int64-priority-queue.go ├── iterator.go ├── plot-tool.go ├── stattest.go └── vectors.go /.gitignore: -------------------------------------------------------------------------------- 1 | tests/fjlt_test.go 2 | projector/fjlt.go 3 | ideas 4 | .DS_STORE 5 | save 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.3 4 | - tip 5 | script: 6 | - sh install 7 | - go test ./tests 8 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## September 1st, 2015 ## 2 | + Start of project. 3 | 4 | ## October 6th, 2015 ## 5 | + Meeting with Lee Carraher about implementation. 6 | 7 | ## October 9th, 2015 ## 8 | + Stable Stream build. 9 | + Working towards a cleaner code base 10 | 11 | ## October 12th, 2015 ## 12 | + Unstable Stream build. 13 | + Official Go structure. 14 | 15 | ## October 15th, 2015 ## 16 | + Add Simple. 17 | + Add API for exposing simple and stream. 18 | + Work towards a working build. 19 | 20 | ## October 23rd, 2015 ## 21 | + All code compiles without error. 22 | + Refer to TODO for implementation in order to fill the gaps. 23 | + Starting Benchmarking 24 | 25 | ## November 17th, 2015 ## 26 | + Running tests and benchmarks for expected outputs. 27 | 28 | ## March 2nd, 2015 ## 29 | + Tests passing 30 | + Streaming and Simple algorithms both in parallel. 31 | 32 | ## May 14th, 2016 ## 33 | + Initial Paper submission. 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # grog dockerIp 172.17.0.2 2 | # docker build -t rphash . 3 | # docker run -i -t rphash /bin/bash 4 | 5 | FROM golang:latest 6 | RUN go get github.com/chrislusf/glow \ 7 | github.com/wilseypa/rphash-golang/demo 8 | 9 | EXPOSE 8080 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RPHash 2 | [![Build Status](https://travis-ci.org/wilseypa/rphash-golang.svg)](https://travis-ci.org/wilseypa/rphash-golang) [![Release Status](https://img.shields.io/badge/version-1.0.0-blue.svg)](https://github.com/wilseypa/rphash-golang/releases) ![RPHash](https://github.com/wilseypa/rphash-golang/blob/master/rphash.png) 3 | 4 | RPHash takes clustering and unsupervised learning problems and solves them in an embarrassingly parallel manner. 5 | 6 | **Clustering** is a core concept in data analysis. Issues arise with scalability and dimensionality, ever changing environments and compatibility, insecure communications and data movement. 7 | 8 | **The solution** is secure, reliable, and fast data for large-scale distributed systems. 9 | 10 | # Random Projection Hash (RPHash) 11 | The algorithm was created for maximizing parallel computation while providing scalability for large scale deployment. It's suitable for high dimensional data sets and is scalable and streamline. 12 | 13 | ![Overview](https://github.com/wilseypa/rphash-golang/blob/master/overview.png) 14 | 15 | # Installing 16 | Ensure you have **Go**, **git**, and **mercurial** installed on your system. Additionally, ensure that you have your Go environment setup. 17 | 18 | ```sh 19 | go get github.com/wilseypa/rphash-golang 20 | # or, clone from source 21 | git clone https://github.com/wilseypa/rphash-golang.git 22 | ``` 23 | 24 | # API 25 | 26 | ```sh 27 | rphash-golang # Streaming command for clustering 28 | --num.clusters <#> # Number of clusters -> output centroids 29 | --num.shards <#> # Number of shards on the data 30 | --local.file # Filename to cluster 31 | --cluster # Cluster algorithm 32 | --centroid.plots # Enable plots 33 | --centroid.plots.file # Output dimension plot path 34 | --centroid.paint # Output of a NxN matrix (experimental) 35 | --centroid.heat # Output of a 3D heatmap (experimental) 36 | --hdfs.enable # Enable hdfs 37 | --hdfs.dir # hdfs directory 38 | [glow flags] # All other glow flags 39 | ``` 40 | 41 | # Test 42 | 43 | ```sh 44 | go test ./tests -v -bench=. 45 | ``` 46 | 47 | # Developers 48 | - Sam Wenke (**wenkesj**) 49 | - Jacob Franklin (**frankljbe**) 50 | 51 | # Documentation 52 | - Sadiq Quasem (**quasemsm**) 53 | -------------------------------------------------------------------------------- /REFERENCES.md: -------------------------------------------------------------------------------- 1 | # References # 2 | [Database-friendly random projections: Johnson-Lindenstrauss with binary coins](https://users.soe.ucsc.edu/~optas/papers/jl.pdf) 3 | 4 | [A Framework for Clustering Massive-Domain Data Streams](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=4812395) 5 | 6 | [On Classification of High-Cardinality Data Streams](http://charuaggarwal.net/16_Aggarwal.pdf) 7 | 8 | [Beyond Locality-Sensitive Hashing](http://arxiv.org/pdf/1306.1547v3.pdf) 9 | 10 | [Dimensionality Reduction: beyond the Johnson-Lindenstrauss bound](http://www.eecs.berkeley.edu/~brecht/papers/07.BRS.local-dim-REVISED.pdf) 11 | 12 | [Random projection in dimensionality reduction: Applications to image and text data](http://users.ics.aalto.fi/ella/publications/randproj_kdd.pdf) 13 | 14 | [Spherical LSH for Approximate Nearest Neighbor Search on Unit Hypersphere](https://lib-repos.fun.ac.jp/dspace/bitstream/10445/3004/4/kterasaw_2007_01_wads.pdf) 15 | -------------------------------------------------------------------------------- /clusterer/kmeans.go: -------------------------------------------------------------------------------- 1 | package clusterer 2 | 3 | import ( 4 | "fmt" 5 | "github.com/wilseypa/rphash-golang/projector" 6 | "github.com/wilseypa/rphash-golang/types" 7 | "github.com/wilseypa/rphash-golang/utils" 8 | "log" 9 | "math/rand" 10 | ) 11 | 12 | type KMeans struct { 13 | k int 14 | n int 15 | data [][]float64 16 | projectionDimension int 17 | means [][]float64 18 | clusters [][]int //Each row of clusters contatins all vectors in the data currently assigned to it. 19 | weights []int64 20 | } 21 | 22 | func NewKMeansSimple(k int, data [][]float64) *KMeans { 23 | weights := make([]int64, len(data), len(data)) 24 | for i := 0; i < len(data); i++ { 25 | weights[i] = int64(1) 26 | } 27 | return NewKMeansWeighted(k, data, weights) 28 | } 29 | 30 | func NewKMeansWeighted(k int, data [][]float64, weights []int64) *KMeans { 31 | if len(data) == 0 { 32 | log.Panic(data) 33 | } 34 | return &KMeans{ 35 | k: k, 36 | data: data, 37 | projectionDimension: 0, 38 | clusters: nil, 39 | weights: weights, 40 | } 41 | } 42 | 43 | //Vectors is a list of all assignedVectors currently assigned to the centriod we are computing 44 | func (this *KMeans) ComputeCentroid(assignedVectors []int, data [][]float64) []float64 { 45 | d := len(data[0]) 46 | centroid := make([]float64, d, d) 47 | for i := 0; i < d; i++ { 48 | centroid[i] = 0.0 49 | } 50 | var w_total int64 = 0 51 | for _, v := range assignedVectors { 52 | w_total += this.weights[v] 53 | } 54 | for _, v := range assignedVectors { 55 | vec := data[v] 56 | weight := float64(this.weights[v]) / float64(w_total) 57 | for i := 0; i < d; i++ { 58 | centroid[i] += (vec[i] * weight) 59 | } 60 | } 61 | return centroid 62 | } 63 | 64 | func (this *KMeans) UpdateMeans(data [][]float64) { 65 | for i := 0; i < this.k; i++ { 66 | this.means[i] = this.ComputeCentroid(this.clusters[i], data) 67 | } 68 | } 69 | 70 | func (this *KMeans) AssignClusters(data [][]float64) int { 71 | swaps := 0 72 | newClusters := [][]int{} 73 | for j := 0; j < this.k; j++ { 74 | newClusterList := []int{} 75 | newClusters = append(newClusters, newClusterList) 76 | } 77 | for clusterid := 0; clusterid < this.k; clusterid++ { 78 | for _, member := range this.clusters[clusterid] { 79 | nearest, _ := utils.FindNearestDistance(data[member], this.means) 80 | newClusters[nearest] = append(newClusters[nearest], member) 81 | if nearest != clusterid { 82 | swaps++ 83 | } 84 | } 85 | } 86 | this.clusters = newClusters 87 | return swaps 88 | } 89 | 90 | func (this *KMeans) Run() { 91 | //This is a condition to avoid infinite Run.. 92 | maxiters := 10000 93 | swaps := 3 94 | fulldata := this.data 95 | data := make([][]float64, 0) 96 | var p types.Projector = nil 97 | if this.projectionDimension != 0 { 98 | p = projector.NewDBFriendly(len(fulldata[0]), this.projectionDimension, rand.Int63()) 99 | } 100 | for _, v := range fulldata { 101 | if p != nil { 102 | data = append(data, p.Project(v)) 103 | } else { 104 | data = append(data, v) 105 | } 106 | } 107 | this.n = len(data) 108 | this.means = make([][]float64, this.k) 109 | for i := 0; i < this.k; i++ { 110 | this.means[i] = data[i*(this.n/this.k)] 111 | } 112 | this.clusters = make([][]int, this.k) 113 | //initilize cluster lists to be evenly diveded sequentailly 114 | for i := 0; i < this.k; i++ { 115 | cluster := make([]int, this.n/this.k) 116 | clusterStart := i * (this.n / this.k) 117 | for j := 0; j < this.n/this.k; j++ { 118 | cluster[j] = j + clusterStart 119 | } 120 | this.clusters[i] = cluster 121 | } 122 | for swaps > 2 && maxiters > 0 { 123 | maxiters-- 124 | this.UpdateMeans(data) 125 | swaps = this.AssignClusters(data) 126 | } 127 | if maxiters == 0 { 128 | fmt.Println("Warning: Max Iterations Reached") 129 | } 130 | data = fulldata 131 | this.UpdateMeans(data) 132 | } 133 | 134 | func (this *KMeans) GetCentroids() [][]float64 { 135 | if this.means == nil { 136 | this.Run() 137 | } 138 | return this.means 139 | } 140 | -------------------------------------------------------------------------------- /clusterer/kmeansstream.go: -------------------------------------------------------------------------------- 1 | package clusterer 2 | import ( 3 | "math" 4 | "github.com/wilseypa/rphash-golang/utils" 5 | "github.com/wilseypa/rphash-golang/itemset" 6 | "math/rand" 7 | ) 8 | //Implements clusterer type 9 | type KMeansStream struct { 10 | debug int 11 | k int 12 | n int 13 | dataCount int 14 | dimensionality int 15 | maxCandidateClusters int 16 | frequency float64 17 | frequencyChange float64 18 | random rand.Source 19 | candidateClusters []itemset.Centroid 20 | } 21 | 22 | func NewKMeansStream(k int, n int, dimensionality int) *KMeansStream { 23 | frequency := 1.0 / (float64(k) * (1 + math.Log10(float64(n)))) 24 | maxCandidateClusters := int(math.Log10(float64(n)) * float64(k)) 25 | candidateClusters := []itemset.Centroid{} 26 | return &KMeansStream{ 27 | debug: 0, 28 | k: k, 29 | n: n, 30 | dataCount: 0, 31 | dimensionality: dimensionality, 32 | maxCandidateClusters: maxCandidateClusters, 33 | frequency: frequency, 34 | frequencyChange: 1.1, 35 | candidateClusters: candidateClusters, 36 | } 37 | } 38 | 39 | func (this *KMeansStream) AddDataPoint(data []float64) { 40 | this.addDataPointWeighted(data, 1); 41 | } 42 | //Add a new data point to the stream 43 | func (this *KMeansStream) addDataPointWeighted(data []float64, weight int64) { 44 | if len(data) != this.dimensionality { 45 | return 46 | // panic("The input data does not have the correct dimenstionality") 47 | } 48 | minIndex := 0; 49 | minDist := 0.0; 50 | for i, centriod := range this.candidateClusters { 51 | currDist := utils.Distance(data,centriod.Centroid()) 52 | if i == 0 || minDist > currDist { 53 | minDist = currDist; 54 | minIndex = i; 55 | } 56 | } 57 | minDistSquared := minDist * minDist 58 | if len(this.candidateClusters) < this.k || rand.Float64() < float64(weight) * (minDistSquared/this.frequency) { 59 | this.candidateClusters = append(this.candidateClusters, *itemset.NewCentroidWeighted(data, weight)) 60 | }else{ 61 | this.candidateClusters[minIndex].UpdateVector(data) 62 | } 63 | if len(this.candidateClusters) > this.maxCandidateClusters { 64 | this.reduceCandidateClusters() 65 | } 66 | } 67 | 68 | func (this *KMeansStream) reduceCandidateClusters() { 69 | this.frequency = this.frequency * this.frequencyChange; 70 | oldCandidateClusters := make([]itemset.Centroid, len(this.candidateClusters), len(this.candidateClusters)) 71 | copy(oldCandidateClusters, this.candidateClusters) 72 | this.candidateClusters = []itemset.Centroid{}; 73 | for _, centriod := range oldCandidateClusters { 74 | this.addDataPointWeighted(centriod.Centroid(), centriod.GetCount()); 75 | } 76 | } 77 | 78 | func (this *KMeansStream) GetCentroids() [][]float64 { 79 | data := make([][]float64, len(this.candidateClusters), len(this.candidateClusters)) 80 | for i := 0; i < len(this.candidateClusters); i++ { 81 | data[i] = this.candidateClusters[i].Centroid(); 82 | } 83 | simple := NewKMeansSimple(this.k, data); 84 | return simple.GetCentroids(); 85 | } 86 | -------------------------------------------------------------------------------- /clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/clusters.png -------------------------------------------------------------------------------- /decoder/multidecoder.go: -------------------------------------------------------------------------------- 1 | package decoder 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/types" 5 | "math" 6 | ) 7 | 8 | type MultiDecoder struct { 9 | innerDec types.Decoder 10 | dimension int 11 | rounds int 12 | distance float64 13 | } 14 | 15 | func NewMultiDecoder(dimension int, innerDec types.Decoder) *MultiDecoder { 16 | rounds := int(math.Ceil(float64(dimension) / float64(innerDec.GetDimensionality()))) 17 | return &MultiDecoder{ 18 | dimension: dimension, 19 | rounds: rounds, 20 | innerDec: innerDec, 21 | distance: -1.0, 22 | } 23 | } 24 | 25 | func (this *MultiDecoder) GetDimensionality() int { 26 | return this.dimension 27 | } 28 | 29 | func (this *MultiDecoder) Decode(f []float64) []int64 { 30 | if this.innerDec.GetDimensionality() == len(f) { 31 | return this.innerDec.Decode(f) 32 | } 33 | innerpartition := make([]float64, this.innerDec.GetDimensionality()) 34 | copy(innerpartition[:int(math.Min(float64(len(f)), float64(len(innerpartition))))], f[:int(math.Min(float64(len(f)), float64(len(innerpartition))))]) 35 | tmp := this.innerDec.Decode(innerpartition) 36 | retLength := len(tmp) 37 | ret := make([]int64, retLength*this.rounds) 38 | copy(ret[:retLength], tmp[:retLength]) 39 | this.distance = this.innerDec.GetDistance() 40 | for i := 1; i < this.rounds; i++ { 41 | copy(innerpartition[0:int(math.Min(float64(len(f)-i*this.innerDec.GetDimensionality()), float64(len(innerpartition))))], f[i*this.innerDec.GetDimensionality():i*this.innerDec.GetDimensionality()+int(math.Min(float64(len(f)-i*this.innerDec.GetDimensionality()), float64(len(innerpartition))))]) 42 | tmp = this.innerDec.Decode(innerpartition) 43 | this.distance += this.innerDec.GetDistance() 44 | copy(ret[i*retLength:i*retLength+retLength], tmp[0:retLength]) 45 | } 46 | return ret 47 | } 48 | 49 | func (this *MultiDecoder) GetErrorRadius() float64 { 50 | return this.innerDec.GetErrorRadius() 51 | } 52 | 53 | func (this *MultiDecoder) GetDistance() float64 { 54 | return this.distance 55 | } 56 | 57 | func (this *MultiDecoder) GetVariance() float64 { 58 | return this.innerDec.GetVariance() 59 | } 60 | 61 | func (this *MultiDecoder) SetVariance(parameterObject float64) { 62 | this.innerDec.SetVariance(parameterObject) 63 | } 64 | -------------------------------------------------------------------------------- /decoder/spherical.go: -------------------------------------------------------------------------------- 1 | package decoder 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/utils" 5 | "math" 6 | "math/rand" 7 | ) 8 | 9 | var HashBits int = 64 10 | 11 | type Spherical struct { 12 | vAll [][][]float64 13 | hashbits int 14 | numDimensions int 15 | numHashFuncs int 16 | numSearchCopies int 17 | distance float64 18 | variance float64 19 | } 20 | 21 | func NewSpherical(numDimensions, numHashFuncs, numSearchCopies int) *Spherical { 22 | nvertex := 2.0 * numDimensions 23 | hashbits := int(math.Ceil(math.Log(float64(nvertex)) / math.Log(2))) 24 | kmax := int(HashBits / hashbits) 25 | if numHashFuncs > kmax { 26 | numHashFuncs = kmax 27 | } 28 | vAll := make([][][]float64, numHashFuncs*numSearchCopies) 29 | r := make([]*rand.Rand, numDimensions) 30 | 31 | for i := 0; i < numDimensions; i++ { 32 | r[i] = rand.New(rand.NewSource(int64(i))) 33 | } 34 | 35 | rotationMatrices := vAll 36 | for i := 0; i < numHashFuncs*numSearchCopies; i++ { 37 | rotationMatrices[i] = utils.RandomRotation(numDimensions, r) 38 | } 39 | vAll = rotationMatrices 40 | return &Spherical{ 41 | vAll: vAll, 42 | hashbits: hashbits, 43 | numDimensions: numDimensions, 44 | numHashFuncs: numHashFuncs, 45 | numSearchCopies: numSearchCopies, 46 | distance: 0.0, 47 | variance: 1.0, 48 | } 49 | } 50 | 51 | func (this *Spherical) GetDimensionality() int { 52 | return this.numDimensions 53 | } 54 | 55 | func (this *Spherical) GetErrorRadius() float64 { 56 | return float64(this.numDimensions) 57 | } 58 | 59 | func (this *Spherical) GetDistance() float64 { 60 | return this.distance 61 | } 62 | 63 | func (this *Spherical) Hash(p []float64) []int64 { 64 | ri := 0 65 | var h int64 66 | g := make([]int64, this.numSearchCopies) 67 | for i := 0; i < this.numSearchCopies; i++ { 68 | g[i] = 0 69 | for j := 0; j < this.numHashFuncs; j++ { 70 | vs := this.vAll[ri] 71 | h = utils.Argmaxi(p, vs, this.numDimensions) 72 | g[i] |= (h << (uint(this.hashbits * j))) 73 | ri++ 74 | } 75 | } 76 | return g 77 | } 78 | 79 | func (this *Spherical) GetVariance() float64 { 80 | return this.variance 81 | } 82 | 83 | func (this *Spherical) SetVariance(parameterObject float64) { 84 | this.variance = parameterObject 85 | } 86 | 87 | func (this *Spherical) Decode(f []float64) []int64 { 88 | return this.Hash(utils.Normalize(f)) 89 | } 90 | 91 | func InnerDecoder() *Spherical { 92 | return NewSpherical(32, 3, 1) 93 | } 94 | -------------------------------------------------------------------------------- /defaults/defaults.go: -------------------------------------------------------------------------------- 1 | package defaults 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/clusterer" 5 | "github.com/wilseypa/rphash-golang/decoder" 6 | "github.com/wilseypa/rphash-golang/hash" 7 | "github.com/wilseypa/rphash-golang/itemset" 8 | "github.com/wilseypa/rphash-golang/lsh" 9 | "github.com/wilseypa/rphash-golang/projector" 10 | "github.com/wilseypa/rphash-golang/reader" 11 | "github.com/wilseypa/rphash-golang/types" 12 | "github.com/wilseypa/rphash-golang/utils" 13 | ) 14 | 15 | func NewDecoder(dimension, rotations, numberOfSearches int) types.Decoder { 16 | return decoder.NewSpherical(dimension, rotations, numberOfSearches) 17 | } 18 | 19 | func NewProjector(n, t int, randomseed int64) types.Projector { 20 | return projector.NewDBFriendly(n, t, randomseed) 21 | } 22 | 23 | func NewHash(hashMod int64) types.Hash { 24 | return hash.NewMurmur(hashMod) 25 | } 26 | 27 | func NewKMeansWeighted(k int, centroids [][]float64, counts []int64) types.Clusterer { 28 | return clusterer.NewKMeansWeighted(k, centroids, counts) 29 | } 30 | 31 | func NewKMeansStream(k int, n int, dimenstionality int) types.Clusterer { 32 | return clusterer.NewKMeansStream(k, n, dimenstionality) 33 | } 34 | 35 | func NewKMeansSimple(k int, centroids [][]float64) types.Clusterer { 36 | return clusterer.NewKMeansSimple(k, centroids) 37 | } 38 | 39 | func NewCentroidStream(vec []float64) types.Centroid { 40 | return itemset.NewCentroidStream(vec) 41 | } 42 | 43 | func NewCentroidSimple(dim int, id int64) types.Centroid { 44 | return itemset.NewCentroidSimple(dim, id) 45 | } 46 | 47 | func NewCountMinSketch(k int) types.CountItemSet { 48 | return itemset.NewKHHCountMinSketch(k) 49 | } 50 | 51 | func NewCentroidCounter(k int) types.CentroidItemSet { 52 | return itemset.NewKHHCentroidCounter(k) 53 | } 54 | 55 | func NewLSH(hash types.Hash, decoder types.Decoder, projector types.Projector) types.LSH { 56 | return lsh.NewLSH(hash, decoder, projector) 57 | } 58 | 59 | func NewStatTest(vari float64) types.StatTest { 60 | return utils.NewStatTest(vari) 61 | } 62 | 63 | func NewSimpleArray(k int, data [][]float64) types.RPHashObject { 64 | return reader.NewSimpleArray(data, k) 65 | } 66 | 67 | func NewRPHashObject(dimension, k int) types.RPHashObject { 68 | return reader.NewStreamObject(dimension, k) 69 | } 70 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # RPHash – Clustering the MNIST Dataset 2 | This demo clusters the MNIST dataset into 10 clusters, one centroid for each digit classifier. 3 | 4 | 5 | ## How It Works 6 | 7 | The dataset is clustered with the RPHash algorithm. The input vectors representing digits are mapped to the clusterer and then centroids are found. 8 | The centroids are then plotted in the `demo/plots` directory where `centroid-drawing-#` are 28x28 plots of the centroids and `centroid-dimensions-#` is a plot of the strength of each dimension of the centroid vector. 9 | 10 | To run the demo, 11 | 12 | ```sh 13 | go run main.go 14 | ``` 15 | 16 | The output will look like this, 17 | 18 | ```sh 19 | # Benchmark output after clustering... 20 | # Time it takes to cluster 5,000 vectors with 784 dimensions. 21 | 2016/04/07 13:31:09 Time: 2.358581704s 22 | ``` 23 | 24 | ```sh 25 | # locaiton of the plots 26 | demo/plots 27 | # location of the data 28 | demo/data 29 | ``` 30 | -------------------------------------------------------------------------------- /demo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "time" 5 | "log" 6 | "bufio" 7 | "github.com/chrislusf/glow/flow" 8 | "github.com/gonum/plot" 9 | "github.com/gonum/plot/plotter" 10 | "github.com/gonum/plot/plotutil" 11 | "github.com/gonum/plot/vg" 12 | "github.com/wilseypa/rphash-golang/parse" 13 | "github.com/wilseypa/rphash-golang/reader" 14 | "github.com/wilseypa/rphash-golang/stream" 15 | "github.com/wilseypa/rphash-golang/types" 16 | "github.com/wilseypa/rphash-golang/utils" 17 | "math" 18 | // _ "github.com/chrislusf/glow/driver" 19 | "bytes" 20 | "io" 21 | "os" 22 | "strconv" 23 | "strings" 24 | ) 25 | 26 | var ( 27 | dataFilePath = "data/MNISTnumImages5000.txt" 28 | f = flow.New() 29 | expectedDimensions = -1 30 | numClusters = 10 31 | numShards = 8 32 | ) 33 | 34 | type Vector struct { 35 | Data []float64 36 | } 37 | 38 | func GeneratePlots(x, y [][]float64, title, xLabel, yLabel, fileName string, legendLabel []string) { 39 | outPlotPoints := make([]plotter.XYs, len(x)) 40 | outPlots := make([]*plot.Plot, len(x)) 41 | 42 | for i, _ := range outPlotPoints { 43 | outPlot, err := plot.New() 44 | outPlots[i] = outPlot 45 | outPlots[i].Title.Text = title 46 | outPlots[i].X.Label.Text = xLabel 47 | outPlots[i].Y.Label.Text = yLabel 48 | outPlotPoints[i] = make(plotter.XYs, len(x[0])) 49 | for j, _ := range x[0] { 50 | outPlotPoints[i][j].X = x[i][j] 51 | outPlotPoints[i][j].Y = y[i][j] 52 | } 53 | err = plotutil.AddLines(outPlots[i], 54 | legendLabel[i], outPlotPoints[i]) 55 | if err != nil { 56 | panic(err) 57 | } 58 | 59 | if err = outPlot.Save(6*vg.Inch, 6*vg.Inch, (fileName+strconv.FormatInt(int64(i), 16))+".png"); err != nil { 60 | panic(err) 61 | } 62 | } 63 | } 64 | 65 | // 784 Bits 66 | func Paint(image []float64, imageId int) { 67 | outPlotPoints := make(plotter.XYs, len(image)) 68 | outPlot, err := plot.New() 69 | if err != nil { 70 | panic(err) 71 | } 72 | x := 0 73 | y := 0 74 | for i, bit := range image { 75 | outPlotPoints[i].X = float64(x) 76 | if bit > 0.4 { 77 | outPlotPoints[i].Y = float64(y) 78 | } else { 79 | outPlotPoints[i].Y = 0 80 | } 81 | if i%int(math.Sqrt(float64(len(image)))) == 0 { 82 | x = 0 83 | y++ 84 | } else { 85 | x++ 86 | } 87 | } 88 | outPlot.Add(plotter.NewGrid()) 89 | s, _ := plotter.NewScatter(outPlotPoints) 90 | outPlot.Add(s) 91 | if err = outPlot.Save(6*vg.Inch, 6*vg.Inch, "plots/centroid-drawing-"+strconv.FormatInt(int64(imageId), 16)+".png"); err != nil { 92 | panic(err) 93 | } 94 | } 95 | 96 | func main() { 97 | var rphashObject *reader.StreamObject 98 | var rphashStream *stream.Stream 99 | var centroids []types.Centroid 100 | t1 := time.Now() 101 | // Split the data into shards and send them to the Agents to work on. 102 | f.Source(func(out chan Vector) { 103 | records, err := utils.ReadLines(dataFilePath) 104 | if err != nil { 105 | panic(err) 106 | } 107 | // Convert the record to standard floating points. 108 | for i, record := range records { 109 | if i == 0 { 110 | // Create a new RPHash stream. 111 | rphashObject = reader.NewStreamObject(len(record), numClusters) 112 | rphashStream = stream.NewStream(rphashObject) 113 | rphashStream.RunCount = 1 114 | } 115 | data := make([]float64, len(record)) 116 | for j, entry := range record { 117 | f, err := strconv.ParseFloat(entry, 64) 118 | f = parse.Normalize(f) 119 | if err != nil { 120 | panic(err) 121 | } 122 | data[j] = f 123 | } 124 | out <- Vector{Data: data} 125 | } 126 | }, numShards).Map(func(vec Vector) { 127 | centroids = append(centroids, rphashStream.AddVectorOnlineStep(vec.Data)) 128 | }).Run() 129 | 130 | for _, cent := range centroids { 131 | rphashStream.CentroidCounter.Add(cent) 132 | } 133 | normalizedResults := rphashStream.GetCentroids() 134 | t2 := time.Now() 135 | log.Println("Time: ", t2.Sub(t1)) 136 | 137 | denormalizedResults := make([][]float64, len(normalizedResults)) 138 | for i, result := range normalizedResults { 139 | row := make([]float64, len(result)) 140 | for j, dimension := range result { 141 | row[j] = parse.DeNormalize(dimension) 142 | } 143 | denormalizedResults[i] = row 144 | } 145 | labels := make([]string, len(denormalizedResults)) 146 | xPlotValues := make([][]float64, len(denormalizedResults)) 147 | yPlotValues := make([][]float64, len(denormalizedResults)) 148 | for i, result := range denormalizedResults { 149 | xPlotValues[i] = make([]float64, len(result)) 150 | yPlotValues[i] = make([]float64, len(result)) 151 | for j, val := range result { 152 | xPlotValues[i][j] = float64(j) 153 | yPlotValues[i][j] = val 154 | } 155 | Paint(result, i) 156 | sI := strconv.FormatInt(int64(i), 16) 157 | labels[i] = "Digit " + sI + " (by Classifier Centroid)" 158 | } 159 | GeneratePlots(xPlotValues, yPlotValues, "High Dimension Handwritting Digits 0-9 Classification", "Dimension", "Strength of Visual Pixel Recognition (0-1000)", "plots/centroid-dimensions-", labels) 160 | } 161 | -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-0.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-1.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-2.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-3.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-4.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-5.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-6.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-7.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-8.png -------------------------------------------------------------------------------- /demo/plots/centroid-dimensions-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-dimensions-9.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-0.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-1.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-2.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-3.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-4.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-5.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-6.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-7.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-8.png -------------------------------------------------------------------------------- /demo/plots/centroid-drawing-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/demo/plots/centroid-drawing-9.png -------------------------------------------------------------------------------- /hash/murmur.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/utils" 5 | ) 6 | 7 | const ( 8 | seed = int(0 >> 1) 9 | ) 10 | 11 | type Murmur struct { 12 | tablesize int64 13 | } 14 | 15 | func NewMurmur(tablesize int64) *Murmur { 16 | return &Murmur{ 17 | tablesize: tablesize, 18 | } 19 | } 20 | 21 | func (this *Murmur) Hash(data1 []int64) int64 { 22 | data := make([]byte, len(data1)*8) 23 | var ct = 0 24 | for _, d := range data1 { 25 | data[ct] = byte(uint64(d) >> 56) 26 | ct++ 27 | data[ct] = byte(uint64(d) >> 48) 28 | ct++ 29 | data[ct] = byte(uint64(d) >> 40) 30 | ct++ 31 | data[ct] = byte(uint64(d) >> 32) 32 | ct++ 33 | data[ct] = byte(uint64(d) >> 24) 34 | ct++ 35 | data[ct] = byte(uint64(d) >> 16) 36 | ct++ 37 | data[ct] = byte(uint64(d) >> 8) 38 | ct++ 39 | data[ct] = byte(uint64(d)) 40 | ct++ 41 | } 42 | m := 1540483477 43 | r := uint(24) 44 | h := seed ^ len(data) 45 | len := len(data) 46 | len_4 := len >> 2 47 | 48 | for i := 0; i < len_4; i++ { 49 | i_4 := i << 2 50 | k := int(data[i_4+3]) 51 | k = k << 8 52 | k = k | int(data[i_4+2]&0xff) 53 | k = k << 8 54 | k = k | int(data[i_4+1]&0xff) 55 | k = k << 8 56 | k = k | int(data[i_4+0]&0xff) 57 | k *= m 58 | k ^= int(uint64(k) >> r) 59 | k *= m 60 | h *= m 61 | h ^= k 62 | } 63 | 64 | len_m := len_4 << 2 65 | left := len - len_m 66 | 67 | if left != 0 { 68 | if left >= 3 { 69 | h ^= int(data[len-3] << 16) 70 | } 71 | if left >= 2 { 72 | h ^= int(data[len-2] << 8) 73 | } 74 | if left >= 1 { 75 | h ^= int(data[len-1]) 76 | } 77 | 78 | h *= m 79 | } 80 | h64 := int64(h) 81 | h64 = h64 ^ utils.RightShiftZeroExtension(h64, 13) 82 | h64 *= int64(m) 83 | h64 = h64 ^ utils.RightShiftZeroExtension(h64, 15) 84 | 85 | return h64 % this.tablesize 86 | } 87 | -------------------------------------------------------------------------------- /itemset/centroid.go: -------------------------------------------------------------------------------- 1 | package itemset 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/types" 5 | "github.com/wilseypa/rphash-golang/utils" 6 | ) 7 | 8 | type Centroid struct { 9 | Vec []float64 10 | Count int64 11 | Ids *utils.Hash64Set 12 | Id int64 13 | } 14 | 15 | func NewCentroidStream(data []float64) *Centroid { 16 | return NewCentroidWeighted(data, 1); 17 | } 18 | 19 | func NewCentroidWeighted(data []float64, weight int64) *Centroid { 20 | return &Centroid{ 21 | Vec: data, 22 | Ids: utils.NewHash64Set(), 23 | Count: weight, 24 | Id: 0, 25 | } 26 | } 27 | 28 | func NewCentroidSimple(dim int, lsh int64) *Centroid { 29 | data := make([]float64, dim) 30 | Ids := utils.NewHash64Set() 31 | Ids.Add(lsh) 32 | return &Centroid{ 33 | Vec: data, 34 | Ids: Ids, 35 | Count: 0, 36 | Id: lsh, 37 | } 38 | } 39 | 40 | func (this *Centroid) UpdateVector(data []float64) { 41 | var delta, x float64 42 | this.Count++ 43 | for i := 0; i < len(data); i++ { 44 | x = data[i] 45 | delta = x - this.Vec[i] 46 | this.Vec[i] = this.Vec[i] + delta/float64(this.Count) 47 | } 48 | } 49 | 50 | func (this *Centroid) Centroid() []float64 { 51 | return this.Vec 52 | } 53 | 54 | func (this *Centroid) GetCount() int64 { 55 | return this.Count 56 | } 57 | 58 | func (this *Centroid) GetID() int64 { 59 | return this.Id 60 | } 61 | 62 | func (this *Centroid) GetIDs() types.HashSet { 63 | return this.Ids 64 | } 65 | 66 | func (this *Centroid) AddID(h int64) { 67 | if this.Ids.Length() == 0 { 68 | this.Id = h 69 | } 70 | this.Ids.Add(h) 71 | } 72 | -------------------------------------------------------------------------------- /itemset/khhcentroidcounter.go: -------------------------------------------------------------------------------- 1 | package itemset 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/types" 5 | "github.com/wilseypa/rphash-golang/utils" 6 | "math" 7 | "math/rand" 8 | "time" 9 | ) 10 | 11 | const ( 12 | width = 200000 13 | depth = 7 14 | ) 15 | 16 | type KHHCentroidCounter struct { 17 | depth int 18 | width int 19 | sketchTable [depth][width]int 20 | hashVector []int64 21 | count int64 22 | k int 23 | frequentItems map[int64]types.Centroid 24 | countlist map[int64]int64 25 | priorityQueue *utils.CentriodPriorityQueue 26 | topCentroid []types.Centroid 27 | counts []int64 28 | } 29 | 30 | func NewKHHCentroidCounter(k int) *KHHCentroidCounter { 31 | newK := int(float64(k)*math.Log(float64(k))) * 4 32 | seed := int64(time.Now().UnixNano() / int64(time.Millisecond)) 33 | priorityQueue := utils.NewCentroidPriorityQueue() 34 | frequentItems := make(map[int64]types.Centroid) 35 | countlist := make(map[int64]int64) 36 | var sketchTable [depth][width]int 37 | hashVector := make([]int64, depth) 38 | random := rand.New(rand.NewSource(seed)) 39 | for i := 0; i < depth; i++ { 40 | hashVector[i] = random.Int63() 41 | } 42 | var result = &KHHCentroidCounter{ 43 | depth: depth, 44 | width: width, 45 | sketchTable: sketchTable, 46 | } 47 | result.hashVector = hashVector 48 | result.k = newK 49 | result.countlist = countlist 50 | result.priorityQueue = priorityQueue 51 | result.frequentItems = frequentItems 52 | return result 53 | } 54 | 55 | func (this *KHHCentroidCounter) Add(centroid types.Centroid) { 56 | this.count++ 57 | count := this.AddLong(centroid.GetID(), 1) 58 | probed := this.frequentItems[centroid.GetID()] 59 | delete(this.frequentItems, centroid.GetID()) 60 | 61 | /*for i := 0; i < centroid.GetIDs().Length(); i++ { 62 | if probed != nil { 63 | break; 64 | } 65 | if centroid.GetIDs().Get(int64(i)) { 66 | delete(this.frequentItems, int64(i)); 67 | probed = this.frequentItems[int64(i)]; 68 | } 69 | }*/ 70 | 71 | if probed == nil { 72 | this.countlist[centroid.GetID()] = count 73 | this.frequentItems[centroid.GetID()] = centroid 74 | this.priorityQueue.Enqueue(centroid) 75 | } else { 76 | this.priorityQueue.Remove(centroid.GetID()) 77 | probed.UpdateVector(centroid.Centroid()) 78 | probed.GetIDs().AddAll(centroid.GetIDs()) 79 | this.frequentItems[probed.GetID()] = probed 80 | this.countlist[probed.GetID()] = count 81 | this.priorityQueue.Enqueue(probed) 82 | } 83 | 84 | if this.priorityQueue.Size() > this.k { 85 | removed := this.priorityQueue.Poll() 86 | delete(this.frequentItems, removed.GetID()) 87 | delete(this.countlist, removed.GetID()) 88 | } 89 | } 90 | 91 | func (this *KHHCentroidCounter) Hash(item int64, i int) int { 92 | const PRIME_MODULUS = uint64((int64(1) << 31) - 1) 93 | hash := uint64(this.hashVector[i] * item) 94 | hash += hash >> 32 95 | hash &= PRIME_MODULUS 96 | return int(hash % uint64(this.width)) 97 | } 98 | 99 | /** 100 | * Add item hashed to a long value to count min sketch table add long comes 101 | * from streaminer documentation 102 | * @param item 103 | * @param count 104 | * @return size of min count bucket 105 | */ 106 | func (this *KHHCentroidCounter) AddLong(item, count int64) int64 { 107 | this.sketchTable[0][int(this.Hash(item, 0))] += int(count) 108 | min := this.sketchTable[0][int(this.Hash(item, 0))] 109 | for i := 1; i < depth; i++ { 110 | this.sketchTable[i][int(this.Hash(item, i))] += int(count) 111 | if this.sketchTable[i][int(this.Hash(item, i))] < min { 112 | min = this.sketchTable[i][int(this.Hash(item, i))] 113 | } 114 | } 115 | return int64(min) 116 | } 117 | 118 | func (this *KHHCentroidCounter) Count(item int64) int64 { 119 | min := this.sketchTable[0][int(this.Hash(item, 0))] 120 | for i := 1; i < this.depth; i++ { 121 | if this.sketchTable[i][int(this.Hash(item, i))] < min { 122 | min = this.sketchTable[i][int(this.Hash(item, i))] 123 | } 124 | } 125 | return int64(min) 126 | } 127 | 128 | func (this *KHHCentroidCounter) GetTop() []types.Centroid { 129 | if this.topCentroid != nil { 130 | return this.topCentroid 131 | } 132 | this.topCentroid = []types.Centroid{} 133 | this.counts = []int64{} 134 | for !this.priorityQueue.IsEmpty() { 135 | tmp := this.priorityQueue.Poll() 136 | this.topCentroid = append(this.topCentroid, tmp) 137 | this.counts = append(this.counts, this.Count(tmp.GetID())) 138 | } 139 | return this.topCentroid 140 | } 141 | 142 | func (this *KHHCentroidCounter) GetCount() int64 { 143 | return this.count 144 | } 145 | 146 | func (this *KHHCentroidCounter) GetCounts() []int64 { 147 | if this.counts != nil { 148 | return this.counts 149 | } 150 | this.GetTop() 151 | return this.counts 152 | } 153 | -------------------------------------------------------------------------------- /itemset/khhcountminsketch.go: -------------------------------------------------------------------------------- 1 | package itemset 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/utils" 5 | "math" 6 | "math/rand" 7 | "time" 8 | ) 9 | 10 | type KHHCountMinSketch struct { 11 | depth int 12 | width int 13 | sketchTable [depth][width]int64 14 | hashVector []int64 15 | size int64 16 | priorityQueue *utils.Int64PriorityQueue 17 | k int 18 | items map[int64]int64 19 | count int64 20 | counts []int64 21 | topCentroid []int64 22 | } 23 | 24 | func NewKHHCountMinSketch(m int) *KHHCountMinSketch { 25 | k := int(float64(m) * math.Log(float64(m))) 26 | seed := int64(time.Now().UnixNano() / int64(time.Millisecond)) 27 | items := make(map[int64]int64) 28 | var sketchTable [depth][width]int64 29 | hashVector := make([]int64, depth) 30 | random := rand.New(rand.NewSource(seed)) 31 | for i := 0; i < depth; i++ { 32 | hashVector[i] = random.Int63n(math.MaxInt64) 33 | } 34 | result := new(KHHCountMinSketch) 35 | result.k = k 36 | result.items = items 37 | result.sketchTable = sketchTable 38 | result.width = width 39 | result.depth = depth 40 | result.size = 0 41 | result.hashVector = hashVector 42 | result.priorityQueue = utils.NewInt64PriorityQueue() 43 | result.topCentroid = nil 44 | return result 45 | } 46 | 47 | func (this *KHHCountMinSketch) Hash(item int64, i int) int { 48 | PRIME_MODULUS := int64(math.MaxInt64) 49 | hash := this.hashVector[i] * item 50 | hash += hash >> 64 51 | hash &= PRIME_MODULUS 52 | return int(hash) % this.width 53 | } 54 | 55 | func (this *KHHCountMinSketch) Add(e int64) { 56 | var hashCode = utils.HashCode(e) 57 | count := this.AddLong(hashCode, 1) 58 | if this.items[hashCode] != 0 { 59 | this.priorityQueue.Remove(e) 60 | } 61 | this.items[hashCode] = e 62 | this.priorityQueue.Enqueue(e, count) 63 | if this.priorityQueue.Size() > this.k { 64 | removed := this.priorityQueue.Poll() 65 | delete(this.items, removed) 66 | } 67 | } 68 | 69 | func (this *KHHCountMinSketch) AddLong(item, count int64) int64 { 70 | this.sketchTable[0][this.Hash(item, 0)] += count 71 | min := int64(this.sketchTable[0][this.Hash(item, 0)]) 72 | for i := 1; i < this.depth; i++ { 73 | this.sketchTable[i][this.Hash(item, i)] += count 74 | if this.sketchTable[i][this.Hash(item, i)] < min { 75 | min = int64(this.sketchTable[i][this.Hash(item, i)]) 76 | } 77 | } 78 | this.size += count 79 | return min 80 | } 81 | 82 | func (this *KHHCountMinSketch) GetCount() int64 { 83 | return this.count 84 | } 85 | 86 | func (this *KHHCountMinSketch) GetCounts() []int64 { 87 | if this.counts != nil { 88 | return this.counts 89 | } 90 | this.GetTop() 91 | return this.counts 92 | } 93 | 94 | func (this *KHHCountMinSketch) GetTop() []int64 { 95 | if this.topCentroid != nil { 96 | return this.topCentroid 97 | } 98 | this.topCentroid = []int64{} 99 | this.counts = []int64{} 100 | for !this.priorityQueue.IsEmpty() { 101 | this.counts = append(this.counts, this.priorityQueue.PeakMinPriority()) 102 | tmp := this.priorityQueue.Poll() 103 | this.topCentroid = append(this.topCentroid, tmp) 104 | } 105 | return this.topCentroid 106 | } 107 | -------------------------------------------------------------------------------- /lsh/lsh.go: -------------------------------------------------------------------------------- 1 | package lsh 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/types" 5 | "math/rand" 6 | ) 7 | 8 | type LSH struct { 9 | hash types.Hash 10 | decoder types.Decoder 11 | projector types.Projector 12 | distance float64 13 | noise [][]float64 14 | radius float64 15 | } 16 | 17 | func NewLSH(hash types.Hash, 18 | decoder types.Decoder, 19 | projector types.Projector) *LSH { 20 | return &LSH{ 21 | hash: hash, 22 | decoder: decoder, 23 | projector: projector, 24 | distance: 0.0, 25 | noise: nil, 26 | radius: decoder.GetErrorRadius() / float64(decoder.GetDimensionality()), 27 | } 28 | } 29 | 30 | func (this *LSH) GenerateNoiseTable(len, times int) { 31 | this.noise = [][]float64{} 32 | for j := 1; j < times; j++ { 33 | tmp := make([]float64, len) 34 | for k := 0; k < len; k++ { 35 | tmp[k] = rand.NormFloat64() * this.radius 36 | } 37 | this.noise = append(this.noise, tmp) 38 | } 39 | } 40 | 41 | func (this *LSH) LSHHashStream(r []float64, times int) []int64 { 42 | if this.noise == nil { 43 | this.GenerateNoiseTable(len(r), times) 44 | } 45 | pr_r := this.projector.Project(r) 46 | nonoise := this.decoder.Decode(pr_r) 47 | ret := make([]int64, times*len(nonoise)) 48 | copy(ret[0:len(nonoise)], nonoise[0:]) 49 | 50 | rtmp := make([]float64, len(pr_r)) 51 | var tmp []float64 52 | for j := 1; j < times; j++ { 53 | copy(rtmp[0:len(pr_r)], pr_r[0:]) 54 | tmp = this.noise[j-1] 55 | for k := 0; k < len(pr_r); k++ { 56 | rtmp[k] = rtmp[k] + tmp[k] 57 | } 58 | nonoise = this.decoder.Decode(rtmp) 59 | copy(ret[j*len(nonoise):j*len(nonoise)+len(nonoise)], nonoise[0:]) 60 | } 61 | return ret 62 | } 63 | 64 | func (this *LSH) LSHHashSimple(r []float64) int64 { 65 | projectedSpace := this.projector.Project(r) 66 | decodedSpace := this.decoder.Decode(projectedSpace) 67 | hashedResult := this.hash.Hash(decodedSpace) 68 | return hashedResult 69 | } 70 | 71 | func (this *LSH) Distance() float64 { 72 | return this.distance 73 | } 74 | 75 | func (this *LSH) UpdateDecoderVariance(vari float64) { 76 | this.decoder.SetVariance(vari) 77 | } 78 | -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/overview.png -------------------------------------------------------------------------------- /parse/parser.go: -------------------------------------------------------------------------------- 1 | package parse 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "math" 7 | "reflect" 8 | ) 9 | 10 | var ( 11 | fixedDecimalPoint = 18 12 | floatType = reflect.TypeOf(float64(0)) 13 | weightMax = math.Abs(ToFixed(math.MaxFloat64, fixedDecimalPoint)) 14 | weightMin = float64(0) 15 | ) 16 | 17 | type Schema struct { 18 | dataType reflect.Type 19 | max float64 20 | min float64 21 | } 22 | 23 | func NewSchema(value float64) *Schema { 24 | return &Schema{ 25 | dataType: reflect.TypeOf(value), 26 | max: value, 27 | min: value, 28 | } 29 | } 30 | 31 | func (this *Schema) SetMax(floatValue float64) { 32 | this.max = floatValue 33 | } 34 | 35 | func (this *Schema) SetMin(floatValue float64) { 36 | this.min = floatValue 37 | } 38 | 39 | func (this *Schema) GetMax() float64 { 40 | return this.max 41 | } 42 | 43 | func (this *Schema) GetMin() float64 { 44 | return this.min 45 | } 46 | 47 | func Round(num float64) int { 48 | return int(num + math.Copysign(0.5, num)) 49 | } 50 | 51 | func ToFixed(num float64, precision int) float64 { 52 | output := math.Pow(10, float64(precision)) 53 | return float64(Round(num*output)) / output 54 | } 55 | 56 | func Normalize(value float64) float64 { 57 | return (value - weightMin) / (weightMax - weightMin) 58 | } 59 | 60 | func DeNormalize(normalized float64) float64 { 61 | return (normalized*(weightMax-weightMin) + weightMin) 62 | } 63 | 64 | type Parser struct { 65 | schemaKeys []string 66 | schema map[string]*Schema 67 | label string 68 | } 69 | 70 | func NewParser() *Parser { 71 | var schemaKeys []string 72 | return &Parser{ 73 | label: "", 74 | schema: nil, 75 | schemaKeys: schemaKeys, 76 | } 77 | } 78 | 79 | // Convert an array of bytes to a JSON struct. 80 | func (this *Parser) BytesToJSON(bytesContents []byte) map[string]interface{} { 81 | var data map[string]interface{} 82 | if err := json.Unmarshal(bytesContents, &data); err != nil { 83 | panic(err) 84 | } 85 | return data 86 | } 87 | 88 | func (this *Parser) JSONToBytes(jsonMap interface{}) []byte { 89 | bytesContents, _ := json.MarshalIndent(jsonMap, "", " ") 90 | return bytesContents 91 | } 92 | 93 | // Convert a json object with a schema to an array of 64 bit floats. 94 | func (this *Parser) JSONToFloat64(jsonMap map[string]interface{}) []float64 { 95 | 96 | // Create an array of 64 bit floats of the same size. 97 | result := make([]float64, len(this.schemaKeys)) 98 | 99 | // Iterate over the json fields and assign floating point values to each field value. 100 | for i := 0; i < len(this.schemaKeys); i++ { 101 | // Normalize the mapped value 102 | key := this.schemaKeys[i] 103 | float, _ := this.ConvertInterfaceToFloat64(jsonMap[key]) 104 | result[i] = Normalize(float) 105 | } 106 | return result 107 | } 108 | 109 | // Convert an array of 64 bit floats to JSON according to a schema. 110 | func (this *Parser) Float64ToJSON(floats []float64) map[string]interface{} { 111 | // Create an JSON object. 112 | jsonMap := make(map[string]interface{}) 113 | 114 | for i := 0; i < len(this.schemaKeys); i++ { 115 | // DeNormalize the mapped value 116 | jsonMap[this.schemaKeys[i]] = DeNormalize(floats[i]) 117 | } 118 | return jsonMap 119 | } 120 | 121 | // Convert a JSON table to a array of float64 arrays. 122 | // The data should come in like this: 123 | // { 124 | // "label": [{ 125 | // "field-1": "value-1", 126 | // ... 127 | // }, { 128 | // "field-1": "value-1", 129 | // ... 130 | // }] 131 | // } 132 | func (this *Parser) JSONToFloat64Matrix(label string, dataSet map[string]interface{}) [][]float64 { 133 | // Assign a label to the specific schema. 134 | this.label = label 135 | 136 | // Read the data in as an array of json objects. 137 | data := dataSet[label].([]interface{}) 138 | count := len(data) 139 | 140 | // Allocate an array of arrays for the return. 141 | matrix := make([][]float64, count, count) 142 | 143 | // Create a schema based on an entry in the data. 144 | this.schema = this.CreateSchema(data) 145 | 146 | // Convert the json data to weighted float values. 147 | for i := 0; i < count; i++ { 148 | matrix[i] = this.JSONToFloat64(data[i].(map[string]interface{})) 149 | } 150 | return matrix 151 | } 152 | 153 | // Convert a matrix of 64 bit floats to JSON according to a json schema. 154 | // label - string associated with JSON data set schema. 155 | // data - the array of arrays associated with the entries of data. 156 | func (this *Parser) Float64MatrixToJSON(label string, dataSet [][]float64) map[string]interface{} { 157 | count := len(dataSet) 158 | 159 | // Create an array of JSON objects. 160 | data := make([]interface{}, count) 161 | 162 | // Create a json object to hold the array of JSON objects with the specific label. 163 | result := make(map[string]interface{}) 164 | 165 | // Convert the weighted float values back to the JSON using the schema. 166 | for i := 0; i < count; i++ { 167 | data[i] = this.Float64ToJSON(dataSet[i]) 168 | } 169 | 170 | // Assign the label to the json data. 171 | result[this.label] = data 172 | return result 173 | } 174 | 175 | // Convert an unknown interface to a 64 bit floating point. 176 | // From stackoverflow.com 177 | func (this *Parser) ConvertInterfaceToFloat64(unk interface{}) (float64, error) { 178 | v := reflect.ValueOf(unk) 179 | v = reflect.Indirect(v) 180 | if !v.Type().ConvertibleTo(floatType) { 181 | return 0, errors.New("Cannot convert" + v.Type().String() + "to float64") 182 | } 183 | fv := v.Convert(floatType) 184 | return fv.Float(), nil 185 | } 186 | 187 | // Create a schema based on a JSON object. 188 | func (this *Parser) CreateSchema(data []interface{}) map[string]*Schema { 189 | count := len(data) 190 | 191 | // Set up a base schema. 192 | schema := make(map[string]*Schema) 193 | 194 | // Loop over each JSON object in the array update the schema associated schema. 195 | for i := 0; i < count; i++ { 196 | // Convert the data to a json object. 197 | jsonMap := data[i].(map[string]interface{}) 198 | 199 | // Loop over its key -> value pairs. 200 | for key, value := range jsonMap { 201 | floatValue, _ := this.ConvertInterfaceToFloat64(value) 202 | // Has the schema not been added for the key? 203 | if _, ok := schema[key]; !ok { 204 | // Assign the key associated with the JSON field to its value type max and min. 205 | schema[key] = NewSchema(floatValue) 206 | 207 | // Assure the keys are in the proper order. 208 | this.schemaKeys = append(this.schemaKeys, key) 209 | continue 210 | } 211 | 212 | // Check if the next value is less than the current minimum 213 | // Check if the next value is greater than the current maximum 214 | if floatValue < schema[key].GetMin() { 215 | schema[key].SetMin(floatValue) 216 | } else if floatValue > schema[key].GetMax() { 217 | schema[key].SetMax(floatValue) 218 | } 219 | } 220 | } 221 | return schema 222 | } 223 | -------------------------------------------------------------------------------- /plots/kmeans/centroid-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-0.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-1.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-2.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-3.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-4.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-5.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-6.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-7.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-8.png -------------------------------------------------------------------------------- /plots/kmeans/centroid-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/centroid-9.png -------------------------------------------------------------------------------- /plots/kmeans/heat0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat0.png -------------------------------------------------------------------------------- /plots/kmeans/heat1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat1.png -------------------------------------------------------------------------------- /plots/kmeans/heat2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat2.png -------------------------------------------------------------------------------- /plots/kmeans/heat3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat3.png -------------------------------------------------------------------------------- /plots/kmeans/heat4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat4.png -------------------------------------------------------------------------------- /plots/kmeans/heat5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat5.png -------------------------------------------------------------------------------- /plots/kmeans/heat6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat6.png -------------------------------------------------------------------------------- /plots/kmeans/heat7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat7.png -------------------------------------------------------------------------------- /plots/kmeans/heat8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat8.png -------------------------------------------------------------------------------- /plots/kmeans/heat9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/heat9.png -------------------------------------------------------------------------------- /plots/kmeans/paint0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint0.png -------------------------------------------------------------------------------- /plots/kmeans/paint1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint1.png -------------------------------------------------------------------------------- /plots/kmeans/paint2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint2.png -------------------------------------------------------------------------------- /plots/kmeans/paint3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint3.png -------------------------------------------------------------------------------- /plots/kmeans/paint4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint4.png -------------------------------------------------------------------------------- /plots/kmeans/paint5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint5.png -------------------------------------------------------------------------------- /plots/kmeans/paint6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint6.png -------------------------------------------------------------------------------- /plots/kmeans/paint7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint7.png -------------------------------------------------------------------------------- /plots/kmeans/paint8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint8.png -------------------------------------------------------------------------------- /plots/kmeans/paint9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/kmeans/paint9.png -------------------------------------------------------------------------------- /plots/rphash/centroid-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-0.png -------------------------------------------------------------------------------- /plots/rphash/centroid-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-1.png -------------------------------------------------------------------------------- /plots/rphash/centroid-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-2.png -------------------------------------------------------------------------------- /plots/rphash/centroid-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-3.png -------------------------------------------------------------------------------- /plots/rphash/centroid-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-4.png -------------------------------------------------------------------------------- /plots/rphash/centroid-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-5.png -------------------------------------------------------------------------------- /plots/rphash/centroid-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-6.png -------------------------------------------------------------------------------- /plots/rphash/centroid-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-7.png -------------------------------------------------------------------------------- /plots/rphash/centroid-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-8.png -------------------------------------------------------------------------------- /plots/rphash/centroid-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/centroid-9.png -------------------------------------------------------------------------------- /plots/rphash/heat0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat0.png -------------------------------------------------------------------------------- /plots/rphash/heat1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat1.png -------------------------------------------------------------------------------- /plots/rphash/heat2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat2.png -------------------------------------------------------------------------------- /plots/rphash/heat3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat3.png -------------------------------------------------------------------------------- /plots/rphash/heat4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat4.png -------------------------------------------------------------------------------- /plots/rphash/heat5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat5.png -------------------------------------------------------------------------------- /plots/rphash/heat6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat6.png -------------------------------------------------------------------------------- /plots/rphash/heat7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat7.png -------------------------------------------------------------------------------- /plots/rphash/heat8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat8.png -------------------------------------------------------------------------------- /plots/rphash/heat9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/heat9.png -------------------------------------------------------------------------------- /plots/rphash/paint0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint0.png -------------------------------------------------------------------------------- /plots/rphash/paint1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint1.png -------------------------------------------------------------------------------- /plots/rphash/paint2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint2.png -------------------------------------------------------------------------------- /plots/rphash/paint3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint3.png -------------------------------------------------------------------------------- /plots/rphash/paint4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint4.png -------------------------------------------------------------------------------- /plots/rphash/paint5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint5.png -------------------------------------------------------------------------------- /plots/rphash/paint6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint6.png -------------------------------------------------------------------------------- /plots/rphash/paint7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint7.png -------------------------------------------------------------------------------- /plots/rphash/paint8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint8.png -------------------------------------------------------------------------------- /plots/rphash/paint9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/plots/rphash/paint9.png -------------------------------------------------------------------------------- /projector/dbfriendly.go: -------------------------------------------------------------------------------- 1 | package projector 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | ) 7 | 8 | type DBFriendly struct { 9 | negativeVectorIndices [][]int 10 | positiveVectorIndices [][]int 11 | inputDimensionality int 12 | targetDimensionality int 13 | random *rand.Rand 14 | } 15 | 16 | func NewDBFriendly(inputDimensionality, targetDimensionality int, randomseed int64) *DBFriendly { 17 | const NONZEROINDICESCHANCE = 6 18 | rando := rand.New(rand.NewSource(randomseed)) 19 | negativeVectorIndices, positiveVectorIndices := make([][]int, targetDimensionality), make([][]int, targetDimensionality) 20 | rM, rP := 0, 0 21 | probability := inputDimensionality / NONZEROINDICESCHANCE 22 | for i := 0; i < targetDimensionality; i++ { 23 | orderedNegativeIndices, orderedPositiveIndices := make([]int, probability), make([]int, probability) 24 | for j := 0; j < inputDimensionality; j++ { 25 | rM, rP = rando.Intn(NONZEROINDICESCHANCE), rando.Intn(NONZEROINDICESCHANCE) 26 | if rM == 0 { 27 | orderedNegativeIndices = append(orderedNegativeIndices, int(j)) 28 | } else if rP == 0 { 29 | orderedPositiveIndices = append(orderedPositiveIndices, int(j)) 30 | } 31 | } 32 | negativeRow, positiveRow := make([]int, len(orderedNegativeIndices)), make([]int, len(orderedPositiveIndices)) 33 | for k, val := range orderedNegativeIndices { 34 | negativeRow[k] = val 35 | } 36 | for k, val := range orderedPositiveIndices { 37 | positiveRow[k] = val 38 | } 39 | negativeVectorIndices[i], positiveVectorIndices[i] = negativeRow, positiveRow 40 | } 41 | 42 | return &DBFriendly{ 43 | negativeVectorIndices: negativeVectorIndices, 44 | positiveVectorIndices: positiveVectorIndices, 45 | inputDimensionality: inputDimensionality, 46 | targetDimensionality: targetDimensionality, 47 | random: rando, 48 | } 49 | } 50 | 51 | func (this *DBFriendly) Project(inputVector []float64) []float64 { 52 | var sum float64 53 | reducedVector := make([]float64, this.targetDimensionality) 54 | scale := math.Sqrt(3 / float64(this.targetDimensionality)) 55 | for i := 0; i < this.targetDimensionality; i++ { 56 | sum = 0 57 | for _, val := range this.negativeVectorIndices[i] { 58 | if val >= len(inputVector) || val < 0 { 59 | continue 60 | } 61 | sum -= inputVector[val] * scale 62 | } 63 | for _, val := range this.positiveVectorIndices[i] { 64 | if val >= len(inputVector) || val < 0 { 65 | continue 66 | } 67 | sum += inputVector[val] * scale 68 | } 69 | reducedVector[i] = sum 70 | } 71 | return reducedVector 72 | } 73 | -------------------------------------------------------------------------------- /projector/fjlt.go: -------------------------------------------------------------------------------- 1 | package projector; 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | ); 7 | 8 | type FJLT struct { 9 | D []float64; 10 | P []float64; 11 | n int64; 12 | k int64; 13 | d int64; 14 | random *rand.Rand; 15 | }; 16 | 17 | /** 18 | * Allocate a new instance of the FJLT. 19 | * @return {*FJLT}. 20 | */ 21 | func NewFJLT(d, k, n int64) *FJLT { 22 | random := rand.New(rand.NewSource(n)); 23 | epsilon := float64(math.Sqrt(math.Log(float64(n)) / float64(k))); 24 | P := GenerateP(n, k, d, 2, epsilon, random); 25 | D := GenerateD(d, random); 26 | return &FJLT{ 27 | D: D, 28 | P: P, 29 | n: n, 30 | k: k, 31 | d: d, 32 | }; 33 | }; 34 | 35 | func SGEMV(t, n, startpoint, startoutput int64, M, v, result []float64, alpha float64) { 36 | var sum float64; 37 | var i, j int64; 38 | for i = 0; i < t; i++ { 39 | sum = 0.0; 40 | for j = 0; j < n; j++ { 41 | sum += v[j+startpoint] * M[i*n+j]; 42 | result[startoutput+i] = sum * alpha; 43 | } 44 | } 45 | }; 46 | 47 | func GenerateP(numberOfPoints, numRows, numCols, embeddingType int64, epsilon float64, random *rand.Rand) []float64 { 48 | data := make([]float64, numRows*numCols); 49 | probability := float64((math.Pow(epsilon, float64(embeddingType-2)) * math.Pow(math.Log(float64(numberOfPoints)), float64(embeddingType))) / float64(numCols)); 50 | if !(probability < 1) { 51 | probability = 1; 52 | } 53 | rdata := make([]float64, numRows*numCols); 54 | InvRandN(data, numRows, numCols, 0, 1/float64(probability), random); 55 | RandU(rdata, numRows, numCols, random); 56 | var i, j int64; 57 | for i = 0; i < numRows; i++ { 58 | for j = 0; j < numCols; j++ { 59 | if rdata[i*numCols+j] < probability { 60 | data[i*numCols+j] *= 0; 61 | } else { 62 | data[i*numCols+j] *= 1; 63 | } 64 | } 65 | } 66 | return data; 67 | }; 68 | 69 | func GenerateD(d int64, random *rand.Rand) []float64 { 70 | var i, j, l int64; 71 | data := make([]float64, d); 72 | for i = 0; i < d; { 73 | l = random.Int63(); 74 | for j = 0; j < 32 && i < d; j++ { 75 | if (l & 1) == 1 { 76 | data[i] = 1; 77 | } else { 78 | data[i] = -1; 79 | } 80 | l = l >> 1; 81 | i++; 82 | } 83 | } 84 | return data; 85 | }; 86 | 87 | func InvRandN(data []float64, m, n int64, mu, vari float64, random *rand.Rand) { 88 | var i, j int64; 89 | sd := float64(math.Sqrt(vari)); 90 | for i = 0; i < m; i++ { 91 | for j = 0; j < n; j++ { 92 | data[i*n+j] = mu + sd*float64(MoroInvCND(random.Float64())); 93 | } 94 | } 95 | }; 96 | 97 | func RandU(data []float64, m, n int64, random *rand.Rand) { 98 | var i, j int64; 99 | for i = 0; i < m; i++ { 100 | for j = 0; j < n; j++ { 101 | data[i*n+j] = random.Float64(); 102 | } 103 | } 104 | }; 105 | 106 | func MoroInvCND(P float64) float64 { 107 | var z float64; 108 | a1 := 2.50662823884; 109 | a2 := -18.61500062529; 110 | a3 := 41.39119773534; 111 | a4 := -25.44106049637; 112 | b1 := -8.4735109309; 113 | b2 := 23.08336743743; 114 | b3 := -21.06224101826; 115 | b4 := 3.13082909833; 116 | c1 := 0.337475482272615; 117 | c2 := 0.976169019091719; 118 | c3 := 0.160797971491821; 119 | c4 := 2.76438810333863E-02; 120 | c5 := 3.8405729373609E-03; 121 | c6 := 3.951896511919E-04; 122 | c7 := 3.21767881768E-05; 123 | c8 := 2.888167364E-07; 124 | c9 := 3.960315187E-07; 125 | 126 | if P <= 0 || P >= 1.0 { 127 | /* Caused by numerical instability of random */ 128 | P = 0.9999; 129 | } 130 | y := P - 0.5; 131 | if math.Abs(y) < 0.42 { 132 | z = y * y; 133 | z = y * (((a4*z+a3)*z+a2)*z + a1) / ((((b4*z+b3)*z+b2)*z+b1)*z + 1); 134 | } else { 135 | if y > 0 { 136 | z = float64(math.Log(-math.Log(1.0 - P))); 137 | } else { 138 | z = float64(math.Log(-math.Log(P))); 139 | z = c1 + z*(c2+z*(c3+z*(c4+z*(c5+z*(c6+z*(c7+z*(c8+z*c9))))))); 140 | if y < 0 { 141 | z = -z; 142 | } 143 | } 144 | } 145 | return z; 146 | }; 147 | 148 | /** 149 | * Performs the FJLT on a matrix. 150 | * @class {FJLT} this. 151 | * @param {[]float64} input, Matrix. 152 | * @return {[]float64} new Matrix. 153 | */ 154 | func (this *FJLT) FJLT(input []float64) []float64 { 155 | var a, b, c uint64; 156 | var curr int64; 157 | result := make([]float64, this.n*this.k); 158 | for curr = 0; curr < this.n; curr++ { 159 | startpoint := curr * this.d; 160 | startoutput := this.k * curr; 161 | for a = 0; a < uint64(this.d); a++ { 162 | input[int64(a)+startpoint] *= this.D[a]; 163 | } 164 | l2 := uint64(math.Log(float64(this.d)) / math.Log(2)); 165 | for a = 0; a < l2; a++ { 166 | for b = 0; b < (1 << l2); b += (1 << (a + 1)) { 167 | for c = 0; c < (1 << a); c++ { 168 | temp := input[startpoint+int64(b+c)]; 169 | input[startpoint+int64(b+c)] += input[startpoint+int64(b+c+(1<> 1)), 37 | numberOfProjections: 1, 38 | numberOfBlurs: 1, 39 | k: k, 40 | topIDs: topIDs, 41 | centroids: centroids, 42 | numDataPoints: 0, 43 | } 44 | } 45 | 46 | func (this *StreamObject) GetK() int { 47 | return this.k 48 | } 49 | func (this *StreamObject) NumDataPoints() int { 50 | return this.numDataPoints 51 | } 52 | 53 | func (this *StreamObject) GetDimensions() int { 54 | return this.dimension 55 | } 56 | 57 | func (this *StreamObject) GetRandomSeed() int64 { 58 | return this.randomSeed 59 | } 60 | 61 | func (this *StreamObject) GetNumberOfBlurs() int { 62 | return this.numberOfBlurs 63 | } 64 | 65 | func (this *StreamObject) GetVectorIterator() types.Iterator { 66 | return this.data 67 | } 68 | 69 | func (this *StreamObject) AppendVector(vector []float64) { 70 | this.numDataPoints++ 71 | this.data.Append(vector) 72 | } 73 | 74 | func (this *StreamObject) GetCentroids() [][]float64 { 75 | return this.centroids 76 | } 77 | 78 | func (this *StreamObject) GetPreviousTopID() []int64 { 79 | return this.topIDs 80 | } 81 | 82 | func (this *StreamObject) SetPreviousTopID(top []int64) { 83 | this.topIDs = top 84 | } 85 | 86 | func (this *StreamObject) AddCentroid(v []float64) { 87 | this.centroids = append(this.centroids, v) 88 | } 89 | 90 | func (this *StreamObject) SetCentroids(l [][]float64) { 91 | this.centroids = l 92 | } 93 | 94 | func (this *StreamObject) GetNumberOfProjections() int { 95 | return this.numberOfProjections 96 | } 97 | 98 | func (this *StreamObject) SetNumberOfProjections(probes int) { 99 | this.numberOfProjections = probes 100 | } 101 | 102 | func (this *StreamObject) SetNumberOfBlurs(parseInt int) { 103 | this.numberOfBlurs = parseInt 104 | } 105 | 106 | func (this *StreamObject) SetRandomSeed(parseLong int64) { 107 | this.randomSeed = parseLong 108 | } 109 | 110 | func (this *StreamObject) GetHashModulus() int64 { 111 | return this.hashModulus 112 | } 113 | 114 | func (this *StreamObject) SetHashModulus(parseLong int64) { 115 | this.hashModulus = int64(parseLong) 116 | } 117 | 118 | func (this *StreamObject) SetDecoderType(dec types.Decoder) { 119 | this.decoder = dec 120 | } 121 | 122 | func (this *StreamObject) GetDecoderType() types.Decoder { 123 | return this.decoder 124 | } 125 | 126 | func (this *StreamObject) SetVariance(data [][]float64) { 127 | this.decoder.SetVariance(utils.VarianceSample(data, 0.01)) 128 | } 129 | 130 | func (this *StreamObject) GetVariance() float64 { 131 | return this.decoder.GetVariance() 132 | } 133 | -------------------------------------------------------------------------------- /rphash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/rphash -------------------------------------------------------------------------------- /rphash.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/gob" 5 | "sync" 6 | "flag" 7 | "fmt" 8 | _ "github.com/chrislusf/glow/driver" 9 | "github.com/chrislusf/glow/flow" 10 | "github.com/wilseypa/rphash-golang/parse" 11 | "github.com/wilseypa/rphash-golang/reader" 12 | "github.com/wilseypa/rphash-golang/stream" 13 | "github.com/wilseypa/rphash-golang/itemset" 14 | "github.com/wilseypa/rphash-golang/utils" 15 | "os" 16 | "time" 17 | ) 18 | 19 | var ( 20 | f = flow.New() 21 | expectedDimensions = -1 22 | numClusters = 6 23 | ) 24 | 25 | type Centroid struct { 26 | C *itemset.Centroid 27 | } 28 | 29 | func goStart(wg *sync.WaitGroup, fn func()) { 30 | wg.Add(1) 31 | go func() { 32 | defer wg.Done() 33 | fn() 34 | }() 35 | } 36 | 37 | func main() { 38 | gob.Register(Centroid{}) 39 | gob.Register(itemset.Centroid{}) 40 | gob.Register(utils.Hash64Set{}) 41 | flag.Parse() 42 | 43 | t1 := time.Now() 44 | records := utils.ReadCSV("./dataset.csv") 45 | 46 | Object := reader.NewStreamObject(len(records[0]), numClusters) 47 | Stream := stream.NewStream(Object) 48 | 49 | outChannel := make(chan Centroid) 50 | 51 | ch := make(chan []float64) 52 | 53 | source := f.Channel(ch) 54 | 55 | f1 := source.Map(func(record []float64) Centroid { 56 | return Centroid{C:Stream.AddVectorOnlineStep(record)} 57 | }).AddOutput(outChannel) 58 | 59 | flow.Ready() 60 | 61 | var wg sync.WaitGroup 62 | 63 | goStart(&wg, func() { 64 | f1.Run() 65 | }) 66 | 67 | goStart(&wg, func() { 68 | for out := range outChannel { 69 | Stream.CentroidCounter.Add(out.C) 70 | } 71 | }) 72 | 73 | for _, record := range records { 74 | ch <- record 75 | } 76 | 77 | close(ch) 78 | wg.Wait() 79 | 80 | normalizedResults := Stream.GetCentroids() 81 | ts := time.Since(t1) 82 | 83 | file, err := os.OpenFile("./results.txt", os.O_WRONLY|os.O_CREATE, 0644) 84 | if err != nil { 85 | panic(err) 86 | } 87 | defer file.Close() 88 | for _, result := range normalizedResults { 89 | for _, dimension := range result { 90 | file.WriteString(fmt.Sprintf("%f ", parse.DeNormalize(dimension))) 91 | } 92 | file.WriteString("\n") 93 | } 94 | file.WriteString("Time: " + ts.String()) 95 | } 96 | -------------------------------------------------------------------------------- /rphash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilseypa/rphash-golang/114931fd03c3cb07bb6bbc6b2d3c9f236e854844/rphash.png -------------------------------------------------------------------------------- /simple/simple.go: -------------------------------------------------------------------------------- 1 | package simple 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/defaults" 5 | "github.com/wilseypa/rphash-golang/types" 6 | "math" 7 | "runtime" 8 | ) 9 | 10 | type Simple struct { 11 | centroids [][]float64 12 | variance float64 13 | rphashObject types.RPHashObject 14 | } 15 | 16 | func NewSimple(_rphashObject types.RPHashObject) *Simple { 17 | return &Simple{ 18 | variance: 0, 19 | centroids: nil, 20 | rphashObject: _rphashObject, 21 | } 22 | } 23 | 24 | // Map is doing the count. 25 | func (this *Simple) Map() *Simple { 26 | runtime.GOMAXPROCS(runtime.NumCPU()) 27 | vecs := this.rphashObject.GetVectorIterator() 28 | //var hashResult int64; 29 | targetDimension := int(math.Floor(float64(this.rphashObject.GetDimensions() / 2))) 30 | numberOfRotations := 6 31 | numberOfSearches := 1 32 | vec := vecs.Next() 33 | hash := defaults.NewHash(this.rphashObject.GetHashModulus()) 34 | decoder := defaults.NewDecoder(targetDimension, numberOfRotations, numberOfSearches) 35 | projector := defaults.NewProjector(this.rphashObject.GetDimensions(), decoder.GetDimensionality(), this.rphashObject.GetRandomSeed()) 36 | LSH := defaults.NewLSH(hash, decoder, projector) 37 | // k := int(float64(this.rphashObject.GetK()) * math.Log(float64(this.rphashObject.GetK()))); 38 | CountMinSketch := defaults.NewCountMinSketch(this.rphashObject.GetK()) 39 | var vecCount = 0 40 | //1000 is an arbitrary comprise between speed and size should be tweeked later. 41 | hashChannel := make(chan int64, this.rphashObject.NumDataPoints()) 42 | hashValues := make([]int64, this.rphashObject.NumDataPoints(), this.rphashObject.NumDataPoints()) 43 | for vecs.HasNext() { 44 | go func(vec []float64, index int) { 45 | // Project the Vector to lower dimension. 46 | // Decode the new vector for meaningful integers 47 | // Hash the new vector into a 64 bit int. 48 | value := LSH.LSHHashSimple(vec) 49 | hashValues[index] = value 50 | hashChannel <- value 51 | //hashResult = LSH.LSHHashSimple(vec); 52 | // Add it to the count min sketch to update frequencies. 53 | }(vec, vecCount) 54 | vecCount++ 55 | vec = vecs.Next() 56 | } 57 | vecs.StoreLSHValues(hashValues) 58 | //TODO should we Paralelize this? slowest loop but also have to wait for LSH Loops 59 | for i := 0; i < vecCount; i++ { 60 | hashResult := <-hashChannel 61 | CountMinSketch.Add(hashResult) 62 | } 63 | this.rphashObject.SetPreviousTopID(CountMinSketch.GetTop()) 64 | vecs.Reset() 65 | return this 66 | } 67 | 68 | // Reduce is finding out where the centroids are in respect to the real data. 69 | func (this *Simple) Reduce() *Simple { 70 | vecs := this.rphashObject.GetVectorIterator() 71 | if !vecs.HasNext() { 72 | return this 73 | } 74 | 75 | var centroids []types.Centroid 76 | for i := 0; i < this.rphashObject.GetK(); i++ { 77 | // Get the top centroids. 78 | previousTop := this.rphashObject.GetPreviousTopID() 79 | centroid := defaults.NewCentroidSimple(this.rphashObject.GetDimensions(), previousTop[i]) 80 | centroids = append(centroids, centroid) 81 | } 82 | 83 | // Iterate over the dataset and check CountMinSketch. 84 | //Paralelize loop 85 | var centriodChannels []chan []float64 86 | for i, _ := range centroids { 87 | centriodChannels = append(centriodChannels, make(chan []float64, 10000)) 88 | go func(id int) { 89 | for true { 90 | newVec, ok := <-centriodChannels[id] 91 | if !ok { 92 | return 93 | } 94 | centroids[id].UpdateVector(newVec) 95 | } 96 | }(i) 97 | } 98 | vec := vecs.Next() 99 | var hashResult = int64(0) 100 | for vecs.HasNext() { 101 | hashResult = vecs.PeakLSH() 102 | // For each vector, check if it is a centroid. 103 | for i, cent := range centroids { 104 | // Get an idea where the LSH is in respect to the vector. 105 | if cent.GetIDs().Contains(hashResult) { 106 | //centriodChannels[i] <- vec; 107 | centriodChannels[i] <- vec 108 | break 109 | } 110 | } 111 | vec = vecs.Next() 112 | } 113 | for _, channel := range centriodChannels { 114 | close(channel) 115 | } 116 | 117 | for _, cent := range centroids { 118 | this.rphashObject.AddCentroid(cent.Centroid()) 119 | } 120 | 121 | vecs.Reset() 122 | return this 123 | } 124 | 125 | func (this *Simple) GetCentroids() [][]float64 { 126 | if this.centroids == nil { 127 | this.Run() 128 | } 129 | // Perform the KMeans on the centroids. 130 | result := defaults.NewKMeansSimple(this.rphashObject.GetK(), this.centroids).GetCentroids() 131 | return result 132 | } 133 | 134 | func (this *Simple) Run() { 135 | this.Map().Reduce() 136 | this.centroids = this.rphashObject.GetCentroids() 137 | } 138 | 139 | func (this *Simple) GetRPHash() types.RPHashObject { 140 | return this.rphashObject 141 | } 142 | -------------------------------------------------------------------------------- /stream/stream.go: -------------------------------------------------------------------------------- 1 | package stream 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/defaults" 5 | "github.com/wilseypa/rphash-golang/types" 6 | "github.com/wilseypa/rphash-golang/itemset" 7 | "math/rand" 8 | ) 9 | 10 | type Stream struct { 11 | processedCount int 12 | vectorCount int 13 | counts []int64 14 | centroids [][]float64 15 | variance float64 16 | CentroidCounter types.CentroidItemSet 17 | randomSeedGenerator *rand.Rand 18 | rphashObject types.RPHashObject 19 | lshGroup []types.LSH 20 | decoder types.Decoder 21 | projector types.Projector 22 | hash types.Hash 23 | varianceTracker types.StatTest 24 | lshChannel chan *itemset.Centroid 25 | } 26 | 27 | func NewStream(rphashObject types.RPHashObject) *Stream { 28 | randomSeedGenerator := rand.New(rand.NewSource(rphashObject.GetRandomSeed())) 29 | hash := defaults.NewHash(rphashObject.GetHashModulus()) 30 | decoder := rphashObject.GetDecoderType() 31 | varianceTracker := defaults.NewStatTest(0.01) 32 | projections := rphashObject.GetNumberOfProjections() 33 | k := rphashObject.GetK() * projections 34 | CentroidCounter := defaults.NewCentroidCounter(k) 35 | lshGroup := make([]types.LSH, projections) 36 | lshChannel := make(chan *itemset.Centroid, 10000) 37 | var projector types.Projector 38 | for i := 0; i < projections; i++ { 39 | projector = defaults.NewProjector(rphashObject.GetDimensions(), decoder.GetDimensionality(), randomSeedGenerator.Int63()) 40 | lshGroup[i] = defaults.NewLSH(hash, decoder, projector) 41 | } 42 | return &Stream{ 43 | counts: nil, 44 | centroids: nil, 45 | variance: 0, 46 | processedCount: 0, 47 | vectorCount: 0, 48 | CentroidCounter: CentroidCounter, 49 | randomSeedGenerator: randomSeedGenerator, 50 | rphashObject: rphashObject, 51 | lshGroup: lshGroup, 52 | hash: hash, 53 | decoder: decoder, 54 | projector: projector, 55 | varianceTracker: varianceTracker, 56 | lshChannel: lshChannel, 57 | } 58 | } 59 | 60 | func (this *Stream) AddVectorOnlineStep(vec []float64) *itemset.Centroid { 61 | c := itemset.NewCentroidStream(vec) 62 | tmpvar := this.varianceTracker.UpdateVarianceSample(vec) 63 | 64 | if this.variance != tmpvar { 65 | for _, lsh := range this.lshGroup { 66 | lsh.UpdateDecoderVariance(tmpvar) 67 | } 68 | this.variance = tmpvar 69 | } 70 | 71 | for _, lsh := range this.lshGroup { 72 | hash := lsh.LSHHashStream(vec, this.rphashObject.GetNumberOfBlurs()) 73 | 74 | for _, h := range hash { 75 | c.AddID(h) 76 | } 77 | } 78 | return c 79 | } 80 | 81 | func (this *Stream) GetCentroids() [][]float64 { 82 | if this.centroids == nil { 83 | this.Run() 84 | var centroids [][]float64 85 | for _, cent := range this.CentroidCounter.GetTop() { 86 | centroids = append(centroids, cent.Centroid()) 87 | } 88 | this.centroids = defaults.NewKMeansWeighted(this.rphashObject.GetK(), centroids, this.CentroidCounter.GetCounts()).GetCentroids() 89 | } 90 | return this.centroids 91 | } 92 | 93 | func (this *Stream) AppendVector(vector []float64) { 94 | //JF this check is required to stop from overflowing memory in the lshChannel with very large data sets. 95 | if (this.vectorCount - this.processedCount) > 100000 { 96 | this.Run() 97 | } 98 | this.vectorCount++ 99 | go func(vector []float64) { 100 | this.lshChannel <- this.AddVectorOnlineStep(vector) 101 | return; 102 | }(vector) 103 | } 104 | 105 | func (this *Stream) Run() { 106 | for this.processedCount < this.vectorCount { 107 | cent := <- this.lshChannel 108 | this.CentroidCounter.Add(cent) 109 | this.processedCount++ 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /tests/clusterer_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/clusterer" 5 | "testing" 6 | ) 7 | 8 | func TestClustererUniformVectors(t *testing.T) { 9 | //initilize data 10 | var numClusters = 2 11 | var numDataPoints = 8 12 | var dimensionality = 4 13 | data := make([][]float64, numDataPoints) 14 | for i := 0; i < numDataPoints; i++ { 15 | data[i] = make([]float64, dimensionality) 16 | for j := 0; j < dimensionality; j++ { 17 | data[i][j] = float64(i) 18 | } 19 | } 20 | 21 | //run test 22 | clusterer := clusterer.NewKMeansSimple(numClusters, data) 23 | clusterer.Run() 24 | var result = clusterer.GetCentroids() 25 | 26 | //Test Results 27 | if len(result) != numClusters { 28 | t.Errorf("Clusterer created %v clusters. When %v was input for k.", len(result), numClusters) 29 | } 30 | if len(result[0]) != dimensionality { 31 | t.Errorf("Cluster dimensionalioty of %v does not match the dimensionality of the input data, %v.", len(result[0]), dimensionality) 32 | } 33 | expectedResults := make([]float64, numClusters) 34 | expectedResults[0] = 1.5 // (0+1+2+3)/4 = 1.5 35 | expectedResults[1] = 5.5 // (4+5+6+7)/4 = 5.5 36 | for i := 0; i < numClusters; i++ { 37 | for j := 0; j < dimensionality; j++ { 38 | if result[i][j] != expectedResults[i] { 39 | t.Errorf("Data did not cluster as expected. Data: %v, Clusters: %v. Failure at %v, %v.", data, result, i, j) 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tests/data/people.json: -------------------------------------------------------------------------------- 1 | { 2 | "people": [{ 3 | "Height": 5.2, 4 | "Weight": 182 5 | }, { 6 | "Height": 4.51, 7 | "Weight": 158 8 | }, { 9 | "Height": 5.36, 10 | "Weight": 124 11 | }, { 12 | "Height": 6.68, 13 | "Weight": 115 14 | }, { 15 | "Height": 4.96, 16 | "Weight": 143 17 | }, { 18 | "Height": 6.37, 19 | "Weight": 118 20 | }, { 21 | "Height": 4.98, 22 | "Weight": 195 23 | }, { 24 | "Height": 6.03, 25 | "Weight": 191 26 | }, { 27 | "Height": 4.92, 28 | "Weight": 196 29 | }, { 30 | "Height": 5.52, 31 | "Weight": 119 32 | }, { 33 | "Height": 5.64, 34 | "Weight": 116 35 | }, { 36 | "Height": 4.09, 37 | "Weight": 146 38 | }, { 39 | "Height": 4.28, 40 | "Weight": 219 41 | }, { 42 | "Height": 4.61, 43 | "Weight": 291 44 | }, { 45 | "Height": 6.34, 46 | "Weight": 203 47 | }, { 48 | "Height": 4.47, 49 | "Weight": 285 50 | }, { 51 | "Height": 5.33, 52 | "Weight": 160 53 | }, { 54 | "Height": 6.41, 55 | "Weight": 197 56 | }, { 57 | "Height": 6.73, 58 | "Weight": 189 59 | }, { 60 | "Height": 4.38, 61 | "Weight": 205 62 | }, { 63 | "Height": 6.93, 64 | "Weight": 196 65 | }, { 66 | "Height": 5.83, 67 | "Weight": 155 68 | }, { 69 | "Height": 5.24, 70 | "Weight": 221 71 | }, { 72 | "Height": 5.47, 73 | "Weight": 115 74 | }, { 75 | "Height": 6.3, 76 | "Weight": 173 77 | }, { 78 | "Height": 5.09, 79 | "Weight": 132 80 | }, { 81 | "Height": 5.46, 82 | "Weight": 146 83 | }, { 84 | "Height": 5.2, 85 | "Weight": 172 86 | }, { 87 | "Height": 4.13, 88 | "Weight": 212 89 | }, { 90 | "Height": 5.47, 91 | "Weight": 219 92 | }, { 93 | "Height": 5.53, 94 | "Weight": 257 95 | }, { 96 | "Height": 6.07, 97 | "Weight": 197 98 | }, { 99 | "Height": 5.81, 100 | "Weight": 173 101 | }, { 102 | "Height": 4.51, 103 | "Weight": 157 104 | }, { 105 | "Height": 5.45, 106 | "Weight": 278 107 | }, { 108 | "Height": 4.87, 109 | "Weight": 292 110 | }, { 111 | "Height": 6.65, 112 | "Weight": 185 113 | }, { 114 | "Height": 6.1, 115 | "Weight": 105 116 | }, { 117 | "Height": 5.36, 118 | "Weight": 286 119 | }, { 120 | "Height": 6.08, 121 | "Weight": 196 122 | }, { 123 | "Height": 6.83, 124 | "Weight": 233 125 | }, { 126 | "Height": 5.47, 127 | "Weight": 102 128 | }, { 129 | "Height": 5.94, 130 | "Weight": 149 131 | }, { 132 | "Height": 6.17, 133 | "Weight": 133 134 | }, { 135 | "Height": 5.49, 136 | "Weight": 158 137 | }, { 138 | "Height": 5.56, 139 | "Weight": 260 140 | }, { 141 | "Height": 5.64, 142 | "Weight": 123 143 | }, { 144 | "Height": 5.65, 145 | "Weight": 118 146 | }, { 147 | "Height": 4.94, 148 | "Weight": 172 149 | }, { 150 | "Height": 5.08, 151 | "Weight": 211 152 | }, { 153 | "Height": 6.81, 154 | "Weight": 242 155 | }, { 156 | "Height": 4.15, 157 | "Weight": 296 158 | }, { 159 | "Height": 4.45, 160 | "Weight": 294 161 | }, { 162 | "Height": 4.33, 163 | "Weight": 272 164 | }, { 165 | "Height": 6.71, 166 | "Weight": 185 167 | }, { 168 | "Height": 5.67, 169 | "Weight": 113 170 | }, { 171 | "Height": 5.03, 172 | "Weight": 106 173 | }, { 174 | "Height": 5.47, 175 | "Weight": 280 176 | }, { 177 | "Height": 6.79, 178 | "Weight": 267 179 | }, { 180 | "Height": 5.07, 181 | "Weight": 288 182 | }, { 183 | "Height": 4.22, 184 | "Weight": 139 185 | }, { 186 | "Height": 5.13, 187 | "Weight": 198 188 | }, { 189 | "Height": 5.26, 190 | "Weight": 148 191 | }, { 192 | "Height": 6.4, 193 | "Weight": 297 194 | }, { 195 | "Height": 6.98, 196 | "Weight": 283 197 | }, { 198 | "Height": 5.21, 199 | "Weight": 282 200 | }, { 201 | "Height": 4.18, 202 | "Weight": 121 203 | }, { 204 | "Height": 6.02, 205 | "Weight": 124 206 | }, { 207 | "Height": 4.17, 208 | "Weight": 284 209 | }, { 210 | "Height": 5.72, 211 | "Weight": 233 212 | }, { 213 | "Height": 5.86, 214 | "Weight": 297 215 | }, { 216 | "Height": 5.96, 217 | "Weight": 278 218 | }, { 219 | "Height": 4.62, 220 | "Weight": 169 221 | }, { 222 | "Height": 6.7, 223 | "Weight": 202 224 | }, { 225 | "Height": 5.34, 226 | "Weight": 235 227 | }, { 228 | "Height": 5.32, 229 | "Weight": 264 230 | }, { 231 | "Height": 6.18, 232 | "Weight": 274 233 | }, { 234 | "Height": 6.32, 235 | "Weight": 201 236 | }, { 237 | "Height": 5.93, 238 | "Weight": 158 239 | }, { 240 | "Height": 5.1, 241 | "Weight": 291 242 | }, { 243 | "Height": 5.4, 244 | "Weight": 267 245 | }, { 246 | "Height": 5.51, 247 | "Weight": 146 248 | }, { 249 | "Height": 6.66, 250 | "Weight": 264 251 | }, { 252 | "Height": 5.17, 253 | "Weight": 122 254 | }, { 255 | "Height": 5.49, 256 | "Weight": 219 257 | }, { 258 | "Height": 5.45, 259 | "Weight": 142 260 | }, { 261 | "Height": 6.16, 262 | "Weight": 230 263 | }, { 264 | "Height": 4.28, 265 | "Weight": 235 266 | }, { 267 | "Height": 4.22, 268 | "Weight": 187 269 | }, { 270 | "Height": 6.94, 271 | "Weight": 229 272 | }, { 273 | "Height": 5.43, 274 | "Weight": 257 275 | }, { 276 | "Height": 6.7, 277 | "Weight": 113 278 | }, { 279 | "Height": 4.07, 280 | "Weight": 204 281 | }, { 282 | "Height": 6.21, 283 | "Weight": 159 284 | }, { 285 | "Height": 5.41, 286 | "Weight": 234 287 | }, { 288 | "Height": 5.73, 289 | "Weight": 201 290 | }, { 291 | "Height": 5.48, 292 | "Weight": 147 293 | }, { 294 | "Height": 5.09, 295 | "Weight": 171 296 | }, { 297 | "Height": 5.09, 298 | "Weight": 206 299 | }, { 300 | "Height": 5.05, 301 | "Weight": 113 302 | }] 303 | } 304 | -------------------------------------------------------------------------------- /tests/decoder_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/decoder" 5 | "github.com/wilseypa/rphash-golang/hash" 6 | "github.com/wilseypa/rphash-golang/utils" 7 | "math/rand" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func TestSpherical(t *testing.T) { 13 | var dimension, k, l, iterations int = 64, 6, 4, 10000 14 | sphere := decoder.NewSpherical(dimension, k, l) 15 | var collisions int = 0 16 | var distavg float64 = 0.0 17 | for j := 0; j < iterations; j++ { 18 | p1, p2 := make([]float64, dimension), make([]float64, dimension) 19 | for k := 0; k < dimension; k++ { 20 | p1[k] = rand.Float64()*2 - 1 21 | p2[k] = rand.Float64()*2 - 1 22 | } 23 | /* Get the distance of each vector from eachother. */ 24 | distavg += utils.Distance(p1, p2) 25 | mh := hash.NewMurmur(1<<63 - 1) 26 | /* Decode from 24-dimensions -> 1-dimensional integer */ 27 | hp1, hp2 := sphere.Hash(utils.Normalize(p1)), sphere.Hash(utils.Normalize(p2)) 28 | /* Blurring the integers into a smaller space. */ 29 | hash1, hash2 := mh.Hash(hp1), mh.Hash(hp2) 30 | if hash1 == hash2 { 31 | collisions++ 32 | } 33 | } 34 | if collisions > (iterations / 100) { 35 | t.Errorf("More than 1 percent of the iterations resulted in collisions. %v collisions in %v iterations.", 36 | collisions, iterations) 37 | } 38 | t.Log("Average Distance: ", distavg/float64(iterations)) 39 | t.Log("Percent collisions : ", float64(collisions)/float64(iterations)) 40 | t.Log("√ Spherical Decoder test complete") 41 | } 42 | 43 | func BenchmarkSpherical(b *testing.B) { 44 | b.StopTimer() 45 | randomSeed := rand.New(rand.NewSource(time.Now().UnixNano())) 46 | var d, k, l int = 64, 6, 4 47 | sphere := decoder.NewSpherical(d, k, l) 48 | p1, p2 := make([]float64, d), make([]float64, d) 49 | for i := 0; i < b.N; i++ { 50 | for j := 0; j < d; j++ { 51 | p1[j], p2[j] = randomSeed.NormFloat64(), randomSeed.NormFloat64() 52 | } 53 | b.StartTimer() 54 | hp1, hp2 := sphere.Hash(utils.Normalize(p1)), sphere.Hash(utils.Normalize(p2)) 55 | b.StopTimer() 56 | if hp1 == nil || hp2 == nil { 57 | b.Error("Spherical hashes are null") 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /tests/hash_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/hash" 5 | "testing" 6 | ) 7 | 8 | func TestMurmur(t *testing.T) { 9 | var hashSize = 100 10 | 11 | testHash := hash.NewMurmur(int64(hashSize)) 12 | resultCount := make([]int64, hashSize) 13 | for i := 0; i < hashSize*100; i++ { 14 | fakeArray := make([]int64, 1) 15 | fakeArray[0] = int64(i) 16 | resultCount[testHash.Hash(fakeArray)]++ 17 | } 18 | var maxCount = hashSize * 13 / 10 19 | var minCount = hashSize * 7 / 10 20 | for index, indexCount := range resultCount { 21 | if indexCount > int64(maxCount) || indexCount < int64(minCount) { 22 | t.Errorf("X - Expected between %d - %d results, for index %d got %d", minCount, maxCount, index, indexCount) 23 | } 24 | } 25 | t.Log("√ Murmur Hash test complete") 26 | } 27 | 28 | func BenchmarkLargeArray(b *testing.B) { 29 | var hashSize = 1000 30 | testHash := hash.NewMurmur(int64(hashSize)) 31 | testArray := make([]int64, 1000) 32 | for i := 0; i < b.N; i++ { 33 | b.StopTimer() 34 | for j := 0; j < len(testArray); j++ { 35 | testArray[j] = int64(i * j) 36 | } 37 | b.StartTimer() 38 | testHash.Hash(testArray) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tests/itemset_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/itemset" 5 | "math/rand" 6 | "testing" 7 | ) 8 | 9 | func TestCountMinSketchCounts(t *testing.T) { 10 | var numToAdd = 10000 11 | var rangeOfValues = 100 12 | k := 100 13 | khh := itemset.NewKHHCountMinSketch(k) 14 | for i := 1; i < numToAdd; i++ { 15 | khh.Add(int64(i % rangeOfValues)) 16 | } 17 | var counts = khh.GetCounts() 18 | for count := range counts { 19 | // Count Min Sketch gaurentees that the count it returns for any value will be equal to or greater than the actual value 20 | if counts[count] > int64(numToAdd/rangeOfValues) { 21 | t.Errorf("All values in the count min sketch should be greater than or equal to the actual value. \n"+ 22 | "Actual value was %d, but returned value for entry %d was %d.", numToAdd/rangeOfValues, count, counts[count]) 23 | } 24 | } 25 | } 26 | 27 | func TestCountMinSketchGetTop(t *testing.T) { 28 | var rangeOfValues = 100 29 | k := 10 30 | khh := itemset.NewKHHCountMinSketch(k) 31 | for i := 1; i < rangeOfValues; i++ { 32 | for j := 1; j < i; j++ { 33 | khh.Add(int64(i % rangeOfValues)) 34 | } 35 | } 36 | //expected result [99 98 97 96 95 ect] 37 | //There is a bug here 38 | counts := khh.GetCounts() 39 | 40 | for i, value := range khh.GetTop() { 41 | //Due to initilization the time each number is entered is one less than the number itself 42 | if counts[i] != (value - 1) { 43 | t.Errorf("The count for element %v was not correct. Expected %v, Actual %v.", i, (value - 1), counts[i]) 44 | } 45 | } 46 | 47 | var previousValue = khh.GetTop()[0] - 1 // initilize to expected value so first loop doesn't fail 48 | //since each value is entered onece more than the previous value the minqueue 49 | //should be the last k values entered, in acesnding order, ending with the max value, 99. 50 | for i, value := range khh.GetTop() { 51 | if value != (previousValue + 1) { 52 | t.Errorf("The element at %v was not the expected value. Expected %v, Actual %v.", i, (previousValue + 1), value) 53 | } 54 | previousValue = value 55 | } 56 | if khh.GetTop()[len(khh.GetTop())-1] != 99 { 57 | t.Errorf("The top count was not the correct the correct value. Expected %v, Actual %v", 99, khh.GetTop()[len(khh.GetTop())-1]) 58 | } 59 | } 60 | 61 | func BenchmarkCountMinSketchAdd(b *testing.B) { 62 | k := 100 63 | khh := itemset.NewKHHCountMinSketch(k) 64 | for i := 0; i < b.N; i++ { 65 | khh.Add(rand.Int63()) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tests/lsh_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/decoder" 5 | "github.com/wilseypa/rphash-golang/hash" 6 | "github.com/wilseypa/rphash-golang/lsh" 7 | "github.com/wilseypa/rphash-golang/projector" 8 | "math" 9 | "testing" 10 | ) 11 | 12 | // The datapoints are seeded in so that the first two data points are near eachother in euclidian geometery and the 3rd and 4th datapoint are 13 | // near eachother in euclidian geometery. So the result1Cluster1 and result2Cluster1 should be closer together than the other two points. 14 | // The same is true for the points in cluster two vs either point in cluster one. 15 | func TestLSHSimple(t *testing.T) { 16 | var seed int64 = 0 17 | // We want to limit the dimension reduction because it causes a lot of noise. 18 | var inDimensions, outDimentions, numberOfClusters, numberOfSearches int = 10, 5, 3, 1 19 | dataPoint1Cluster1 := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} 20 | dataPoint2Cluster1 := []float64{2.0, 3.0, 2.0, 6.0, 5.5, 2.0, 8.0, 3.1, 2.0, 0.0} 21 | 22 | dataPoint1Cluster2 := []float64{100.0, -120.0, 6.0, 18.0, 209.0, 0.0, -2.0, 1036.0, 15.0, 123.0} 23 | dataPoint2Cluster2 := []float64{99.0, -119.0, 2.0, 18.0, 208.5, 0.0, -3.0, 1048.0, 13.0, 122.0} 24 | 25 | hash := hash.NewMurmur(1<<63 - 1) 26 | decoder := decoder.NewSpherical(inDimensions, numberOfClusters, numberOfSearches) 27 | projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) 28 | lsh := lsh.NewLSH(hash, decoder, projector) 29 | result1Cluster1 := lsh.LSHHashSimple(dataPoint1Cluster1) 30 | result2Cluster1 := lsh.LSHHashSimple(dataPoint2Cluster1) 31 | result1Cluster2 := lsh.LSHHashSimple(dataPoint1Cluster2) 32 | result2Cluster2 := lsh.LSHHashSimple(dataPoint2Cluster2) 33 | // Assert that results are still localy sensetive based on the original euclidian geometry 34 | if math.Abs(float64(result1Cluster1-result2Cluster1)) > math.Abs(float64(result1Cluster1-result1Cluster2)) { 35 | t.Errorf("\nThe first datapoint in cluster two is closer to the first data point in cluster one than the second data point in cluster one"+ 36 | "\ndatapoint cluster one datapoint one: %d, \ndatapoint cluster one datapoint two: %d, \ndatapoint cluster two datapoint one: %d", 37 | result1Cluster1, result2Cluster1, result1Cluster2) 38 | } 39 | if math.Abs(float64(result1Cluster1-result2Cluster1)) > math.Abs(float64(result1Cluster1-result2Cluster2)) { 40 | t.Errorf("\nThe second datapoint in cluster two is closer to the first data point in cluster one than the second data point in cluster one"+ 41 | "\nCluster one datapoint one: %d, \nCluster one datapoint two: %d, \nCluster two datapoint two: %d", 42 | result1Cluster1, result2Cluster1, result2Cluster2) 43 | } 44 | if math.Abs(float64(result1Cluster2-result2Cluster2)) > math.Abs(float64(result1Cluster1-result1Cluster2)) { 45 | t.Errorf("\nThe first datapoint in cluster one is closer to the first data point in cluster two than the second data point in cluster two"+ 46 | "\nCluster one datapoint one: %d, \nCluster two datapoint one: %d, \nCluster two datapoint two: %d", 47 | result1Cluster1, result1Cluster2, result2Cluster2) 48 | } 49 | 50 | t.Log("√ LSH Simple test complete") 51 | } 52 | 53 | func TestLSHStream(t *testing.T) { 54 | var seed int64 = 0 55 | var d, k, l int = 64, 6, 4 56 | data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} 57 | var inDimensions, outDimentions int = 10, 2 58 | hash := hash.NewMurmur(1<<63 - 1) 59 | decoder := decoder.NewSpherical(d, k, l) 60 | projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) 61 | lsh := lsh.NewLSH(hash, decoder, projector) 62 | lsh.LSHHashStream(data, 1) 63 | t.Log("√ LSH Stream test complete") 64 | } 65 | 66 | func BenchmarkSimpleLSH(b *testing.B) { 67 | var seed int64 = 0 68 | var d, k, l int = 64, 6, 4 69 | data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} 70 | var inDimensions, outDimentions int = 10, 2 71 | hash := hash.NewMurmur(1<<63 - 1) 72 | decoder := decoder.NewSpherical(d, k, l) 73 | projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) 74 | for i := 0; i < b.N; i++ { 75 | lsh := lsh.NewLSH(hash, decoder, projector) 76 | b.StopTimer() 77 | lsh.LSHHashSimple(data) 78 | b.StartTimer() 79 | } 80 | } 81 | 82 | func BenchmarkStreamLSH(b *testing.B) { 83 | var seed int64 = 0 84 | var d, k, l int = 64, 6, 4 85 | data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} 86 | var inDimensions, outDimentions int = 10, 2 87 | hash := hash.NewMurmur(1<<63 - 1) 88 | decoder := decoder.NewSpherical(d, k, l) 89 | projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) 90 | for i := 0; i < b.N; i++ { 91 | lsh := lsh.NewLSH(hash, decoder, projector) 92 | b.StopTimer() 93 | lsh.LSHHashStream(data, 1) 94 | b.StartTimer() 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /tests/parser_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/parse" 5 | "io/ioutil" 6 | "testing" 7 | ) 8 | 9 | const ( 10 | dataPath = "./data/" 11 | dataFileName = "people.json" 12 | dataLabel = "people" 13 | ) 14 | 15 | func TestParser(t *testing.T) { 16 | parser := parse.NewParser() 17 | oldBytes, _ := ioutil.ReadFile(dataPath + dataFileName) 18 | oldJSON := parser.BytesToJSON(oldBytes) 19 | jsonFloats := parser.JSONToFloat64Matrix(dataLabel, oldJSON) 20 | newJSON := parser.Float64MatrixToJSON(dataLabel, jsonFloats) 21 | oldJSONData := oldJSON[dataLabel].([]interface{}) 22 | newJSONData := newJSON[dataLabel].([]interface{}) 23 | 24 | // Iterate over all the data and check consistency. 25 | for i, _ := range oldJSONData { 26 | oldJSONObject := oldJSONData[i].(map[string]interface{}) 27 | newJSONObject := newJSONData[i].(map[string]interface{}) 28 | for key, value := range oldJSONObject { 29 | newJSONValue, _ := parser.ConvertInterfaceToFloat64(newJSONObject[key]) 30 | oldJSONValue, _ := parser.ConvertInterfaceToFloat64(value) 31 | if oldJSONValue >= newJSONValue-float64(0.01) || oldJSONValue <= newJSONValue+float64(0.01) { 32 | t.Log("√ Matched key and normalized precision") 33 | } else { 34 | t.Log("Mismatched key or normalized precision off!") 35 | } 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/projector_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "fmt" 5 | "github.com/wilseypa/rphash-golang/projector" 6 | "math/rand" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | func TestDBFriendly(t *testing.T) { 12 | //There is probably a better way to test this than hard coding. 13 | data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} 14 | expectedResult := []float64{1.224744871391589, 13.472193585307478} 15 | var inDimensions, outDimentions int = 10, 2 16 | //Use a uniform seed for testing 17 | var seed int64 = 0 18 | RP := projector.NewDBFriendly(inDimensions, outDimentions, seed) 19 | result := RP.Project(data) 20 | if len(result) != len(expectedResult) { 21 | t.Error("The result and expected result are not the same length.") 22 | } 23 | for i := 0; i < len(result); i++ { 24 | if result[i] != expectedResult[i] { 25 | t.Error(fmt.Sprintf("The result at index %d: %f did not match the expected result: %f", i, result[i], expectedResult[i])) 26 | } 27 | } 28 | t.Log("√ DBFriendly Projector test complete") 29 | } 30 | 31 | func BenchmarkDBFriendlyProjection(b *testing.B) { 32 | var inDimensions, outDimentions int = 10, 2 33 | for i := 0; i < b.N; i++ { 34 | b.StopTimer() 35 | var randomGen = rand.New(rand.NewSource(int64(time.Now().Nanosecond()))) 36 | data := make([]float64, inDimensions) 37 | for i := 0; i < inDimensions; i++ { 38 | data[i] = randomGen.Float64() 39 | } 40 | b.StartTimer() 41 | var seed int64 = int64(time.Now().Nanosecond()) 42 | RP := projector.NewDBFriendly(inDimensions, outDimentions, seed) 43 | RP.Project(data) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /tests/simple_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "fmt" 5 | "github.com/wilseypa/rphash-golang/clusterer" 6 | "github.com/wilseypa/rphash-golang/reader" 7 | "github.com/wilseypa/rphash-golang/simple" 8 | "github.com/wilseypa/rphash-golang/utils" 9 | "math/rand" 10 | "testing" 11 | "time" 12 | ) 13 | 14 | func TestKMeansOnNumImagesData(t *testing.T) { 15 | numClusters := 10 16 | lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt"); 17 | if err != nil { 18 | panic(err); 19 | } 20 | data := utils.StringArrayToFloatArray(lines); 21 | 22 | start := time.Now(); 23 | clusterer := clusterer.NewKMeansSimple(numClusters, data) 24 | clusterer.Run() 25 | 26 | result := clusterer.GetCentroids() 27 | time := time.Since(start); 28 | 29 | 30 | totalSqDist := float64(0) 31 | for _, vector := range data { 32 | _, dist := utils.FindNearestDistance(vector, result) 33 | totalSqDist += dist * dist 34 | } 35 | 36 | t.Log("Total Square Distance: ", totalSqDist); 37 | t.Log("Average Square Distance: ", totalSqDist/float64(len(data))); 38 | t.Log("Runtime(seconds): ", time.Seconds()); 39 | 40 | if len(result) != numClusters { 41 | t.Errorf("RPHash Stream did not present the correct number of clusters.") 42 | } 43 | } 44 | 45 | func TestRPHashSimpleOnNumImagesData(t *testing.T) { 46 | numClusters := 10 47 | lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt"); 48 | if err != nil { 49 | panic(err); 50 | } 51 | data := utils.StringArrayToFloatArray(lines); 52 | 53 | start := time.Now(); 54 | RPHashObject := reader.NewSimpleArray(data, numClusters) 55 | simpleObject := simple.NewSimple(RPHashObject) 56 | simpleObject.Run() 57 | 58 | result := RPHashObject.GetCentroids() 59 | time := time.Since(start); 60 | 61 | 62 | totalSqDist := float64(0) 63 | for _, vector := range data { 64 | _, dist := utils.FindNearestDistance(vector, result) 65 | totalSqDist += dist * dist 66 | } 67 | 68 | t.Log("Total Square Distance: ", totalSqDist); 69 | t.Log("Average Square Distance: ", totalSqDist/float64(len(data))); 70 | t.Log("Runtime(seconds): ", time.Seconds()); 71 | 72 | if len(result) != numClusters { 73 | t.Errorf("RPHash Stream did not present the correct number of clusters.") 74 | } 75 | } 76 | 77 | func TestSimpleLeastDistanceVsKmeans(t *testing.T) { 78 | 79 | //Create fake data 80 | var numClusters = 5 81 | var numRows = 400 82 | var dimensionality = 1000 83 | data := make([][]float64, numRows, numRows) 84 | for i := 0; i < numRows; i++ { 85 | row := make([]float64, dimensionality, dimensionality) 86 | for j := 0; j < dimensionality; j++ { 87 | row[j] = rand.Float64() 88 | } 89 | data[i] = row 90 | } 91 | 92 | start := time.Now() 93 | //Test RPHash with Fake Object 94 | RPHashObject := reader.NewSimpleArray(data, numClusters) 95 | simpleObject := simple.NewSimple(RPHashObject) 96 | simpleObject.Run() 97 | 98 | if len(RPHashObject.GetCentroids()) != numClusters { 99 | t.Errorf("Requested %v centriods. But RPHashSimple returned %v.", numClusters, len(RPHashObject.GetCentroids())) 100 | } 101 | rpHashResult := RPHashObject.GetCentroids() 102 | fmt.Println("RPHash: ", time.Since(start)) 103 | //Find clusters using KMeans 104 | start = time.Now() 105 | clusterer := clusterer.NewKMeansSimple(numClusters, data) 106 | clusterer.Run() 107 | 108 | kMeansResult := clusterer.GetCentroids() 109 | fmt.Println("kMeans: ", time.Since(start)) 110 | 111 | var kMeansAssignment = 0 112 | var rpHashAssignment = 0 113 | var matchingAssignmentCount = 0 114 | var kMeansTotalDist = float64(0) 115 | var rpHashTotalDist = float64(0) 116 | for _, vector := range data { 117 | rpHashAssignment, _ = utils.FindNearestDistance(vector, rpHashResult) 118 | kMeansAssignment, _ = utils.FindNearestDistance(vector, kMeansResult) 119 | kMeansTotalDist += utils.Distance(vector, kMeansResult[kMeansAssignment]) 120 | rpHashTotalDist += utils.Distance(vector, rpHashResult[rpHashAssignment]) 121 | //t.Log(rpHashAssignments[i], kMeansAssignments[i]); 122 | if rpHashAssignment == kMeansAssignment { 123 | matchingAssignmentCount += 1 124 | } 125 | } 126 | t.Log("RPHash:", rpHashTotalDist) 127 | t.Log("KMeans:", kMeansTotalDist) 128 | t.Log("Ratio: ", kMeansTotalDist/rpHashTotalDist) 129 | } 130 | 131 | func BenchmarkKMeans(b *testing.B) { 132 | var numClusters = 5 133 | var numRows = 4000 134 | var dimensionality = 1000 135 | data := make([][]float64, numRows, numRows) 136 | for i := 0; i < numRows; i++ { 137 | row := make([]float64, dimensionality, dimensionality) 138 | for j := 0; j < dimensionality; j++ { 139 | row[j] = rand.Float64() 140 | } 141 | data[i] = row 142 | } 143 | for i := 0; i < b.N; i++ { 144 | clusterer := clusterer.NewKMeansSimple(numClusters, data) 145 | clusterer.Run() 146 | 147 | clusterer.GetCentroids() 148 | } 149 | } 150 | 151 | func BenchmarkSimple(b *testing.B) { 152 | var numClusters = 5 153 | var numRows = 4000 154 | var dimensionality = 1000 155 | data := make([][]float64, numRows, numRows) 156 | for i := 0; i < numRows; i++ { 157 | row := make([]float64, dimensionality, dimensionality) 158 | for j := 0; j < dimensionality; j++ { 159 | row[j] = rand.Float64() 160 | } 161 | data[i] = row 162 | } 163 | for i := 0; i < b.N; i++ { 164 | RPHashObject := reader.NewSimpleArray(data, numClusters) 165 | simpleObject := simple.NewSimple(RPHashObject) 166 | simpleObject.Run() 167 | RPHashObject.GetCentroids() 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /tests/simplearray_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/stretchr/testify/assert" 5 | "github.com/wilseypa/rphash-golang/reader" 6 | "github.com/wilseypa/rphash-golang/types" 7 | "github.com/wilseypa/rphash-golang/utils" 8 | "math" 9 | "math/rand" 10 | "reflect" 11 | "testing" 12 | ) 13 | 14 | func TestSimpleArray(t *testing.T) { 15 | var k = 4 16 | var dimensionality = 100 17 | var numBlurs = 2 18 | var numProjections = 2 19 | var numDataPoints = 8 20 | var origVariance float64 = 1 21 | var testDecoderType types.Decoder 22 | var newNumProjections = 4 23 | var newHashModulus int64 = rand.Int63() 24 | 25 | newVarianceSample, newCentroidList := make([][]float64, numDataPoints), make([][]float64, numDataPoints) 26 | for i := 0; i < numDataPoints; i++ { 27 | newVarianceSample[i], newCentroidList[i] = make([]float64, dimensionality), make([]float64, dimensionality) 28 | for j := 0; j < dimensionality; j++ { 29 | newVarianceSample[i][j], newCentroidList[i][j] = float64(i), float64(i) 30 | } 31 | } 32 | 33 | newCentroid := make([]float64, dimensionality) 34 | for i := 0; i < dimensionality; i++ { 35 | newCentroid[i] = float64(i) 36 | } 37 | 38 | newTopId := make([]int64, dimensionality) 39 | for i := 0; i < dimensionality; i++ { 40 | newTopId[i] = int64(i) 41 | } 42 | 43 | RPHashObject := reader.NewSimpleArray(newCentroidList, k) 44 | 45 | // K. 46 | assert.Equal(t, k, RPHashObject.GetK(), "Expected K equal to Stream K.") 47 | 48 | // Dimensionality. 49 | assert.Equal(t, dimensionality, RPHashObject.GetDimensions(), "Expected dimensionality equal to Stream dimensionality.") 50 | 51 | // Iterator. 52 | assert.NotNil(t, RPHashObject.GetVectorIterator(), "Vector iterator should be initially not be nil.") 53 | 54 | // Blurs. 55 | assert.Equal(t, numBlurs, RPHashObject.GetNumberOfBlurs(), "Number of blurs should be initially 2.") 56 | 57 | // Variance. 58 | assert.Equal(t, origVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.") 59 | RPHashObject.SetVariance(newVarianceSample) 60 | newVariance := utils.VarianceSample(newVarianceSample, 0.01) 61 | assert.Equal(t, newVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.") 62 | 63 | // Decoders. 64 | origDecoderType := RPHashObject.GetDecoderType() 65 | assert.NotNil(t, origDecoderType) 66 | assert.Equal(t, reflect.ValueOf(&testDecoderType).Elem().Type(), reflect.ValueOf(&origDecoderType).Elem().Type(), "Decoder should implement the Decoder interface.") 67 | RPHashObject.SetDecoderType(testDecoderType) 68 | assert.Equal(t, testDecoderType, RPHashObject.GetDecoderType(), "Decoder should be set to a new Decoder.") 69 | 70 | // Projections. 71 | assert.Equal(t, numProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be initially 2.") 72 | RPHashObject.SetNumberOfProjections(newNumProjections) 73 | assert.Equal(t, newNumProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be equal to the new number of projections.") 74 | 75 | // Hash modulus. 76 | assert.Equal(t, int64(math.MaxInt64), RPHashObject.GetHashModulus(), "Hash modulus should be equal to the maximum 32 bit integer value.") 77 | RPHashObject.SetHashModulus(newHashModulus) 78 | assert.Equal(t, newHashModulus, RPHashObject.GetHashModulus(), "Hash modulus should be equal to the new hash modulus.") 79 | 80 | // Centroids. 81 | assert.Empty(t, RPHashObject.GetCentroids(), "Centroids should initially be empty.") 82 | RPHashObject.AddCentroid(newCentroid) 83 | assert.Equal(t, newCentroid, RPHashObject.GetCentroids()[0], "First centroid should be the new centroid.") 84 | RPHashObject.SetCentroids(newCentroidList) 85 | assert.Equal(t, newCentroidList, RPHashObject.GetCentroids(), "Centroids should be equal to the new centroid list.") 86 | 87 | // Top IDs 88 | assert.Empty(t, RPHashObject.GetPreviousTopID(), "Previous top ID should initially be empty.") 89 | RPHashObject.SetPreviousTopID(newTopId) 90 | assert.Equal(t, newTopId, RPHashObject.GetPreviousTopID(), "Previous top ID should be equal to the new top centroid.") 91 | } 92 | -------------------------------------------------------------------------------- /tests/stream_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/clusterer" 5 | "github.com/wilseypa/rphash-golang/reader" 6 | "github.com/wilseypa/rphash-golang/stream" 7 | "github.com/wilseypa/rphash-golang/utils" 8 | "testing" 9 | "time" 10 | ) 11 | var filePath = "data/fake_data_500_1000.txt" 12 | var numClusters = 10 13 | var dimensionality = 500 14 | var numDataPoints = float64(1000) 15 | func TestStreamingKMeansOnRandomData(t *testing.T) { 16 | filereader := utils.NewDataFileReader(filePath); 17 | 18 | start := time.Now(); 19 | kmeansStream := clusterer.NewKMeansStream(numClusters, 10, dimensionality) 20 | elapsedtime := time.Since(start); 21 | for { 22 | vector := filereader.Next() 23 | if vector == nil { 24 | break 25 | } 26 | start := time.Now(); 27 | kmeansStream.AddDataPoint(vector); 28 | elapsedtime = elapsedtime + time.Since(start); 29 | } 30 | start = time.Now(); 31 | result := kmeansStream.GetCentroids() 32 | elapsedtime = elapsedtime + time.Since(start); 33 | totalSqDist := float64(0) 34 | filereader = utils.NewDataFileReader(filePath); 35 | for { 36 | vector := filereader.Next() 37 | if vector == nil { 38 | break 39 | } 40 | _, dist := utils.FindNearestDistance(vector, result) 41 | totalSqDist += dist * dist 42 | } 43 | 44 | t.Log("Total Square Distance: ", totalSqDist); 45 | t.Log("Average Square Distance: ", totalSqDist/numDataPoints); 46 | t.Log("Runtime(seconds): ", elapsedtime.Seconds()); 47 | 48 | if len(result) != numClusters { 49 | t.Errorf("RPHash Stream did not present the correct number of clusters.") 50 | } 51 | } 52 | 53 | func TestStreamingRPHashOnRandomData(t *testing.T) { 54 | filereader := utils.NewDataFileReader(filePath); 55 | 56 | start := time.Now(); 57 | rphashObject := reader.NewStreamObject(dimensionality, numClusters) 58 | rphashStream := stream.NewStream(rphashObject) 59 | elapsedtime := time.Since(start); 60 | for { 61 | vector := filereader.Next() 62 | if vector == nil { 63 | break 64 | } 65 | start := time.Now(); 66 | rphashStream.AppendVector(vector); 67 | elapsedtime = elapsedtime + time.Since(start); 68 | } 69 | start = time.Now(); 70 | result := rphashStream.GetCentroids() 71 | elapsedtime = elapsedtime + time.Since(start); 72 | totalSqDist := float64(0) 73 | filereader = utils.NewDataFileReader(filePath); 74 | for { 75 | vector := filereader.Next() 76 | if vector == nil { 77 | break 78 | } 79 | _, dist := utils.FindNearestDistance(vector, result) 80 | totalSqDist += dist * dist 81 | } 82 | 83 | t.Log("Total Square Distance: ", totalSqDist); 84 | t.Log("Average Square Distance: ", totalSqDist/numDataPoints); 85 | t.Log("Runtime(seconds): ", elapsedtime.Seconds()); 86 | 87 | if len(result) != numClusters { 88 | t.Errorf("RPHash Stream did not present the correct number of clusters.") 89 | } 90 | } 91 | 92 | 93 | func TestStreamingKMeansOnNumImagesData(t *testing.T) { 94 | numClusters := 10 95 | lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt"); 96 | if err != nil { 97 | panic(err); 98 | } 99 | dimensionality := len(lines[0]); 100 | data := utils.StringArrayToFloatArray(lines); 101 | 102 | start := time.Now(); 103 | kmeansStream := clusterer.NewKMeansStream(numClusters, 10, dimensionality) 104 | for _, vector := range data { 105 | kmeansStream.AddDataPoint(vector); 106 | } 107 | 108 | result := kmeansStream.GetCentroids() 109 | time := time.Since(start); 110 | totalSqDist := float64(0) 111 | for _, vector := range data { 112 | _, dist := utils.FindNearestDistance(vector, result) 113 | totalSqDist += dist * dist 114 | } 115 | 116 | t.Log("Total Square Distance: ", totalSqDist); 117 | t.Log("Average Square Distance: ", totalSqDist/float64(len(data))); 118 | t.Log("Runtime(seconds): ", time.Seconds()); 119 | 120 | if len(result) != numClusters { 121 | t.Errorf("RPHash Stream did not present the correct number of clusters.") 122 | } 123 | } 124 | 125 | func TestStreamingRPHashOnNumImagesData(t *testing.T) { 126 | numClusters := 10 127 | lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt"); 128 | if err != nil { 129 | panic(err); 130 | } 131 | dimensionality := len(lines[0]); 132 | data := utils.StringArrayToFloatArray(lines); 133 | 134 | start := time.Now(); 135 | rphashObject := reader.NewStreamObject(dimensionality, numClusters) 136 | rphashStream := stream.NewStream(rphashObject) 137 | for _, vector := range data { 138 | rphashStream.AppendVector(vector) 139 | } 140 | rpHashResult := rphashStream.GetCentroids() 141 | time := time.Since(start); 142 | totalSqDist := float64(0) 143 | for _, vector := range data { 144 | _, dist := utils.FindNearestDistance(vector, rpHashResult) 145 | totalSqDist += dist * dist 146 | } 147 | 148 | t.Log("Total Square Distance: ", totalSqDist); 149 | t.Log("Average Square Distance: ", totalSqDist/float64(len(data))); 150 | t.Log("Runtime(seconds): ", time.Seconds()); 151 | 152 | if len(rpHashResult) != numClusters { 153 | t.Errorf("RPHash Stream did not present the correct number of clusters.") 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /tests/streamobject_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/stretchr/testify/assert" 5 | "github.com/wilseypa/rphash-golang/reader" 6 | "github.com/wilseypa/rphash-golang/types" 7 | "github.com/wilseypa/rphash-golang/utils" 8 | "math/rand" 9 | "reflect" 10 | "testing" 11 | ) 12 | 13 | func TestStreamObject(t *testing.T) { 14 | var k = 4 15 | var dimensionality = 100 16 | var numBlurs = 2 17 | var numProjections = 2 18 | var numDataPoints = 8 19 | var origVariance float64 = 1 20 | var testDecoderType types.Decoder 21 | var newNumProjections = 4 22 | var newHashModulus int64 = rand.Int63() 23 | //var newRandomSeed int64 = rand.Int63() 24 | 25 | newVarianceSample, newCentroidList := make([][]float64, numDataPoints), make([][]float64, numDataPoints) 26 | for i := 0; i < numDataPoints; i++ { 27 | newVarianceSample[i], newCentroidList[i] = make([]float64, dimensionality), make([]float64, dimensionality) 28 | for j := 0; j < dimensionality; j++ { 29 | newVarianceSample[i][j], newCentroidList[i][j] = float64(i), float64(i) 30 | } 31 | } 32 | 33 | newCentroid := make([]float64, dimensionality) 34 | for i := 0; i < dimensionality; i++ { 35 | newCentroid[i] = float64(i) 36 | } 37 | 38 | newTopId := make([]int64, dimensionality) 39 | for i := 0; i < dimensionality; i++ { 40 | newTopId[i] = int64(i) 41 | } 42 | 43 | RPHashObject := reader.NewStreamObject(dimensionality, k) 44 | 45 | // K. 46 | assert.Equal(t, k, RPHashObject.GetK(), "Expected K equal to Stream K.") 47 | 48 | // Dimensionality. 49 | assert.Equal(t, dimensionality, RPHashObject.GetDimensions(), "Expected dimensionality equal to Stream dimensionality.") 50 | 51 | // Iterator. 52 | assert.Equal(t, RPHashObject.GetVectorIterator().HasNext(), false, "Vector iterator should be initially empty.") 53 | 54 | // Blurs. 55 | assert.Equal(t, numBlurs, RPHashObject.GetNumberOfBlurs(), "Number of blurs should be initially 2.") 56 | 57 | // Variance. 58 | assert.Equal(t, origVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.") 59 | RPHashObject.SetVariance(newVarianceSample) 60 | newVariance := utils.VarianceSample(newVarianceSample, 0.01) 61 | assert.Equal(t, newVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.") 62 | 63 | // Decoders. 64 | origDecoderType := RPHashObject.GetDecoderType() 65 | assert.NotNil(t, origDecoderType) 66 | assert.Equal(t, reflect.ValueOf(&testDecoderType).Elem().Type(), reflect.ValueOf(&origDecoderType).Elem().Type(), "Decoder should implement the Decoder interface.") 67 | RPHashObject.SetDecoderType(testDecoderType) 68 | assert.Equal(t, testDecoderType, RPHashObject.GetDecoderType(), "Decoder should be set to a new Decoder.") 69 | 70 | // Projections. 71 | assert.Equal(t, numProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be initially 2.") 72 | RPHashObject.SetNumberOfProjections(newNumProjections) 73 | assert.Equal(t, newNumProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be equal to the new number of projections.") 74 | 75 | // Hash modulus. 76 | RPHashObject.SetHashModulus(newHashModulus) 77 | assert.Equal(t, newHashModulus, RPHashObject.GetHashModulus(), "Hash modulus should be equal to the new hash modulus.") 78 | 79 | // Centroids. 80 | assert.Empty(t, RPHashObject.GetCentroids(), "Centroids should initially be empty.") 81 | RPHashObject.AddCentroid(newCentroid) 82 | assert.Equal(t, newCentroid, RPHashObject.GetCentroids()[0], "First centroid should be the new centroid.") 83 | RPHashObject.SetCentroids(newCentroidList) 84 | assert.Equal(t, newCentroidList, RPHashObject.GetCentroids(), "Centroids should be equal to the new centroid list.") 85 | 86 | // Top IDs 87 | assert.Empty(t, RPHashObject.GetPreviousTopID(), "Previous top ID should initially be empty.") 88 | RPHashObject.SetPreviousTopID(newTopId) 89 | assert.Equal(t, newTopId, RPHashObject.GetPreviousTopID(), "Previous top ID should be equal to the new top centroid.") 90 | } 91 | -------------------------------------------------------------------------------- /tests/tests.go: -------------------------------------------------------------------------------- 1 | package tests; 2 | -------------------------------------------------------------------------------- /tests/util_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/itemset" 5 | "github.com/wilseypa/rphash-golang/types" 6 | "github.com/wilseypa/rphash-golang/utils" 7 | "testing" 8 | ) 9 | 10 | func TestNormalizeVector(t *testing.T) { 11 | input := make([]float64, 2, 2) 12 | input[0] = 3 13 | input[1] = 4 14 | expectedResult := make([]float64, 2, 2) 15 | expectedResult[0] = 3.0 / 5.0 16 | expectedResult[1] = 4.0 / 5.0 17 | result := utils.Normalize(input) 18 | if len(result) != len(expectedResult) { 19 | t.Errorf("Result vector has dimensionality %v, Expected Result has dimensionality %v.", len(result), len(expectedResult)) 20 | } 21 | for i := 0; i < len(result); i++ { 22 | if result[i] != expectedResult[i] { 23 | t.Errorf("Dimension %v, equals %v in the result and %v in the expectedResult", i, result[i], expectedResult[i]) 24 | } 25 | } 26 | } 27 | func TestPriorityQueueEnquque(t *testing.T) { 28 | input := make([]int64, 5, 5) 29 | input[0] = 3 30 | input[1] = 4 31 | input[2] = 1 32 | input[3] = 20 33 | input[4] = 13 34 | expectedResult := make([]int64, 5, 5) 35 | expectedResult[0] = 1 36 | expectedResult[1] = 3 37 | expectedResult[2] = 4 38 | expectedResult[3] = 13 39 | expectedResult[4] = 20 40 | testQueue := utils.NewInt64PriorityQueue() 41 | for _, value := range input { 42 | testQueue.Enqueue(value, value) 43 | } 44 | if testQueue.Size() != len(expectedResult) { 45 | t.Errorf("priorityQueue is not the correct size expected length: %v, actual length: %v", len(input), testQueue.Size()) 46 | } 47 | for i, expectedValue := range expectedResult { 48 | actualValue := testQueue.Poll() 49 | if actualValue != expectedValue { 50 | t.Errorf("priorityQueue did not output the correct value at index: %v, expected: %v, actual %v.", i, expectedValue, actualValue) 51 | } 52 | } 53 | } 54 | 55 | func TestPriorityQueueRemove(t *testing.T) { 56 | input := make([]int64, 5, 5) 57 | input[0] = 3 58 | input[1] = 4 59 | input[2] = 1 60 | input[3] = 20 61 | input[4] = 13 62 | expectedResult := make([]int64, 3, 3) 63 | expectedResult[0] = 3 64 | expectedResult[1] = 4 65 | expectedResult[2] = 20 66 | testQueue := utils.NewInt64PriorityQueue() 67 | for _, value := range input { 68 | testQueue.Enqueue(value, value) 69 | } 70 | testQueue.Remove(1) 71 | testQueue.Remove(13) 72 | if testQueue.Size() != len(expectedResult) { 73 | t.Errorf("priorityQueue is not the correct size expected length: %v, actual length: %v", len(input), testQueue.Size()) 74 | } 75 | for i, expectedValue := range expectedResult { 76 | actualValue := testQueue.Poll() 77 | if actualValue != expectedValue { 78 | t.Errorf("priorityQueue did not output the correct value at index: %v, expected: %v, actual %v.", i, expectedValue, actualValue) 79 | } 80 | } 81 | } 82 | 83 | func TestCentriodQueue(t *testing.T) { 84 | testQueue := utils.NewCentroidPriorityQueue() 85 | var dimensionality = 20 86 | fakeData := make([]float64, dimensionality, dimensionality) 87 | input := make([]types.Centroid, 5, 5) 88 | for i := range input { 89 | nextCentriod := itemset.NewCentroidSimple(dimensionality, int64(i)) 90 | input[i] = nextCentriod 91 | for j := 0; j <= i; j++ { 92 | input[i].UpdateVector(fakeData) 93 | } 94 | testQueue.Enqueue(input[i]) 95 | } 96 | //Remove the third centriod from the testQueue and the input 97 | testQueue.Remove(3) 98 | input = append(input[:3], input[4:]...) 99 | 100 | //Add another centoid to both the input and the testQueue 101 | nextCentriod := itemset.NewCentroidSimple(dimensionality, int64(9)) 102 | input = append([]types.Centroid{nextCentriod}, input...) 103 | 104 | testQueue.Enqueue(nextCentriod) 105 | if testQueue.Size() != len(input) { 106 | t.Errorf("priorityQueue is not the correct size expected length: %v, actual length: %v", len(input), testQueue.Size()) 107 | } 108 | for i, expectedValue := range input { 109 | actualValue := testQueue.Poll() 110 | if actualValue.GetID() != expectedValue.GetID() { 111 | t.Errorf("priorityQueue did not output the correct value at index: %v, expected: %v, actual %v.", i, expectedValue.GetID(), actualValue.GetID()) 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /types/types.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | type Iterator interface { 8 | StoreLSHValues([]int64) 9 | Append(vector []float64) 10 | PeakLSH() int64 11 | Next() (value []float64) 12 | HasNext() (ok bool) 13 | Reset() 14 | Size() (count int) 15 | } 16 | 17 | type PQueue interface { 18 | IsEmpty() bool 19 | Poll(i interface{}) 20 | Push(i interface{}) 21 | Pop() interface{} 22 | Length() int 23 | Less(i, j int) bool 24 | Swap(i, j int) 25 | String() string 26 | } 27 | 28 | type Decoder interface { 29 | SetVariance(parameterObject float64) 30 | GetDimensionality() int 31 | Decode(f []float64) []int64 32 | GetErrorRadius() float64 33 | GetDistance() float64 34 | GetVariance() float64 35 | } 36 | 37 | type Projector interface { 38 | Project(v []float64) []float64 39 | } 40 | 41 | type HashSet interface { 42 | Add(i int64) bool 43 | Get(i int64) bool 44 | AddAll(i HashSet) 45 | GetS() map[int64]bool 46 | Remove(i int64) 47 | Length() int 48 | Contains(i int64) bool 49 | } 50 | 51 | type Hash interface { 52 | Hash(k []int64) int64 53 | } 54 | 55 | type Centroid interface { 56 | Centroid() []float64 57 | UpdateVector(rp []float64) 58 | GetCount() int64 59 | GetID() int64 60 | GetIDs() HashSet 61 | AddID(h int64) 62 | } 63 | 64 | type CountItemSet interface { 65 | Add(c int64) 66 | GetCounts() []int64 67 | GetTop() []int64 68 | GetCount() int64 69 | } 70 | 71 | type CentroidItemSet interface { 72 | Add(c Centroid) 73 | GetCounts() []int64 74 | GetTop() []Centroid 75 | GetCount() int64 76 | } 77 | 78 | type LSH interface { 79 | LSHHashSimple(r []float64) int64 80 | LSHHashStream(r []float64, a int) []int64 81 | UpdateDecoderVariance(vari float64) 82 | } 83 | 84 | type StatTest interface { 85 | UpdateVarianceSample(vec []float64) float64 86 | } 87 | 88 | type RPHashObject interface { 89 | GetK() int 90 | NumDataPoints() int 91 | GetDimensions() int 92 | GetRandomSeed() int64 93 | GetNumberOfBlurs() int 94 | AppendVector(vector []float64) 95 | GetVectorIterator() Iterator 96 | GetCentroids() [][]float64 97 | GetPreviousTopID() []int64 98 | SetPreviousTopID(i []int64) 99 | AddCentroid(v []float64) 100 | SetCentroids(l [][]float64) 101 | GetNumberOfProjections() int 102 | SetNumberOfProjections(probes int) 103 | SetRandomSeed(parseLong int64) 104 | GetHashModulus() int64 105 | SetHashModulus(parseLong int64) 106 | SetDecoderType(dec Decoder) 107 | GetDecoderType() Decoder 108 | SetVariance(data [][]float64) 109 | } 110 | 111 | type Clusterer interface { 112 | GetCentroids() [][]float64 113 | } 114 | 115 | type StreamClusterer interface { 116 | AddVectorOnlineStep(x []float64, wg *sync.WaitGroup) Centroid 117 | GetCentroidsOfflineStep() [][]float64 118 | } 119 | -------------------------------------------------------------------------------- /utils/bit-shift.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // Java like zero extension shift see http://docs.oracle.com/javase/specs/jls/se7/html/jls-15.html#jls-15.19 >>> operator 4 | func RightShiftZeroExtension(leftOperand int64, rightOperand int64) int64 { 5 | if leftOperand > -1 { 6 | return leftOperand >> uint64(rightOperand) 7 | } else { 8 | return (leftOperand >> uint64(rightOperand)) + (2 << uint64(^rightOperand)) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /utils/centriod-priority-queue.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "errors" 5 | "github.com/wilseypa/rphash-golang/types" 6 | ) 7 | 8 | type CentriodPriorityQueue struct { 9 | heap []types.Centroid 10 | heapSize int 11 | } 12 | 13 | func NewCentroidPriorityQueue() *CentriodPriorityQueue { 14 | heap := make([]types.Centroid, 16, 16) 15 | return &CentriodPriorityQueue{ 16 | heapSize: 0, 17 | heap: heap, 18 | } 19 | } 20 | 21 | func (this *CentriodPriorityQueue) Dequeue() (types.Centroid, error) { 22 | if this.heapSize < 1 { 23 | err := errors.New("Queue contains no centroids") 24 | return nil, err 25 | } 26 | var result = this.heap[1] 27 | this.heap[1] = this.heap[this.heapSize] 28 | this.percolateDown(1) 29 | this.heapSize-- 30 | return result, nil 31 | } 32 | 33 | func (this *CentriodPriorityQueue) Remove(idToRemove int64) bool { 34 | for i := 1; i < this.heapSize; i++ { 35 | if this.heap[i].GetID() == idToRemove { 36 | this.heap[i] = this.heap[this.heapSize] 37 | this.heap[this.heapSize] = nil 38 | this.heapSize-- 39 | this.percolateDown(i) 40 | this.percolateUp(i) 41 | return true 42 | } 43 | } 44 | return false 45 | } 46 | 47 | func (this *CentriodPriorityQueue) IsEmpty() bool { 48 | return this.heapSize == 0 49 | } 50 | 51 | func (this *CentriodPriorityQueue) Poll() types.Centroid { 52 | var result, error = this.Dequeue() 53 | if error != nil { 54 | return nil 55 | } 56 | return result 57 | } 58 | 59 | func (this *CentriodPriorityQueue) Enqueue(newCentriod types.Centroid) { 60 | this.heapSize++ 61 | if this.heapSize == len(this.heap) { 62 | var newHeap = make([]types.Centroid, len(this.heap)*2) 63 | copy(newHeap, this.heap) 64 | this.heap = newHeap 65 | } 66 | this.heap[this.heapSize] = newCentriod 67 | this.percolateUp(this.heapSize) 68 | } 69 | 70 | func (this *CentriodPriorityQueue) Size() int { 71 | return this.heapSize 72 | } 73 | 74 | //The children of index X are (2*X) and (2*X) + 1 75 | func (this *CentriodPriorityQueue) percolateUp(lowerIndex int) { 76 | if lowerIndex < 2 { 77 | return 78 | } 79 | var upperIndex = lowerIndex / 2 80 | if this.compareAtPositions(lowerIndex, upperIndex) < 0 { 81 | this.swap(lowerIndex, upperIndex) 82 | this.percolateUp(upperIndex) 83 | } 84 | //Else we have fixed the priorityQueue; 85 | //Else we have fixed the priorityQueue; 86 | } 87 | 88 | func (this *CentriodPriorityQueue) percolateDown(upperIndex int) { 89 | var lowerIndex = 2 * upperIndex 90 | if lowerIndex > this.heapSize { 91 | return // If this node has no children we are done. 92 | } 93 | if this.compareAtPositions(lowerIndex, upperIndex) < 0 { 94 | this.swap(lowerIndex, upperIndex) 95 | this.percolateDown(lowerIndex) 96 | this.percolateDown(1) 97 | } else if lowerIndex+1 <= this.heapSize && this.compareAtPositions(lowerIndex+1, upperIndex) < 0 { 98 | this.swap(lowerIndex+1, upperIndex) 99 | this.percolateDown(lowerIndex + 1) 100 | } 101 | //Else we have fixed the priorityQueue; 102 | } 103 | 104 | func (this *CentriodPriorityQueue) swap(index1 int, index2 int) { 105 | var temp = this.heap[index1] 106 | this.heap[index1] = this.heap[index2] 107 | this.heap[index2] = temp 108 | } 109 | 110 | func (this *CentriodPriorityQueue) compare(centroid1 types.Centroid, centroid2 types.Centroid) int { 111 | count1 := centroid1.GetCount() 112 | count2 := centroid2.GetCount() 113 | if count1 > count2 { 114 | return 1 115 | } else if count1 < count2 { 116 | return -1 117 | } 118 | return 0 119 | } 120 | 121 | func (this *CentriodPriorityQueue) compareAtPositions(index1 int, index2 int) int { 122 | return this.compare(this.heap[index1], this.heap[index2]) 123 | } 124 | -------------------------------------------------------------------------------- /utils/file-reader.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/csv" 7 | "github.com/wilseypa/rphash-golang/parse" 8 | "io" 9 | "math" 10 | "os" 11 | "strconv" 12 | "strings" 13 | ) 14 | type DataFileReader struct { 15 | reader *bufio.Reader 16 | file *os.File 17 | buffer *bytes.Buffer 18 | hasNext bool 19 | part []byte 20 | prefix bool 21 | err error 22 | } 23 | 24 | 25 | func NewDataFileReader(path string) *DataFileReader { 26 | var ( 27 | file *os.File 28 | err error 29 | ) 30 | if file, err = os.Open(path); err != nil { 31 | panic(err) 32 | } 33 | 34 | reader := bufio.NewReader(file) 35 | buffer := bytes.NewBuffer(make([]byte, 0)) 36 | hasNext := true 37 | return &DataFileReader{ 38 | reader: reader, 39 | buffer: buffer, 40 | hasNext: hasNext, 41 | file: file, 42 | } 43 | } 44 | 45 | func (this *DataFileReader) HasNext() bool { 46 | return this.hasNext 47 | } 48 | 49 | func (this *DataFileReader) Next() []float64 { 50 | for { 51 | if this.part, this.prefix, this.err = this.reader.ReadLine(); this.err != nil { 52 | this.hasNext = false 53 | return nil 54 | } 55 | this.buffer.Write(this.part) 56 | if !this.prefix { 57 | line := strings.Fields(this.buffer.String()) 58 | this.buffer.Reset() 59 | return StringLineToFloatLine(line) 60 | } 61 | } 62 | } 63 | 64 | var ( 65 | fixedDecimalPoint = 18 66 | weightMax = math.Abs(parse.ToFixed(math.MaxFloat64, fixedDecimalPoint)) 67 | weightMin = float64(0) 68 | ) 69 | 70 | func NormalizeSlice(records [][]float64) [][]float64 { 71 | data := make([][]float64, len(records)) 72 | for i, record := range records { 73 | data[i] = make([]float64, len(record)) 74 | for j, entry := range record { 75 | data[i][j] = parse.Normalize(entry) 76 | } 77 | } 78 | return data 79 | } 80 | 81 | func ReadCSV(path string) [][]float64 { 82 | file, err := os.Open(path) 83 | if err != nil { 84 | panic(err) 85 | } 86 | 87 | r := csv.NewReader(file) 88 | r.FieldsPerRecord = -1 89 | 90 | lines, err := r.ReadAll() 91 | if err != nil { 92 | panic(err) 93 | } 94 | lines = lines[1:] 95 | return NormalizeSlice(StringArrayToFloatArray(lines)) 96 | } 97 | 98 | func ReadXLines(path string, x int) (lines [][]string, err error) { 99 | // Read a whole file into the memory and store it as array of lines 100 | var ( 101 | file *os.File 102 | part []byte 103 | prefix bool 104 | ) 105 | if file, err = os.Open(path); err != nil { 106 | return 107 | } 108 | defer file.Close() 109 | 110 | reader := bufio.NewReader(file) 111 | buffer := bytes.NewBuffer(make([]byte, 0)) 112 | for i := 0; i < x; i++ { 113 | if part, prefix, err = reader.ReadLine(); err != nil { 114 | break 115 | } 116 | buffer.Write(part) 117 | if !prefix { 118 | line := strings.Fields(buffer.String()) 119 | lines = append(lines, line) 120 | buffer.Reset() 121 | } 122 | } 123 | if err == io.EOF { 124 | err = nil 125 | } 126 | return 127 | } 128 | 129 | func StringArrayToFloatArray(lines [][]string) (result [][]float64) { 130 | result = make([][]float64, len(lines), len(lines)) 131 | for i, line := range lines { 132 | result[i] = make([]float64, len(lines[i]), len(lines[i])) 133 | for j, toFloat := range line { 134 | float, err := strconv.ParseFloat(toFloat, 64) 135 | if err != nil { 136 | continue 137 | } 138 | result[i][j] = float 139 | } 140 | } 141 | return result 142 | } 143 | 144 | func StringLineToFloatLine(line []string) (result []float64) { 145 | result = make([]float64, len(line), len(line)) 146 | for j, toFloat := range line { 147 | float, err := strconv.ParseFloat(toFloat, 64) 148 | if err != nil { 149 | panic(err) 150 | } 151 | result[j] = float 152 | } 153 | return result; 154 | } 155 | 156 | func ReadLines(path string) (lines [][]string, err error) { 157 | // Read a whole file into the memory and store it as array of lines 158 | var ( 159 | file *os.File 160 | part []byte 161 | prefix bool 162 | ) 163 | if file, err = os.Open(path); err != nil { 164 | return 165 | } 166 | defer file.Close() 167 | 168 | reader := bufio.NewReader(file) 169 | buffer := bytes.NewBuffer(make([]byte, 0)) 170 | for { 171 | if part, prefix, err = reader.ReadLine(); err != nil { 172 | break 173 | } 174 | buffer.Write(part) 175 | if !prefix { 176 | line := strings.Fields(buffer.String()) 177 | lines = append(lines, line) 178 | buffer.Reset() 179 | } 180 | } 181 | if err == io.EOF { 182 | err = nil 183 | } 184 | return 185 | } 186 | -------------------------------------------------------------------------------- /utils/generate-data.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "os" 5 | "bufio" 6 | "math/rand" 7 | "strconv" 8 | ) 9 | 10 | func GenerateData(path string, dimensionality int, numRows int) () { 11 | file, err := os.Create(path) 12 | defer file.Close(); 13 | if err != nil { 14 | panic(err); 15 | } 16 | w := bufio.NewWriter(file) 17 | for i := 0; i < numRows; i++ { 18 | for j := 0; j < dimensionality; j++ { 19 | w.WriteString(strconv.FormatFloat(rand.Float64(), 'f', -1, 64)) 20 | w.WriteString(" ") 21 | } 22 | w.WriteString("\n") 23 | } 24 | w.Flush() 25 | } -------------------------------------------------------------------------------- /utils/hashsets.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "github.com/wilseypa/rphash-golang/types" 5 | ) 6 | 7 | type Hash64Set struct { 8 | Set map[int64]bool 9 | } 10 | 11 | func NewHash64Set() *Hash64Set { 12 | return &Hash64Set{make(map[int64]bool)} 13 | } 14 | 15 | func (Set *Hash64Set) AddAll(other types.HashSet) { 16 | for k, v := range other.GetS() { 17 | Set.Set[k] = v 18 | } 19 | } 20 | 21 | func (Set *Hash64Set) Add(i int64) bool { 22 | _, found := Set.Set[i] 23 | Set.Set[i] = true 24 | return !found 25 | } 26 | 27 | func (Set *Hash64Set) Contains(i int64) bool { 28 | _, found := Set.Set[i] 29 | return found 30 | } 31 | 32 | func (Set *Hash64Set) GetS() map[int64]bool { 33 | return Set.Set 34 | } 35 | 36 | func (Set *Hash64Set) Get(i int64) bool { 37 | _, found := Set.Set[i] 38 | return found 39 | } 40 | 41 | func (Set *Hash64Set) Remove(i int64) { 42 | delete(Set.Set, i) 43 | } 44 | 45 | func (Set *Hash64Set) Length() int { 46 | return len(Set.Set) 47 | } 48 | -------------------------------------------------------------------------------- /utils/int64-priority-queue.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "errors" 5 | ) 6 | 7 | //Since we are going to remove the smallest count we need this to be a min priorityQueue 8 | 9 | type int64WithPriority struct { 10 | actualInt int64 11 | priority int64 12 | } 13 | type Int64PriorityQueue struct { 14 | heap []int64WithPriority 15 | heapSize int 16 | } 17 | 18 | func NewInt64PriorityQueue() *Int64PriorityQueue { 19 | heap := make([]int64WithPriority, 16) 20 | return &Int64PriorityQueue{ 21 | heapSize: 0, 22 | heap: heap, 23 | } 24 | } 25 | 26 | func (this *Int64PriorityQueue) Dequeue() (int64, error) { 27 | if this.heapSize < 1 { 28 | err := errors.New("Queue contains no int64s") 29 | return 0, err 30 | } 31 | var result = this.heap[1] 32 | this.heap[1] = this.heap[this.heapSize] 33 | this.heapSize-- 34 | this.percolateDown(1) 35 | return result.actualInt, nil 36 | } 37 | 38 | func (this *Int64PriorityQueue) IsEmpty() bool { 39 | return this.heapSize == 0 40 | } 41 | 42 | func (this *Int64PriorityQueue) PeakMinPriority() int64 { 43 | return this.heap[1].priority 44 | } 45 | 46 | func (this *Int64PriorityQueue) Poll() int64 { 47 | var result, error = this.Dequeue() 48 | if error != nil { 49 | return 0 50 | } 51 | return result 52 | } 53 | 54 | func (this *Int64PriorityQueue) Remove(toRemove int64) bool { 55 | for i := 1; i <= this.heapSize; i++ { 56 | if this.heap[i].actualInt == toRemove { 57 | this.heap[i] = this.heap[this.heapSize] 58 | this.heapSize-- 59 | //We dont know if we need to percolate up or down so do both 60 | this.percolateUp(i) 61 | this.percolateDown(i) 62 | return true 63 | } 64 | } 65 | return false 66 | } 67 | 68 | func (this *Int64PriorityQueue) Enqueue(newInt int64, priority int64) { 69 | newIntObj := int64WithPriority{actualInt: newInt, priority: priority} 70 | this.heapSize++ 71 | if this.heapSize == len(this.heap) { 72 | var newHeap = make([]int64WithPriority, len(this.heap)*2) 73 | copy(newHeap, this.heap) 74 | this.heap = newHeap 75 | } 76 | this.heap[this.heapSize] = newIntObj 77 | this.percolateUp(this.heapSize) 78 | } 79 | 80 | func (this *Int64PriorityQueue) Size() int { 81 | return this.heapSize 82 | } 83 | 84 | func (this *Int64PriorityQueue) percolateUp(lowerIndex int) { 85 | if lowerIndex < 2 { 86 | return 87 | } 88 | var upperIndex = lowerIndex / 2 89 | if this.compare(lowerIndex, upperIndex) < 0 { 90 | this.swap(lowerIndex, upperIndex) 91 | this.percolateUp(upperIndex) 92 | } 93 | //Else we have fixed the priorityQueue; 94 | } 95 | 96 | func (this *Int64PriorityQueue) percolateDown(upperIndex int) { 97 | var lowerIndex = 2 * upperIndex 98 | if lowerIndex > this.heapSize { 99 | return // If this node has no children we are done. 100 | } 101 | if this.compare(lowerIndex, upperIndex) < 0 { 102 | this.swap(lowerIndex, upperIndex) 103 | this.percolateDown(lowerIndex) 104 | this.percolateDown(upperIndex) 105 | } else if lowerIndex+1 <= this.heapSize && this.compare(lowerIndex+1, upperIndex) < 0 { 106 | this.swap(lowerIndex+1, upperIndex) 107 | this.percolateDown(lowerIndex + 1) 108 | } 109 | //Else we have fixed the priorityQueue; 110 | } 111 | 112 | func (this *Int64PriorityQueue) swap(index1 int, index2 int) { 113 | var temp = this.heap[index1] 114 | this.heap[index1] = this.heap[index2] 115 | this.heap[index2] = temp 116 | } 117 | 118 | func (this *Int64PriorityQueue) compare(index1 int, index2 int) int { 119 | if this.heap[index1].priority > this.heap[index2].priority { 120 | return 1 121 | } else if this.heap[index1].priority < this.heap[index2].priority { 122 | return -1 123 | } 124 | return 0 125 | } 126 | -------------------------------------------------------------------------------- /utils/iterator.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | type IterableSlice struct { 4 | position int 5 | data [][]float64 6 | lshVals []int64 7 | } 8 | 9 | func (this *IterableSlice) Next() (value []float64) { 10 | this.position++ 11 | return this.data[this.position] 12 | } 13 | 14 | func (this *IterableSlice) Size() (count int) { 15 | return len(this.data); 16 | } 17 | 18 | func (this *IterableSlice) PeakLSH() (lshValue int64) { 19 | if this.lshVals == nil { 20 | panic("Cannot call PeakLSH until after StoreLSHValues") 21 | } 22 | return this.lshVals[this.position] 23 | } 24 | 25 | func (this *IterableSlice) StoreLSHValues(lshVals []int64) { 26 | this.lshVals = lshVals 27 | } 28 | 29 | func (this *IterableSlice) Append(data []float64) { 30 | this.data = append(this.data, data) 31 | } 32 | 33 | func (this *IterableSlice) HasNext() (ok bool) { 34 | this.position++ 35 | if this.position >= len(this.data) { 36 | this.position-- 37 | return false 38 | } 39 | this.position-- 40 | return true 41 | } 42 | 43 | func (this *IterableSlice) GetS() [][]float64 { 44 | return this.data 45 | } 46 | 47 | func (this *IterableSlice) Reset() { 48 | this.position = -1 49 | } 50 | 51 | func NewIterator(data [][]float64) *IterableSlice { 52 | return &IterableSlice{-1, data, nil} 53 | } 54 | -------------------------------------------------------------------------------- /utils/plot-tool.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "github.com/gonum/matrix/mat64" 5 | "github.com/gonum/plot" 6 | "github.com/gonum/plot/palette" 7 | "github.com/gonum/plot/plotter" 8 | "github.com/gonum/plot/plotutil" 9 | "github.com/gonum/plot/vg" 10 | "image/color" 11 | "math" 12 | "strconv" 13 | ) 14 | 15 | var ( 16 | grayscale = Grayscale{Min: 0, Max: 1} 17 | ) 18 | 19 | // Linearly maps the colors between light gray and black 20 | type Grayscale struct { 21 | Min float64 22 | Max float64 23 | } 24 | 25 | func (c *Grayscale) Color(z float64) color.Color { 26 | val := (1 - (z-c.Min)/(c.Max-c.Min)) 27 | if val < 0 { 28 | val = 0 29 | } else if val > 1 { 30 | val = 1 31 | } 32 | val *= 255 * 0.9 33 | u8v := uint8(val) 34 | return color.RGBA{u8v, u8v, u8v, 255} 35 | } 36 | 37 | // Util for generating plots 38 | func GeneratePlots(x, y [][]float64, title, xLabel, yLabel, fileName string, legendLabel []string) { 39 | outPlotPoints := make([]plotter.XYs, len(x)) 40 | outPlots := make([]*plot.Plot, len(x)) 41 | 42 | for i, _ := range outPlotPoints { 43 | outPlot, err := plot.New() 44 | outPlots[i] = outPlot 45 | outPlots[i].Title.Text = title 46 | outPlots[i].X.Label.Text = xLabel 47 | outPlots[i].Y.Label.Text = yLabel 48 | outPlotPoints[i] = make(plotter.XYs, len(x[0])) 49 | for j, _ := range x[0] { 50 | outPlotPoints[i][j].X = x[i][j] 51 | outPlotPoints[i][j].Y = y[i][j] 52 | } 53 | err = plotutil.AddLines(outPlots[i], 54 | legendLabel[i], outPlotPoints[i]) 55 | if err != nil { 56 | panic(err) 57 | } 58 | 59 | if err = outPlot.Save(6*vg.Inch, 6*vg.Inch, (fileName+strconv.FormatInt(int64(i), 16))+".png"); err != nil { 60 | panic(err) 61 | } 62 | } 63 | } 64 | 65 | type Grid struct { 66 | Matrix *mat64.Dense 67 | } 68 | 69 | func NewGrid(matrix *mat64.Dense) *Grid { 70 | return &Grid{ 71 | Matrix: matrix, 72 | } 73 | } 74 | 75 | // Dims returns the dimensions of the grid. 76 | func (this *Grid) Dims() (c, r int) { 77 | return this.Matrix.Dims() 78 | } 79 | 80 | // Z returns the value of a grid value at (c, r). 81 | // It will panic if c or r are out of bounds for the grid. 82 | func (this *Grid) Z(c, r int) float64 { 83 | return this.Matrix.ColView(c).At(r, 0) 84 | } 85 | 86 | // X returns the coordinate for the column at the index x. 87 | // It will panic if c is out of bounds for the grid. 88 | func (this *Grid) X(c int) float64 { 89 | return this.Matrix.ColView(c).At(c, 0) 90 | } 91 | 92 | // Y returns the coordinate for the row at the index r. 93 | // It will panic if r is out of bounds for the grid. 94 | func (this *Grid) Y(r int) float64 { 95 | return this.Matrix.RowView(r).At(r, 0) 96 | } 97 | 98 | func MaxFloat(collection []float64) float64 { 99 | max := collection[0] 100 | for _, value := range collection { 101 | if value > max { 102 | max = value 103 | } 104 | } 105 | return max 106 | } 107 | 108 | func HeatMap(image []float64, index int, fileName string) { 109 | dim := int(math.Sqrt(float64(len(image)))) 110 | 111 | m := NewGrid(mat64.NewDense(dim, dim, image)) 112 | 113 | h := plotter.NewHeatMap(m, palette.Heat(10, 1)) 114 | 115 | p, err := plot.New() 116 | if err != nil { 117 | panic(err) 118 | } 119 | p.Title.Text = "Heat map" 120 | p.Y.Max = MaxFloat(image) 121 | p.X.Max = MaxFloat(image) 122 | p.Add(h) 123 | 124 | err = p.Save(6*vg.Inch, 6*vg.Inch, fileName+strconv.FormatInt(int64(index), 16)+".png") 125 | if err != nil { 126 | panic(err) 127 | } 128 | } 129 | 130 | // 784 Bits 131 | func Paint(image []float64, imageId int, fileName string, threshold float64) { 132 | outPlotPoints := make(plotter.XYs, len(image)) 133 | outPlot, err := plot.New() 134 | if err != nil { 135 | panic(err) 136 | } 137 | x := 0 138 | y := 0 139 | for i, bit := range image { 140 | outPlotPoints[i].X = float64(x) 141 | 142 | if bit > threshold { 143 | outPlotPoints[i].Y = float64(y) 144 | } else { 145 | outPlotPoints[i].Y = 0 146 | } 147 | 148 | if i%int(math.Sqrt(float64(len(image)))) == 0 { 149 | x = 0 150 | y++ 151 | } else { 152 | x++ 153 | } 154 | } 155 | outPlot.Add(plotter.NewGrid()) 156 | s, _ := plotter.NewScatter(outPlotPoints) 157 | outPlot.Add(s) 158 | if err = outPlot.Save(6*vg.Inch, 6*vg.Inch, fileName+strconv.FormatInt(int64(imageId), 16)+".png"); err != nil { 159 | panic(err) 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /utils/stattest.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "math/rand" 5 | ) 6 | 7 | type StatTest struct { 8 | sampRatio float64 9 | } 10 | 11 | func NewStatTest(sampRatio float64) *StatTest { 12 | return &StatTest{ 13 | sampRatio: sampRatio, 14 | } 15 | } 16 | 17 | func HashCode(num int64) int64 { 18 | return int64(uint64(num) ^ uint64(num)>>64) 19 | } 20 | 21 | func (this *StatTest) UpdateVarianceSample(row []float64) float64 { 22 | var n float64 = 0 23 | var mean float64 = 0 24 | var M2 float64 = 0 25 | if rand.Float64() > this.sampRatio { 26 | return M2 / (n - 1.0) 27 | } 28 | for _, x := range row { 29 | n++ 30 | delta := x - mean 31 | mean = mean + delta/n 32 | M2 = M2 + delta*(x-mean) 33 | } 34 | if n < 2 { 35 | return 0 36 | } 37 | return M2 / (n - 1.0) 38 | } 39 | 40 | func VarianceSample(data [][]float64, sampRatio float64) float64 { 41 | var n float64 = 0 42 | var mean float64 = 0 43 | var M2 float64 = 0 44 | len := len(data) 45 | for i := 0; i < int(sampRatio)*len; i++ { 46 | row := data[rand.Intn(len)] 47 | for _, x := range row { 48 | n++ 49 | delta := x - mean 50 | mean = mean + delta/n 51 | M2 = M2 + delta*(x-mean) 52 | } 53 | } 54 | if n < 2 { 55 | return 0 56 | } 57 | return M2 / (n - 1.0) 58 | } 59 | 60 | func (this *StatTest) VarianceAll(data [][]float64) float64 { 61 | var n float64 = 0 62 | var mean float64 = 0 63 | var M2 float64 = 0 64 | for _, row := range data { 65 | for _, x := range row { 66 | n++ 67 | delta := x - mean 68 | mean = mean + delta/n 69 | M2 = M2 + delta*(x-mean) 70 | } 71 | } 72 | if n < 2 { 73 | return 0 74 | } 75 | return M2 / (n - 1.0) 76 | } 77 | 78 | func (this *StatTest) AverageAll(data [][]float64) float64 { 79 | var n float64 = 0 80 | var mean float64 = 0 81 | for _, row := range data { 82 | for _, x := range row { 83 | n++ 84 | mean += x 85 | } 86 | } 87 | return mean / n 88 | } 89 | 90 | func (this *StatTest) VarianceCol(data [][]float64) []float64 { 91 | leng := len(data) 92 | if leng < 1 { 93 | return nil 94 | } 95 | vars := make([]float64, len(data[0])) 96 | var n float64 = 0 97 | var mean float64 = 0 98 | var M2 float64 = 0 99 | for i := 0; i < leng; i++ { 100 | n = 0 101 | mean = 0 102 | M2 = 0 103 | for _, x := range data[i] { 104 | n++ 105 | delta := x - mean 106 | mean = mean + delta/n 107 | M2 = M2 + delta*(x-mean) 108 | } 109 | if n < 2 { 110 | vars[i] = 0 111 | } else { 112 | vars[i] = M2 / (n - 1.0) 113 | } 114 | } 115 | return vars 116 | } 117 | 118 | func (this *StatTest) AverageCol(data [][]float64) []float64 { 119 | n := len(data) 120 | if n < 1 { 121 | return nil 122 | } 123 | d := len(data[0]) 124 | avgs := make([]float64, d) 125 | for _, tmp := range data { 126 | for j := 0; j < d; j++ { 127 | avgs[j] += (tmp[j] / float64(n)) 128 | } 129 | } 130 | return avgs 131 | } 132 | 133 | func (this *StatTest) Variance(row []float64) float64 { 134 | var n float64 = 0 135 | var mean float64 = 0 136 | var M2 float64 = 0 137 | for _, x := range row { 138 | n++ 139 | delta := x - mean 140 | mean = mean + delta/n 141 | M2 = M2 + delta*(x-mean) 142 | } 143 | if n < 2 { 144 | return 0 145 | } 146 | return M2 / (n - 1.0) 147 | } 148 | -------------------------------------------------------------------------------- /utils/vectors.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | ) 7 | 8 | func Normalize(input []float64) []float64 { 9 | var length float64 = 0 10 | for _, dimension := range input { 11 | length += (dimension * dimension) 12 | } 13 | length = math.Sqrt(length) 14 | result := make([]float64, len(input)) 15 | for i, dimension := range input { 16 | result[i] = dimension / length 17 | } 18 | return result 19 | } 20 | 21 | func Random(d int, r []*rand.Rand) []float64 { 22 | v := make([]float64, d) 23 | for i := 0; i < d; i++ { 24 | v[i] = r[i].NormFloat64() 25 | } 26 | return v 27 | } 28 | 29 | func RandomRotation(d int, r2 []*rand.Rand) [][]float64 { 30 | R := make([][]float64, d) 31 | for i := 0; i < d; i++ { 32 | R[i] = Random(d, r2) 33 | u := R[i] 34 | for j := 0; j < i; j++ { 35 | v := R[j] 36 | vnorm := Norm(v) 37 | if vnorm == 0 { 38 | return RandomRotation(d, r2) 39 | } 40 | vs := make([]float64, len(v)) 41 | copy(vs, v) 42 | Scale(vs, Dot(v, u)/vnorm) 43 | u = Sub(u, vs) 44 | } 45 | u = Scale(u, 1.0/Norm(u)) 46 | } 47 | return R 48 | } 49 | 50 | func Argmaxi(p []float64, vs [][]float64, d int) int64 { 51 | var maxi int64 = 0 52 | var max float64 = 0 53 | var abs float64 54 | for i := 0; i < d; i++ { 55 | dot := Dot(p, vs[i]) 56 | if dot >= 0 { 57 | abs = dot 58 | } else { 59 | abs = -dot 60 | } 61 | if abs < max { 62 | continue 63 | } 64 | max = abs 65 | if dot >= 0 { 66 | maxi = int64(i) 67 | } else { 68 | maxi = int64(i + d) 69 | } 70 | } 71 | return maxi 72 | } 73 | 74 | func Norm(t []float64) float64 { 75 | var n float64 = 0 76 | for i := 0; i < len(t); i++ { 77 | n += t[i] * t[i] 78 | } 79 | return math.Sqrt(n) 80 | } 81 | 82 | func Scale(t []float64, s float64) []float64 { 83 | for i := 0; i < len(t); i++ { 84 | t[i] *= s 85 | } 86 | return t 87 | } 88 | 89 | func Dot(t, u []float64) float64 { 90 | var s float64 = 0 91 | for i := 0; i < len(t); i++ { 92 | s += t[i] * u[i] 93 | } 94 | return s 95 | } 96 | 97 | func Sub(t, u []float64) []float64 { 98 | for i := 0; i < len(t); i++ { 99 | t[i] -= u[i] 100 | } 101 | return t 102 | } 103 | 104 | func Max(collection []int64) int64 { 105 | max := collection[0] 106 | for _, value := range collection { 107 | if value > max { 108 | max = value 109 | } 110 | } 111 | return max 112 | } 113 | 114 | func Min(collection []int64) int64 { 115 | min := collection[0] 116 | for _, value := range collection { 117 | if value < min { 118 | min = value 119 | } 120 | } 121 | return min 122 | } 123 | 124 | func Distance(x, y []float64) float64 { 125 | if len(x) < 1 { 126 | return 0 127 | } 128 | if len(y) < 1 { 129 | return 0 130 | } 131 | dist := math.Abs((x[0] - y[0]) * (x[0] - y[0])) 132 | for i := 1; i < len(x); i++ { 133 | dist += math.Abs((x[i] - y[i]) * (x[i] - y[i])) 134 | } 135 | return math.Sqrt(dist) 136 | } 137 | 138 | func FindNearestDistance(x []float64, DB [][]float64) (int, float64) { 139 | mindist := Distance(x, DB[0]) 140 | minindex := 0 141 | var tmp float64 142 | for i := 1; i < len(DB); i++ { 143 | tmp = Distance(x, DB[i]) 144 | if tmp <= mindist { 145 | mindist = tmp 146 | minindex = i 147 | } 148 | } 149 | return minindex, Distance(x, DB[minindex]); 150 | } 151 | --------------------------------------------------------------------------------