├── Gophers.008.crop.png
├── GOPHER
├── .travis.yml
├── LICENSE
├── utils.go
├── hashing.go
├── doc.go
├── dimreduction_test.go
├── measures
    └── pairwise
    │   └── comparisons.go
├── index_test.go
├── weightings_test.go
├── weightings.go
├── example_test.go
├── vectorisers_test.go
├── README.md
├── dimreduction.go
├── index.go
├── lsh.go
├── lda_test.go
├── randomprojection_test.go
├── vectorisers.go
├── randomprojection.go
└── lda.go


/Gophers.008.crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/james-bowman/nlp/HEAD/Gophers.008.crop.png


--------------------------------------------------------------------------------
/GOPHER:
--------------------------------------------------------------------------------
1 | The Go gopher was designed by Renee French and is licensed under the Creative Commons Attributions 3.0. 
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | 
 3 | go:
 4 |   - 1.13.x
 5 |   - 1.14.x
 6 |   - tip
 7 | 
 8 | before_install:
 9 |   - go get -t -v ./...
10 | 
11 | script:
12 |   - go test -coverprofile=coverage.txt -covermode=atomic
13 | 
14 | after_success:
15 |   - bash <(curl -s https://codecov.io/bash)
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 James Bowman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/utils.go:
--------------------------------------------------------------------------------
 1 | package nlp
 2 | 
 3 | import (
 4 | 	"github.com/james-bowman/sparse"
 5 | 	"gonum.org/v1/gonum/mat"
 6 | )
 7 | 
 8 | // ColDo executes fn for each column j in m.  If the matrix implements the mat.ColViewer
 9 | // interface then this interface will be used to iterate over the column vectors more
10 | // efficiently.  If the matrix implements the sparse.TypeConverter interface then the
11 | // matrix will be converted to a CSC matrix (which implements the mat.ColViewer
12 | // interface) so that it can benefit from the same optimisation.
13 | func ColDo(m mat.Matrix, fn func(j int, vec mat.Vector)) {
14 | 	if v, isOk := m.(mat.Vector); isOk {
15 | 		fn(0, v)
16 | 		return
17 | 	}
18 | 
19 | 	if cv, isOk := m.(mat.ColViewer); isOk {
20 | 		_, c := m.Dims()
21 | 		for j := 0; j < c; j++ {
22 | 			fn(j, cv.ColView(j))
23 | 		}
24 | 		return
25 | 	}
26 | 
27 | 	if sv, isOk := m.(sparse.TypeConverter); isOk {
28 | 		csc := sv.ToCSC()
29 | 		_, c := csc.Dims()
30 | 		for j := 0; j < c; j++ {
31 | 			fn(j, csc.ColView(j))
32 | 		}
33 | 		return
34 | 	}
35 | 
36 | 	r, c := m.Dims()
37 | 	for j := 0; j < c; j++ {
38 | 		fn(j, mat.NewVecDense(r, mat.Col(nil, j, m)))
39 | 	}
40 | }
41 | 
42 | // ColNonZeroElemDo executes fn for each non-zero element in column j of matrix m.
43 | // If m implements mat.ColNonZeroDoer then this interface will be used to perform
44 | // the iteration.
45 | func ColNonZeroElemDo(m mat.Matrix, j int, fn func(i, j int, v float64)) {
46 | 	colNonZeroDoer, isSparse := m.(mat.ColNonZeroDoer)
47 | 	r, _ := m.Dims()
48 | 
49 | 	if isSparse {
50 | 		colNonZeroDoer.DoColNonZero(j, fn)
51 | 	} else {
52 | 		for i := 0; i < r; i++ {
53 | 			v := m.At(i, j)
54 | 			if v != 0 {
55 | 				fn(i, j, v)
56 | 			}
57 | 		}
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/hashing.go:
--------------------------------------------------------------------------------
 1 | package nlp
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 
 6 | 	"github.com/james-bowman/sparse"
 7 | 	"gonum.org/v1/gonum/mat"
 8 | )
 9 | 
10 | // SimHash implements the SimHash Locality Sensitive Hashing (LSH) algorithm for
11 | // angular distance using sign random projections based on the work of Moses S. Charikar.
12 | // The distance between the original vectors is preserved through the hashing process
13 | // such that hashed vectors can be compared using Hamming Similarity for a faster,
14 | // more space efficient, approximation of Cosine Similarity for the original vectors.
15 | //
16 | // Charikar, Moses S. "Similarity Estimation Techniques from Rounding Algorithms"
17 | // in Proceedings of the thiry-fourth annual ACM symposium on Theory of computing -
18 | // STOC ’02, 2002, p. 380.
19 | // https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf
20 | type SimHash struct {
21 | 	hyperplanes []*mat.VecDense
22 | }
23 | 
24 | // NewSimHash constructs a new SimHash creating a set of locality sensitive
25 | // hash functions which are combined to accept input vectors of length dim
26 | // and produce hashed binary vector fingerprints of length bits.  This method
27 | // creates a series of random hyperplanes which are then compared to each
28 | // input vector to produce the output hashed binary vector encoding the input
29 | // vector's location in vector space relative to the hyperplanes.  Each bit in
30 | // the output vector corresponds to the sign (1/0 for +/-) of the result of
31 | // the dot product comparison with each random hyperplane.
32 | func NewSimHash(bits int, dim int) *SimHash {
33 | 	// Generate random hyperplanes
34 | 	hyperplanes := make([]*mat.VecDense, bits)
35 | 
36 | 	for j := 0; j < bits; j++ {
37 | 		p := make([]float64, dim)
38 | 		for i := 0; i < dim; i++ {
39 | 			p[i] = rand.NormFloat64()
40 | 		}
41 | 		hyperplanes[j] = mat.NewVecDense(dim, p)
42 | 	}
43 | 	return &SimHash{hyperplanes: hyperplanes}
44 | }
45 | 
46 | // Hash accepts a Vector and outputs a BinaryVec (which also implements the
47 | // Gonum Vector interface).  This method will panic if the input vector is of a
48 | // different length than the dim parameter used when constructing the SimHash.
49 | func (h *SimHash) Hash(v mat.Vector) *sparse.BinaryVec {
50 | 	bits := len(h.hyperplanes)
51 | 	dim := h.hyperplanes[0].Len()
52 | 	if dim != v.Len() {
53 | 		panic("The supplied vector has a different number of dimensions from the projected hyperplanes")
54 | 	}
55 | 	sig := sparse.NewBinaryVec(bits)
56 | 	for i := 0; i < bits; i++ {
57 | 		if sparse.Dot(v, h.hyperplanes[i]) >= 0 {
58 | 			sig.SetBit(i)
59 | 		}
60 | 	}
61 | 	return sig
62 | }
63 | 


--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Package nlp provides implementations of selected machine learning algorithms for natural language processing of text corpora.  The primary focus is the statistical semantics of plain-text documents supporting semantic analysis and retrieval of semantically similar documents.
 3 | 
 4 | The package makes use of the Gonum (http://http//www.gonum.org/) library for linear algebra and scientific computing with some inspiration taken from Python's scikit-learn (http://scikit-learn.org/stable/) and Gensim(https://radimrehurek.com/gensim/)
 5 | 
 6 | Overview
 7 | 
 8 | The primary intended use case is to support document input as text strings encoded as a matrix of numerical feature vectors called a `term document matrix`.  Each column in the matrix corresponds to a document in the corpus and each row corresponds to a unique term occurring in the corpus.  The individual elements within the matrix contain the frequency with which each term occurs within each document (referred to as `term frequency`).  Whilst textual data from document corpora are the primary intended use case, the algorithms can be used with other types of data from other sources once encoded (vectorised) into a suitable matrix e.g. image data, sound data, users/products, etc.
 9 | 
10 | These matrices can be processed and manipulated through the application of additional transformations for weighting features, identifying relationships or optimising the data for analysis, information retrieval and/or predictions.
11 | 
12 | Typically the algorithms in this package implement one of three primary interfaces:
13 | 
14 | 	Vectoriser - Taking document input as strings and outputting matrices of numerical features e.g. term frequency.
15 | 	Transformer - Takes matrices of numerical features and applies some logic/transformation to output a new matrix.
16 | 	Comparer - Functions taking two vectors (columns from a matrix) and outputting a distance/similarity measure.
17 | 
18 | One of the implementations of Vectoriser is Pipeline which can be used to wire together pipelines composed of a Vectoriser and one or more Transformers arranged in serial so that the output from each stage forms the input of the next.  This can be used to construct a classic LSI (Latent Semantic Indexing) pipeline (vectoriser -> TF.IDF weighting -> Truncated SVD):
19 | 
20 | 	pipeline := nlp.NewPipeline(
21 | 		nlp.NewCountVectoriser(true),
22 | 		nlp.NewTFIDFTransformer(),
23 | 		nlp.NewTruncatedSVD(100),
24 | 	)
25 | 
26 | Whilst they take different inputs, both Vectorisers and Transformers have 3 primary methods:
27 | 
28 | 	Fit() - Trains the model based upon the supplied, input training data.
29 | 	Transform() - Transforms the input into the output matrix (requires the model to be already fitted by a previous call to Fit() or FitTransform()).
30 | 	FitTransform() - Convenience method combining Fit() and Transform() methods to transform input data, fitting the model to the input data in the process.
31 | */
32 | package nlp
33 | 


--------------------------------------------------------------------------------
/dimreduction_test.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"testing"
  6 | 
  7 | 	"gonum.org/v1/gonum/mat"
  8 | )
  9 | 
 10 | func TestTruncatedSVDFitTransform(t *testing.T) {
 11 | 	var tests = []struct {
 12 | 		m      int
 13 | 		n      int
 14 | 		input  []float64
 15 | 		k      int
 16 | 		r      int
 17 | 		c      int
 18 | 		result []float64
 19 | 	}{
 20 | 		{
 21 | 			m: 6, n: 4,
 22 | 			input: []float64{
 23 | 				1, 3, 5, 2,
 24 | 				8, 1, 0, 0,
 25 | 				2, 1, 0, 1,
 26 | 				0, 0, 0, 0,
 27 | 				0, 0, 0, 1,
 28 | 				0, 1, 0, 0,
 29 | 			},
 30 | 			k: 2,
 31 | 			r: 2, c: 4,
 32 | 			result: []float64{
 33 | 				-8.090, -2.212, -1.695, -0.955,
 34 | 				1.888, -2.524, -4.649, -1.930,
 35 | 			},
 36 | 		},
 37 | 	}
 38 | 
 39 | 	for _, test := range tests {
 40 | 		transformer := NewTruncatedSVD(test.k)
 41 | 		input := mat.NewDense(test.m, test.n, test.input)
 42 | 		expResult := mat.NewDense(test.r, test.c, test.result)
 43 | 
 44 | 		result, err := transformer.FitTransform(input)
 45 | 
 46 | 		if err != nil {
 47 | 			t.Errorf("Failed Truncated SVD transform caused by %v", err)
 48 | 		}
 49 | 
 50 | 		if !mat.EqualApprox(expResult, result, 0.01) {
 51 | 			t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
 52 | 				mat.Formatted(expResult),
 53 | 				mat.Formatted(result))
 54 | 			t.Fail()
 55 | 		}
 56 | 
 57 | 		result2, err := transformer.Transform(input)
 58 | 
 59 | 		if err != nil {
 60 | 			t.Errorf("Failed Truncated SVD transform caused by %v", err)
 61 | 		}
 62 | 
 63 | 		if !mat.EqualApprox(result, result2, 0.001) {
 64 | 			t.Logf("First matrix: \n%v\n but second matrix: \n%v\n",
 65 | 				mat.Formatted(result),
 66 | 				mat.Formatted(result2))
 67 | 			t.Fail()
 68 | 		}
 69 | 	}
 70 | }
 71 | 
 72 | func TestPCAFitTransform(t *testing.T) {
 73 | 	var tests = []struct {
 74 | 		m      int
 75 | 		n      int
 76 | 		input  []float64
 77 | 		k      int
 78 | 		r      int
 79 | 		c      int
 80 | 		result []float64
 81 | 	}{
 82 | 		{
 83 | 			m: 6, n: 4,
 84 | 			input: []float64{
 85 | 				1, 3, 5, 2,
 86 | 				8, 1, 0, 0,
 87 | 				2, 1, 0, 1,
 88 | 				0, 0, 0, 0,
 89 | 				0, 0, 0, 1,
 90 | 				0, 1, 0, 0,
 91 | 			},
 92 | 			k: 2,
 93 | 			r: 2, c: 4,
 94 | 			result: []float64{
 95 | 				-7.478, -0.128, 1.591, 0.496,
 96 | 				2.937, 2.581, 4.240, 1.110,
 97 | 			},
 98 | 		},
 99 | 	}
100 | 
101 | 	for _, test := range tests {
102 | 		transformer := NewPCA(test.k)
103 | 		input := mat.NewDense(test.m, test.n, test.input)
104 | 		expResult := mat.NewDense(test.r, test.c, test.result)
105 | 
106 | 		result, err := transformer.FitTransform(input)
107 | 
108 | 		if err != nil {
109 | 			t.Errorf("Failed Truncated SVD transform caused by %v", err)
110 | 		}
111 | 
112 | 		if !mat.EqualApprox(expResult, result, 0.01) {
113 | 			t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
114 | 				mat.Formatted(expResult),
115 | 				mat.Formatted(result))
116 | 			t.Fail()
117 | 		}
118 | 
119 | 		result2, err := transformer.Transform(input)
120 | 
121 | 		if err != nil {
122 | 			t.Errorf("Failed Truncated SVD transform caused by %v", err)
123 | 		}
124 | 
125 | 		if !mat.EqualApprox(result, result2, 0.001) {
126 | 			t.Logf("First matrix: \n%v\n but second matrix: \n%v\n",
127 | 				mat.Formatted(result),
128 | 				mat.Formatted(result2))
129 | 			t.Fail()
130 | 		}
131 | 	}
132 | }
133 | 
134 | func TestTruncatedSVDSaveLoad(t *testing.T) {
135 | 	var transforms = []struct {
136 | 		wanted *TruncatedSVD
137 | 	}{
138 | 		{
139 | 			wanted: &TruncatedSVD{
140 | 				Components: mat.NewDense(4, 2, []float64{
141 | 					1, 5,
142 | 					3, 2,
143 | 					9, 0,
144 | 					8, 4,
145 | 				}),
146 | 				K: 2,
147 | 			},
148 | 		},
149 | 	}
150 | 
151 | 	for ti, test := range transforms {
152 | 		t.Logf("**** TestTruncatedSVDSaveLoad - Test Run %d.\n", ti+1)
153 | 
154 | 		buf := new(bytes.Buffer)
155 | 		if err := test.wanted.Save(buf); err != nil {
156 | 			t.Errorf("Error encoding: %v\n", err)
157 | 			continue
158 | 		}
159 | 
160 | 		var b TruncatedSVD
161 | 		if err := b.Load(buf); err != nil {
162 | 			t.Errorf("Error unencoding: %v\n", err)
163 | 			continue
164 | 		}
165 | 
166 | 		if !mat.Equal(test.wanted.Components, b.Components) {
167 | 			t.Logf("Components mismatch: Wanted %v but got %v\n", mat.Formatted(test.wanted.Components), mat.Formatted(b.Components))
168 | 			t.Fail()
169 | 		}
170 | 		if test.wanted.K != b.K {
171 | 			t.Logf("K value mismatch: Wanted %d but got %d\n", test.wanted.K, b.K)
172 | 			t.Fail()
173 | 		}
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/measures/pairwise/comparisons.go:
--------------------------------------------------------------------------------
  1 | package pairwise
  2 | 
  3 | import (
  4 | 	"math"
  5 | 
  6 | 	"github.com/james-bowman/sparse"
  7 | 	"gonum.org/v1/gonum/mat"
  8 | )
  9 | 
 10 | // Comparer is a type of function that compares two mat.Vector types and
 11 | // returns a value indicating how similar they are.
 12 | type Comparer func(a, b mat.Vector) float64
 13 | 
 14 | // CosineSimilarity calculates the cosine of the angles of 2 vectors i.e. how
 15 | // similar they are.  Possible values range up to 1 (exact match).  NaN will be
 16 | // returned if either vector is zero length or contains only 0s.
 17 | func CosineSimilarity(a, b mat.Vector) float64 {
 18 | 	// Cosine angle between two vectors is equal to their dot product divided by
 19 | 	// the product of their L2 norms
 20 | 	dotProduct := sparse.Dot(a, b)
 21 | 	norma := sparse.Norm(a, 2.0)
 22 | 	normb := sparse.Norm(b, 2.0)
 23 | 
 24 | 	if norma == 0 || normb == 0 {
 25 | 		return math.NaN()
 26 | 	}
 27 | 
 28 | 	return (dotProduct / (norma * normb))
 29 | }
 30 | 
 31 | // CosineDistance is the complement of CosineSimilarity in the positive space.
 32 | // 	CosineDistance = 1.0 - CosineSimilariy
 33 | // It should be noted that CosineDistance is not strictly a valid distance measure
 34 | // as it does not obey triangular inequality.  For applications requiring a distance
 35 | // measure that conforms with the strict definition then AngularDistance or
 36 | // Euclidean distance (with all vectors L2 normalised first) should be used instead.
 37 | // Whilst these distance measures may give different values, they will rank the same
 38 | // as CosineDistance.
 39 | func CosineDistance(a, b mat.Vector) float64 {
 40 | 	return 1.0 - CosineSimilarity(a, b)
 41 | }
 42 | 
 43 | // AngularDistance is a distance measure closely related to CosineSimilarity.
 44 | // It measures the difference between the angles of 2 vectors by taking
 45 | // the inverse cosine (acos) of the CosineSimilarity and dividing by Pi.
 46 | // Unlike CosineSimilarity, this distance measure is a valid distance measure
 47 | // as it obeys triangular inequality.
 48 | // See https://en.wikipedia.org/wiki/Cosine_similarity#Angular_distance_and_similarity
 49 | func AngularDistance(a, b mat.Vector) float64 {
 50 | 	cos := CosineSimilarity(a, b)
 51 | 	if cos > 1 {
 52 | 		cos = 1.0
 53 | 	}
 54 | 	theta := math.Acos(cos)
 55 | 	return theta / math.Pi
 56 | }
 57 | 
 58 | // AngularSimilarity is the inverse of AngularDistance.
 59 | // 	AngularSimilarity = 1.0 - AngularDistance
 60 | func AngularSimilarity(a, b mat.Vector) float64 {
 61 | 	return 1.0 - AngularDistance(a, b)
 62 | }
 63 | 
 64 | // HammingDistance is a distance measure sometimes referred to as the
 65 | // `Matching Distance` and measures how different the 2 vectors are
 66 | // in terms of the number of non-matching elements. This measurement
 67 | // is normalised to provide the distance as proportional to the total
 68 | // number of elements in the vectors.  If a and b are not the same
 69 | // shape then the function will panic.
 70 | func HammingDistance(a, b mat.Vector) float64 {
 71 | 	ba, aok := a.(*sparse.BinaryVec)
 72 | 	bb, bok := b.(*sparse.BinaryVec)
 73 | 
 74 | 	if aok && bok {
 75 | 		return float64(ba.DistanceFrom(bb)) / float64(ba.Len())
 76 | 	}
 77 | 
 78 | 	var count float64
 79 | 	for i := 0; i < a.Len(); i++ {
 80 | 		if a.AtVec(i) != b.AtVec(i) {
 81 | 			count++
 82 | 		}
 83 | 	}
 84 | 	return count / float64(a.Len())
 85 | }
 86 | 
 87 | // HammingSimilarity is the inverse of HammingDistance (1-HammingDistance)
 88 | // and represents the proportion of elements within the 2 vectors that
 89 | // exactly match.
 90 | func HammingSimilarity(a, b mat.Vector) float64 {
 91 | 	return 1.0 - HammingDistance(a, b)
 92 | }
 93 | 
 94 | // EuclideanDistance calculates the Euclidean distance
 95 | // (l2 distance) between vectors a and b or more specifically
 96 | // \sqrt{\sum_{i=1}^n (a_i - b_i)^2}
 97 | func EuclideanDistance(a, b mat.Vector) float64 {
 98 | 	var v mat.VecDense
 99 | 	v.SubVec(a, b)
100 | 	return math.Sqrt(mat.Dot(&v, &v))
101 | }
102 | 
103 | // ManhattenDistance calculates the Manhatten distance (l1 distance) otherwise
104 | // known as the taxi cab distance between two vectors a and b.
105 | func ManhattenDistance(a, b mat.Vector) float64 {
106 | 	var v mat.VecDense
107 | 	v.SubVec(a, b)
108 | 	return mat.Norm(&v, 1)
109 | }
110 | 
111 | // VectorLenSimilarity calculates the len of ab vectors
112 | func VectorLenSimilarity(a, b mat.Vector) float64 {
113 | 	dotProduct := sparse.Dot(a, b)
114 | 	if dotProduct == 0 {
115 | 		return math.NaN()
116 | 	}
117 | 	return math.Sqrt(dotProduct)
118 | }
119 | 


--------------------------------------------------------------------------------
/index_test.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/james-bowman/nlp/measures/pairwise"
  8 | 	"github.com/james-bowman/sparse"
  9 | 	"gonum.org/v1/gonum/floats"
 10 | 	"gonum.org/v1/gonum/mat"
 11 | )
 12 | 
 13 | func TestIndexerIndex(t *testing.T) {
 14 | 	m := sparse.Random(sparse.DenseFormat, 100, 10, 1.0)
 15 | 
 16 | 	tests := []struct {
 17 | 		index Indexer
 18 | 	}{
 19 | 		{index: NewLinearScanIndex(pairwise.CosineDistance)},
 20 | 		{index: NewLSHIndex(false, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.CosineDistance)},
 21 | 		{index: NewLSHIndex(true, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.HammingDistance)},
 22 | 		{index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)},
 23 | 	}
 24 | 
 25 | 	for ti, test := range tests {
 26 | 		ColDo(m, func(j int, v mat.Vector) {
 27 | 			test.index.Index(v, j)
 28 | 		})
 29 | 
 30 | 		ColDo(m, func(j int, v mat.Vector) {
 31 | 			matches := test.index.Search(v, 1)
 32 | 
 33 | 			if len(matches) != 1 {
 34 | 				t.Errorf("Test %d: Search expected 1 result but received %d", ti+1, len(matches))
 35 | 			}
 36 | 			if matches[0].ID != j {
 37 | 				t.Errorf("Test %d: Search expected to find %d but found %d", ti+1, j, matches[0].ID)
 38 | 			}
 39 | 			if matches[0].Distance < -0.0000001 || matches[0].Distance > 0.0000001 {
 40 | 				t.Errorf("Test %d: Search match distance expected 0.0 but received %f", ti+1, matches[0].Distance)
 41 | 			}
 42 | 		})
 43 | 	}
 44 | }
 45 | 
 46 | func TestIndexerSearch(t *testing.T) {
 47 | 	numCols := 10
 48 | 	m := sparse.Random(sparse.DenseFormat, 100, numCols, 1.0)
 49 | 
 50 | 	// build similarity matrix
 51 | 	similarityMatrix := make([]float64, numCols*numCols)
 52 | 	inds := make([][]int, numCols)
 53 | 	ColDo(m, func(j int, v1 mat.Vector) {
 54 | 		ColDo(m, func(i int, v2 mat.Vector) {
 55 | 			similarityMatrix[j*numCols+i] = pairwise.CosineDistance(v1, v2)
 56 | 		})
 57 | 		inds[j] = make([]int, numCols)
 58 | 		floats.Argsort(similarityMatrix[j*numCols:(j+1)*numCols], inds[j])
 59 | 		for left, right := 0, len(inds[j])-1; left < right; left, right = left+1, right-1 {
 60 | 			inds[j][left], inds[j][right] = inds[j][right], inds[j][left]
 61 | 			similarityMatrix[j*numCols+left], similarityMatrix[j*numCols+right] = similarityMatrix[j*numCols+right], similarityMatrix[j*numCols+left]
 62 | 		}
 63 | 	})
 64 | 
 65 | 	tests := []struct {
 66 | 		k     int
 67 | 		index Indexer
 68 | 	}{
 69 | 		{k: numCols, index: NewLinearScanIndex(pairwise.CosineDistance)},
 70 | 		{k: numCols, index: NewLSHIndex(false, NewSimHash(700, 100), NewClassicLSH(7, 100), pairwise.CosineDistance)},
 71 | 		{k: numCols, index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)},
 72 | 	}
 73 | 
 74 | 	for ti, test := range tests {
 75 | 		ColDo(m, func(j int, v mat.Vector) {
 76 | 			test.index.Index(v, j)
 77 | 		})
 78 | 
 79 | 		ColDo(m, func(j int, v mat.Vector) {
 80 | 			matches := test.index.Search(v, test.k)
 81 | 
 82 | 			if len(matches) != test.k {
 83 | 				t.Errorf("Test %d: Search expected %d result but received %d", ti+1, test.k, len(matches))
 84 | 			}
 85 | 			heap := resultHeap{matches: matches}
 86 | 			sort.Sort(heap)
 87 | 
 88 | 			for i, match := range matches {
 89 | 				if match.ID != inds[j][i] {
 90 | 					t.Errorf("Test %d: For col #%d, Rank #%d - expected %v but found %v", ti+1, j, i, inds[j], matches)
 91 | 					return
 92 | 				}
 93 | 			}
 94 | 		})
 95 | 	}
 96 | }
 97 | 
 98 | func TestIndexerRemove(t *testing.T) {
 99 | 	m := sparse.Random(sparse.DenseFormat, 100, 10, 1.0)
100 | 
101 | 	tests := []struct {
102 | 		index Indexer
103 | 	}{
104 | 		{index: NewLinearScanIndex(pairwise.CosineDistance)},
105 | 		{index: NewLSHIndex(false, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.CosineDistance)},
106 | 		{index: NewLSHIndex(true, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.HammingDistance)},
107 | 		{index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)},
108 | 	}
109 | 
110 | 	for ti, test := range tests {
111 | 		ColDo(m, func(j int, v mat.Vector) {
112 | 			test.index.Index(v, j)
113 | 		})
114 | 
115 | 		ColDo(m, func(j int, v mat.Vector) {
116 | 			test.index.Remove(j)
117 | 			matches := test.index.Search(v, 1)
118 | 
119 | 			if len(matches) > 1 {
120 | 				t.Errorf("Test %d: Search expected less than 1 result but received %d", ti+1, len(matches))
121 | 			}
122 | 			if len(matches) == 1 {
123 | 				if matches[0].ID == j {
124 | 					t.Errorf("Test %d: Search expected not to find %d but found %d", ti+1, j, matches[0].ID)
125 | 				}
126 | 			}
127 | 		})
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/weightings_test.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/james-bowman/sparse"
  8 | 	"gonum.org/v1/gonum/mat"
  9 | )
 10 | 
 11 | func TestTfidfTransformerFit(t *testing.T) {
 12 | 	var tests = []struct {
 13 | 		m         int
 14 | 		n         int
 15 | 		input     []float64
 16 | 		dim       int
 17 | 		transform []float64
 18 | 	}{
 19 | 		{
 20 | 			m: 6, n: 4,
 21 | 			input: []float64{
 22 | 				1, 3, 5, 2,
 23 | 				8, 1, 0, 0,
 24 | 				2, 1, 0, 1,
 25 | 				0, 0, 0, 0,
 26 | 				0, 0, 0, 1,
 27 | 				0, 1, 0, 0,
 28 | 			},
 29 | 			dim: 6,
 30 | 			transform: []float64{
 31 | 				0,
 32 | 				0.5108256237659907,
 33 | 				0.22314355131420976,
 34 | 				1.6094379124341003,
 35 | 				0.9162907318741551,
 36 | 				0.9162907318741551,
 37 | 			},
 38 | 		},
 39 | 	}
 40 | 
 41 | 	for _, test := range tests {
 42 | 		transformer := NewTfidfTransformer()
 43 | 		input := mat.NewDense(test.m, test.n, test.input)
 44 | 
 45 | 		transformer.Fit(input)
 46 | 
 47 | 		weights := transformer.transform.Diagonal()
 48 | 		for i, v := range weights {
 49 | 			if v != test.transform[i] {
 50 | 				t.Logf("Expected weights: \n%v\n but found: \n%v\n",
 51 | 					test.transform, weights)
 52 | 				t.Fail()
 53 | 			}
 54 | 		}
 55 | 	}
 56 | }
 57 | 
 58 | func TestTfidfTransformerTransform(t *testing.T) {
 59 | 	var tests = []struct {
 60 | 		m      int
 61 | 		n      int
 62 | 		input  []float64
 63 | 		tm     int
 64 | 		tn     int
 65 | 		output []float64
 66 | 	}{
 67 | 		{
 68 | 			m: 6, n: 4,
 69 | 			input: []float64{
 70 | 				1, 3, 5, 2,
 71 | 				8, 1, 0, 0,
 72 | 				2, 1, 0, 1,
 73 | 				0, 0, 0, 0,
 74 | 				0, 0, 0, 1,
 75 | 				0, 1, 0, 0,
 76 | 			},
 77 | 			tm: 6, tn: 4,
 78 | 			output: []float64{
 79 | 				0.000, 0.000, 0.000, 0.000,
 80 | 				4.087, 0.511, 0.000, 0.000,
 81 | 				0.446, 0.223, 0.000, 0.223,
 82 | 				0.000, 0.000, 0.000, 0.000,
 83 | 				0.000, 0.000, 0.000, 0.916,
 84 | 				0.000, 0.916, 0.000, 0.000,
 85 | 			},
 86 | 		},
 87 | 	}
 88 | 
 89 | 	for _, test := range tests {
 90 | 		transformer := NewTfidfTransformer()
 91 | 		input := mat.NewDense(test.m, test.n, test.input)
 92 | 		output := mat.NewDense(test.tm, test.tn, test.output)
 93 | 
 94 | 		result, err := transformer.FitTransform(input)
 95 | 
 96 | 		if err != nil {
 97 | 			t.Errorf("Failed tfidf fit transform caused by %v", err)
 98 | 		}
 99 | 
100 | 		if !mat.EqualApprox(output, result, 0.001) {
101 | 			t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
102 | 				mat.Formatted(output),
103 | 				mat.Formatted(result))
104 | 			t.Fail()
105 | 		}
106 | 
107 | 		// test that subsequent transforms produce same result as initial
108 | 		result2, err := transformer.Transform(input)
109 | 
110 | 		if err != nil {
111 | 			t.Errorf("Failed tfidf fit transform caused by %v", err)
112 | 		}
113 | 
114 | 		if !mat.Equal(result, result2) {
115 | 			t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
116 | 				mat.Formatted(result),
117 | 				mat.Formatted(result2))
118 | 			t.Fail()
119 | 		}
120 | 	}
121 | }
122 | 
123 | func TestTfidfTransformerSaveLoad(t *testing.T) {
124 | 	var transforms = []struct {
125 | 		wantedTransform *sparse.DIA
126 | 	}{
127 | 		{
128 | 			wantedTransform: sparse.NewDIA(2, 2, []float64{1, 5}),
129 | 		},
130 | 	}
131 | 
132 | 	for ti, test := range transforms {
133 | 		t.Logf("**** TestTfidfTransformerSave - Test Run %d.\n", ti+1)
134 | 
135 | 		a := NewTfidfTransformer()
136 | 		a.transform = test.wantedTransform
137 | 
138 | 		buf := new(bytes.Buffer)
139 | 		if err := a.Save(buf); err != nil {
140 | 			t.Errorf("Error encoding: %v\n", err)
141 | 			continue
142 | 		}
143 | 
144 | 		b := NewTfidfTransformer()
145 | 		if err := b.Load(buf); err != nil {
146 | 			t.Errorf("Error unencoding: %v\n", err)
147 | 			continue
148 | 		}
149 | 
150 | 		if !mat.Equal(a.transform, b.transform) {
151 | 			t.Logf("Wanted %v but got %v\n", mat.Formatted(a.transform), mat.Formatted(b.transform))
152 | 			t.Fail()
153 | 		}
154 | 	}
155 | }
156 | 
157 | func benchmarkTFIDFFitTransform(t Transformer, m, n int, b *testing.B) {
158 | 	mat := mat.NewDense(m, n, nil)
159 | 
160 | 	for n := 0; n < b.N; n++ {
161 | 		t.FitTransform(mat)
162 | 	}
163 | }
164 | 
165 | func BenchmarkTFIDFFitTransform20x10(b *testing.B) {
166 | 	benchmarkTFIDFFitTransform(NewTfidfTransformer(), 20, 10, b)
167 | }
168 | func BenchmarkTFIDFFitTransform200x100(b *testing.B) {
169 | 	benchmarkTFIDFFitTransform(NewTfidfTransformer(), 200, 100, b)
170 | }
171 | func BenchmarkTFIDFFitTransform2000x1000(b *testing.B) {
172 | 	benchmarkTFIDFFitTransform(NewTfidfTransformer(), 2000, 1000, b)
173 | }
174 | func BenchmarkTFIDFFitTransform20000x10000(b *testing.B) {
175 | 	benchmarkTFIDFFitTransform(NewTfidfTransformer(), 20000, 10000, b)
176 | }
177 | 


--------------------------------------------------------------------------------
/weightings.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"io"
  5 | 	"math"
  6 | 
  7 | 	"github.com/james-bowman/sparse"
  8 | 	"gonum.org/v1/gonum/mat"
  9 | )
 10 | 
 11 | // TfidfTransformer takes a raw term document matrix and weights each raw term frequency
 12 | // value depending upon how commonly it occurs across all documents within the corpus.
 13 | // For example a very commonly occurring word like `the` is likely to occur in all documents
 14 | // and so would be weighted down.
 15 | // More precisely, TfidfTransformer applies a tf-idf algorithm to the matrix where each
 16 | // term frequency is multiplied by the inverse document frequency.  Inverse document
 17 | // frequency is calculated as log(n/df) where df is the number of documents in which the
 18 | // term occurs and n is the total number of documents within the corpus.  We add 1 to both n
 19 | // and df before division to prevent division by zero.
 20 | type TfidfTransformer struct {
 21 | 	transform *sparse.DIA
 22 | }
 23 | 
 24 | // NewTfidfTransformer constructs a new TfidfTransformer.
 25 | func NewTfidfTransformer() *TfidfTransformer {
 26 | 	return &TfidfTransformer{}
 27 | }
 28 | 
 29 | // Fit takes a training term document matrix, counts term occurrences across all documents
 30 | // and constructs an inverse document frequency transform to apply to matrices in subsequent
 31 | // calls to Transform().
 32 | func (t *TfidfTransformer) Fit(matrix mat.Matrix) Transformer {
 33 | 	if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv {
 34 | 		matrix = t.ToCSR()
 35 | 	}
 36 | 	m, n := matrix.Dims()
 37 | 
 38 | 	weights := make([]float64, m)
 39 | 	var df int
 40 | 	if csr, ok := matrix.(*sparse.CSR); ok {
 41 | 		for i := 0; i < m; i++ {
 42 | 			weights[i] = math.Log(float64(1+n) / float64(1+csr.RowNNZ(i)))
 43 | 		}
 44 | 	} else {
 45 | 		for i := 0; i < m; i++ {
 46 | 			df = 0
 47 | 			for j := 0; j < n; j++ {
 48 | 				if matrix.At(i, j) != 0 {
 49 | 					df++
 50 | 				}
 51 | 			}
 52 | 			weights[i] = math.Log(float64(1+n) / float64(1+df))
 53 | 		}
 54 | 	}
 55 | 
 56 | 	// build a diagonal matrix from array of term weighting values for subsequent
 57 | 	// multiplication with term document matrics
 58 | 	t.transform = sparse.NewDIA(m, m, weights)
 59 | 
 60 | 	return t
 61 | }
 62 | 
 63 | // Transform applies the inverse document frequency (IDF) transform by multiplying
 64 | // each term frequency by its corresponding IDF value.  This has the effect of weighting
 65 | // each term frequency according to how often it appears across the whole document corpus
 66 | // so that naturally frequent occurring words are given less weight than uncommon ones.
 67 | // The returned matrix is a sparse matrix type.
 68 | func (t *TfidfTransformer) Transform(matrix mat.Matrix) (mat.Matrix, error) {
 69 | 	if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv {
 70 | 		matrix = t.ToCSR()
 71 | 	}
 72 | 	var product sparse.CSR
 73 | 
 74 | 	// simply multiply the matrix by our idf transform (the diagonal matrix of term weights)
 75 | 	product.Mul(t.transform, matrix)
 76 | 
 77 | 	// todo: possibly L2 norm matrix to remove any bias caused by documents of different
 78 | 	// lengths where longer documents naturally have more words and so higher word counts
 79 | 
 80 | 	return &product, nil
 81 | }
 82 | 
 83 | // FitTransform is exactly equivalent to calling Fit() followed by Transform() on the
 84 | // same matrix.  This is a convenience where separate training data is not being
 85 | // used to fit the model i.e. the model is fitted on the fly to the test data.
 86 | // The returned matrix is a sparse matrix type.
 87 | func (t *TfidfTransformer) FitTransform(matrix mat.Matrix) (mat.Matrix, error) {
 88 | 	if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv {
 89 | 		matrix = t.ToCSR()
 90 | 	}
 91 | 	return t.Fit(matrix).Transform(matrix)
 92 | }
 93 | 
 94 | // Save binary serialises the model and writes it into w.  This is useful for persisting
 95 | // a trained model to disk so that it may be loaded (using the Load() method)in another
 96 | // context (e.g. production) for reproducible results.
 97 | func (t TfidfTransformer) Save(w io.Writer) error {
 98 | 	_, err := t.transform.MarshalBinaryTo(w)
 99 | 
100 | 	return err
101 | }
102 | 
103 | // Load binary deserialises the previously serialised model into the receiver.  This is
104 | // useful for loading a previously trained and saved model from another context
105 | // (e.g. offline training) for use within another context (e.g. production) for
106 | // reproducible results.  Load should only be performed with trusted data.
107 | func (t *TfidfTransformer) Load(r io.Reader) error {
108 | 	var model sparse.DIA
109 | 
110 | 	if _, err := model.UnmarshalBinaryFrom(r); err != nil {
111 | 		return err
112 | 	}
113 | 	t.transform = &model
114 | 
115 | 	return nil
116 | }
117 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
 1 | package nlp_test
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/james-bowman/nlp"
 7 | 	"github.com/james-bowman/nlp/measures/pairwise"
 8 | 	"gonum.org/v1/gonum/mat"
 9 | )
10 | 
11 | func Example() {
12 | 	testCorpus := []string{
13 | 		"The quick brown fox jumped over the lazy dog",
14 | 		"hey diddle diddle, the cat and the fiddle",
15 | 		"the cow jumped over the moon",
16 | 		"the little dog laughed to see such fun",
17 | 		"and the dish ran away with the spoon",
18 | 	}
19 | 
20 | 	var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}
21 | 
22 | 	query := "the brown fox ran around the dog"
23 | 
24 | 	vectoriser := nlp.NewCountVectoriser(stopWords...)
25 | 	transformer := nlp.NewTfidfTransformer()
26 | 
27 | 	// set k (the number of dimensions following truncation) to 4
28 | 	reducer := nlp.NewTruncatedSVD(4)
29 | 
30 | 	lsiPipeline := nlp.NewPipeline(vectoriser, transformer, reducer)
31 | 
32 | 	// Transform the corpus into an LSI fitting the model to the documents in the process
33 | 	lsi, err := lsiPipeline.FitTransform(testCorpus...)
34 | 	if err != nil {
35 | 		fmt.Printf("Failed to process documents because %v", err)
36 | 		return
37 | 	}
38 | 
39 | 	// run the query through the same pipeline that was fitted to the corpus and
40 | 	// to project it into the same dimensional space
41 | 	queryVector, err := lsiPipeline.Transform(query)
42 | 	if err != nil {
43 | 		fmt.Printf("Failed to process documents because %v", err)
44 | 		return
45 | 	}
46 | 
47 | 	// iterate over document feature vectors (columns) in the LSI matrix and compare
48 | 	// with the query vector for similarity.  Similarity is determined by the difference
49 | 	// between the angles of the vectors known as the cosine similarity
50 | 	highestSimilarity := -1.0
51 | 	var matched int
52 | 	_, docs := lsi.Dims()
53 | 	for i := 0; i < docs; i++ {
54 | 		similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), lsi.(mat.ColViewer).ColView(i))
55 | 		if similarity > highestSimilarity {
56 | 			matched = i
57 | 			highestSimilarity = similarity
58 | 		}
59 | 	}
60 | 
61 | 	fmt.Printf("Matched '%s'", testCorpus[matched])
62 | 	// Output: Matched 'The quick brown fox jumped over the lazy dog'
63 | }
64 | 


--------------------------------------------------------------------------------
/vectorisers_test.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/james-bowman/sparse"
  7 | )
  8 | 
  9 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}
 10 | 
 11 | var trainSet = []string{
 12 | 	"The quick brown fox jumped over the. Lazy dog",
 13 | 	"the brown Cat sat on the mat",
 14 | 	"the little dog laughed to see such fun",
 15 | 	"laughing cow",
 16 | 	"the cow ran around the dog",
 17 | 	"spoon dish and plate",
 18 | }
 19 | 
 20 | var testSet = []string{
 21 | 	"hey diddle diddle",
 22 | 	"the cat and the fiddle",
 23 | 	"the cow jumped over the moon",
 24 | 	"the quick brown fox jumped over the. Lazy dog",
 25 | 	"The little dog laughed to see such fun",
 26 | 	"The dish ran away with the spoon",
 27 | }
 28 | 
 29 | func TestCountVectoriserFit(t *testing.T) {
 30 | 	var tests = []struct {
 31 | 		train     []string
 32 | 		stop      []string
 33 | 		vocabSize int
 34 | 	}{
 35 | 		{trainSet, []string{}, 26},
 36 | 		{trainSet[0:1], []string{}, 8},
 37 | 		{trainSet, stopWords, 18},
 38 | 	}
 39 | 
 40 | 	for testRun, test := range tests {
 41 | 		t.Logf("**** Test Run %d.\n", testRun+1)
 42 | 		vectoriser := NewCountVectoriser(test.stop...)
 43 | 
 44 | 		vectoriser.Fit(test.train...)
 45 | 
 46 | 		if len(vectoriser.Vocabulary) != test.vocabSize {
 47 | 			t.Logf("Expected training dataset %v of size %d but found vocabulary %v of size %d",
 48 | 				test.train, test.vocabSize, vectoriser.Vocabulary, len(vectoriser.Vocabulary))
 49 | 			t.Fail()
 50 | 		}
 51 | 	}
 52 | }
 53 | func TestCountVectoriserTransform(t *testing.T) {
 54 | 	var tests = []struct {
 55 | 		train     []string
 56 | 		vocabSize int
 57 | 		stop      []string
 58 | 		test      []string
 59 | 	}{
 60 | 		{trainSet, 26, []string{}, testSet},
 61 | 		{trainSet[0:1], 8, []string{}, testSet[0:3]},
 62 | 		{testSet, 26, []string{}, testSet},
 63 | 		{testSet, 19, stopWords, testSet},
 64 | 	}
 65 | 
 66 | 	for testRun, test := range tests {
 67 | 		t.Logf("**** Test Run %d.\n", testRun+1)
 68 | 
 69 | 		vectoriser := NewCountVectoriser(test.stop...)
 70 | 		vectoriser.Fit(test.train...)
 71 | 
 72 | 		vec, err := vectoriser.Transform(test.test...)
 73 | 
 74 | 		if err != nil {
 75 | 			t.Errorf("Error fitting and applying vectoriser caused by %v", err)
 76 | 		}
 77 | 
 78 | 		m, n := vec.Dims()
 79 | 
 80 | 		if m != test.vocabSize || n != len(test.test) {
 81 | 			t.Logf("Expected matrix %d x %d but found %d x %d", test.vocabSize, len(test.test), m, n)
 82 | 			t.Fail()
 83 | 		}
 84 | 	}
 85 | }
 86 | 
 87 | func TestHashingVectoriserTransform(t *testing.T) {
 88 | 	var tests = []struct {
 89 | 		train    []string
 90 | 		nnz      int
 91 | 		features int
 92 | 		stop     []string
 93 | 		test     []string
 94 | 	}{
 95 | 		{trainSet, 33, 260000, []string{}, testSet},
 96 | 		{trainSet[0:1], 11, 260000, []string{}, testSet[0:3]},
 97 | 		{testSet, 33, 260001, []string{}, testSet},
 98 | 		{testSet, 21, 260000, stopWords, testSet},
 99 | 	}
100 | 
101 | 	for testRun, test := range tests {
102 | 		t.Logf("**** Test Run %d.\n", testRun+1)
103 | 		vectoriser := NewHashingVectoriser(test.features, test.stop...)
104 | 		vectoriser.Fit(test.train...)
105 | 
106 | 		vec, err := vectoriser.Transform(test.test...)
107 | 
108 | 		if err != nil {
109 | 			t.Errorf("Error fitting and applying vectoriser caused by %v", err)
110 | 		}
111 | 
112 | 		m, n := vec.Dims()
113 | 
114 | 		if m != test.features || n != len(test.test) || vec.(sparse.Sparser).NNZ() != test.nnz {
115 | 			t.Logf("Expected matrix %d x %d with NNZ = %d but found %d x %d with NNZ = %d",
116 | 				test.features,
117 | 				len(test.test),
118 | 				test.nnz,
119 | 				m, n,
120 | 				vec.(sparse.Sparser).NNZ())
121 | 			t.Fail()
122 | 		}
123 | 	}
124 | }
125 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Natural Language Processing 
 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 
 3 | [![GoDoc](https://godoc.org/github.com/james-bowman/nlp?status.svg)](https://godoc.org/github.com/james-bowman/nlp) 
 4 | [![Build Status](https://travis-ci.org/james-bowman/nlp.svg?branch=master)](https://travis-ci.org/james-bowman/nlp)
 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/james-bowman/nlp)](https://goreportcard.com/report/github.com/james-bowman/nlp)
 6 | [![codecov](https://codecov.io/gh/james-bowman/nlp/branch/master/graph/badge.svg)](https://codecov.io/gh/james-bowman/nlp)
 7 | [![Mentioned in Awesome Go](https://awesome.re/mentioned-badge-flat.svg)](https://github.com/avelino/awesome-go)
 8 | [![Sourcegraph](https://sourcegraph.com/github.com/james-bowman/nlp/-/badge.svg)](https://sourcegraph.com/github.com/james-bowman/nlp?badge)
 9 | 
10 | 
11 | <img src="https://github.com/james-bowman/nlp/raw/master/Gophers.008.crop.png" alt="nlp" align="left" />
12 | 
13 | Implementations of selected machine learning algorithms for natural language processing in golang.  The primary focus for the package is the statistical semantics of plain-text documents supporting semantic analysis and retrieval of semantically similar documents.
14 | 
15 | Built upon the [Gonum](https://www.gonum.org/) package for linear algebra and scientific computing with some inspiration taken from Python's [scikit-learn](http://scikit-learn.org/stable/) and [Gensim](https://radimrehurek.com/gensim/).
16 | 
17 | Check out [the companion blog post](http://www.jamesbowman.me/post/semantic-analysis-of-webpages-with-machine-learning-in-go/) or [the Go documentation page](https://godoc.org/github.com/james-bowman/nlp) for full usage and examples.
18 | 
19 | <br clear="all"/>
20 | 
21 | ## Features
22 | 
23 | * [LSA (Latent Semantic Analysis aka Latent Semantic Indexing (LSI))][LSA] implementation using truncated [SVD (Singular Value Decomposition)](https://en.wikipedia.org/wiki/Singular-value_decomposition) for dimensionality reduction.
24 | * Fast comparison and retrieval of semantically similar documents using [SimHash](https://en.wikipedia.org/wiki/SimHash)(random hyperplanes/[sign random projection](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection)) algorithm with multi-index and Forest schemes for [LSH (Locality Sensitive Hashing)](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) to support fast, approximate cosine similarity/angular distance comparisons and approximate nearest neighbour search using significantly less memory and processing time.
25 | * [Random Indexing (RI)](https://en.wikipedia.org/wiki/Random_indexing) and Reflective Random Indexing (RRI) (which extends RI to support indirect inference) for scalable [Latent Semantic Analysis (LSA)][LSA] over large, web-scale corpora.
26 | * [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) using a parallelised implementation of the fast [SCVB0 (Stochastic Collapsed Variational Bayesian inference)][SCVB0] algorithm for unsupervised topic extraction. 
27 | * [PCA (Principal Component Analysis)](https://en.wikipedia.org/wiki/Principal_component_analysis)
28 | * [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) weighting to account for frequently occuring words
29 | * [Sparse matrix](http://github.com/james-bowman/sparse) implementations used for more efficient memory usage and processing over large document corpora.
30 | * Stop word removal to remove frequently occuring English words e.g. "the", "and"
31 | * [Feature hashing](https://en.wikipedia.org/wiki/Feature_hashing) ('the hashing trick') implementation (using [MurmurHash3](http://github.com/spaolacci/murmur3)) for reduced memory requirements and reduced reliance on training data
32 | * Similarity/distance measures to calculate the similarity/distance between feature vectors.
33 | 
34 | ## Planned
35 | 
36 | * Expanded persistence support
37 | * Stemming to treat words with common root as the same e.g. "go" and "going"
38 | * Clustering algorithms e.g. Heirachical, K-means, etc.
39 | * Classification algorithms e.g. SVM, KNN, random forest, etc.
40 | 
41 | ## References
42 | 
43 | 1. [Rosario, Barbara. Latent Semantic Indexing: An overview. INFOSYS 240 Spring 2000](http://people.ischool.berkeley.edu/~rosario/projects/LSI.pdf)
44 | 1. [Latent Semantic Analysis, a scholarpedia article on LSA written by Tom Landauer, one of the creators of LSA.](http://www.scholarpedia.org/article/Latent_semantic_analysis)
45 | 1. [Thomo, Alex. Latent Semantic Analysis (Tutorial).](http://webhome.cs.uvic.ca/~thomo/svd.pdf)
46 | 1. [Latent Semantic Indexing. Standford NLP Course](http://nlp.stanford.edu/IR-book/html/htmledition/latent-semantic-indexing-1.html)
47 | 1. [Charikar, Moses S. "Similarity Estimation Techniques from Rounding Algorithms" in Proceedings of the thiry-fourth annual ACM symposium on Theory of computing - STOC ’02, 2002, p. 380.](https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf)
48 | 1. [M. Bawa, T. Condie, and P. Ganesan, “LSH forest: self-tuning indexes for similarity search,” Proc. 14th Int. Conf. World Wide Web - WWW ’05, p. 651, 2005.](http://dl.acm.org/citation.cfm?id=1060745.1060840)
49 | 1. [A. Gionis, P. Indyk, and R. Motwani, “Similarity Search in High Dimensions via Hashing,” VLDB ’99 Proc. 25th Int. Conf. Very Large Data Bases, vol. 99, no. 1, pp. 518–529, 1999.](http://www.cs.princeton.edu/courses/archive/spring13/cos598C/Gionis.pdf%5Cnhttp://portal.acm.org/citation.cfm?id=671516)
50 | 1. [Kanerva, Pentti, Kristoferson, Jan and Holst, Anders (2000). Random Indexing of Text Samples for Latent Semantic Analysis](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.4.6523&rep=rep1&type=pdf)
51 | 1. [Rangan, Venkat. Discovery of Related Terms in a corpus using Reflective Random Indexing](https://www.umiacs.umd.edu/~oard/desi4/papers/rangan.pdf)
52 | 1. [Vasuki, Vidya and Cohen, Trevor. Reflective random indexing for semi-automatic indexing of the biomedical literature](https://ac.els-cdn.com/S1532046410000481/1-s2.0-S1532046410000481-main.pdf?_tid=f31f92e8-028a-11e8-8c31-00000aab0f6c&acdnat=1516965824_e24a804445fff1744281ca6f5898a3a4)
53 | 1. [QasemiZadeh, Behrang and Handschuh, Siegfried. Random Indexing Explained with High Probability](http://pars.ie/publications/papers/pre-prints/random-indexing-dr-explained.pdf)
54 | 1. [Foulds, James; Boyles, Levi; Dubois, Christopher; Smyth, Padhraic; Welling, Max (2013). Stochastic Collapsed Variational Bayesian Inference for Latent Dirichlet Allocation][SCVB0]
55 | 
56 | <!--
57 | 1. [Geva, Shlomo & De Vries, Christopher M (2011). TOPSIG : Topology Preserving Document Signatures.](https://eprints.qut.edu.au/43451/4/43451.pdf)
58 | -->
59 | 
60 | [LSA]: https://en.wikipedia.org/wiki/Latent_semantic_analysis
61 | [SCVB0]: https://arxiv.org/pdf/1305.2452
62 | 


--------------------------------------------------------------------------------
/dimreduction.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"fmt"
  6 | 	"io"
  7 | 
  8 | 	"github.com/james-bowman/sparse"
  9 | 	"gonum.org/v1/gonum/mat"
 10 | 	"gonum.org/v1/gonum/stat"
 11 | )
 12 | 
 13 | // TruncatedSVD implements the Singular Value Decomposition factorisation of matrices.
 14 | // This produces an approximation of the input matrix at a lower rank.  This is a core
 15 | // component of LSA (Latent Semantic Analsis)
 16 | type TruncatedSVD struct {
 17 | 	// Components is the truncated term matrix (matrix U of the Singular Value Decomposition
 18 | 	// (A=USV^T)).  The matrix will be of size m, k where m = the number of unique terms
 19 | 	// in the training data and k = the number of elements to truncate to (specified by
 20 | 	// attribute K) or m or n (the number of documents in the training data) whichever of
 21 | 	// the 3 values is smaller.
 22 | 	Components *mat.Dense
 23 | 
 24 | 	// K is the number of dimensions to which the output, transformed, matrix should be
 25 | 	// truncated to.  The matrix output by the FitTransform() and Transform() methods will
 26 | 	// be n rows by min(m, n, K) columns, where n is the number of columns in the original,
 27 | 	// input matrix and min(m, n, K) is the lowest value of m, n, K where m is the number of
 28 | 	// rows in the original, input matrix.
 29 | 	K int
 30 | }
 31 | 
 32 | // NewTruncatedSVD creates a new TruncatedSVD transformer with K (the truncated
 33 | // dimensionality) being set to the specified value k
 34 | func NewTruncatedSVD(k int) *TruncatedSVD {
 35 | 	return &TruncatedSVD{K: k}
 36 | }
 37 | 
 38 | // Fit performs the SVD factorisation on the input training data matrix, mat and
 39 | // stores the output term matrix as a transform to apply to matrices in the Transform matrix.
 40 | func (t *TruncatedSVD) Fit(mat mat.Matrix) Transformer {
 41 | 	if _, err := t.FitTransform(mat); err != nil {
 42 | 		panic("nlp: Failed to fit truncated SVD because " + err.Error())
 43 | 	}
 44 | 	return t
 45 | }
 46 | 
 47 | // Transform applies the transform decomposed from the training data matrix in Fit()
 48 | // to the input matrix.  The resulting output matrix will be the closest approximation
 49 | // to the input matrix at a reduced rank.  The returned matrix is a dense matrix type.
 50 | func (t *TruncatedSVD) Transform(m mat.Matrix) (mat.Matrix, error) {
 51 | 	var product mat.Dense
 52 | 
 53 | 	product.Mul(t.Components.T(), m)
 54 | 
 55 | 	return &product, nil
 56 | }
 57 | 
 58 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
 59 | // on the same matrix.  This is a useful shortcut where separate training data is not being
 60 | // used to fit the model i.e. the model is fitted on the fly to the test data.
 61 | // The returned matrix is a dense matrix type.
 62 | func (t *TruncatedSVD) FitTransform(m mat.Matrix) (mat.Matrix, error) {
 63 | 	var svd mat.SVD
 64 | 	if ok := svd.Factorize(m, mat.SVDThin); !ok {
 65 | 		return nil, fmt.Errorf("Failed SVD Factorisation of working matrix")
 66 | 	}
 67 | 	s, u, v := t.extractSVD(&svd)
 68 | 
 69 | 	r, c := m.Dims()
 70 | 	min := minimum(t.K, r, c)
 71 | 
 72 | 	// truncate U and V matrices to k << min(m, n)
 73 | 	uk := u.Slice(0, r, 0, min)
 74 | 	vk := v.Slice(0, c, 0, min)
 75 | 
 76 | 	t.Components = uk.(*mat.Dense)
 77 | 
 78 | 	// multiply Sigma by transpose of V.  As sigma is a symmetrical (square) diagonal matrix it is
 79 | 	// more efficient to simply multiply each element from the array of diagonal values with each
 80 | 	// element from the matrix V rather than multiplying out the non-zero values from off the diagonal.
 81 | 	var product mat.Dense
 82 | 	product.Apply(func(i, j int, v float64) float64 {
 83 | 		return (v * s[i])
 84 | 	}, vk.T())
 85 | 
 86 | 	return &product, nil
 87 | }
 88 | 
 89 | func minimum(k, m, n int) int {
 90 | 	return min(k, min(m, n))
 91 | }
 92 | 
 93 | func min(m, n int) int {
 94 | 	if m < n {
 95 | 		return m
 96 | 	}
 97 | 	return n
 98 | }
 99 | 
100 | func (t *TruncatedSVD) extractSVD(svd *mat.SVD) (s []float64, u, v *mat.Dense) {
101 | 	var um, vm mat.Dense
102 | 	svd.UTo(&um)
103 | 	svd.VTo(&vm)
104 | 	s = svd.Values(nil)
105 | 	return s, &um, &vm
106 | }
107 | 
108 | // Save binary serialises the model and writes it into w.  This is useful for persisting
109 | // a trained model to disk so that it may be loaded (using the Load() method)in another
110 | // context (e.g. production) for reproducible results.
111 | func (t TruncatedSVD) Save(w io.Writer) error {
112 | 	var buf [8]byte
113 | 	binary.LittleEndian.PutUint64(buf[:], uint64(t.K))
114 | 	if _, err := w.Write(buf[:]); err != nil {
115 | 		return err
116 | 	}
117 | 
118 | 	_, err := t.Components.MarshalBinaryTo(w)
119 | 
120 | 	return err
121 | }
122 | 
123 | // Load binary deserialises the previously serialised model into the receiver.  This is
124 | // useful for loading a previously trained and saved model from another context
125 | // (e.g. offline training) for use within another context (e.g. production) for
126 | // reproducible results.  Load should only be performed with trusted data.
127 | func (t *TruncatedSVD) Load(r io.Reader) error {
128 | 	var n int
129 | 	var buf [8]byte
130 | 	var err error
131 | 	for n < len(buf) && err == nil {
132 | 		var nn int
133 | 		nn, err = r.Read(buf[n:])
134 | 		n += nn
135 | 	}
136 | 	if err == io.EOF {
137 | 		return io.ErrUnexpectedEOF
138 | 	}
139 | 	if err != nil {
140 | 		return err
141 | 	}
142 | 	k := int(binary.LittleEndian.Uint64(buf[:]))
143 | 
144 | 	var model mat.Dense
145 | 	if _, err := model.UnmarshalBinaryFrom(r); err != nil {
146 | 		return err
147 | 	}
148 | 
149 | 	t.K = k
150 | 	t.Components = &model
151 | 
152 | 	return nil
153 | }
154 | 
155 | // PCA calculates the principal components of a matrix, or the axis of greatest variance and
156 | // then projects matrices onto those axis.
157 | // See https://en.wikipedia.org/wiki/Principal_component_analysis for further details.
158 | type PCA struct {
159 | 	// K is the number of components
160 | 	K  int
161 | 	pc *stat.PC
162 | }
163 | 
164 | // NewPCA constructs a new Principal Component Analysis transformer to reduce the dimensionality,
165 | // projecting matrices onto the axis of greatest variance
166 | func NewPCA(k int) *PCA {
167 | 	return &PCA{K: k, pc: &stat.PC{}}
168 | }
169 | 
170 | // Fit calculates the principal component directions (axis of greatest variance) within the
171 | // training data which can then be used to project matrices onto those principal components using
172 | // the Transform() method.
173 | func (p *PCA) Fit(m mat.Matrix) Transformer {
174 | 	if ok := p.pc.PrincipalComponents(m.T(), nil); !ok {
175 | 		panic("nlp: PCA analysis failed during fitting")
176 | 	}
177 | 
178 | 	return p
179 | }
180 | 
181 | // Transform projects the matrix onto the first K principal components calculated during training
182 | // (the Fit() method).  The returned matrix will be of reduced dimensionality compared to the input
183 | // (K x c compared to r x c of the input).
184 | func (p *PCA) Transform(m mat.Matrix) (mat.Matrix, error) {
185 | 	r, _ := m.Dims()
186 | 
187 | 	//var proj mat.Dense
188 | 	var proj sparse.CSR
189 | 	var dst mat.Dense
190 | 	p.pc.VectorsTo(&dst)
191 | 	proj.Mul(m.T(), dst.Slice(0, r, 0, p.K))
192 | 
193 | 	// matrix is r x c (t x d)
194 | 	// m.T() = c x r (d x t)
195 | 	// slice c x K
196 | 
197 | 	// (ar x ac) * (br x bc) = ar x bc
198 | 	// ac == br
199 | 	return proj.T(), nil
200 | }
201 | 
202 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
203 | // on the same matrix.  This is a useful shortcut where separate training data is not being
204 | // used to fit the model i.e. the model is fitted on the fly to the test data.
205 | func (p *PCA) FitTransform(m mat.Matrix) (mat.Matrix, error) {
206 | 	return p.Fit(m).Transform(m)
207 | }
208 | 
209 | // ExplainedVariance returns a slice of float64 values representing the variances of the
210 | // principal component scores.
211 | func (p *PCA) ExplainedVariance() []float64 {
212 | 	return p.pc.VarsTo(nil)
213 | }
214 | 


--------------------------------------------------------------------------------
/index.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"container/heap"
  5 | 	"sync"
  6 | 
  7 | 	"github.com/james-bowman/nlp/measures/pairwise"
  8 | 	"github.com/james-bowman/sparse"
  9 | 	"gonum.org/v1/gonum/mat"
 10 | )
 11 | 
 12 | // Match represents a matching item for nearest neighbour similarity searches.
 13 | // It contains both the ID of the matching item and the distance from the queried item.
 14 | // The distance is represented as a score from 0 (exact match) to 1 (orthogonal)
 15 | // depending upon the metric used.
 16 | type Match struct {
 17 | 	Distance float64
 18 | 	ID       interface{}
 19 | }
 20 | 
 21 | // resultHeap is a min heap (priority queue) used to compile the top-k matches whilst
 22 | // performing nearest neighbour similarity searches.
 23 | type resultHeap struct {
 24 | 	matches []Match
 25 | }
 26 | 
 27 | func (r resultHeap) Len() int { return len(r.matches) }
 28 | 
 29 | func (r resultHeap) Less(i, j int) bool { return r.matches[i].Distance > r.matches[j].Distance }
 30 | 
 31 | func (r resultHeap) Swap(i, j int) { r.matches[i], r.matches[j] = r.matches[j], r.matches[i] }
 32 | 
 33 | func (r *resultHeap) Push(x interface{}) {
 34 | 	r.matches = append(r.matches, x.(Match))
 35 | }
 36 | 
 37 | func (r *resultHeap) Pop() interface{} {
 38 | 	old := r.matches
 39 | 	n := len(old)
 40 | 	x := old[n-1]
 41 | 	r.matches = old[0 : n-1]
 42 | 	return x
 43 | }
 44 | 
 45 | // Indexer indexes vectors to support Nearest Neighbour (NN) similarity searches across
 46 | // the indexed vectors.
 47 | type Indexer interface {
 48 | 	Index(v mat.Vector, id interface{})
 49 | 	Search(q mat.Vector, k int) []Match
 50 | 	Remove(ids interface{})
 51 | }
 52 | 
 53 | // LinearScanIndex supports Nearest Neighbour (NN) similarity searches across indexed
 54 | // vectors performing queries in O(n) and requiring O(n) storage.  As the name implies,
 55 | // LinearScanIndex performs a linear scan across all indexed vectors comparing them
 56 | // each in turn with the specified query vector using the configured pairwise distance
 57 | // metric.  LinearScanIndex is accurate and will always return the true top-k nearest
 58 | // neighbours as opposed to some other types of index, like LSHIndex,
 59 | // which perform Approximate Nearest Neighbour (ANN) searches and trade some recall
 60 | // accuracy for performance over large scale datasets.
 61 | type LinearScanIndex struct {
 62 | 	lock       sync.RWMutex
 63 | 	signatures []mat.Vector
 64 | 	ids        []interface{}
 65 | 	distance   pairwise.Comparer
 66 | }
 67 | 
 68 | // NewLinearScanIndex construct a new empty LinearScanIndex which will use the specified
 69 | // pairwise distance metric to determine nearest neighbours based on similarity.
 70 | func NewLinearScanIndex(compareFN pairwise.Comparer) *LinearScanIndex {
 71 | 	return &LinearScanIndex{distance: compareFN}
 72 | }
 73 | 
 74 | // Index adds the specified vector v with associated id to the index.
 75 | func (b *LinearScanIndex) Index(v mat.Vector, id interface{}) {
 76 | 	b.lock.Lock()
 77 | 	b.signatures = append(b.signatures, v)
 78 | 	b.ids = append(b.ids, id)
 79 | 	b.lock.Unlock()
 80 | }
 81 | 
 82 | // Search searches for the top-k nearest neighbours in the index.  The method
 83 | // returns up to the top-k most similar items in unsorted order.  The method may
 84 | // return fewer than k items if less than k neighbours are found.
 85 | func (b *LinearScanIndex) Search(qv mat.Vector, k int) []Match {
 86 | 	b.lock.RLock()
 87 | 	defer b.lock.RUnlock()
 88 | 
 89 | 	size := len(b.signatures)
 90 | 
 91 | 	var point int
 92 | 	var results resultHeap
 93 | 	results.matches = make([]Match, 0, k)
 94 | 
 95 | 	for point = 0; point < k && point < size; point++ {
 96 | 		mv := b.signatures[point]
 97 | 		match := Match{Distance: b.distance(qv, mv), ID: b.ids[point]}
 98 | 		results.matches = append(results.matches, match)
 99 | 	}
100 | 	if len(results.matches) < k {
101 | 		return results.matches
102 | 	}
103 | 	heap.Init(&results)
104 | 	var dist float64
105 | 	for i := point; i < size; i++ {
106 | 		mv := b.signatures[i]
107 | 		dist = b.distance(qv, mv)
108 | 		if dist <= results.matches[0].Distance {
109 | 			heap.Pop(&results)
110 | 			heap.Push(&results, Match{Distance: dist, ID: b.ids[i]})
111 | 		}
112 | 	}
113 | 
114 | 	return results.matches
115 | }
116 | 
117 | // Remove removes the vector with the specified id from the index.  If no vector
118 | // is found with the specified id the method will simply do nothing.
119 | func (b *LinearScanIndex) Remove(id interface{}) {
120 | 	b.lock.Lock()
121 | 	defer b.lock.Unlock()
122 | 
123 | 	for i, v := range b.ids {
124 | 		if v == id {
125 | 			copy(b.signatures[i:], b.signatures[i+1:])
126 | 			b.signatures[len(b.signatures)-1] = nil
127 | 			b.signatures = b.signatures[:len(b.signatures)-1]
128 | 
129 | 			copy(b.ids[i:], b.ids[i+1:])
130 | 			b.ids[len(b.ids)-1] = nil
131 | 			b.ids = b.ids[:len(b.ids)-1]
132 | 
133 | 			return
134 | 		}
135 | 	}
136 | }
137 | 
138 | // Hasher interface represents a Locality Sensitive Hashing algorithm whereby
139 | // the proximity of data points is preserved in the hash space i.e. similar data
140 | // points will be hashed to values close together in the hash space.
141 | type Hasher interface {
142 | 	// Hash hashes the input vector into a BinaryVector hash representation
143 | 	Hash(mat.Vector) *sparse.BinaryVec
144 | }
145 | 
146 | // LSHScheme interface represents LSH indexing schemes to support Approximate Nearest
147 | // Neighbour (ANN) search.
148 | type LSHScheme interface {
149 | 	// Put stores the specified LSH signature and associated ID in the LSH index
150 | 	Put(id interface{}, signature *sparse.BinaryVec)
151 | 
152 | 	// GetCandidates returns the IDs of candidate nearest neighbours.  It is up to
153 | 	// the calling code to further filter these candidates based on distance to arrive
154 | 	// at the top-k approximate nearest neighbours.  The number of candidates returned
155 | 	// may be smaller or larger than k.
156 | 	GetCandidates(query *sparse.BinaryVec, k int) []interface{}
157 | 
158 | 	// Remove removes the specified item from the LSH index
159 | 	Remove(id interface{})
160 | }
161 | 
162 | // LSHIndex is an LSH (Locality Sensitive Hashing) based index supporting Approximate
163 | // Nearest Neighbour (ANN) search in O(log n).  The storage required by the index will
164 | // depend upon the underlying LSH scheme used but will typically be higher than O(n).
165 | // In use cases where accurate Nearest Neighbour search is required other types of
166 | // index should be considered like LinearScanIndex.
167 | type LSHIndex struct {
168 | 	lock       sync.RWMutex
169 | 	isApprox   bool
170 | 	hasher     Hasher
171 | 	scheme     LSHScheme
172 | 	signatures map[interface{}]mat.Vector
173 | 	distance   pairwise.Comparer
174 | }
175 | 
176 | // NewLSHIndex creates a new LSHIndex.  When queried, the initial candidate
177 | // nearest neighbours returned by the underlying LSH indexing algorithm
178 | // are further filtered by comparing distances to the query vector using the supplied
179 | // distance metric.  If approx is true, the filtering comparison is performed on the
180 | // hashes and if approx is false, then the comparison is performed on the original
181 | // vectors instead. This will have time and storage implications as comparing the
182 | // original vectors will be more accurate but slower and require the original vectors
183 | // be stored for the comparison.  The LSH algorithm and underlying LSH indexing
184 | // algorithm may both be specified as hasher and store parameters respectively.
185 | func NewLSHIndex(approx bool, hasher Hasher, store LSHScheme, distance pairwise.Comparer) *LSHIndex {
186 | 	index := LSHIndex{
187 | 		isApprox:   approx,
188 | 		hasher:     hasher,
189 | 		scheme:     store,
190 | 		signatures: make(map[interface{}]mat.Vector),
191 | 		distance:   distance,
192 | 	}
193 | 
194 | 	return &index
195 | }
196 | 
197 | // Index indexes the supplied vector along with its associated ID.
198 | func (l *LSHIndex) Index(v mat.Vector, id interface{}) {
199 | 	h := l.hasher.Hash(v)
200 | 
201 | 	l.lock.Lock()
202 | 	defer l.lock.Unlock()
203 | 
204 | 	l.scheme.Put(id, h)
205 | 	if l.isApprox {
206 | 		l.signatures[id] = h
207 | 	} else {
208 | 		l.signatures[id] = v
209 | 	}
210 | }
211 | 
212 | // Search searches for the top-k approximate nearest neighbours in the index.  The
213 | // method returns up to the top-k most similar items in unsorted order.  The method may
214 | // return fewer than k items if less than k neighbours are found.
215 | func (l *LSHIndex) Search(q mat.Vector, k int) []Match {
216 | 	hv := l.hasher.Hash(q)
217 | 
218 | 	l.lock.RLock()
219 | 	defer l.lock.RUnlock()
220 | 
221 | 	candidateIDs := l.scheme.GetCandidates(hv, k)
222 | 	size := len(candidateIDs)
223 | 
224 | 	var qv mat.Vector
225 | 	if l.isApprox {
226 | 		qv = hv
227 | 	} else {
228 | 		qv = q
229 | 	}
230 | 
231 | 	var point int
232 | 	var results resultHeap
233 | 	results.matches = make([]Match, 0, k)
234 | 
235 | 	for point = 0; point < k && point < size; point++ {
236 | 		mv := l.signatures[candidateIDs[point]]
237 | 		match := Match{Distance: l.distance(qv, mv), ID: candidateIDs[point]}
238 | 		results.matches = append(results.matches, match)
239 | 	}
240 | 	if len(results.matches) < k {
241 | 		return results.matches
242 | 	}
243 | 	heap.Init(&results)
244 | 	var dist float64
245 | 	for i := point; i < size; i++ {
246 | 		mv := l.signatures[candidateIDs[i]]
247 | 		dist = l.distance(qv, mv)
248 | 		if dist <= results.matches[0].Distance {
249 | 			heap.Pop(&results)
250 | 			heap.Push(&results, Match{Distance: dist, ID: candidateIDs[i]})
251 | 		}
252 | 	}
253 | 
254 | 	return results.matches
255 | }
256 | 
257 | // Remove removes the vector with the specified id from the index.  If no vector
258 | // is found with the specified id the method will simply do nothing.
259 | func (l *LSHIndex) Remove(id interface{}) {
260 | 	l.lock.Lock()
261 | 	defer l.lock.Unlock()
262 | 
263 | 	delete(l.signatures, id)
264 | 	l.scheme.Remove(id)
265 | }
266 | 


--------------------------------------------------------------------------------
/lsh.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"strings"
  6 | 
  7 | 	radix "github.com/armon/go-radix"
  8 | 	"github.com/james-bowman/sparse"
  9 | )
 10 | 
 11 | // lshTableBucket represents a hash table bucket used for ClassicLSH.  The bucket
 12 | // is a slice of IDs relating to items whose hash maps to the bucket.
 13 | type lshTableBucket []interface{}
 14 | 
 15 | // lshTable is an hash table used for ClassicLSH.  It is simply a map of hashcodes
 16 | // to lshTableBuckets
 17 | //type lshTable map[uint64]lshTableBucket
 18 | type lshTable map[uint64]lshTableBucket
 19 | 
 20 | // remove removes the specified item from the LSH table
 21 | func (t lshTable) remove(id interface{}) {
 22 | 	for key, bucketContents := range t {
 23 | 		for j, indexedID := range bucketContents {
 24 | 			if id == indexedID {
 25 | 				bucketContents[j] = bucketContents[len(bucketContents)-1]
 26 | 				t[key] = bucketContents[:len(bucketContents)-1]
 27 | 				if len(t[key]) == 0 {
 28 | 					delete(t, key)
 29 | 				}
 30 | 				return
 31 | 			}
 32 | 		}
 33 | 	}
 34 | }
 35 | 
 36 | // ClassicLSH supports finding top-k Approximate Nearest Neighbours (ANN) using Locality
 37 | // Sensitive Hashing (LSH).  Classic LSH scheme is based on using hash tables to store
 38 | // items by their locality sensitive hash code based on the work of A. Gionis et al.
 39 | // Items that map to the same bucket (their hash codes collide) are similar.  Multiple
 40 | // hash tables are used to improve recall where some similar items would otherwise
 41 | // hash to separate, neighbouring buckets in only a single table.
 42 | //
 43 | // A. Gionis, P. Indyk, and R. Motwani, “Similarity Search in High Dimensions via
 44 | // Hashing,” VLDB ’99 Proc. 25th Int. Conf. Very Large Data Bases, vol. 99, no. 1,
 45 | // pp. 518–529, 1999.
 46 | // http://www.cs.princeton.edu/courses/archive/spring13/cos598C/Gionis.pdf%5Cnhttp://portal.acm.org/citation.cfm?id=671516
 47 | type ClassicLSH struct {
 48 | 	numHashtables    int
 49 | 	numHashfunctions int
 50 | 	reqLen           int
 51 | 	hashTables       []lshTable
 52 | }
 53 | 
 54 | // NewClassicLSH creates a new ClassicLSH with the configured number of hash tables
 55 | // and hash functions per table.  The length of hash signatures used in this type's
 56 | // methods (Put() and GetCandidates()) should be exactly equal to functions * tables.
 57 | // The Classic LSH algorithm uses multiple hash tables to improve recall for similar
 58 | // items that hash to nearby buckets within a specific hash table.
 59 | func NewClassicLSH(functions, tables int) *ClassicLSH {
 60 | 	hashtables := make([]lshTable, tables)
 61 | 	for i := range hashtables {
 62 | 		hashtables[i] = make(map[uint64]lshTableBucket)
 63 | 	}
 64 | 
 65 | 	return &ClassicLSH{
 66 | 		reqLen:           tables * functions,
 67 | 		numHashtables:    tables,
 68 | 		numHashfunctions: functions,
 69 | 		hashTables:       hashtables,
 70 | 	}
 71 | }
 72 | 
 73 | // Put stores the specified LSH signature and associated ID in the LSH index.
 74 | // The method panics if the signature is not the same length as tables * functions.
 75 | func (l *ClassicLSH) Put(id interface{}, signature *sparse.BinaryVec) {
 76 | 	keys := l.hashKeysForSignature(signature)
 77 | 	for i := range l.hashTables {
 78 | 		l.hashTables[i][keys[i]] = append(l.hashTables[i][keys[i]], id)
 79 | 	}
 80 | }
 81 | 
 82 | // GetCandidates returns the IDs of candidate nearest neighbours.  It is up to
 83 | // the calling code to further filter these candidates based on distance to arrive
 84 | // at the top-k approximate nearest neighbours.  The number of candidates returned
 85 | // may be smaller or larger than k.  The method panics if the signature is not the
 86 | // same length as tables * functions.
 87 | func (l *ClassicLSH) GetCandidates(query *sparse.BinaryVec, k int) []interface{} {
 88 | 	keys := l.hashKeysForSignature(query)
 89 | 
 90 | 	seen := make(map[interface{}]struct{})
 91 | 	for i, table := range l.hashTables {
 92 | 		if bucketEntries, exist := table[keys[i]]; exist {
 93 | 			for _, id := range bucketEntries {
 94 | 				seen[id] = struct{}{}
 95 | 			}
 96 | 		}
 97 | 	}
 98 | 
 99 | 	// Collect results
100 | 	ids := make([]interface{}, len(seen))
101 | 	var i int
102 | 	for index := range seen {
103 | 		ids[i] = index
104 | 		i++
105 | 	}
106 | 
107 | 	return ids
108 | }
109 | 
110 | // Remove removes the specified item from the LSH index
111 | func (l *ClassicLSH) Remove(id interface{}) {
112 | 	for _, table := range l.hashTables {
113 | 		table.remove(id)
114 | 	}
115 | }
116 | 
117 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per
118 | // table) each the length of the configured number of hash functions per table.
119 | // The method panics if the signature is not the same length as tables * functions.
120 | func (l *ClassicLSH) hashKeysForSignature(signature *sparse.BinaryVec) []uint64 {
121 | 	// TODO: rather than simply chunking up the hash signature into k/l chunks
122 | 	// possibly select hash functions (digits) uniformly at random (with replacement?)
123 | 	if signature.Len() != l.reqLen {
124 | 		panic(fmt.Sprintf("nlp: Specified signature is not the correct length.  Needed %d but received %d", l.reqLen, signature.Len()))
125 | 	}
126 | 	keys := make([]uint64, l.numHashtables)
127 | 	for i := range keys {
128 | 		//keys[i] = signature.SliceToUint64(i*l.numHashfunctions, ((i+1)*l.numHashfunctions)-1)
129 | 		keys[i] = signature.SliceToUint64(i*l.numHashfunctions, ((i + 1) * l.numHashfunctions))
130 | 	}
131 | 	return keys
132 | }
133 | 
134 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per
135 | // table) each the length of the configured number of hash functions per table.
136 | // The method panics if the signature is not the same length as tables * functions.
137 | // func (l *ClassicLSH) hashKeysForSignature(signature *sparse.BinaryVec) []string {
138 | // 	// TODO: rather than simply chunking up the hash signature into k/l chunks
139 | // 	// possibly select hash functions (digits) uniformly at random (with replacement?)
140 | // 	if signature.Len() != l.reqLen {
141 | // 		panic(fmt.Sprintf("nlp: Specified signature is not the correct length.  Needed %d but received %d", l.reqLen, signature.Len()))
142 | // 	}
143 | // 	keys := make([]string, l.numHashtables)
144 | // 	key := signature.String()
145 | // 	for i := range keys {
146 | // 		keys[i] = key[i*l.numHashfunctions : (i+1)*l.numHashfunctions]
147 | // 	}
148 | // 	return keys
149 | // }
150 | 
151 | // LSHForest is an implementation of the LSH Forest Locality Sensitive Hashing scheme
152 | // based on the work of M. Bawa et al.
153 | //
154 | // M. Bawa, T. Condie, and P. Ganesan, “LSH forest: self-tuning indexes for
155 | // similarity search,” Proc. 14th Int. Conf. World Wide Web - WWW ’05, p. 651, 2005.
156 | // http://dl.acm.org/citation.cfm?id=1060745.1060840
157 | type LSHForest struct {
158 | 	trees            []*radix.Tree
159 | 	numHashfunctions int
160 | 	reqLen           int
161 | }
162 | 
163 | // NewLSHForest creates a new LSHForest Locality Sensitive Hashing scheme with the
164 | // specified number of hash tables and hash functions per table.
165 | func NewLSHForest(functions int, tables int) *LSHForest {
166 | 	trees := make([]*radix.Tree, tables)
167 | 	for i := range trees {
168 | 		trees[i] = radix.New()
169 | 	}
170 | 	return &LSHForest{
171 | 		trees:            trees,
172 | 		numHashfunctions: functions,
173 | 		reqLen:           functions * tables,
174 | 	}
175 | }
176 | 
177 | // Put stores the specified LSH signature and associated ID in the LSH index
178 | func (l *LSHForest) Put(id interface{}, signature *sparse.BinaryVec) {
179 | 	keys := l.hashKeysForSignature(signature)
180 | 	for i, tree := range l.trees {
181 | 		//bucket, _ := tree.Get(keys[i])
182 | 		bucket, ok := tree.Get(keys[i])
183 | 		if !ok {
184 | 			bucket = make([]interface{}, 0)
185 | 		}
186 | 		tree.Insert(keys[i], append(bucket.([]interface{}), id))
187 | 	}
188 | }
189 | 
190 | // GetCandidates returns the IDs of candidate nearest neighbours.  It is up to
191 | // the calling code to further filter these candidates based on distance to arrive
192 | // at the top-k approximate nearest neighbours.  The number of candidates returned
193 | // may be smaller or larger than k.
194 | func (l *LSHForest) GetCandidates(query *sparse.BinaryVec, k int) []interface{} {
195 | 	keys := l.hashKeysForSignature(query)
196 | 
197 | 	m := k
198 | 	seen := make(map[interface{}]struct{})
199 | 
200 | 	for i, tree := range l.trees {
201 | 		if bucketEntries, exist := tree.Get(keys[i]); exist {
202 | 			for _, id := range bucketEntries.([]interface{}) {
203 | 				seen[id] = struct{}{}
204 | 			}
205 | 		}
206 | 	}
207 | 
208 | 	// if we have not found enough candidates then walk back up the trees for
209 | 	// similar items in neighbouring buckets with shared prefixes
210 | 	x := l.numHashfunctions
211 | 	for len(seen) < m && x > 0 {
212 | 		for i, tree := range l.trees {
213 | 			var k string
214 | 			if keys[i][x-1] == '1' {
215 | 				k = "0"
216 | 			} else {
217 | 				k = "1"
218 | 			}
219 | 
220 | 			altKey := strings.Join([]string{keys[i][0 : x-1], k}, "")
221 | 			tree.WalkPrefix(altKey, func(s string, v interface{}) bool {
222 | 				for _, id := range v.([]interface{}) {
223 | 					seen[id] = struct{}{}
224 | 				}
225 | 				return false
226 | 			})
227 | 		}
228 | 		x--
229 | 	}
230 | 
231 | 	// Collect results
232 | 	candidates := make([]interface{}, len(seen))
233 | 	var i int
234 | 	for index := range seen {
235 | 		candidates[i] = index
236 | 		i++
237 | 	}
238 | 
239 | 	return candidates
240 | }
241 | 
242 | // Remove removes the specified item from the LSH index
243 | func (l *LSHForest) Remove(id interface{}) {
244 | 	for _, tree := range l.trees {
245 | 		tree.Walk(func(s string, v interface{}) bool {
246 | 			bucketContents := v.([]interface{})
247 | 			for i, indexedID := range bucketContents {
248 | 				if id == indexedID {
249 | 					bucketContents[i] = bucketContents[len(bucketContents)-1]
250 | 					bucketContents = bucketContents[:len(bucketContents)-1]
251 | 					if len(bucketContents) == 0 {
252 | 						tree.Delete(s)
253 | 					} else {
254 | 						tree.Insert(s, bucketContents)
255 | 					}
256 | 					return true
257 | 				}
258 | 			}
259 | 			return false
260 | 		})
261 | 	}
262 | }
263 | 
264 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per
265 | // table) each the length of the configured number of hash functions per table.
266 | // The method panics if the signature is not the same length as tables * functions.
267 | func (l *LSHForest) hashKeysForSignature(signature *sparse.BinaryVec) []string {
268 | 	// TODO: rather than simply chunking up the hash signature into k/l chunks
269 | 	// possibly select hash functions (digits) uniformly at random (with replacement?)
270 | 	if signature.Len() != l.reqLen {
271 | 		panic(fmt.Sprintf("nlp: Specified signature is not the correct length.  Needed %d but received %d", l.reqLen, signature.Len()))
272 | 	}
273 | 	keys := make([]string, len(l.trees))
274 | 	key := signature.String()
275 | 	for i := range keys {
276 | 		keys[i] = key[i*l.numHashfunctions : (i+1)*l.numHashfunctions]
277 | 	}
278 | 	return keys
279 | }
280 | 


--------------------------------------------------------------------------------
/lda_test.go:
--------------------------------------------------------------------------------
  1 | package nlp_test
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"testing"
  7 | 
  8 | 	"golang.org/x/exp/rand"
  9 | 
 10 | 	"github.com/james-bowman/nlp"
 11 | 	"gonum.org/v1/gonum/mat"
 12 | )
 13 | 
 14 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}
 15 | 
 16 | func TestLDAFit(t *testing.T) {
 17 | 	tests := []struct {
 18 | 		topics         int
 19 | 		r, c           int
 20 | 		data           []float64
 21 | 		expectedTopics [][]float64
 22 | 	}{
 23 | 		{
 24 | 			topics: 3,
 25 | 			r:      9, c: 9,
 26 | 			data: []float64{
 27 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
 28 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
 29 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
 30 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
 31 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
 32 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
 33 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
 34 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
 35 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
 36 | 			},
 37 | 			expectedTopics: [][]float64{
 38 | 				{0.33, 0.33, 0.33, 0, 0, 0, 0, 0, 0},
 39 | 				{0, 0, 0, 0, 0, 0, 0.33, 0.33, 0.33},
 40 | 				{0, 0, 0, 0.33, 0.33, 0.33, 0, 0, 0},
 41 | 			},
 42 | 		},
 43 | 		{
 44 | 			topics: 3,
 45 | 			r:      9, c: 9,
 46 | 			data: []float64{
 47 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
 48 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
 49 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
 50 | 				0, 0, 0, 3, 5, 1, 0, 0, 0,
 51 | 				0, 0, 0, 3, 5, 0, 0, 0, 0,
 52 | 				0, 0, 0, 3, 5, 0, 0, 0, 0,
 53 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
 54 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
 55 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
 56 | 			},
 57 | 			expectedTopics: [][]float64{
 58 | 				{0.33, 0.33, 0.33, 0, 0, 0, 0, 0, 0},
 59 | 				{0, 0, 0, 0, 0, 0, 0.33, 0.33, 0.33},
 60 | 				{0, 0, 0, 0.428, 0.285, 0.285, 0, 0, 0},
 61 | 			},
 62 | 		},
 63 | 	}
 64 | 
 65 | 	for ti, test := range tests {
 66 | 		// set Rnd to fixed constant seed for deterministic results
 67 | 		lda := nlp.NewLatentDirichletAllocation(test.topics)
 68 | 		lda.Rnd = rand.New(rand.NewSource(uint64(0)))
 69 | 
 70 | 		in := mat.NewDense(test.r, test.c, test.data)
 71 | 		lda.Fit(in)
 72 | 
 73 | 		components := lda.Components()
 74 | 
 75 | 		for i := 0; i < test.topics; i++ {
 76 | 			var sum float64
 77 | 			for ri, v := range test.expectedTopics[i] {
 78 | 				cv := components.At(i, ri)
 79 | 				sum += cv
 80 | 				if math.Abs(cv-v) > 0.01 {
 81 | 					t.Errorf("Test %d: Topic (%d) over word (%d) distribution incorrect. Expected %f but received %f\n", ti, i, ri, v, cv)
 82 | 				}
 83 | 			}
 84 | 			if math.Abs(1-sum) > 0.00000001 {
 85 | 				t.Errorf("Test %d: values in topic (%d) over word distributions should sum to 1 but summed to %f\n", ti, i, sum)
 86 | 			}
 87 | 		}
 88 | 	}
 89 | }
 90 | 
 91 | func TestLDAFitTransform(t *testing.T) {
 92 | 	tests := []struct {
 93 | 		topics       int
 94 | 		r, c         int
 95 | 		data         []float64
 96 | 		expectedDocs [][]float64
 97 | 	}{
 98 | 		{
 99 | 			topics: 3,
100 | 			r:      9, c: 9,
101 | 			data: []float64{
102 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
103 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
104 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
105 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
106 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
107 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
108 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
109 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
110 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
111 | 			},
112 | 			expectedDocs: [][]float64{
113 | 				{1, 0, 0},
114 | 				{1, 0, 0},
115 | 				{1, 0, 0},
116 | 				{0, 0, 1},
117 | 				{0, 0, 1},
118 | 				{0, 0, 1},
119 | 				{0, 1, 0},
120 | 				{0, 1, 0},
121 | 				{0, 1, 0},
122 | 			},
123 | 		},
124 | 		{
125 | 			topics: 3,
126 | 			r:      9, c: 9,
127 | 			data: []float64{
128 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
129 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
130 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
131 | 				0, 0, 0, 3, 5, 1, 0, 0, 0,
132 | 				0, 0, 0, 3, 5, 0, 0, 0, 0,
133 | 				0, 0, 0, 3, 5, 0, 0, 0, 0,
134 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
135 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
136 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
137 | 			},
138 | 			expectedDocs: [][]float64{
139 | 				{1, 0, 0},
140 | 				{1, 0, 0},
141 | 				{1, 0, 0},
142 | 				{0, 0, 1},
143 | 				{0, 0, 1},
144 | 				{0, 0, 1},
145 | 				{0, 1, 0},
146 | 				{0, 1, 0},
147 | 				{0, 1, 0},
148 | 			},
149 | 		},
150 | 	}
151 | 
152 | 	for ti, test := range tests {
153 | 		// set Rnd to fixed constant seed for deterministic results
154 | 		lda := nlp.NewLatentDirichletAllocation(test.topics)
155 | 		lda.Rnd = rand.New(rand.NewSource(uint64(0)))
156 | 
157 | 		in := mat.NewDense(test.r, test.c, test.data)
158 | 		theta, err := lda.FitTransform(in)
159 | 		if err != nil {
160 | 			t.Error(err)
161 | 		}
162 | 
163 | 		for j := 0; j < test.c; j++ {
164 | 			var sum float64
165 | 			for ri, v := range test.expectedDocs[j] {
166 | 				cv := theta.At(ri, j)
167 | 				sum += cv
168 | 				if math.Abs(cv-v) > 0.01 {
169 | 					t.Errorf("Test %d: Document (%d) over topic (%d) distribution incorrect. Expected %f but received %f\n", ti, j, ri, v, cv)
170 | 				}
171 | 			}
172 | 			if math.Abs(1-sum) > 0.00000001 {
173 | 				t.Errorf("Test %d: values in document (%d) over topic distributions should sum to 1 but summed to %f\n", ti, j, sum)
174 | 			}
175 | 		}
176 | 	}
177 | }
178 | 
179 | func TestLDATransform(t *testing.T) {
180 | 	tests := []struct {
181 | 		topics int
182 | 		r, c   int
183 | 		data   []float64
184 | 	}{
185 | 		{
186 | 			topics: 3,
187 | 			r:      9, c: 9,
188 | 			data: []float64{
189 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
190 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
191 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
192 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
193 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
194 | 				0, 0, 0, 3, 3, 3, 0, 0, 0,
195 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
196 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
197 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
198 | 			},
199 | 		},
200 | 		{
201 | 			topics: 3,
202 | 			r:      9, c: 9,
203 | 			data: []float64{
204 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
205 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
206 | 				3, 3, 3, 0, 0, 0, 0, 0, 0,
207 | 				0, 0, 0, 3, 5, 1, 0, 0, 0,
208 | 				0, 0, 0, 3, 5, 0, 0, 0, 0,
209 | 				0, 0, 0, 3, 5, 0, 0, 0, 0,
210 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
211 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
212 | 				0, 0, 0, 0, 0, 0, 4, 4, 4,
213 | 			},
214 | 		},
215 | 	}
216 | 
217 | 	for ti, test := range tests {
218 | 		// set Rnd to fixed constant seed for deterministic results
219 | 		lda := nlp.NewLatentDirichletAllocation(test.topics)
220 | 		lda.Rnd = rand.New(rand.NewSource(uint64(0)))
221 | 		lda.PerplexityEvaluationFrequency = 2
222 | 
223 | 		in := mat.NewDense(test.r, test.c, test.data)
224 | 		theta, err := lda.FitTransform(in)
225 | 		if err != nil {
226 | 			t.Error(err)
227 | 		}
228 | 
229 | 		tTheta, err := lda.Transform(in)
230 | 
231 | 		if !mat.EqualApprox(theta, tTheta, 0.035) {
232 | 			t.Errorf("Test %d: Transformed matrix not equal to FitTransformed\nExpected:\n %v\nbut received:\n %v\n", ti, mat.Formatted(theta), mat.Formatted(tTheta))
233 | 		}
234 | 	}
235 | }
236 | 
237 | func ExampleLatentDirichletAllocation() {
238 | 	corpus := []string{
239 | 		"The quick brown fox jumped over the lazy dog",
240 | 		"The cow jumped over the moon",
241 | 		"The little dog laughed to see such fun",
242 | 	}
243 | 
244 | 	// Create a pipeline with a count vectoriser and LDA transformer for 2 topics
245 | 	vectoriser := nlp.NewCountVectoriser(stopWords...)
246 | 	lda := nlp.NewLatentDirichletAllocation(2)
247 | 	pipeline := nlp.NewPipeline(vectoriser, lda)
248 | 
249 | 	docsOverTopics, err := pipeline.FitTransform(corpus...)
250 | 	if err != nil {
251 | 		fmt.Printf("Failed to model topics for documents because %v", err)
252 | 		return
253 | 	}
254 | 
255 | 	// Examine Document over topic probability distribution
256 | 	dr, dc := docsOverTopics.Dims()
257 | 	for doc := 0; doc < dc; doc++ {
258 | 		fmt.Printf("\nTopic distribution for document: '%s' -", corpus[doc])
259 | 		for topic := 0; topic < dr; topic++ {
260 | 			if topic > 0 {
261 | 				fmt.Printf(",")
262 | 			}
263 | 			fmt.Printf(" Topic #%d=%f", topic, docsOverTopics.At(topic, doc))
264 | 		}
265 | 	}
266 | 
267 | 	// Examine Topic over word probability distribution
268 | 	topicsOverWords := lda.Components()
269 | 	tr, tc := topicsOverWords.Dims()
270 | 
271 | 	vocab := make([]string, len(vectoriser.Vocabulary))
272 | 	for k, v := range vectoriser.Vocabulary {
273 | 		vocab[v] = k
274 | 	}
275 | 	for topic := 0; topic < tr; topic++ {
276 | 		fmt.Printf("\nWord distribution for Topic #%d -", topic)
277 | 		for word := 0; word < tc; word++ {
278 | 			if word > 0 {
279 | 				fmt.Printf(",")
280 | 			}
281 | 			fmt.Printf(" '%s'=%f", vocab[word], topicsOverWords.At(topic, word))
282 | 		}
283 | 	}
284 | }
285 | 


--------------------------------------------------------------------------------
/randomprojection_test.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/james-bowman/nlp/measures/pairwise"
  8 | 	"github.com/james-bowman/sparse"
  9 | 	"golang.org/x/exp/rand"
 10 | 	"gonum.org/v1/gonum/mat"
 11 | )
 12 | 
 13 | func TestSignRandomProjection(t *testing.T) {
 14 | 	tests := []struct {
 15 | 		rows int
 16 | 		cols int
 17 | 		bits int
 18 | 	}{
 19 | 		{rows: 100, cols: 1000, bits: 1024},
 20 | 		{rows: 100, cols: 1000, bits: 256},
 21 | 	}
 22 | 
 23 | 	for ti, test := range tests {
 24 | 		// Given an input matrix and a query matching one column
 25 | 		matrix := mat.NewDense(test.rows, test.cols, nil)
 26 | 		for i := 0; i < test.rows; i++ {
 27 | 			for j := 0; j < test.cols; j++ {
 28 | 				matrix.Set(i, j, rand.Float64())
 29 | 			}
 30 | 		}
 31 | 
 32 | 		query := matrix.ColView(0)
 33 | 
 34 | 		// When transformed using sign random projections
 35 | 		transformer := NewSignRandomProjection(test.bits)
 36 | 		reducedDimMatrix, err := transformer.FitTransform(matrix)
 37 | 		if err != nil {
 38 | 			t.Errorf("Failed to transform matrix because %v\n", err)
 39 | 		}
 40 | 		m := reducedDimMatrix.(*sparse.Binary)
 41 | 
 42 | 		reducedDimQuery, err := transformer.Transform(query)
 43 | 		if err != nil {
 44 | 			t.Errorf("Failed to transform query because %v\n", err)
 45 | 		}
 46 | 		q := reducedDimQuery.(*sparse.Binary).ColView(0)
 47 | 
 48 | 		var culmDiff float64
 49 | 		for i := 0; i < test.cols; i++ {
 50 | 			angSim := pairwise.AngularSimilarity(query, matrix.ColView(i))
 51 | 			lshSim := pairwise.HammingSimilarity(q, m.ColView(i))
 52 | 
 53 | 			if i == 0 {
 54 | 				if math.Abs(angSim-lshSim) >= 0.0000001 {
 55 | 					t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
 56 | 				}
 57 | 			}
 58 | 
 59 | 			diff := math.Abs(lshSim-angSim) / angSim
 60 | 			culmDiff += diff
 61 | 		}
 62 | 		avgDiff := culmDiff / float64(test.cols)
 63 | 
 64 | 		// Then output matrix should be of specified length,
 65 | 		// matching column should still have similarity of ~1.0 and
 66 | 		// avg difference betwen angular and hamming similarities should
 67 | 		// be less than 0.03
 68 | 		r, c := m.Dims()
 69 | 		if r != test.bits || c != test.cols {
 70 | 			t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.bits, test.cols, r, c)
 71 | 		}
 72 | 		if avgDiff >= 0.03 {
 73 | 			t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.03, avgDiff)
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func TestRandomProjection(t *testing.T) {
 79 | 	tests := []struct {
 80 | 		k       int
 81 | 		rows    int
 82 | 		cols    int
 83 | 		density float32
 84 | 	}{
 85 | 		{k: 400, rows: 700, cols: 600, density: 0.02},
 86 | 		{k: 400, rows: 800, cols: 800, density: 0.02},
 87 | 	}
 88 | 
 89 | 	for ti, test := range tests {
 90 | 		matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
 91 | 		query := matrix.ToCSC().ColView(0)
 92 | 
 93 | 		// When transformed using sign random projections
 94 | 		transformer := NewRandomProjection(test.k, float64(test.density))
 95 | 		transformer.rnd = rand.New(rand.NewSource(uint64(0)))
 96 | 		reducedDimMatrix, err := transformer.FitTransform(matrix)
 97 | 		if err != nil {
 98 | 			t.Errorf("Failed to transform matrix because %v\n", err)
 99 | 		}
100 | 		m := reducedDimMatrix.(*sparse.CSR).ToCSC()
101 | 
102 | 		reducedDimQuery, err := transformer.Transform(query)
103 | 		if err != nil {
104 | 			t.Errorf("Failed to transform query because %v\n", err)
105 | 		}
106 | 		q := reducedDimQuery.(*sparse.CSR).ToCSC().ColView(0)
107 | 
108 | 		var culmDiff float64
109 | 		ColDo(matrix, func(j int, v mat.Vector) {
110 | 			angSim := pairwise.CosineSimilarity(query, v)
111 | 			lshSim := pairwise.CosineSimilarity(q, m.ColView(j))
112 | 
113 | 			if j == 0 {
114 | 				if math.Abs(angSim-lshSim) >= 0.0000001 {
115 | 					t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
116 | 				}
117 | 			}
118 | 
119 | 			//diff := math.Abs(lshSim-angSim) / angSim
120 | 			diff := math.Abs(lshSim - angSim)
121 | 			culmDiff += diff
122 | 		})
123 | 		t.Logf("CulmDiff = %f\n", culmDiff)
124 | 		avgDiff := culmDiff / float64(test.cols)
125 | 
126 | 		// Then output matrix should be of specified length,
127 | 		// matching column should still have similarity of ~1.0 and
128 | 		// avg difference betwen angular and hamming similarities should
129 | 		// be less than 0.03
130 | 		r, c := reducedDimMatrix.Dims()
131 | 		if r != test.k || c != test.cols {
132 | 			t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
133 | 		}
134 | 		if avgDiff >= 0.05 {
135 | 			t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.05, avgDiff)
136 | 		}
137 | 	}
138 | }
139 | 
140 | func TestRandomIndexingFit(t *testing.T) {
141 | 	tests := []struct {
142 | 		k       int
143 | 		rows    int
144 | 		cols    int
145 | 		density float32
146 | 	}{
147 | 		{k: 400, rows: 700, cols: 600, density: 0.02},
148 | 		{k: 400, rows: 800, cols: 800, density: 0.02},
149 | 	}
150 | 
151 | 	for ti, test := range tests {
152 | 		matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
153 | 		query := matrix.ToCSC().ColView(0)
154 | 
155 | 		// When transformed using sign random projections
156 | 		transformer := NewRandomIndexing(test.k, float64(test.density))
157 | 		transformer.rnd = rand.New(rand.NewSource(uint64(0)))
158 | 		reducedDimMatrix, err := transformer.FitTransform(matrix)
159 | 		if err != nil {
160 | 			t.Errorf("Failed to transform matrix because %v\n", err)
161 | 		}
162 | 		m := reducedDimMatrix.(sparse.TypeConverter).ToCSC()
163 | 
164 | 		reducedDimQuery, err := transformer.Transform(query)
165 | 		if err != nil {
166 | 			t.Errorf("Failed to transform query because %v\n", err)
167 | 		}
168 | 		q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0)
169 | 
170 | 		var culmDiff float64
171 | 		ColDo(matrix, func(j int, v mat.Vector) {
172 | 			angSim := pairwise.CosineSimilarity(query, v)
173 | 			lshSim := pairwise.CosineSimilarity(q, m.ColView(j))
174 | 
175 | 			if j == 0 {
176 | 				if math.Abs(angSim-lshSim) >= 0.05 {
177 | 					t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
178 | 				}
179 | 			}
180 | 
181 | 			//diff := math.Abs(lshSim-angSim) / angSim
182 | 			diff := math.Abs(lshSim - angSim)
183 | 			culmDiff += diff
184 | 		})
185 | 		t.Logf("CulmDiff = %f\n", culmDiff)
186 | 		avgDiff := culmDiff / float64(test.cols)
187 | 
188 | 		// Then output matrix should be of specified length,
189 | 		// matching column should still have similarity of ~1.0 and
190 | 		// avg difference betwen angular and hamming similarities should
191 | 		// be less than 0.03
192 | 		r, c := reducedDimMatrix.Dims()
193 | 		if r != test.k || c != test.cols {
194 | 			t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
195 | 		}
196 | 		if avgDiff >= 0.12 {
197 | 			t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff)
198 | 		}
199 | 	}
200 | }
201 | 
202 | func TestRandomIndexingPartialFit(t *testing.T) {
203 | 	tests := []struct {
204 | 		k       int
205 | 		rows    int
206 | 		cols    int
207 | 		density float32
208 | 	}{
209 | 		{k: 400, rows: 700, cols: 600, density: 0.02},
210 | 		{k: 400, rows: 800, cols: 800, density: 0.02},
211 | 	}
212 | 
213 | 	for ti, test := range tests {
214 | 		matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
215 | 		query := matrix.ToCSC().ColView(0)
216 | 
217 | 		// When transformed using sign random projections
218 | 		transformer := NewRandomIndexing(test.k, float64(test.density))
219 | 		transformer.rnd = rand.New(rand.NewSource(uint64(0)))
220 | 
221 | 		ColDo(matrix, func(j int, v mat.Vector) {
222 | 			transformer.PartialFit(v)
223 | 		})
224 | 
225 | 		reducedDimMatrix, err := transformer.Transform(matrix)
226 | 		if err != nil {
227 | 			t.Errorf("Failed to transform matrix because %v\n", err)
228 | 		}
229 | 		m := reducedDimMatrix.(sparse.TypeConverter).ToCSC()
230 | 
231 | 		reducedDimQuery, err := transformer.Transform(query)
232 | 		if err != nil {
233 | 			t.Errorf("Failed to transform query because %v\n", err)
234 | 		}
235 | 		q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0)
236 | 
237 | 		var culmDiff float64
238 | 		ColDo(matrix, func(j int, v mat.Vector) {
239 | 			angSim := pairwise.CosineSimilarity(query, v)
240 | 			lshSim := pairwise.CosineSimilarity(q, m.ColView(j))
241 | 
242 | 			if j == 0 {
243 | 				if math.Abs(angSim-lshSim) >= 0.05 {
244 | 					t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
245 | 				}
246 | 			}
247 | 
248 | 			//diff := math.Abs(lshSim-angSim) / angSim
249 | 			diff := math.Abs(lshSim - angSim)
250 | 			culmDiff += diff
251 | 		})
252 | 		t.Logf("CulmDiff = %f\n", culmDiff)
253 | 		avgDiff := culmDiff / float64(test.cols)
254 | 
255 | 		// Then output matrix should be of specified length,
256 | 		// matching column should still have similarity of ~1.0 and
257 | 		// avg difference betwen angular and hamming similarities should
258 | 		// be less than 0.03
259 | 		r, c := reducedDimMatrix.Dims()
260 | 		if r != test.k || c != test.cols {
261 | 			t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
262 | 		}
263 | 		if avgDiff >= 0.12 {
264 | 			t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff)
265 | 		}
266 | 	}
267 | }
268 | 
269 | func TestReflectiveRandomIndexing(t *testing.T) {
270 | 	tests := []struct {
271 | 		k       int
272 | 		rows    int
273 | 		cols    int
274 | 		density float32
275 | 	}{
276 | 		{k: 400, rows: 700, cols: 600, density: 0.02},
277 | 		{k: 400, rows: 800, cols: 800, density: 0.02},
278 | 	}
279 | 
280 | 	for ti, test := range tests {
281 | 		matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
282 | 		query := matrix.ToCSC().ColView(0)
283 | 
284 | 		// When transformed using Reflective Random Indexing
285 | 		transformer := NewReflectiveRandomIndexing(test.k, TermBasedRRI, 0, float64(test.density))
286 | 		transformer.rnd = rand.New(rand.NewSource(uint64(0)))
287 | 		reducedDimMatrix, err := transformer.FitTransform(matrix)
288 | 		if err != nil {
289 | 			t.Errorf("Failed to transform matrix because %v\n", err)
290 | 		}
291 | 		m := reducedDimMatrix.(sparse.TypeConverter).ToCSC()
292 | 
293 | 		reducedDimQuery, err := transformer.Transform(query)
294 | 		if err != nil {
295 | 			t.Errorf("Failed to transform query because %v\n", err)
296 | 		}
297 | 		q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0)
298 | 
299 | 		var culmDiff float64
300 | 		ColDo(matrix, func(j int, v mat.Vector) {
301 | 			origSim := pairwise.CosineSimilarity(query, v)
302 | 			redSim := pairwise.CosineSimilarity(q, m.ColView(j))
303 | 
304 | 			if j == 0 {
305 | 				if math.Abs(origSim-redSim) >= 0.0000001 {
306 | 					t.Errorf("Test %d: Expected matching similarity but found %.10f (Original) and %.10f (Reduced)\n", ti, origSim, redSim)
307 | 				}
308 | 			}
309 | 
310 | 			diff := math.Abs(redSim - origSim)
311 | 			culmDiff += diff
312 | 		})
313 | 		t.Logf("CulmDiff = %f\n", culmDiff)
314 | 		avgDiff := culmDiff / float64(test.cols)
315 | 
316 | 		// Then output matrix should be of specified length,
317 | 		// matching column should still have similarity of ~1.0 and
318 | 		// avg difference betwen angular and hamming similarities should
319 | 		// be less than 0.03
320 | 		r, c := reducedDimMatrix.Dims()
321 | 		if r != test.k || c != test.cols {
322 | 			t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
323 | 		}
324 | 		if avgDiff >= 0.12 {
325 | 			t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff)
326 | 		}
327 | 	}
328 | }
329 | 


--------------------------------------------------------------------------------
/vectorisers.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"regexp"
  5 | 	"strings"
  6 | 
  7 | 	"github.com/james-bowman/sparse"
  8 | 	"github.com/spaolacci/murmur3"
  9 | 	"gonum.org/v1/gonum/mat"
 10 | )
 11 | 
 12 | // Vectoriser provides a common interface for vectorisers that take a variable
 13 | // set of string arguments and produce a numerical matrix of features.
 14 | type Vectoriser interface {
 15 | 	Fit(...string) Vectoriser
 16 | 	Transform(...string) (mat.Matrix, error)
 17 | 	FitTransform(...string) (mat.Matrix, error)
 18 | }
 19 | 
 20 | // OnlineVectoriser is an extension to the Vectoriser interface that supports
 21 | // online (streaming/mini-batch) training as opposed to just batch.
 22 | type OnlineVectoriser interface {
 23 | 	Vectoriser
 24 | 	PartialFit(...string) OnlineVectoriser
 25 | }
 26 | 
 27 | // Transformer provides a common interface for transformer steps.
 28 | type Transformer interface {
 29 | 	Fit(mat.Matrix) Transformer
 30 | 	Transform(mat mat.Matrix) (mat.Matrix, error)
 31 | 	FitTransform(mat mat.Matrix) (mat.Matrix, error)
 32 | }
 33 | 
 34 | // OnlineTransformer is an extension to the Transformer interface that
 35 | // supports online (streaming/mini-batch) training as opposed to just batch.
 36 | type OnlineTransformer interface {
 37 | 	Transformer
 38 | 	PartialFit(mat.Matrix) OnlineTransformer
 39 | }
 40 | 
 41 | // Tokeniser interface for tokenisers allowing substitution of different
 42 | // tokenisation strategies e.g. Regexp and also supporting different
 43 | // different token types n-grams and languages.
 44 | type Tokeniser interface {
 45 | 	// ForEachIn iterates over each token within text and invokes function
 46 | 	// f with the token as parameter
 47 | 	ForEachIn(text string, f func(token string))
 48 | 
 49 | 	// Tokenise returns a slice of all the tokens contained in string
 50 | 	// text
 51 | 	Tokenise(text string) []string
 52 | }
 53 | 
 54 | // RegExpTokeniser implements Tokeniser interface using a basic RegExp
 55 | // pattern for unary-gram word tokeniser supporting optional stop word
 56 | // removal
 57 | type RegExpTokeniser struct {
 58 | 	RegExp    *regexp.Regexp
 59 | 	StopWords map[string]bool
 60 | }
 61 | 
 62 | // NewTokeniser returns a new, default Tokeniser implementation.
 63 | // stopWords is a potentially empty string slice
 64 | // that contains the words that should be removed from the corpus
 65 | // default regExpTokeniser will split words by whitespace/tabs: "\t\n\f\r "
 66 | func NewTokeniser(stopWords ...string) Tokeniser {
 67 | 	var stop map[string]bool
 68 | 
 69 | 	stop = make(map[string]bool)
 70 | 	for _, word := range stopWords {
 71 | 		stop[word] = true
 72 | 	}
 73 | 	return &RegExpTokeniser{
 74 | 		RegExp:    regexp.MustCompile("[\\p{L}]+"),
 75 | 		StopWords: stop,
 76 | 	}
 77 | }
 78 | 
 79 | // ForEachIn iterates over each token within text and invokes function
 80 | // f with the token as parameter.  If StopWords is not nil then any
 81 | // tokens from text present in StopWords will be ignored.
 82 | func (t *RegExpTokeniser) ForEachIn(text string, f func(token string)) {
 83 | 	tokens := t.tokenise(text)
 84 | 	for _, token := range tokens {
 85 | 		if t.StopWords != nil {
 86 | 			if t.StopWords[token] {
 87 | 				continue
 88 | 			}
 89 | 		}
 90 | 		f(token)
 91 | 	}
 92 | }
 93 | 
 94 | // Tokenise returns a slice of all the tokens contained in string
 95 | // text.  If StopWords is not nil then any tokens from text present in
 96 | // StopWords will be removed from the slice.
 97 | func (t *RegExpTokeniser) Tokenise(text string) []string {
 98 | 	words := t.tokenise(text)
 99 | 
100 | 	// filter out stop words
101 | 	if t.StopWords != nil {
102 | 		b := words[:0]
103 | 		for _, w := range words {
104 | 			if !t.StopWords[w] {
105 | 				b = append(b, w)
106 | 			}
107 | 		}
108 | 		return b
109 | 	}
110 | 
111 | 	return words
112 | }
113 | 
114 | // tokenise returns a slice of all the tokens contained in string
115 | // text.
116 | func (t *RegExpTokeniser) tokenise(text string) []string {
117 | 	// convert content to lower case
118 | 	c := strings.ToLower(text)
119 | 
120 | 	// match whole words, removing any punctuation/whitespace
121 | 	words := t.RegExp.FindAllString(c, -1)
122 | 
123 | 	return words
124 | }
125 | 
126 | // CountVectoriser can be used to encode one or more text documents into a term document
127 | // matrix where each column represents a document within the corpus and each row represents
128 | // a term present in the training data set.  Each element represents the frequency the
129 | // corresponding term appears in the corresponding document e.g. tf(t, d) = 5 would mean
130 | // that term t (perhaps the word "dog") appears 5 times in the document d.
131 | type CountVectoriser struct {
132 | 	// Vocabulary is a map of words to indices that point to the row number representing
133 | 	// that word in the term document matrix output from the Transform() and FitTransform()
134 | 	// methods.  The Vocabulary map is populated by the Fit() or FitTransform() methods
135 | 	// based upon the words occurring in the datasets supplied to those methods.  Within
136 | 	// Transform(), any words found in the test data set that were not present in the
137 | 	// training data set supplied to Fit() will not have an entry in the Vocabulary
138 | 	// and will be ignored.
139 | 	Vocabulary map[string]int
140 | 
141 | 	// Tokeniser is used to tokenise input text into features.
142 | 	Tokeniser Tokeniser
143 | }
144 | 
145 | // NewCountVectoriser creates a new CountVectoriser.
146 | // stopWords is a potentially empty slice of words to be removed from the corpus
147 | func NewCountVectoriser(stopWords ...string) *CountVectoriser {
148 | 	return &CountVectoriser{
149 | 		Vocabulary: make(map[string]int),
150 | 		Tokeniser:  NewTokeniser(stopWords...),
151 | 	}
152 | }
153 | 
154 | // Fit processes the supplied training data (a variable number of strings representing
155 | // documents).  Each word appearing inside the training data will be added to the
156 | // Vocabulary.  The Fit() method is intended to be called once to train the model
157 | // in a batch context.  Calling the Fit() method a sceond time have the effect of
158 | // re-training the model from scratch (discarding the previously learnt vocabulary).
159 | func (v *CountVectoriser) Fit(train ...string) Vectoriser {
160 | 	i := 0
161 | 	if len(v.Vocabulary) != 0 {
162 | 		v.Vocabulary = make(map[string]int)
163 | 	}
164 | 	v.fitVocab(i, train...)
165 | 
166 | 	return v
167 | }
168 | 
169 | // fitVocab learns the vocabulary contained within the supplied training documents
170 | func (v *CountVectoriser) fitVocab(start int, train ...string) {
171 | 	i := start
172 | 	for _, doc := range train {
173 | 		v.Tokeniser.ForEachIn(doc, func(word string) {
174 | 			_, exists := v.Vocabulary[word]
175 | 			if !exists {
176 | 				v.Vocabulary[word] = i
177 | 				i++
178 | 			}
179 | 		})
180 | 	}
181 | }
182 | 
183 | // Transform transforms the supplied documents into a term document matrix where each
184 | // column is a feature vector representing one of the supplied documents.  Each element
185 | // represents the frequency with which the associated term for that row occurred within
186 | // that document.  The returned matrix is a sparse matrix type.
187 | func (v *CountVectoriser) Transform(docs ...string) (mat.Matrix, error) {
188 | 	mat := sparse.NewDOK(len(v.Vocabulary), len(docs))
189 | 
190 | 	for d, doc := range docs {
191 | 		v.Tokeniser.ForEachIn(doc, func(word string) {
192 | 			i, exists := v.Vocabulary[word]
193 | 
194 | 			if exists {
195 | 				mat.Set(i, d, mat.At(i, d)+1)
196 | 			}
197 | 		})
198 | 	}
199 | 	return mat, nil
200 | }
201 | 
202 | // FitTransform is exactly equivalent to calling Fit() followed by Transform() on the
203 | // same matrix.  This is a convenience where separate training data is not being
204 | // used to fit the model i.e. the model is fitted on the fly to the test data.
205 | // The returned matrix is a sparse matrix type.
206 | func (v *CountVectoriser) FitTransform(docs ...string) (mat.Matrix, error) {
207 | 	return v.Fit(docs...).Transform(docs...)
208 | }
209 | 
210 | // HashingVectoriser can be used to encode one or more text documents into a term document
211 | // matrix where each column represents a document within the corpus and each row represents
212 | // a term.  Each element represents the frequency the corresponding term appears in the
213 | // corresponding document e.g. tf(t, d) = 5 would mean that term t (perhaps the word "dog")
214 | // appears 5 times in the document d.
215 | type HashingVectoriser struct {
216 | 	NumFeatures int
217 | 	Tokeniser   Tokeniser
218 | }
219 | 
220 | // NewHashingVectoriser creates a new HashingVectoriser.  If stopWords is not an empty slice then
221 | // english stop words will be removed.  numFeatures specifies the number of features
222 | // that should be present in produced vectors.  Each word in a document is hashed and
223 | // the mod of the hash and numFeatures gives the row in the matrix corresponding to that
224 | // word.
225 | func NewHashingVectoriser(numFeatures int, stopWords ...string) *HashingVectoriser {
226 | 	return &HashingVectoriser{
227 | 		NumFeatures: numFeatures,
228 | 		Tokeniser:   NewTokeniser(stopWords...),
229 | 	}
230 | }
231 | 
232 | // Fit does nothing for a HashingVectoriser.  As the HashingVectoriser vectorises features
233 | // based on their hash, it does require a pre-determined vocabulary to map features to their
234 | // correct row in the vector.  It is effectively stateless and does not require fitting to
235 | // training data.  The method is included for compatibility with other vectorisers.
236 | func (v *HashingVectoriser) Fit(train ...string) Vectoriser {
237 | 	// The hashing vectoriser is stateless and does not require pre-training so this
238 | 	// method does nothing.
239 | 	return v
240 | }
241 | 
242 | // PartialFit does nothing for a HashingVectoriser.  As the HashingVectoriser vectorises
243 | // features based on their hash, it does not require a pre-learnt vocabulary to map
244 | // features to the correct row in the feature vector.  This method is included
245 | // for compatibility with other vectorisers.
246 | func (v *HashingVectoriser) PartialFit(train ...string) Vectoriser {
247 | 	// The hashing vectoriser is stateless and does not requre training so this method
248 | 	// does nothing.
249 | 	return v
250 | }
251 | 
252 | // Transform transforms the supplied documents into a term document matrix where each
253 | // column is a feature vector representing one of the supplied documents.  Each element
254 | // represents the frequency with which the associated term for that row occurred within
255 | // that document.  The returned matrix is a sparse matrix type.
256 | func (v *HashingVectoriser) Transform(docs ...string) (mat.Matrix, error) {
257 | 	mat := sparse.NewDOK(v.NumFeatures, len(docs))
258 | 
259 | 	for d, doc := range docs {
260 | 		v.Tokeniser.ForEachIn(doc, func(word string) {
261 | 			h := murmur3.Sum32([]byte(word))
262 | 			i := int(h) % v.NumFeatures
263 | 
264 | 			mat.Set(i, d, mat.At(i, d)+1)
265 | 		})
266 | 	}
267 | 	return mat, nil
268 | }
269 | 
270 | // FitTransform for a HashingVectoriser is exactly equivalent to calling
271 | // Transform() with the same matrix.  For most vectorisers, Fit() must be called
272 | // prior to Transform() and so this method is a convenience where separate
273 | // training data is not used to fit the model.  For a HashingVectoriser, fitting is
274 | // not required and so this method is exactly equivalent to Transform().  As with
275 | // Fit(), this method is included with the HashingVectoriser for compatibility
276 | // with other vectorisers.  The returned matrix is a sparse matrix type.
277 | func (v *HashingVectoriser) FitTransform(docs ...string) (mat.Matrix, error) {
278 | 	return v.Transform(docs...)
279 | }
280 | 
281 | // Pipeline is a mechanism for composing processing pipelines out of vectorisers
282 | // transformation steps.  For example to compose a classic LSA/LSI pipeline
283 | // (vectorisation -> TFIDF transformation -> Truncated SVD) one could use a
284 | // Pipeline as follows:
285 | // 	lsaPipeline := NewPipeline(NewCountVectoriser(false), NewTfidfTransformer(), NewTruncatedSVD(100))
286 | //
287 | type Pipeline struct {
288 | 	Vectoriser   Vectoriser
289 | 	Transformers []Transformer
290 | }
291 | 
292 | // NewPipeline constructs a new processing pipline with the supplied Vectoriser
293 | // and one or more transformers
294 | func NewPipeline(vectoriser Vectoriser, transformers ...Transformer) *Pipeline {
295 | 	pipeline := Pipeline{
296 | 		Vectoriser:   vectoriser,
297 | 		Transformers: transformers,
298 | 	}
299 | 
300 | 	return &pipeline
301 | }
302 | 
303 | // Fit fits the model(s) to the supplied training data
304 | func (p *Pipeline) Fit(docs ...string) Vectoriser {
305 | 	if _, err := p.FitTransform(docs...); err != nil {
306 | 		panic("nlp: Failed to Fit pipeline because " + err.Error())
307 | 	}
308 | 
309 | 	return p
310 | }
311 | 
312 | // Transform transforms the supplied documents into a matrix representation
313 | // of numerical feature vectors using a model(s) previously fitted to supplied
314 | // training data.
315 | func (p *Pipeline) Transform(docs ...string) (mat.Matrix, error) {
316 | 	matrix, err := p.Vectoriser.Transform(docs...)
317 | 	if err != nil {
318 | 		return matrix, err
319 | 	}
320 | 	for _, t := range p.Transformers {
321 | 		matrix, err = t.Transform(matrix)
322 | 		if err != nil {
323 | 			return matrix, err
324 | 		}
325 | 	}
326 | 	return matrix, nil
327 | }
328 | 
329 | // FitTransform transforms the supplied documents into a matrix representation
330 | // of numerical feature vectors fitting the model to the supplied data in the
331 | // process.
332 | func (p *Pipeline) FitTransform(docs ...string) (mat.Matrix, error) {
333 | 	matrix, err := p.Vectoriser.FitTransform(docs...)
334 | 	if err != nil {
335 | 		return matrix, err
336 | 	}
337 | 	for _, t := range p.Transformers {
338 | 		matrix, err = t.FitTransform(matrix)
339 | 		if err != nil {
340 | 			return matrix, err
341 | 		}
342 | 	}
343 | 	return matrix, nil
344 | }
345 | 


--------------------------------------------------------------------------------
/randomprojection.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"time"
  6 | 
  7 | 	"golang.org/x/exp/rand"
  8 | 
  9 | 	"github.com/james-bowman/sparse"
 10 | 	"gonum.org/v1/gonum/mat"
 11 | 	"gonum.org/v1/gonum/stat/distuv"
 12 | 	"gonum.org/v1/gonum/stat/sampleuv"
 13 | )
 14 | 
 15 | // SignRandomProjection represents a transform of a matrix into a lower
 16 | // dimensional space.  Sign Random Projection is a method of Locality
 17 | // Sensitive Hashing (LSH) sometimes referred to as the random hyperplane method.
 18 | // A set of random hyperplanes are created in the original dimensional
 19 | // space and then input matrices are expressed relative to the random
 20 | // hyperplanes as follows:
 21 | //	For each column vector in the input matrix, construct a corresponding output
 22 | // 	bit vector with each bit (i) calculated as follows:
 23 | //		if dot(vector, hyperplane[i]) > 0
 24 | // 			bit[i] = 1
 25 | // 		else
 26 | //			bit[i] = 0
 27 | // Whilst similar to other methods of random projection this method is unique in that
 28 | // it uses only a single bit in the output matrix to represent the sign of the result
 29 | // of the comparison (Dot product) with each hyperplane so encodes vector
 30 | // representations with very low memory and processor requirements whilst preserving
 31 | // relative distance between vectors from the original space.
 32 | // Hamming similarity (and distance) between the transformed vectors in the
 33 | // subspace can approximate Angular similarity (and distance) (which is strongly
 34 | // related to Cosine similarity) of the associated vectors from the original space.
 35 | type SignRandomProjection struct {
 36 | 	// Bits represents the number of bits the output vectors should
 37 | 	// be in length and hence the number of random hyperplanes needed
 38 | 	// for the transformation
 39 | 	Bits int
 40 | 
 41 | 	// simhash is the simhash LSH (Locality Sensitive Hashing) algorithm
 42 | 	// used to perform the sign random projection
 43 | 	simHash *SimHash
 44 | }
 45 | 
 46 | // NewSignRandomProjection constructs a new SignRandomProjection transformer
 47 | // to reduce the dimensionality.  The transformer uses a number of random hyperplanes
 48 | // represented by `bits` and is the dimensionality of the output, transformed
 49 | // matrices.
 50 | func NewSignRandomProjection(bits int) *SignRandomProjection {
 51 | 	return &SignRandomProjection{Bits: bits}
 52 | }
 53 | 
 54 | // Fit creates the random hyperplanes from the input training data matrix, mat and
 55 | // stores the hyperplanes as a transform to apply to matrices.
 56 | func (s *SignRandomProjection) Fit(m mat.Matrix) Transformer {
 57 | 	rows, _ := m.Dims()
 58 | 	s.simHash = NewSimHash(s.Bits, rows)
 59 | 	return s
 60 | }
 61 | 
 62 | // Transform applies the transform decomposed from the training data matrix in Fit()
 63 | // to the input matrix.  The columns in the resulting output matrix will be a low
 64 | // dimensional binary representation of the columns within the original
 65 | // i.e. a hash or fingerprint that can be quickly and efficiently compared with other
 66 | // similar vectors.  Hamming similarity in the new dimensional space can be
 67 | // used to approximate Cosine similarity between the vectors of the original space.
 68 | // The returned matrix is a Binary matrix or BinaryVec type depending
 69 | // upon whether m is Matrix or Vector.
 70 | func (s *SignRandomProjection) Transform(m mat.Matrix) (mat.Matrix, error) {
 71 | 	_, cols := m.Dims()
 72 | 
 73 | 	sigs := make([]sparse.BinaryVec, cols)
 74 | 	ColDo(m, func(j int, v mat.Vector) {
 75 | 		sigs[j] = *s.simHash.Hash(v)
 76 | 	})
 77 | 	return sparse.NewBinary(s.Bits, cols, sigs), nil
 78 | }
 79 | 
 80 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
 81 | // on the same matrix.  This is a useful shortcut where separate training data is not being
 82 | // used to fit the model i.e. the model is fitted on the fly to the test data.
 83 | // The returned matrix is a Binary matrix or BinaryVec type depending upon
 84 | // whether m is Matrix or Vector.
 85 | func (s *SignRandomProjection) FitTransform(m mat.Matrix) (mat.Matrix, error) {
 86 | 	return s.Fit(m).Transform(m)
 87 | }
 88 | 
 89 | // RandomProjection is a method of dimensionality reduction based upon
 90 | // the Johnson–Lindenstrauss lemma stating that a small set of points
 91 | // in a high-dimensional space can be embedded into a space of much
 92 | // lower dimension in such a way that distances between the points
 93 | // are nearly preserved.
 94 | //
 95 | // The technique projects the original
 96 | // matrix orthogonally onto a random subspace, transforming the
 97 | // elements of the original matrix into a lower dimensional representation.
 98 | // Computing orthogonal matrices is expensive and so this technique
 99 | // uses specially generated random matrices (hence the name) following
100 | // the principle that in high dimensional spaces, there are lots of
101 | // nearly orthogonal matrices.
102 | type RandomProjection struct {
103 | 	K           int
104 | 	Density     float64
105 | 	rnd         *rand.Rand
106 | 	projections mat.Matrix
107 | }
108 | 
109 | // NewRandomProjection creates and returns a new RandomProjection
110 | // transformer.  The RandomProjection will use a specially generated
111 | // random matrix of the specified density and dimensionality k to
112 | // perform the transform to k dimensional space.
113 | func NewRandomProjection(k int, density float64) *RandomProjection {
114 | 	r := RandomProjection{
115 | 		K:       k,
116 | 		Density: density,
117 | 	}
118 | 
119 | 	return &r
120 | }
121 | 
122 | // Fit creates the random (almost) orthogonal matrix used to project
123 | // input matrices into the new reduced dimensional subspace.
124 | func (r *RandomProjection) Fit(m mat.Matrix) Transformer {
125 | 	rows, _ := m.Dims()
126 | 	r.projections = CreateRandomProjectionTransform(r.K, rows, r.Density, r.rnd)
127 | 	return r
128 | }
129 | 
130 | // Transform applies the transformation, projecting the input matrix
131 | // into the reduced dimensional subspace.  The transformed matrix
132 | // will be a sparse CSR format matrix of shape k x c.
133 | func (r *RandomProjection) Transform(m mat.Matrix) (mat.Matrix, error) {
134 | 	var product sparse.CSR
135 | 
136 | 	// projections will be dimensions k x r (k x t)
137 | 	// m will be dimensions r x c (t x d)
138 | 	// product will be of reduced dimensions k x c (k x d)
139 | 	if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
140 | 		m = t.ToCSR()
141 | 	}
142 | 
143 | 	product.Mul(r.projections, m)
144 | 
145 | 	return &product, nil
146 | }
147 | 
148 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
149 | // on the same matrix.  This is a useful shortcut where separate training data is not being
150 | // used to fit the model i.e. the model is fitted on the fly to the test data.
151 | // The returned matrix is a sparse CSR format matrix of shape k x c.
152 | func (r *RandomProjection) FitTransform(m mat.Matrix) (mat.Matrix, error) {
153 | 	return r.Fit(m).Transform(m)
154 | }
155 | 
156 | // RRIBasis represents the initial basis for the index/elemental vectors
157 | // used for Random Reflective Indexing
158 | type RRIBasis int
159 | 
160 | const (
161 | 	// DocBasedRRI represents columns (documents/contexts in a term-document
162 | 	// matrix) forming the initial basis for index/elemental vectors in Random Indexing
163 | 	DocBasedRRI RRIBasis = iota
164 | 
165 | 	// TermBasedRRI indicates rows (terms in a term-document matrix)
166 | 	// form the initial basis for index/elemental vectors in Reflective Random Indexing.
167 | 	TermBasedRRI
168 | )
169 | 
170 | // RandomIndexing is a method of dimensionality reduction used for Latent Semantic
171 | // Analysis in a similar way to TruncatedSVD and PCA.  Random
172 | // Indexing is designed to solve limitations of very high dimensional
173 | // vector space model implementations for modelling term co-occurance
174 | // in language processing such as SVD typically used for LSA/LSI (Latent
175 | // Semantic Analysis/Latent Semantic Indexing).  In implementation
176 | // it bears some similarity to other random projection techniques
177 | // such as those implemented in RandomProjection and SignRandomProjection
178 | // within this package.
179 | // The RandomIndexing type can also be used to perform Reflective
180 | // Random Indexing which extends the Random Indexing model with additional
181 | // training cycles to better support indirect inferrence i.e. find synonyms
182 | // where the words do not appear together in documents.
183 | type RandomIndexing struct {
184 | 	// K specifies the number of dimensions for the semantic space
185 | 	K int
186 | 
187 | 	// Density specifies the proportion of non-zero elements in the
188 | 	// elemental vectors
189 | 	Density float64
190 | 
191 | 	// Type specifies the initial basis for the elemental vectors
192 | 	// i.e. whether they initially represent the rows or columns
193 | 	// This is only relevent for Reflective Random Indexing
194 | 	Type RRIBasis
195 | 
196 | 	// Reflections specifies the number of reflective training cycles
197 | 	// to run during fitting for RRI (Reflective Random Indexing). For
198 | 	// Randome Indexing (non-reflective) this is 0.
199 | 	Reflections int
200 | 
201 | 	rnd *rand.Rand
202 | 
203 | 	// components is a k x t matrix where `t` is the number of terms
204 | 	// (rows) in the training data matrix.  The columns in this matrix
205 | 	// contain the `context` vectors for RI where each column represents
206 | 	// a semantic representation of a term based upon the contexts
207 | 	// in which it has appeared within the training data.
208 | 	components mat.Matrix
209 | }
210 | 
211 | // NewRandomIndexing returns a new RandomIndexing transformer
212 | // configured to transform term document matrices into k dimensional
213 | // space. The density parameter specifies the density of the index/elemental
214 | // vectors used to project the input matrix into lower dimensional
215 | // space i.e. the proportion of elements that are non-zero.
216 | func NewRandomIndexing(k int, density float64) *RandomIndexing {
217 | 	return &RandomIndexing{
218 | 		K:       k,
219 | 		Density: density,
220 | 	}
221 | }
222 | 
223 | // NewReflectiveRandomIndexing returns a new RandomIndexing type
224 | // configured for Reflective Random Indexing.  Reflective Random
225 | // Indexing applies additional (reflective) training cycles ontop
226 | // of Random Indexing to capture indirect inferences (synonyms).
227 | // i.e. similarity between terms that do not directly co-occur
228 | // within the same context/document.
229 | // basis specifies the basis for the reflective random indexing i.e.
230 | // whether the initial, random index/elemental vectors should represent
231 | // documents (columns) or terms (rows).
232 | // reflections is the number of additional training cycles to apply
233 | // to build the elemental vectors.
234 | // Specifying basis == DocBasedRRI and reflections == 0 is equivalent
235 | // to conventional Random Indexing.
236 | func NewReflectiveRandomIndexing(k int, basis RRIBasis, reflections int, density float64) *RandomIndexing {
237 | 	return &RandomIndexing{
238 | 		K:           k,
239 | 		Type:        basis,
240 | 		Reflections: reflections,
241 | 		Density:     density,
242 | 	}
243 | }
244 | 
245 | // PartialFit extends the model to take account of the specified matrix m. The
246 | // context vectors are learnt and stored to be used for furture transformations
247 | // and analysis.  PartialFit performs Random Indexing even if the Transformer is
248 | // configured for Reflective Random Indexing so if RRI is required please train
249 | // using the Fit() method as a batch operation.  Unlike the Fit() method, the
250 | // PartialFit() method is designed to be called multiple times to support online
251 | // and mini-batch learning whereas the Fit() method is only intended to be called
252 | // once for batch learning.
253 | func (r *RandomIndexing) PartialFit(m mat.Matrix) OnlineTransformer {
254 | 	rows, cols := m.Dims()
255 | 
256 | 	if r.components == nil || r.components.(*sparse.CSR).IsZero() {
257 | 		r.components = sparse.NewCSR(r.K, rows, make([]int, r.K+1), []int{}, []float64{})
258 | 	}
259 | 	current := r.components
260 | 
261 | 	// Create transform in transpose to get better randomised sparsity patterns
262 | 	// when partial fitting with small mini-batches e.g. single column/streaming
263 | 	idxVecs := CreateRandomProjectionTransform(cols, r.K, r.Density, r.rnd).T()
264 | 	ctxVecs := r.contextualise(m.T(), idxVecs)
265 | 
266 | 	current.(*sparse.CSR).Add(current, ctxVecs)
267 | 	r.components = current
268 | 
269 | 	return r
270 | }
271 | 
272 | // Components returns a t x k matrix where `t` is the number of terms
273 | // (rows) in the training data matrix.  The rows in this matrix
274 | // are the `context` vectors for RI each one representing
275 | // a semantic representation of a term based upon the contexts
276 | // in which it has appeared within the training data.
277 | func (r *RandomIndexing) Components() mat.Matrix {
278 | 	return r.components.T()
279 | }
280 | 
281 | // SetComponents sets a t x k matrix where `t` is the number of terms
282 | // (rows) in the training data matrix.
283 | func (r *RandomIndexing) SetComponents(m mat.Matrix) {
284 | 	r.components = m
285 | }
286 | 
287 | // Fit trains the model, creating random index/elemental vectors to
288 | // be used to construct the new projected feature vectors ('context'
289 | // vectors) in the reduced semantic dimensional space. If configured for
290 | // Reflective Random Indexing then Fit may actually run multiple
291 | // training cycles as specified during construction.  The Fit method
292 | // trains the model in batch mode so is intended to be called once, for
293 | // online/streaming or mini-batch training please consider the
294 | // PartialFit method instead.
295 | func (r *RandomIndexing) Fit(m mat.Matrix) Transformer {
296 | 	rows, cols := m.Dims()
297 | 	var idxVecs mat.Matrix
298 | 
299 | 	if r.Type == TermBasedRRI {
300 | 		idxVecs = CreateRandomProjectionTransform(r.K, rows, r.Density, r.rnd)
301 | 	} else {
302 | 		idxVecs = CreateRandomProjectionTransform(r.K, cols, r.Density, r.rnd)
303 | 		idxVecs = r.contextualise(m.T(), idxVecs)
304 | 	}
305 | 
306 | 	for i := 0; i < r.Reflections; i++ {
307 | 		idxVecs = r.contextualise(m, idxVecs)
308 | 		idxVecs = r.contextualise(m.T(), idxVecs)
309 | 	}
310 | 
311 | 	r.components = idxVecs
312 | 	return r
313 | }
314 | 
315 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
316 | // on the same matrix.  This is a useful shortcut where separate training data is not being
317 | // used to fit the model i.e. the model is fitted on the fly to the test data.
318 | // The returned matrix is a sparse CSR format matrix of shape k x c.
319 | func (r *RandomIndexing) FitTransform(m mat.Matrix) (mat.Matrix, error) {
320 | 	return r.Fit(m).Transform(m)
321 | }
322 | 
323 | // Transform applies the transform, projecting matrix m into the
324 | // lower dimensional semantic space.  The output matrix will be of
325 | // shape k x c and will be a sparse CSR format matrix.  The transformation
326 | // for each document vector is simply the accumulation of all trained context
327 | // vectors relating to terms appearing in the document.  These are weighted by
328 | // the frequency the term appears in the document.
329 | func (r *RandomIndexing) Transform(m mat.Matrix) (mat.Matrix, error) {
330 | 	return r.contextualise(m, r.components), nil
331 | }
332 | 
333 | // contextualise accumulates the vectors vectors for each column in matrix m weighting
334 | // each row vector in vectors by its corresponding value in column of the matrix
335 | func (r *RandomIndexing) contextualise(m mat.Matrix, vectors mat.Matrix) mat.Matrix {
336 | 	var product sparse.CSR
337 | 
338 | 	product.Mul(vectors, m)
339 | 
340 | 	return &product
341 | }
342 | 
343 | // CreateRandomProjectionTransform returns a new random matrix for
344 | // Random Projections of shape newDims x origDims.  The matrix will
345 | // be randomly populated using probability distributions where density
346 | // is used as the probability that each element will be populated.
347 | // Populated values will be randomly selected from [-1, 1] scaled
348 | // according to the density and dimensions of the matrix.  If rnd is
349 | // nil then a new random number generator will be created and used.
350 | func CreateRandomProjectionTransform(newDims, origDims int, density float64, rnd *rand.Rand) mat.Matrix {
351 | 	if rnd == nil {
352 | 		rnd = rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
353 | 	}
354 | 	// TODO Possibly return a mat.Dense instead of sparse.CSR if
355 | 	// density == 1
356 | 
357 | 	var ptr int
358 | 	var ind []int
359 | 	indptr := make([]int, newDims+1)
360 | 
361 | 	for i := 0; i < newDims; i++ {
362 | 		nnz := binomial(origDims, density, rnd)
363 | 		if nnz > 0 {
364 | 			idx := make([]int, nnz)
365 | 			sampleuv.WithoutReplacement(idx, origDims, rnd)
366 | 			//sort.Ints(idx)
367 | 			ind = append(ind, idx...)
368 | 			ptr += nnz
369 | 		}
370 | 		indptr[i+1] = ptr
371 | 	}
372 | 
373 | 	vals := make([]float64, len(ind))
374 | 	values(vals, newDims, density, rnd)
375 | 
376 | 	return sparse.NewCSR(newDims, origDims, indptr, ind, vals)
377 | }
378 | 
379 | func binomial(n int, p float64, rnd *rand.Rand) int {
380 | 	dist := distuv.Bernoulli{
381 | 		P: p,
382 | 		// Should this be Source (Gonum code and docs seem out of sync)
383 | 		Src: rnd,
384 | 	}
385 | 
386 | 	var x int
387 | 	for i := 0; i < n; i++ {
388 | 		x += int(dist.Rand())
389 | 	}
390 | 	return x
391 | }
392 | 
393 | func values(idx []float64, dims int, density float64, rnd *rand.Rand) {
394 | 	dist := distuv.Bernoulli{
395 | 		P: 0.5,
396 | 		// Should this be Source (Gonum code and docs seem out of sync)
397 | 		Src: rnd,
398 | 	}
399 | 
400 | 	factor := math.Sqrt(1.0/density) / math.Sqrt(float64(dims))
401 | 	for i := range idx {
402 | 		idx[i] = factor * (dist.Rand()*2 - 1)
403 | 	}
404 | }
405 | 


--------------------------------------------------------------------------------
/lda.go:
--------------------------------------------------------------------------------
  1 | package nlp
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"runtime"
  6 | 	"sync"
  7 | 	"time"
  8 | 
  9 | 	"github.com/james-bowman/sparse"
 10 | 	"golang.org/x/exp/rand"
 11 | 	"gonum.org/v1/gonum/mat"
 12 | )
 13 | 
 14 | // LearningSchedule is used to calculate the learning rate for each iteration using a natural
 15 | // gradient descent algorithm.
 16 | type LearningSchedule struct {
 17 | 	// S is the scale of the step size for the learning rate.
 18 | 	S float64
 19 | 
 20 | 	// Tau is the learning offset. The learning offset downweights the
 21 | 	// learning rate from early iterations.
 22 | 	Tau float64
 23 | 
 24 | 	// Kappa controls the learning decay.  This is the amount the learning rate
 25 | 	// reduces each iteration.  This is typically a value between 0.5 and 1.0.
 26 | 	Kappa float64
 27 | }
 28 | 
 29 | // Calc returns the learning rate for the specified iteration
 30 | func (l LearningSchedule) Calc(iteration float64) float64 {
 31 | 	return l.S / math.Pow(l.Tau+iteration, l.Kappa)
 32 | }
 33 | 
 34 | type ldaMiniBatch struct {
 35 | 	start, end int
 36 | 	nPhiHat    []float64
 37 | 	nZHat      []float64
 38 | 	gamma      []float64
 39 | }
 40 | 
 41 | func newLdaMiniBatch(topics int, words int) *ldaMiniBatch {
 42 | 	l := ldaMiniBatch{
 43 | 		nPhiHat: make([]float64, topics*words),
 44 | 		nZHat:   make([]float64, topics),
 45 | 		gamma:   make([]float64, topics),
 46 | 	}
 47 | 	return &l
 48 | }
 49 | 
 50 | func (l *ldaMiniBatch) reset() {
 51 | 	for i := range l.nPhiHat {
 52 | 		l.nPhiHat[i] = 0
 53 | 	}
 54 | 	for i := range l.nZHat {
 55 | 		l.nZHat[i] = 0
 56 | 	}
 57 | 	// assume gamma does not need to be zeroed between mini batches
 58 | }
 59 | 
 60 | // LatentDirichletAllocation (LDA) for fast unsupervised topic extraction.  LDA processes
 61 | // documents and learns their latent topic model estimating the posterior document over topic
 62 | // probability distribution (the probabilities of each document being allocated to each
 63 | // topic) and the posterior topic over word probability distribution.
 64 | //
 65 | // This transformer uses a parallel implemention of the
 66 | // SCVB0 (Stochastic Collapsed Variational Bayes) Algorithm (https://arxiv.org/pdf/1305.2452.pdf)
 67 | // by Jimmy Foulds with optional `clumping` optimisations.
 68 | type LatentDirichletAllocation struct {
 69 | 	// Iterations is the maximum number of training iterations
 70 | 	Iterations int
 71 | 
 72 | 	// PerplexityTolerance is the tolerance of perplexity below which the Fit method will stop iterating
 73 | 	// and complete.  If the evaluated perplexity is is below the tolerance, fitting will terminate successfully
 74 | 	// without necessarily completing all of the configured number of training iterations.
 75 | 	PerplexityTolerance float64
 76 | 
 77 | 	// PerplexityEvaluationFrquency is the frequency with which to test Perplexity against PerplexityTolerance inside
 78 | 	// Fit.  A value <= 0 will not evaluate Perplexity at all and simply iterate for `Iterations` iterations.
 79 | 	PerplexityEvaluationFrequency int
 80 | 
 81 | 	// BatchSize is the size of mini batches used during training
 82 | 	BatchSize int
 83 | 
 84 | 	// K is the number of topics
 85 | 	K int
 86 | 
 87 | 	// NumBurnInPasses is the number of `burn-in` passes across the documents in the
 88 | 	// training data to learn the document statistics before we start collecting topic statistics.
 89 | 	BurnInPasses int
 90 | 
 91 | 	// TransformationPasses is the number of passes to transform new documents given a previously
 92 | 	// fitted topic model
 93 | 	TransformationPasses int
 94 | 
 95 | 	// MeanChangeTolerance is the tolerance of change to Theta between burn in passes.
 96 | 	// If the level of change between passes is below the tolerance, the burn in will complete
 97 | 	// without necessarily completing the configured number of passes.
 98 | 	MeanChangeTolerance float64
 99 | 
100 | 	// ChangeEvaluationFrequency is the frequency with which to test Perplexity against
101 | 	// MeanChangeTolerance during burn-in and transformation.  A value <= 0 will not evaluate
102 | 	// the mean change at all and simply iterate for `BurnInPasses` iterations.
103 | 	ChangeEvaluationFrequency int
104 | 
105 | 	// Alpha is the prior of theta (the documents over topics distribution)
106 | 	Alpha float64
107 | 
108 | 	// Eta is the prior of phi (the topics over words distribution)
109 | 	Eta float64
110 | 
111 | 	// RhoPhi is the learning rate for phi (the topics over words distribution)
112 | 	RhoPhi LearningSchedule
113 | 
114 | 	// RhoTheta is the learning rate for theta (the documents over topics distribution)
115 | 	RhoTheta LearningSchedule
116 | 
117 | 	rhoPhiT   float64
118 | 	rhoThetaT float64
119 | 
120 | 	wordsInCorpus float64
121 | 	w, d          int
122 | 
123 | 	// Rnd is the random number generator used to generate the initial distributions
124 | 	// for nTheta (the document over topic distribution), nPhi (the topic over word
125 | 	// distribution) and nZ (the topic assignments).
126 | 	Rnd *rand.Rand
127 | 
128 | 	// mutexes for updating global topic statistics
129 | 	phiMutex sync.Mutex
130 | 	zMutex   sync.Mutex
131 | 
132 | 	// Processes is the degree of parallelisation, or more specifically, the number of
133 | 	// concurrent go routines to use during fitting.
134 | 	Processes int
135 | 
136 | 	// nPhi is the topics over words distribution
137 | 	nPhi []float64
138 | 
139 | 	// nZ is the topic assignments
140 | 	nZ []float64
141 | }
142 | 
143 | // NewLatentDirichletAllocation returns a new LatentDirichletAllocation type initialised
144 | // with default values for k topics.
145 | func NewLatentDirichletAllocation(k int) *LatentDirichletAllocation {
146 | 	// TODO:
147 | 	// - Add FitPartial (and FitPartialTransform?) methods
148 | 	// - refactor word counting
149 | 	// - rename and check rhoTheta_t and rhoPhi_t
150 | 	// - Check visibilitiy of member variables
151 | 	// - Try parallelising:
152 | 	// 		- minibatches
153 | 	// 		- individual docs within minibatches
154 | 	// 		- M step
155 | 	//		- other areas
156 | 	// - investigate whetehr can combine/consolidate fitMiniBatch and burnIn
157 | 	// - Check whether nPhi could be sparse
158 | 	// - Add persistence methods
159 | 
160 | 	l := LatentDirichletAllocation{
161 | 		Iterations:                    1000,
162 | 		PerplexityTolerance:           1e-2,
163 | 		PerplexityEvaluationFrequency: 30,
164 | 		BatchSize:                     100,
165 | 		K:                             k,
166 | 		BurnInPasses:                  1,
167 | 		TransformationPasses:          500,
168 | 		MeanChangeTolerance:           1e-5,
169 | 		ChangeEvaluationFrequency:     30,
170 | 		Alpha: 0.1,
171 | 		Eta:   0.01,
172 | 		RhoPhi: LearningSchedule{
173 | 			S:     10,
174 | 			Tau:   1000,
175 | 			Kappa: 0.9,
176 | 		},
177 | 		RhoTheta: LearningSchedule{
178 | 			S:     1,
179 | 			Tau:   10,
180 | 			Kappa: 0.9,
181 | 		},
182 | 		rhoPhiT:   1,
183 | 		rhoThetaT: 1,
184 | 		Rnd:       rand.New(rand.NewSource(uint64(time.Now().UnixNano()))),
185 | 		Processes: runtime.GOMAXPROCS(0),
186 | 	}
187 | 
188 | 	return &l
189 | }
190 | 
191 | // init initialises model for fitting allocating memory for distributions and
192 | // randomising initial values.
193 | func (l *LatentDirichletAllocation) init(m mat.Matrix) {
194 | 	r, c := m.Dims()
195 | 	l.w, l.d = r, c
196 | 	l.nPhi = make([]float64, l.K*r)
197 | 	l.nZ = make([]float64, l.K)
198 | 	var v float64
199 | 	for i := 0; i < r; i++ {
200 | 		for k := 0; k < l.K; k++ {
201 | 			v = float64((l.Rnd.Int() % (r * l.K))) / float64(r*l.K)
202 | 			l.nPhi[i*l.K+k] = v
203 | 			l.nZ[k] += v
204 | 		}
205 | 	}
206 | }
207 | 
208 | // Fit fits the model to the specified matrix m.  The latent topics, and probability
209 | // distribution of topics over words, are learnt and stored to be used for furture transformations
210 | // and analysis.
211 | func (l *LatentDirichletAllocation) Fit(m mat.Matrix) Transformer {
212 | 	l.FitTransform(m)
213 | 	return l
214 | }
215 | 
216 | // burnInDoc calculates document statistics as part of fitting and transforming new
217 | // documents
218 | func (l *LatentDirichletAllocation) burnInDoc(j int, iterations int, m mat.Matrix, wc float64, gamma *[]float64, nTheta []float64) {
219 | 	var rhoTheta float64
220 | 	var sum, prevSum float64
221 | 	var thetaInd int
222 | 
223 | 	for counter := 1; counter <= iterations; counter++ {
224 | 		if l.ChangeEvaluationFrequency > 0 && counter%l.ChangeEvaluationFrequency == 0 && 1 < iterations {
225 | 			// take a copy of current column j
226 | 			prevSum = 0
227 | 			for k := 0; k < l.K; k++ {
228 | 				prevSum += nTheta[j*l.K+k]
229 | 			}
230 | 		}
231 | 		rhoTheta = l.RhoTheta.Calc(l.rhoThetaT + float64(counter))
232 | 		ColNonZeroElemDo(m, j, func(i, j int, v float64) {
233 | 			var gammaSum float64
234 | 			for k := 0; k < l.K; k++ {
235 | 				// Eqn. 5.
236 | 				(*gamma)[k] = ((l.nPhi[i*l.K+k] + l.Eta) * (nTheta[j*l.K+k] + l.Alpha) / (l.nZ[k] + l.Eta*float64(l.w)))
237 | 				gammaSum += (*gamma)[k]
238 | 			}
239 | 
240 | 			for k := 0; k < l.K; k++ {
241 | 				(*gamma)[k] /= gammaSum
242 | 			}
243 | 
244 | 			for k := 0; k < l.K; k++ {
245 | 				// Eqn. 9.
246 | 				thetaInd = j*l.K + k
247 | 				nTheta[thetaInd] = ((math.Pow((1.0-rhoTheta), v) * nTheta[thetaInd]) +
248 | 					((1 - math.Pow((1.0-rhoTheta), v)) * wc * (*gamma)[k]))
249 | 			}
250 | 		})
251 | 		if l.ChangeEvaluationFrequency > 0 && counter%l.ChangeEvaluationFrequency == 0 && counter < iterations {
252 | 			sum = 0
253 | 			for k := 0; k < l.K; k++ {
254 | 				sum += nTheta[j*l.K+k]
255 | 			}
256 | 			if math.Abs(sum-prevSum)/float64(l.K) < l.MeanChangeTolerance {
257 | 				break
258 | 			}
259 | 		}
260 | 	}
261 | }
262 | 
263 | // fitMiniBatch fits a proportion of the matrix as specified by miniBatch.  The
264 | // algorithm is stochastic and so estimates across the minibatch and then applies those
265 | // estimates to the global statistics.
266 | func (l *LatentDirichletAllocation) fitMiniBatch(miniBatch *ldaMiniBatch, wc []float64, nTheta []float64, m mat.Matrix) {
267 | 	var rhoTheta float64
268 | 	batchSize := miniBatch.end - miniBatch.start
269 | 	var phiInd, thetaInd int
270 | 
271 | 	for j := miniBatch.start; j < miniBatch.end; j++ {
272 | 		l.burnInDoc(j, l.BurnInPasses, m, wc[j], &miniBatch.gamma, nTheta)
273 | 
274 | 		rhoTheta = l.RhoTheta.Calc(l.rhoThetaT + float64(l.BurnInPasses))
275 | 		ColNonZeroElemDo(m, j, func(i, j int, v float64) {
276 | 			var gammaSum float64
277 | 			for k := 0; k < l.K; k++ {
278 | 				// Eqn. 5.
279 | 				miniBatch.gamma[k] = ((l.nPhi[i*l.K+k] + l.Eta) * (nTheta[j*l.K+k] + l.Alpha) / (l.nZ[k] + l.Eta*float64(l.w)))
280 | 				gammaSum += miniBatch.gamma[k]
281 | 			}
282 | 			for k := 0; k < l.K; k++ {
283 | 				miniBatch.gamma[k] /= gammaSum
284 | 			}
285 | 
286 | 			for k := 0; k < l.K; k++ {
287 | 				// Eqn. 9.
288 | 				thetaInd = j*l.K + k
289 | 				nTheta[thetaInd] = ((math.Pow((1.0-rhoTheta), v) * nTheta[thetaInd]) +
290 | 					((1 - math.Pow((1.0-rhoTheta), v)) * wc[j] * miniBatch.gamma[k]))
291 | 
292 | 				// calculate sufficient stats
293 | 				nv := l.wordsInCorpus * miniBatch.gamma[k] / float64(batchSize)
294 | 				miniBatch.nPhiHat[i*l.K+k] += nv
295 | 				miniBatch.nZHat[k] += nv
296 | 			}
297 | 		})
298 | 	}
299 | 	rhoPhi := l.RhoPhi.Calc(l.rhoPhiT)
300 | 	l.rhoPhiT++
301 | 
302 | 	// Eqn. 7.
303 | 	l.phiMutex.Lock()
304 | 	for w := 0; w < l.w; w++ {
305 | 		for k := 0; k < l.K; k++ {
306 | 			phiInd = w*l.K + k
307 | 			l.nPhi[phiInd] = ((1.0 - rhoPhi) * l.nPhi[phiInd]) + (rhoPhi * miniBatch.nPhiHat[phiInd])
308 | 		}
309 | 	}
310 | 	l.phiMutex.Unlock()
311 | 
312 | 	// Eqn. 8.
313 | 	l.zMutex.Lock()
314 | 	for k := 0; k < l.K; k++ {
315 | 		l.nZ[k] = ((1.0 - rhoPhi) * l.nZ[k]) + (rhoPhi * miniBatch.nZHat[k])
316 | 	}
317 | 	l.zMutex.Unlock()
318 | }
319 | 
320 | // normaliseTheta normalises theta to derive the posterior probability estimates for
321 | // documents over topics.  All values for each document are divided by the sum of all
322 | // values for the document.
323 | func (l *LatentDirichletAllocation) normaliseTheta(theta []float64, result []float64) []float64 {
324 | 	//adjustment := l.Alpha
325 | 	adjustment := 0.0
326 | 	c := len(theta) / l.K
327 | 	if result == nil {
328 | 		result = make([]float64, l.K*c)
329 | 	}
330 | 	for j := 0; j < c; j++ {
331 | 		var sum float64
332 | 		for k := 0; k < l.K; k++ {
333 | 			sum += theta[j*l.K+k] + adjustment
334 | 		}
335 | 		for k := 0; k < l.K; k++ {
336 | 			result[j*l.K+k] = (theta[j*l.K+k] + adjustment) / sum
337 | 		}
338 | 	}
339 | 	return result
340 | }
341 | 
342 | // normalisePhi normalises phi to derive the posterior probability estimates for
343 | // topics over words.  All values for each topic are divided by the sum of all values
344 | // for the topic.
345 | func (l *LatentDirichletAllocation) normalisePhi(phi []float64, result []float64) []float64 {
346 | 	//adjustment := l.Eta
347 | 	adjustment := 0.0
348 | 	if result == nil {
349 | 		result = make([]float64, l.K*l.w)
350 | 	}
351 | 	sum := make([]float64, l.K)
352 | 	for i := 0; i < l.w; i++ {
353 | 		for k := 0; k < l.K; k++ {
354 | 			sum[k] += phi[i*l.K+k] + adjustment
355 | 		}
356 | 	}
357 | 	for i := 0; i < l.w; i++ {
358 | 		for k := 0; k < l.K; k++ {
359 | 			result[i*l.K+k] = (phi[i*l.K+k] + adjustment) / sum[k]
360 | 		}
361 | 	}
362 | 	return result
363 | }
364 | 
365 | // Perplexity calculates the perplexity of the matrix m against the trained model.
366 | // m is first transformed into corresponding posterior estimates for document over topic
367 | // distributions and then used to calculate the perplexity.
368 | func (l *LatentDirichletAllocation) Perplexity(m mat.Matrix) float64 {
369 | 	if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
370 | 		m = t.ToCSC()
371 | 	}
372 | 	var wordCount float64
373 | 	r, c := m.Dims()
374 | 
375 | 	if s, isSparse := m.(sparse.Sparser); isSparse {
376 | 		s.DoNonZero(func(i, j int, v float64) {
377 | 			wordCount += v
378 | 		})
379 | 	} else {
380 | 		for i := 0; i < r; i++ {
381 | 			for j := 0; j < c; j++ {
382 | 				wordCount += m.At(i, j)
383 | 			}
384 | 		}
385 | 	}
386 | 
387 | 	theta := l.unNormalisedTransform(m)
388 | 	return l.perplexity(m, wordCount, l.normaliseTheta(theta, theta), l.normalisePhi(l.nPhi, nil))
389 | }
390 | 
391 | // perplexity returns the perplexity of the matrix against the model.
392 | func (l *LatentDirichletAllocation) perplexity(m mat.Matrix, sum float64, nTheta []float64, nPhi []float64) float64 {
393 | 	_, c := m.Dims()
394 | 	var perplexity float64
395 | 	var ttlLogWordProb float64
396 | 
397 | 	for j := 0; j < c; j++ {
398 | 		ColNonZeroElemDo(m, j, func(i, j int, v float64) {
399 | 			var dot float64
400 | 			for k := 0; k < l.K; k++ {
401 | 				dot += nPhi[i*l.K+k] * nTheta[j*l.K+k]
402 | 			}
403 | 			ttlLogWordProb += math.Log2(dot) * v
404 | 		})
405 | 	}
406 | 	perplexity = math.Exp2(-ttlLogWordProb / sum)
407 | 	return perplexity
408 | }
409 | 
410 | // Components returns the topic over words probability distribution.  The returned
411 | // matrix is of dimensions K x W where w was the number of rows in the training matrix
412 | // and each column represents a unique words in the vocabulary and K is the number of
413 | // topics.
414 | func (l *LatentDirichletAllocation) Components() mat.Matrix {
415 | 	return mat.DenseCopyOf(mat.NewDense(l.w, l.K, l.normalisePhi(l.nPhi, nil)).T())
416 | }
417 | 
418 | // unNormalisedTransform performs an unNormalisedTransform - the output
419 | // needs to be normalised using normaliseTheta before use.
420 | func (l *LatentDirichletAllocation) unNormalisedTransform(m mat.Matrix) []float64 {
421 | 	_, c := m.Dims()
422 | 	theta := make([]float64, l.K*c)
423 | 	for i := range theta {
424 | 		//data[i] = rnd.Float64() + 0.5
425 | 		theta[i] = float64((l.Rnd.Int() % (c * l.K))) / float64(c*l.K)
426 | 	}
427 | 	gamma := make([]float64, l.K)
428 | 
429 | 	for j := 0; j < c; j++ {
430 | 		var wc float64
431 | 		ColNonZeroElemDo(m, j, func(i, j int, v float64) {
432 | 			wc += v
433 | 		})
434 | 		l.burnInDoc(j, l.TransformationPasses, m, wc, &gamma, theta)
435 | 	}
436 | 	return theta
437 | }
438 | 
439 | // Transform transforms the input matrix into a matrix representing the distribution
440 | // of the documents over topics.
441 | // THe returned matrix contains the document over topic distributions where each element
442 | // is the probability of the corresponding document being related to the corresponding
443 | // topic.  The returned matrix is a Dense matrix of shape K x C where K is the number
444 | // of topics and C is the number of columns in the input matrix (representing the
445 | // documents).
446 | func (l *LatentDirichletAllocation) Transform(m mat.Matrix) (mat.Matrix, error) {
447 | 	if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
448 | 		m = t.ToCSC()
449 | 	}
450 | 	_, c := m.Dims()
451 | 	theta := l.unNormalisedTransform(m)
452 | 	return mat.DenseCopyOf(mat.NewDense(c, l.K, l.normaliseTheta(theta, theta)).T()), nil
453 | }
454 | 
455 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
456 | // on the same matrix.  This is a useful shortcut where separate training data is not being
457 | // used to fit the model i.e. the model is fitted on the fly to the test data.
458 | // THe returned matrix contains the document over topic distributions where each element
459 | // is the probability of the corresponding document being related to the corresponding
460 | // topic.  The returned matrix is a Dense matrix of shape K x C where K is the number
461 | // of topics and C is the number of columns in the input matrix (representing the
462 | // documents).
463 | func (l *LatentDirichletAllocation) FitTransform(m mat.Matrix) (mat.Matrix, error) {
464 | 	if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
465 | 		m = t.ToCSC()
466 | 	}
467 | 
468 | 	l.init(m)
469 | 
470 | 	_, c := m.Dims()
471 | 
472 | 	nTheta := make([]float64, l.K*c)
473 | 	for i := 0; i < l.K*c; i++ {
474 | 		nTheta[i] = float64((l.Rnd.Int() % (c * l.K))) / float64(c*l.K)
475 | 	}
476 | 	wc := make([]float64, c)
477 | 	for j := 0; j < c; j++ {
478 | 		ColNonZeroElemDo(m, j, func(i, j int, v float64) {
479 | 			wc[j] += v
480 | 		})
481 | 		l.wordsInCorpus += wc[j]
482 | 	}
483 | 
484 | 	var phiProb []float64
485 | 	var thetaProb []float64
486 | 
487 | 	numMiniBatches := int(math.Ceil(float64(c) / float64(l.BatchSize)))
488 | 	processes := l.Processes
489 | 	if numMiniBatches < l.Processes {
490 | 		processes = numMiniBatches
491 | 	}
492 | 	miniBatches := make([]*ldaMiniBatch, processes)
493 | 	for i := range miniBatches {
494 | 		miniBatches[i] = newLdaMiniBatch(l.K, l.w)
495 | 	}
496 | 
497 | 	l.rhoPhiT = 1
498 | 	var perplexity float64
499 | 	var prevPerplexity float64
500 | 
501 | 	for it := 0; it < l.Iterations; it++ {
502 | 		l.rhoThetaT++
503 | 
504 | 		mb := make(chan int)
505 | 		var wg sync.WaitGroup
506 | 
507 | 		for process := 0; process < processes; process++ {
508 | 			wg.Add(1)
509 | 			go func(miniBatch *ldaMiniBatch) {
510 | 				defer wg.Done()
511 | 				for j := range mb {
512 | 					miniBatch.reset()
513 | 					miniBatch.start = j * l.BatchSize
514 | 					if j < numMiniBatches-1 {
515 | 						miniBatch.end = miniBatch.start + l.BatchSize
516 | 					} else {
517 | 						miniBatch.end = c
518 | 					}
519 | 					l.fitMiniBatch(miniBatch, wc, nTheta, m)
520 | 				}
521 | 			}(miniBatches[process])
522 | 		}
523 | 
524 | 		for j := 0; j < numMiniBatches; j++ {
525 | 			mb <- j
526 | 		}
527 | 		close(mb)
528 | 		wg.Wait()
529 | 
530 | 		if l.PerplexityEvaluationFrequency > 0 && (it+1)%l.PerplexityEvaluationFrequency == 0 {
531 | 			phiProb = l.normalisePhi(l.nPhi, phiProb)
532 | 			thetaProb = l.normaliseTheta(nTheta, thetaProb)
533 | 			perplexity = l.perplexity(m, l.wordsInCorpus, thetaProb, phiProb)
534 | 
535 | 			if prevPerplexity != 0 && math.Abs(prevPerplexity-perplexity) < l.PerplexityTolerance {
536 | 				break
537 | 			}
538 | 			prevPerplexity = perplexity
539 | 		}
540 | 	}
541 | 	return mat.DenseCopyOf(mat.NewDense(c, l.K, l.normaliseTheta(nTheta, thetaProb)).T()), nil
542 | }
543 | 


--------------------------------------------------------------------------------