├── Gophers.008.crop.png ├── GOPHER ├── .travis.yml ├── LICENSE ├── utils.go ├── hashing.go ├── doc.go ├── dimreduction_test.go ├── measures └── pairwise │ └── comparisons.go ├── index_test.go ├── weightings_test.go ├── weightings.go ├── example_test.go ├── vectorisers_test.go ├── README.md ├── dimreduction.go ├── index.go ├── lsh.go ├── lda_test.go ├── randomprojection_test.go ├── vectorisers.go ├── randomprojection.go └── lda.go /Gophers.008.crop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/james-bowman/nlp/HEAD/Gophers.008.crop.png -------------------------------------------------------------------------------- /GOPHER: -------------------------------------------------------------------------------- 1 | The Go gopher was designed by Renee French and is licensed under the Creative Commons Attributions 3.0. 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.13.x 5 | - 1.14.x 6 | - tip 7 | 8 | before_install: 9 | - go get -t -v ./... 10 | 11 | script: 12 | - go test -coverprofile=coverage.txt -covermode=atomic 13 | 14 | after_success: 15 | - bash <(curl -s https://codecov.io/bash) 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 James Bowman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "github.com/james-bowman/sparse" 5 | "gonum.org/v1/gonum/mat" 6 | ) 7 | 8 | // ColDo executes fn for each column j in m. If the matrix implements the mat.ColViewer 9 | // interface then this interface will be used to iterate over the column vectors more 10 | // efficiently. If the matrix implements the sparse.TypeConverter interface then the 11 | // matrix will be converted to a CSC matrix (which implements the mat.ColViewer 12 | // interface) so that it can benefit from the same optimisation. 13 | func ColDo(m mat.Matrix, fn func(j int, vec mat.Vector)) { 14 | if v, isOk := m.(mat.Vector); isOk { 15 | fn(0, v) 16 | return 17 | } 18 | 19 | if cv, isOk := m.(mat.ColViewer); isOk { 20 | _, c := m.Dims() 21 | for j := 0; j < c; j++ { 22 | fn(j, cv.ColView(j)) 23 | } 24 | return 25 | } 26 | 27 | if sv, isOk := m.(sparse.TypeConverter); isOk { 28 | csc := sv.ToCSC() 29 | _, c := csc.Dims() 30 | for j := 0; j < c; j++ { 31 | fn(j, csc.ColView(j)) 32 | } 33 | return 34 | } 35 | 36 | r, c := m.Dims() 37 | for j := 0; j < c; j++ { 38 | fn(j, mat.NewVecDense(r, mat.Col(nil, j, m))) 39 | } 40 | } 41 | 42 | // ColNonZeroElemDo executes fn for each non-zero element in column j of matrix m. 43 | // If m implements mat.ColNonZeroDoer then this interface will be used to perform 44 | // the iteration. 45 | func ColNonZeroElemDo(m mat.Matrix, j int, fn func(i, j int, v float64)) { 46 | colNonZeroDoer, isSparse := m.(mat.ColNonZeroDoer) 47 | r, _ := m.Dims() 48 | 49 | if isSparse { 50 | colNonZeroDoer.DoColNonZero(j, fn) 51 | } else { 52 | for i := 0; i < r; i++ { 53 | v := m.At(i, j) 54 | if v != 0 { 55 | fn(i, j, v) 56 | } 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /hashing.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "math/rand" 5 | 6 | "github.com/james-bowman/sparse" 7 | "gonum.org/v1/gonum/mat" 8 | ) 9 | 10 | // SimHash implements the SimHash Locality Sensitive Hashing (LSH) algorithm for 11 | // angular distance using sign random projections based on the work of Moses S. Charikar. 12 | // The distance between the original vectors is preserved through the hashing process 13 | // such that hashed vectors can be compared using Hamming Similarity for a faster, 14 | // more space efficient, approximation of Cosine Similarity for the original vectors. 15 | // 16 | // Charikar, Moses S. "Similarity Estimation Techniques from Rounding Algorithms" 17 | // in Proceedings of the thiry-fourth annual ACM symposium on Theory of computing - 18 | // STOC ’02, 2002, p. 380. 19 | // https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf 20 | type SimHash struct { 21 | hyperplanes []*mat.VecDense 22 | } 23 | 24 | // NewSimHash constructs a new SimHash creating a set of locality sensitive 25 | // hash functions which are combined to accept input vectors of length dim 26 | // and produce hashed binary vector fingerprints of length bits. This method 27 | // creates a series of random hyperplanes which are then compared to each 28 | // input vector to produce the output hashed binary vector encoding the input 29 | // vector's location in vector space relative to the hyperplanes. Each bit in 30 | // the output vector corresponds to the sign (1/0 for +/-) of the result of 31 | // the dot product comparison with each random hyperplane. 32 | func NewSimHash(bits int, dim int) *SimHash { 33 | // Generate random hyperplanes 34 | hyperplanes := make([]*mat.VecDense, bits) 35 | 36 | for j := 0; j < bits; j++ { 37 | p := make([]float64, dim) 38 | for i := 0; i < dim; i++ { 39 | p[i] = rand.NormFloat64() 40 | } 41 | hyperplanes[j] = mat.NewVecDense(dim, p) 42 | } 43 | return &SimHash{hyperplanes: hyperplanes} 44 | } 45 | 46 | // Hash accepts a Vector and outputs a BinaryVec (which also implements the 47 | // Gonum Vector interface). This method will panic if the input vector is of a 48 | // different length than the dim parameter used when constructing the SimHash. 49 | func (h *SimHash) Hash(v mat.Vector) *sparse.BinaryVec { 50 | bits := len(h.hyperplanes) 51 | dim := h.hyperplanes[0].Len() 52 | if dim != v.Len() { 53 | panic("The supplied vector has a different number of dimensions from the projected hyperplanes") 54 | } 55 | sig := sparse.NewBinaryVec(bits) 56 | for i := 0; i < bits; i++ { 57 | if sparse.Dot(v, h.hyperplanes[i]) >= 0 { 58 | sig.SetBit(i) 59 | } 60 | } 61 | return sig 62 | } 63 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package nlp provides implementations of selected machine learning algorithms for natural language processing of text corpora. The primary focus is the statistical semantics of plain-text documents supporting semantic analysis and retrieval of semantically similar documents. 3 | 4 | The package makes use of the Gonum (http://http//www.gonum.org/) library for linear algebra and scientific computing with some inspiration taken from Python's scikit-learn (http://scikit-learn.org/stable/) and Gensim(https://radimrehurek.com/gensim/) 5 | 6 | Overview 7 | 8 | The primary intended use case is to support document input as text strings encoded as a matrix of numerical feature vectors called a `term document matrix`. Each column in the matrix corresponds to a document in the corpus and each row corresponds to a unique term occurring in the corpus. The individual elements within the matrix contain the frequency with which each term occurs within each document (referred to as `term frequency`). Whilst textual data from document corpora are the primary intended use case, the algorithms can be used with other types of data from other sources once encoded (vectorised) into a suitable matrix e.g. image data, sound data, users/products, etc. 9 | 10 | These matrices can be processed and manipulated through the application of additional transformations for weighting features, identifying relationships or optimising the data for analysis, information retrieval and/or predictions. 11 | 12 | Typically the algorithms in this package implement one of three primary interfaces: 13 | 14 | Vectoriser - Taking document input as strings and outputting matrices of numerical features e.g. term frequency. 15 | Transformer - Takes matrices of numerical features and applies some logic/transformation to output a new matrix. 16 | Comparer - Functions taking two vectors (columns from a matrix) and outputting a distance/similarity measure. 17 | 18 | One of the implementations of Vectoriser is Pipeline which can be used to wire together pipelines composed of a Vectoriser and one or more Transformers arranged in serial so that the output from each stage forms the input of the next. This can be used to construct a classic LSI (Latent Semantic Indexing) pipeline (vectoriser -> TF.IDF weighting -> Truncated SVD): 19 | 20 | pipeline := nlp.NewPipeline( 21 | nlp.NewCountVectoriser(true), 22 | nlp.NewTFIDFTransformer(), 23 | nlp.NewTruncatedSVD(100), 24 | ) 25 | 26 | Whilst they take different inputs, both Vectorisers and Transformers have 3 primary methods: 27 | 28 | Fit() - Trains the model based upon the supplied, input training data. 29 | Transform() - Transforms the input into the output matrix (requires the model to be already fitted by a previous call to Fit() or FitTransform()). 30 | FitTransform() - Convenience method combining Fit() and Transform() methods to transform input data, fitting the model to the input data in the process. 31 | */ 32 | package nlp 33 | -------------------------------------------------------------------------------- /dimreduction_test.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "gonum.org/v1/gonum/mat" 8 | ) 9 | 10 | func TestTruncatedSVDFitTransform(t *testing.T) { 11 | var tests = []struct { 12 | m int 13 | n int 14 | input []float64 15 | k int 16 | r int 17 | c int 18 | result []float64 19 | }{ 20 | { 21 | m: 6, n: 4, 22 | input: []float64{ 23 | 1, 3, 5, 2, 24 | 8, 1, 0, 0, 25 | 2, 1, 0, 1, 26 | 0, 0, 0, 0, 27 | 0, 0, 0, 1, 28 | 0, 1, 0, 0, 29 | }, 30 | k: 2, 31 | r: 2, c: 4, 32 | result: []float64{ 33 | -8.090, -2.212, -1.695, -0.955, 34 | 1.888, -2.524, -4.649, -1.930, 35 | }, 36 | }, 37 | } 38 | 39 | for _, test := range tests { 40 | transformer := NewTruncatedSVD(test.k) 41 | input := mat.NewDense(test.m, test.n, test.input) 42 | expResult := mat.NewDense(test.r, test.c, test.result) 43 | 44 | result, err := transformer.FitTransform(input) 45 | 46 | if err != nil { 47 | t.Errorf("Failed Truncated SVD transform caused by %v", err) 48 | } 49 | 50 | if !mat.EqualApprox(expResult, result, 0.01) { 51 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n", 52 | mat.Formatted(expResult), 53 | mat.Formatted(result)) 54 | t.Fail() 55 | } 56 | 57 | result2, err := transformer.Transform(input) 58 | 59 | if err != nil { 60 | t.Errorf("Failed Truncated SVD transform caused by %v", err) 61 | } 62 | 63 | if !mat.EqualApprox(result, result2, 0.001) { 64 | t.Logf("First matrix: \n%v\n but second matrix: \n%v\n", 65 | mat.Formatted(result), 66 | mat.Formatted(result2)) 67 | t.Fail() 68 | } 69 | } 70 | } 71 | 72 | func TestPCAFitTransform(t *testing.T) { 73 | var tests = []struct { 74 | m int 75 | n int 76 | input []float64 77 | k int 78 | r int 79 | c int 80 | result []float64 81 | }{ 82 | { 83 | m: 6, n: 4, 84 | input: []float64{ 85 | 1, 3, 5, 2, 86 | 8, 1, 0, 0, 87 | 2, 1, 0, 1, 88 | 0, 0, 0, 0, 89 | 0, 0, 0, 1, 90 | 0, 1, 0, 0, 91 | }, 92 | k: 2, 93 | r: 2, c: 4, 94 | result: []float64{ 95 | -7.478, -0.128, 1.591, 0.496, 96 | 2.937, 2.581, 4.240, 1.110, 97 | }, 98 | }, 99 | } 100 | 101 | for _, test := range tests { 102 | transformer := NewPCA(test.k) 103 | input := mat.NewDense(test.m, test.n, test.input) 104 | expResult := mat.NewDense(test.r, test.c, test.result) 105 | 106 | result, err := transformer.FitTransform(input) 107 | 108 | if err != nil { 109 | t.Errorf("Failed Truncated SVD transform caused by %v", err) 110 | } 111 | 112 | if !mat.EqualApprox(expResult, result, 0.01) { 113 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n", 114 | mat.Formatted(expResult), 115 | mat.Formatted(result)) 116 | t.Fail() 117 | } 118 | 119 | result2, err := transformer.Transform(input) 120 | 121 | if err != nil { 122 | t.Errorf("Failed Truncated SVD transform caused by %v", err) 123 | } 124 | 125 | if !mat.EqualApprox(result, result2, 0.001) { 126 | t.Logf("First matrix: \n%v\n but second matrix: \n%v\n", 127 | mat.Formatted(result), 128 | mat.Formatted(result2)) 129 | t.Fail() 130 | } 131 | } 132 | } 133 | 134 | func TestTruncatedSVDSaveLoad(t *testing.T) { 135 | var transforms = []struct { 136 | wanted *TruncatedSVD 137 | }{ 138 | { 139 | wanted: &TruncatedSVD{ 140 | Components: mat.NewDense(4, 2, []float64{ 141 | 1, 5, 142 | 3, 2, 143 | 9, 0, 144 | 8, 4, 145 | }), 146 | K: 2, 147 | }, 148 | }, 149 | } 150 | 151 | for ti, test := range transforms { 152 | t.Logf("**** TestTruncatedSVDSaveLoad - Test Run %d.\n", ti+1) 153 | 154 | buf := new(bytes.Buffer) 155 | if err := test.wanted.Save(buf); err != nil { 156 | t.Errorf("Error encoding: %v\n", err) 157 | continue 158 | } 159 | 160 | var b TruncatedSVD 161 | if err := b.Load(buf); err != nil { 162 | t.Errorf("Error unencoding: %v\n", err) 163 | continue 164 | } 165 | 166 | if !mat.Equal(test.wanted.Components, b.Components) { 167 | t.Logf("Components mismatch: Wanted %v but got %v\n", mat.Formatted(test.wanted.Components), mat.Formatted(b.Components)) 168 | t.Fail() 169 | } 170 | if test.wanted.K != b.K { 171 | t.Logf("K value mismatch: Wanted %d but got %d\n", test.wanted.K, b.K) 172 | t.Fail() 173 | } 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /measures/pairwise/comparisons.go: -------------------------------------------------------------------------------- 1 | package pairwise 2 | 3 | import ( 4 | "math" 5 | 6 | "github.com/james-bowman/sparse" 7 | "gonum.org/v1/gonum/mat" 8 | ) 9 | 10 | // Comparer is a type of function that compares two mat.Vector types and 11 | // returns a value indicating how similar they are. 12 | type Comparer func(a, b mat.Vector) float64 13 | 14 | // CosineSimilarity calculates the cosine of the angles of 2 vectors i.e. how 15 | // similar they are. Possible values range up to 1 (exact match). NaN will be 16 | // returned if either vector is zero length or contains only 0s. 17 | func CosineSimilarity(a, b mat.Vector) float64 { 18 | // Cosine angle between two vectors is equal to their dot product divided by 19 | // the product of their L2 norms 20 | dotProduct := sparse.Dot(a, b) 21 | norma := sparse.Norm(a, 2.0) 22 | normb := sparse.Norm(b, 2.0) 23 | 24 | if norma == 0 || normb == 0 { 25 | return math.NaN() 26 | } 27 | 28 | return (dotProduct / (norma * normb)) 29 | } 30 | 31 | // CosineDistance is the complement of CosineSimilarity in the positive space. 32 | // CosineDistance = 1.0 - CosineSimilariy 33 | // It should be noted that CosineDistance is not strictly a valid distance measure 34 | // as it does not obey triangular inequality. For applications requiring a distance 35 | // measure that conforms with the strict definition then AngularDistance or 36 | // Euclidean distance (with all vectors L2 normalised first) should be used instead. 37 | // Whilst these distance measures may give different values, they will rank the same 38 | // as CosineDistance. 39 | func CosineDistance(a, b mat.Vector) float64 { 40 | return 1.0 - CosineSimilarity(a, b) 41 | } 42 | 43 | // AngularDistance is a distance measure closely related to CosineSimilarity. 44 | // It measures the difference between the angles of 2 vectors by taking 45 | // the inverse cosine (acos) of the CosineSimilarity and dividing by Pi. 46 | // Unlike CosineSimilarity, this distance measure is a valid distance measure 47 | // as it obeys triangular inequality. 48 | // See https://en.wikipedia.org/wiki/Cosine_similarity#Angular_distance_and_similarity 49 | func AngularDistance(a, b mat.Vector) float64 { 50 | cos := CosineSimilarity(a, b) 51 | if cos > 1 { 52 | cos = 1.0 53 | } 54 | theta := math.Acos(cos) 55 | return theta / math.Pi 56 | } 57 | 58 | // AngularSimilarity is the inverse of AngularDistance. 59 | // AngularSimilarity = 1.0 - AngularDistance 60 | func AngularSimilarity(a, b mat.Vector) float64 { 61 | return 1.0 - AngularDistance(a, b) 62 | } 63 | 64 | // HammingDistance is a distance measure sometimes referred to as the 65 | // `Matching Distance` and measures how different the 2 vectors are 66 | // in terms of the number of non-matching elements. This measurement 67 | // is normalised to provide the distance as proportional to the total 68 | // number of elements in the vectors. If a and b are not the same 69 | // shape then the function will panic. 70 | func HammingDistance(a, b mat.Vector) float64 { 71 | ba, aok := a.(*sparse.BinaryVec) 72 | bb, bok := b.(*sparse.BinaryVec) 73 | 74 | if aok && bok { 75 | return float64(ba.DistanceFrom(bb)) / float64(ba.Len()) 76 | } 77 | 78 | var count float64 79 | for i := 0; i < a.Len(); i++ { 80 | if a.AtVec(i) != b.AtVec(i) { 81 | count++ 82 | } 83 | } 84 | return count / float64(a.Len()) 85 | } 86 | 87 | // HammingSimilarity is the inverse of HammingDistance (1-HammingDistance) 88 | // and represents the proportion of elements within the 2 vectors that 89 | // exactly match. 90 | func HammingSimilarity(a, b mat.Vector) float64 { 91 | return 1.0 - HammingDistance(a, b) 92 | } 93 | 94 | // EuclideanDistance calculates the Euclidean distance 95 | // (l2 distance) between vectors a and b or more specifically 96 | // \sqrt{\sum_{i=1}^n (a_i - b_i)^2} 97 | func EuclideanDistance(a, b mat.Vector) float64 { 98 | var v mat.VecDense 99 | v.SubVec(a, b) 100 | return math.Sqrt(mat.Dot(&v, &v)) 101 | } 102 | 103 | // ManhattenDistance calculates the Manhatten distance (l1 distance) otherwise 104 | // known as the taxi cab distance between two vectors a and b. 105 | func ManhattenDistance(a, b mat.Vector) float64 { 106 | var v mat.VecDense 107 | v.SubVec(a, b) 108 | return mat.Norm(&v, 1) 109 | } 110 | 111 | // VectorLenSimilarity calculates the len of ab vectors 112 | func VectorLenSimilarity(a, b mat.Vector) float64 { 113 | dotProduct := sparse.Dot(a, b) 114 | if dotProduct == 0 { 115 | return math.NaN() 116 | } 117 | return math.Sqrt(dotProduct) 118 | } 119 | -------------------------------------------------------------------------------- /index_test.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "sort" 5 | "testing" 6 | 7 | "github.com/james-bowman/nlp/measures/pairwise" 8 | "github.com/james-bowman/sparse" 9 | "gonum.org/v1/gonum/floats" 10 | "gonum.org/v1/gonum/mat" 11 | ) 12 | 13 | func TestIndexerIndex(t *testing.T) { 14 | m := sparse.Random(sparse.DenseFormat, 100, 10, 1.0) 15 | 16 | tests := []struct { 17 | index Indexer 18 | }{ 19 | {index: NewLinearScanIndex(pairwise.CosineDistance)}, 20 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.CosineDistance)}, 21 | {index: NewLSHIndex(true, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.HammingDistance)}, 22 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)}, 23 | } 24 | 25 | for ti, test := range tests { 26 | ColDo(m, func(j int, v mat.Vector) { 27 | test.index.Index(v, j) 28 | }) 29 | 30 | ColDo(m, func(j int, v mat.Vector) { 31 | matches := test.index.Search(v, 1) 32 | 33 | if len(matches) != 1 { 34 | t.Errorf("Test %d: Search expected 1 result but received %d", ti+1, len(matches)) 35 | } 36 | if matches[0].ID != j { 37 | t.Errorf("Test %d: Search expected to find %d but found %d", ti+1, j, matches[0].ID) 38 | } 39 | if matches[0].Distance < -0.0000001 || matches[0].Distance > 0.0000001 { 40 | t.Errorf("Test %d: Search match distance expected 0.0 but received %f", ti+1, matches[0].Distance) 41 | } 42 | }) 43 | } 44 | } 45 | 46 | func TestIndexerSearch(t *testing.T) { 47 | numCols := 10 48 | m := sparse.Random(sparse.DenseFormat, 100, numCols, 1.0) 49 | 50 | // build similarity matrix 51 | similarityMatrix := make([]float64, numCols*numCols) 52 | inds := make([][]int, numCols) 53 | ColDo(m, func(j int, v1 mat.Vector) { 54 | ColDo(m, func(i int, v2 mat.Vector) { 55 | similarityMatrix[j*numCols+i] = pairwise.CosineDistance(v1, v2) 56 | }) 57 | inds[j] = make([]int, numCols) 58 | floats.Argsort(similarityMatrix[j*numCols:(j+1)*numCols], inds[j]) 59 | for left, right := 0, len(inds[j])-1; left < right; left, right = left+1, right-1 { 60 | inds[j][left], inds[j][right] = inds[j][right], inds[j][left] 61 | similarityMatrix[j*numCols+left], similarityMatrix[j*numCols+right] = similarityMatrix[j*numCols+right], similarityMatrix[j*numCols+left] 62 | } 63 | }) 64 | 65 | tests := []struct { 66 | k int 67 | index Indexer 68 | }{ 69 | {k: numCols, index: NewLinearScanIndex(pairwise.CosineDistance)}, 70 | {k: numCols, index: NewLSHIndex(false, NewSimHash(700, 100), NewClassicLSH(7, 100), pairwise.CosineDistance)}, 71 | {k: numCols, index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)}, 72 | } 73 | 74 | for ti, test := range tests { 75 | ColDo(m, func(j int, v mat.Vector) { 76 | test.index.Index(v, j) 77 | }) 78 | 79 | ColDo(m, func(j int, v mat.Vector) { 80 | matches := test.index.Search(v, test.k) 81 | 82 | if len(matches) != test.k { 83 | t.Errorf("Test %d: Search expected %d result but received %d", ti+1, test.k, len(matches)) 84 | } 85 | heap := resultHeap{matches: matches} 86 | sort.Sort(heap) 87 | 88 | for i, match := range matches { 89 | if match.ID != inds[j][i] { 90 | t.Errorf("Test %d: For col #%d, Rank #%d - expected %v but found %v", ti+1, j, i, inds[j], matches) 91 | return 92 | } 93 | } 94 | }) 95 | } 96 | } 97 | 98 | func TestIndexerRemove(t *testing.T) { 99 | m := sparse.Random(sparse.DenseFormat, 100, 10, 1.0) 100 | 101 | tests := []struct { 102 | index Indexer 103 | }{ 104 | {index: NewLinearScanIndex(pairwise.CosineDistance)}, 105 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.CosineDistance)}, 106 | {index: NewLSHIndex(true, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.HammingDistance)}, 107 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)}, 108 | } 109 | 110 | for ti, test := range tests { 111 | ColDo(m, func(j int, v mat.Vector) { 112 | test.index.Index(v, j) 113 | }) 114 | 115 | ColDo(m, func(j int, v mat.Vector) { 116 | test.index.Remove(j) 117 | matches := test.index.Search(v, 1) 118 | 119 | if len(matches) > 1 { 120 | t.Errorf("Test %d: Search expected less than 1 result but received %d", ti+1, len(matches)) 121 | } 122 | if len(matches) == 1 { 123 | if matches[0].ID == j { 124 | t.Errorf("Test %d: Search expected not to find %d but found %d", ti+1, j, matches[0].ID) 125 | } 126 | } 127 | }) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /weightings_test.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/james-bowman/sparse" 8 | "gonum.org/v1/gonum/mat" 9 | ) 10 | 11 | func TestTfidfTransformerFit(t *testing.T) { 12 | var tests = []struct { 13 | m int 14 | n int 15 | input []float64 16 | dim int 17 | transform []float64 18 | }{ 19 | { 20 | m: 6, n: 4, 21 | input: []float64{ 22 | 1, 3, 5, 2, 23 | 8, 1, 0, 0, 24 | 2, 1, 0, 1, 25 | 0, 0, 0, 0, 26 | 0, 0, 0, 1, 27 | 0, 1, 0, 0, 28 | }, 29 | dim: 6, 30 | transform: []float64{ 31 | 0, 32 | 0.5108256237659907, 33 | 0.22314355131420976, 34 | 1.6094379124341003, 35 | 0.9162907318741551, 36 | 0.9162907318741551, 37 | }, 38 | }, 39 | } 40 | 41 | for _, test := range tests { 42 | transformer := NewTfidfTransformer() 43 | input := mat.NewDense(test.m, test.n, test.input) 44 | 45 | transformer.Fit(input) 46 | 47 | weights := transformer.transform.Diagonal() 48 | for i, v := range weights { 49 | if v != test.transform[i] { 50 | t.Logf("Expected weights: \n%v\n but found: \n%v\n", 51 | test.transform, weights) 52 | t.Fail() 53 | } 54 | } 55 | } 56 | } 57 | 58 | func TestTfidfTransformerTransform(t *testing.T) { 59 | var tests = []struct { 60 | m int 61 | n int 62 | input []float64 63 | tm int 64 | tn int 65 | output []float64 66 | }{ 67 | { 68 | m: 6, n: 4, 69 | input: []float64{ 70 | 1, 3, 5, 2, 71 | 8, 1, 0, 0, 72 | 2, 1, 0, 1, 73 | 0, 0, 0, 0, 74 | 0, 0, 0, 1, 75 | 0, 1, 0, 0, 76 | }, 77 | tm: 6, tn: 4, 78 | output: []float64{ 79 | 0.000, 0.000, 0.000, 0.000, 80 | 4.087, 0.511, 0.000, 0.000, 81 | 0.446, 0.223, 0.000, 0.223, 82 | 0.000, 0.000, 0.000, 0.000, 83 | 0.000, 0.000, 0.000, 0.916, 84 | 0.000, 0.916, 0.000, 0.000, 85 | }, 86 | }, 87 | } 88 | 89 | for _, test := range tests { 90 | transformer := NewTfidfTransformer() 91 | input := mat.NewDense(test.m, test.n, test.input) 92 | output := mat.NewDense(test.tm, test.tn, test.output) 93 | 94 | result, err := transformer.FitTransform(input) 95 | 96 | if err != nil { 97 | t.Errorf("Failed tfidf fit transform caused by %v", err) 98 | } 99 | 100 | if !mat.EqualApprox(output, result, 0.001) { 101 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n", 102 | mat.Formatted(output), 103 | mat.Formatted(result)) 104 | t.Fail() 105 | } 106 | 107 | // test that subsequent transforms produce same result as initial 108 | result2, err := transformer.Transform(input) 109 | 110 | if err != nil { 111 | t.Errorf("Failed tfidf fit transform caused by %v", err) 112 | } 113 | 114 | if !mat.Equal(result, result2) { 115 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n", 116 | mat.Formatted(result), 117 | mat.Formatted(result2)) 118 | t.Fail() 119 | } 120 | } 121 | } 122 | 123 | func TestTfidfTransformerSaveLoad(t *testing.T) { 124 | var transforms = []struct { 125 | wantedTransform *sparse.DIA 126 | }{ 127 | { 128 | wantedTransform: sparse.NewDIA(2, 2, []float64{1, 5}), 129 | }, 130 | } 131 | 132 | for ti, test := range transforms { 133 | t.Logf("**** TestTfidfTransformerSave - Test Run %d.\n", ti+1) 134 | 135 | a := NewTfidfTransformer() 136 | a.transform = test.wantedTransform 137 | 138 | buf := new(bytes.Buffer) 139 | if err := a.Save(buf); err != nil { 140 | t.Errorf("Error encoding: %v\n", err) 141 | continue 142 | } 143 | 144 | b := NewTfidfTransformer() 145 | if err := b.Load(buf); err != nil { 146 | t.Errorf("Error unencoding: %v\n", err) 147 | continue 148 | } 149 | 150 | if !mat.Equal(a.transform, b.transform) { 151 | t.Logf("Wanted %v but got %v\n", mat.Formatted(a.transform), mat.Formatted(b.transform)) 152 | t.Fail() 153 | } 154 | } 155 | } 156 | 157 | func benchmarkTFIDFFitTransform(t Transformer, m, n int, b *testing.B) { 158 | mat := mat.NewDense(m, n, nil) 159 | 160 | for n := 0; n < b.N; n++ { 161 | t.FitTransform(mat) 162 | } 163 | } 164 | 165 | func BenchmarkTFIDFFitTransform20x10(b *testing.B) { 166 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 20, 10, b) 167 | } 168 | func BenchmarkTFIDFFitTransform200x100(b *testing.B) { 169 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 200, 100, b) 170 | } 171 | func BenchmarkTFIDFFitTransform2000x1000(b *testing.B) { 172 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 2000, 1000, b) 173 | } 174 | func BenchmarkTFIDFFitTransform20000x10000(b *testing.B) { 175 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 20000, 10000, b) 176 | } 177 | -------------------------------------------------------------------------------- /weightings.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "io" 5 | "math" 6 | 7 | "github.com/james-bowman/sparse" 8 | "gonum.org/v1/gonum/mat" 9 | ) 10 | 11 | // TfidfTransformer takes a raw term document matrix and weights each raw term frequency 12 | // value depending upon how commonly it occurs across all documents within the corpus. 13 | // For example a very commonly occurring word like `the` is likely to occur in all documents 14 | // and so would be weighted down. 15 | // More precisely, TfidfTransformer applies a tf-idf algorithm to the matrix where each 16 | // term frequency is multiplied by the inverse document frequency. Inverse document 17 | // frequency is calculated as log(n/df) where df is the number of documents in which the 18 | // term occurs and n is the total number of documents within the corpus. We add 1 to both n 19 | // and df before division to prevent division by zero. 20 | type TfidfTransformer struct { 21 | transform *sparse.DIA 22 | } 23 | 24 | // NewTfidfTransformer constructs a new TfidfTransformer. 25 | func NewTfidfTransformer() *TfidfTransformer { 26 | return &TfidfTransformer{} 27 | } 28 | 29 | // Fit takes a training term document matrix, counts term occurrences across all documents 30 | // and constructs an inverse document frequency transform to apply to matrices in subsequent 31 | // calls to Transform(). 32 | func (t *TfidfTransformer) Fit(matrix mat.Matrix) Transformer { 33 | if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv { 34 | matrix = t.ToCSR() 35 | } 36 | m, n := matrix.Dims() 37 | 38 | weights := make([]float64, m) 39 | var df int 40 | if csr, ok := matrix.(*sparse.CSR); ok { 41 | for i := 0; i < m; i++ { 42 | weights[i] = math.Log(float64(1+n) / float64(1+csr.RowNNZ(i))) 43 | } 44 | } else { 45 | for i := 0; i < m; i++ { 46 | df = 0 47 | for j := 0; j < n; j++ { 48 | if matrix.At(i, j) != 0 { 49 | df++ 50 | } 51 | } 52 | weights[i] = math.Log(float64(1+n) / float64(1+df)) 53 | } 54 | } 55 | 56 | // build a diagonal matrix from array of term weighting values for subsequent 57 | // multiplication with term document matrics 58 | t.transform = sparse.NewDIA(m, m, weights) 59 | 60 | return t 61 | } 62 | 63 | // Transform applies the inverse document frequency (IDF) transform by multiplying 64 | // each term frequency by its corresponding IDF value. This has the effect of weighting 65 | // each term frequency according to how often it appears across the whole document corpus 66 | // so that naturally frequent occurring words are given less weight than uncommon ones. 67 | // The returned matrix is a sparse matrix type. 68 | func (t *TfidfTransformer) Transform(matrix mat.Matrix) (mat.Matrix, error) { 69 | if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv { 70 | matrix = t.ToCSR() 71 | } 72 | var product sparse.CSR 73 | 74 | // simply multiply the matrix by our idf transform (the diagonal matrix of term weights) 75 | product.Mul(t.transform, matrix) 76 | 77 | // todo: possibly L2 norm matrix to remove any bias caused by documents of different 78 | // lengths where longer documents naturally have more words and so higher word counts 79 | 80 | return &product, nil 81 | } 82 | 83 | // FitTransform is exactly equivalent to calling Fit() followed by Transform() on the 84 | // same matrix. This is a convenience where separate training data is not being 85 | // used to fit the model i.e. the model is fitted on the fly to the test data. 86 | // The returned matrix is a sparse matrix type. 87 | func (t *TfidfTransformer) FitTransform(matrix mat.Matrix) (mat.Matrix, error) { 88 | if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv { 89 | matrix = t.ToCSR() 90 | } 91 | return t.Fit(matrix).Transform(matrix) 92 | } 93 | 94 | // Save binary serialises the model and writes it into w. This is useful for persisting 95 | // a trained model to disk so that it may be loaded (using the Load() method)in another 96 | // context (e.g. production) for reproducible results. 97 | func (t TfidfTransformer) Save(w io.Writer) error { 98 | _, err := t.transform.MarshalBinaryTo(w) 99 | 100 | return err 101 | } 102 | 103 | // Load binary deserialises the previously serialised model into the receiver. This is 104 | // useful for loading a previously trained and saved model from another context 105 | // (e.g. offline training) for use within another context (e.g. production) for 106 | // reproducible results. Load should only be performed with trusted data. 107 | func (t *TfidfTransformer) Load(r io.Reader) error { 108 | var model sparse.DIA 109 | 110 | if _, err := model.UnmarshalBinaryFrom(r); err != nil { 111 | return err 112 | } 113 | t.transform = &model 114 | 115 | return nil 116 | } 117 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package nlp_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/james-bowman/nlp" 7 | "github.com/james-bowman/nlp/measures/pairwise" 8 | "gonum.org/v1/gonum/mat" 9 | ) 10 | 11 | func Example() { 12 | testCorpus := []string{ 13 | "The quick brown fox jumped over the lazy dog", 14 | "hey diddle diddle, the cat and the fiddle", 15 | "the cow jumped over the moon", 16 | "the little dog laughed to see such fun", 17 | "and the dish ran away with the spoon", 18 | } 19 | 20 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"} 21 | 22 | query := "the brown fox ran around the dog" 23 | 24 | vectoriser := nlp.NewCountVectoriser(stopWords...) 25 | transformer := nlp.NewTfidfTransformer() 26 | 27 | // set k (the number of dimensions following truncation) to 4 28 | reducer := nlp.NewTruncatedSVD(4) 29 | 30 | lsiPipeline := nlp.NewPipeline(vectoriser, transformer, reducer) 31 | 32 | // Transform the corpus into an LSI fitting the model to the documents in the process 33 | lsi, err := lsiPipeline.FitTransform(testCorpus...) 34 | if err != nil { 35 | fmt.Printf("Failed to process documents because %v", err) 36 | return 37 | } 38 | 39 | // run the query through the same pipeline that was fitted to the corpus and 40 | // to project it into the same dimensional space 41 | queryVector, err := lsiPipeline.Transform(query) 42 | if err != nil { 43 | fmt.Printf("Failed to process documents because %v", err) 44 | return 45 | } 46 | 47 | // iterate over document feature vectors (columns) in the LSI matrix and compare 48 | // with the query vector for similarity. Similarity is determined by the difference 49 | // between the angles of the vectors known as the cosine similarity 50 | highestSimilarity := -1.0 51 | var matched int 52 | _, docs := lsi.Dims() 53 | for i := 0; i < docs; i++ { 54 | similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), lsi.(mat.ColViewer).ColView(i)) 55 | if similarity > highestSimilarity { 56 | matched = i 57 | highestSimilarity = similarity 58 | } 59 | } 60 | 61 | fmt.Printf("Matched '%s'", testCorpus[matched]) 62 | // Output: Matched 'The quick brown fox jumped over the lazy dog' 63 | } 64 | -------------------------------------------------------------------------------- /vectorisers_test.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/james-bowman/sparse" 7 | ) 8 | 9 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"} 10 | 11 | var trainSet = []string{ 12 | "The quick brown fox jumped over the. Lazy dog", 13 | "the brown Cat sat on the mat", 14 | "the little dog laughed to see such fun", 15 | "laughing cow", 16 | "the cow ran around the dog", 17 | "spoon dish and plate", 18 | } 19 | 20 | var testSet = []string{ 21 | "hey diddle diddle", 22 | "the cat and the fiddle", 23 | "the cow jumped over the moon", 24 | "the quick brown fox jumped over the. Lazy dog", 25 | "The little dog laughed to see such fun", 26 | "The dish ran away with the spoon", 27 | } 28 | 29 | func TestCountVectoriserFit(t *testing.T) { 30 | var tests = []struct { 31 | train []string 32 | stop []string 33 | vocabSize int 34 | }{ 35 | {trainSet, []string{}, 26}, 36 | {trainSet[0:1], []string{}, 8}, 37 | {trainSet, stopWords, 18}, 38 | } 39 | 40 | for testRun, test := range tests { 41 | t.Logf("**** Test Run %d.\n", testRun+1) 42 | vectoriser := NewCountVectoriser(test.stop...) 43 | 44 | vectoriser.Fit(test.train...) 45 | 46 | if len(vectoriser.Vocabulary) != test.vocabSize { 47 | t.Logf("Expected training dataset %v of size %d but found vocabulary %v of size %d", 48 | test.train, test.vocabSize, vectoriser.Vocabulary, len(vectoriser.Vocabulary)) 49 | t.Fail() 50 | } 51 | } 52 | } 53 | func TestCountVectoriserTransform(t *testing.T) { 54 | var tests = []struct { 55 | train []string 56 | vocabSize int 57 | stop []string 58 | test []string 59 | }{ 60 | {trainSet, 26, []string{}, testSet}, 61 | {trainSet[0:1], 8, []string{}, testSet[0:3]}, 62 | {testSet, 26, []string{}, testSet}, 63 | {testSet, 19, stopWords, testSet}, 64 | } 65 | 66 | for testRun, test := range tests { 67 | t.Logf("**** Test Run %d.\n", testRun+1) 68 | 69 | vectoriser := NewCountVectoriser(test.stop...) 70 | vectoriser.Fit(test.train...) 71 | 72 | vec, err := vectoriser.Transform(test.test...) 73 | 74 | if err != nil { 75 | t.Errorf("Error fitting and applying vectoriser caused by %v", err) 76 | } 77 | 78 | m, n := vec.Dims() 79 | 80 | if m != test.vocabSize || n != len(test.test) { 81 | t.Logf("Expected matrix %d x %d but found %d x %d", test.vocabSize, len(test.test), m, n) 82 | t.Fail() 83 | } 84 | } 85 | } 86 | 87 | func TestHashingVectoriserTransform(t *testing.T) { 88 | var tests = []struct { 89 | train []string 90 | nnz int 91 | features int 92 | stop []string 93 | test []string 94 | }{ 95 | {trainSet, 33, 260000, []string{}, testSet}, 96 | {trainSet[0:1], 11, 260000, []string{}, testSet[0:3]}, 97 | {testSet, 33, 260001, []string{}, testSet}, 98 | {testSet, 21, 260000, stopWords, testSet}, 99 | } 100 | 101 | for testRun, test := range tests { 102 | t.Logf("**** Test Run %d.\n", testRun+1) 103 | vectoriser := NewHashingVectoriser(test.features, test.stop...) 104 | vectoriser.Fit(test.train...) 105 | 106 | vec, err := vectoriser.Transform(test.test...) 107 | 108 | if err != nil { 109 | t.Errorf("Error fitting and applying vectoriser caused by %v", err) 110 | } 111 | 112 | m, n := vec.Dims() 113 | 114 | if m != test.features || n != len(test.test) || vec.(sparse.Sparser).NNZ() != test.nnz { 115 | t.Logf("Expected matrix %d x %d with NNZ = %d but found %d x %d with NNZ = %d", 116 | test.features, 117 | len(test.test), 118 | test.nnz, 119 | m, n, 120 | vec.(sparse.Sparser).NNZ()) 121 | t.Fail() 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Processing 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | [![GoDoc](https://godoc.org/github.com/james-bowman/nlp?status.svg)](https://godoc.org/github.com/james-bowman/nlp) 4 | [![Build Status](https://travis-ci.org/james-bowman/nlp.svg?branch=master)](https://travis-ci.org/james-bowman/nlp) 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/james-bowman/nlp)](https://goreportcard.com/report/github.com/james-bowman/nlp) 6 | [![codecov](https://codecov.io/gh/james-bowman/nlp/branch/master/graph/badge.svg)](https://codecov.io/gh/james-bowman/nlp) 7 | [![Mentioned in Awesome Go](https://awesome.re/mentioned-badge-flat.svg)](https://github.com/avelino/awesome-go) 8 | [![Sourcegraph](https://sourcegraph.com/github.com/james-bowman/nlp/-/badge.svg)](https://sourcegraph.com/github.com/james-bowman/nlp?badge) 9 | 10 | 11 | nlp 12 | 13 | Implementations of selected machine learning algorithms for natural language processing in golang. The primary focus for the package is the statistical semantics of plain-text documents supporting semantic analysis and retrieval of semantically similar documents. 14 | 15 | Built upon the [Gonum](https://www.gonum.org/) package for linear algebra and scientific computing with some inspiration taken from Python's [scikit-learn](http://scikit-learn.org/stable/) and [Gensim](https://radimrehurek.com/gensim/). 16 | 17 | Check out [the companion blog post](http://www.jamesbowman.me/post/semantic-analysis-of-webpages-with-machine-learning-in-go/) or [the Go documentation page](https://godoc.org/github.com/james-bowman/nlp) for full usage and examples. 18 | 19 |
20 | 21 | ## Features 22 | 23 | * [LSA (Latent Semantic Analysis aka Latent Semantic Indexing (LSI))][LSA] implementation using truncated [SVD (Singular Value Decomposition)](https://en.wikipedia.org/wiki/Singular-value_decomposition) for dimensionality reduction. 24 | * Fast comparison and retrieval of semantically similar documents using [SimHash](https://en.wikipedia.org/wiki/SimHash)(random hyperplanes/[sign random projection](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection)) algorithm with multi-index and Forest schemes for [LSH (Locality Sensitive Hashing)](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) to support fast, approximate cosine similarity/angular distance comparisons and approximate nearest neighbour search using significantly less memory and processing time. 25 | * [Random Indexing (RI)](https://en.wikipedia.org/wiki/Random_indexing) and Reflective Random Indexing (RRI) (which extends RI to support indirect inference) for scalable [Latent Semantic Analysis (LSA)][LSA] over large, web-scale corpora. 26 | * [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) using a parallelised implementation of the fast [SCVB0 (Stochastic Collapsed Variational Bayesian inference)][SCVB0] algorithm for unsupervised topic extraction. 27 | * [PCA (Principal Component Analysis)](https://en.wikipedia.org/wiki/Principal_component_analysis) 28 | * [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) weighting to account for frequently occuring words 29 | * [Sparse matrix](http://github.com/james-bowman/sparse) implementations used for more efficient memory usage and processing over large document corpora. 30 | * Stop word removal to remove frequently occuring English words e.g. "the", "and" 31 | * [Feature hashing](https://en.wikipedia.org/wiki/Feature_hashing) ('the hashing trick') implementation (using [MurmurHash3](http://github.com/spaolacci/murmur3)) for reduced memory requirements and reduced reliance on training data 32 | * Similarity/distance measures to calculate the similarity/distance between feature vectors. 33 | 34 | ## Planned 35 | 36 | * Expanded persistence support 37 | * Stemming to treat words with common root as the same e.g. "go" and "going" 38 | * Clustering algorithms e.g. Heirachical, K-means, etc. 39 | * Classification algorithms e.g. SVM, KNN, random forest, etc. 40 | 41 | ## References 42 | 43 | 1. [Rosario, Barbara. Latent Semantic Indexing: An overview. INFOSYS 240 Spring 2000](http://people.ischool.berkeley.edu/~rosario/projects/LSI.pdf) 44 | 1. [Latent Semantic Analysis, a scholarpedia article on LSA written by Tom Landauer, one of the creators of LSA.](http://www.scholarpedia.org/article/Latent_semantic_analysis) 45 | 1. [Thomo, Alex. Latent Semantic Analysis (Tutorial).](http://webhome.cs.uvic.ca/~thomo/svd.pdf) 46 | 1. [Latent Semantic Indexing. Standford NLP Course](http://nlp.stanford.edu/IR-book/html/htmledition/latent-semantic-indexing-1.html) 47 | 1. [Charikar, Moses S. "Similarity Estimation Techniques from Rounding Algorithms" in Proceedings of the thiry-fourth annual ACM symposium on Theory of computing - STOC ’02, 2002, p. 380.](https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf) 48 | 1. [M. Bawa, T. Condie, and P. Ganesan, “LSH forest: self-tuning indexes for similarity search,” Proc. 14th Int. Conf. World Wide Web - WWW ’05, p. 651, 2005.](http://dl.acm.org/citation.cfm?id=1060745.1060840) 49 | 1. [A. Gionis, P. Indyk, and R. Motwani, “Similarity Search in High Dimensions via Hashing,” VLDB ’99 Proc. 25th Int. Conf. Very Large Data Bases, vol. 99, no. 1, pp. 518–529, 1999.](http://www.cs.princeton.edu/courses/archive/spring13/cos598C/Gionis.pdf%5Cnhttp://portal.acm.org/citation.cfm?id=671516) 50 | 1. [Kanerva, Pentti, Kristoferson, Jan and Holst, Anders (2000). Random Indexing of Text Samples for Latent Semantic Analysis](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.4.6523&rep=rep1&type=pdf) 51 | 1. [Rangan, Venkat. Discovery of Related Terms in a corpus using Reflective Random Indexing](https://www.umiacs.umd.edu/~oard/desi4/papers/rangan.pdf) 52 | 1. [Vasuki, Vidya and Cohen, Trevor. Reflective random indexing for semi-automatic indexing of the biomedical literature](https://ac.els-cdn.com/S1532046410000481/1-s2.0-S1532046410000481-main.pdf?_tid=f31f92e8-028a-11e8-8c31-00000aab0f6c&acdnat=1516965824_e24a804445fff1744281ca6f5898a3a4) 53 | 1. [QasemiZadeh, Behrang and Handschuh, Siegfried. Random Indexing Explained with High Probability](http://pars.ie/publications/papers/pre-prints/random-indexing-dr-explained.pdf) 54 | 1. [Foulds, James; Boyles, Levi; Dubois, Christopher; Smyth, Padhraic; Welling, Max (2013). Stochastic Collapsed Variational Bayesian Inference for Latent Dirichlet Allocation][SCVB0] 55 | 56 | 59 | 60 | [LSA]: https://en.wikipedia.org/wiki/Latent_semantic_analysis 61 | [SCVB0]: https://arxiv.org/pdf/1305.2452 62 | -------------------------------------------------------------------------------- /dimreduction.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "io" 7 | 8 | "github.com/james-bowman/sparse" 9 | "gonum.org/v1/gonum/mat" 10 | "gonum.org/v1/gonum/stat" 11 | ) 12 | 13 | // TruncatedSVD implements the Singular Value Decomposition factorisation of matrices. 14 | // This produces an approximation of the input matrix at a lower rank. This is a core 15 | // component of LSA (Latent Semantic Analsis) 16 | type TruncatedSVD struct { 17 | // Components is the truncated term matrix (matrix U of the Singular Value Decomposition 18 | // (A=USV^T)). The matrix will be of size m, k where m = the number of unique terms 19 | // in the training data and k = the number of elements to truncate to (specified by 20 | // attribute K) or m or n (the number of documents in the training data) whichever of 21 | // the 3 values is smaller. 22 | Components *mat.Dense 23 | 24 | // K is the number of dimensions to which the output, transformed, matrix should be 25 | // truncated to. The matrix output by the FitTransform() and Transform() methods will 26 | // be n rows by min(m, n, K) columns, where n is the number of columns in the original, 27 | // input matrix and min(m, n, K) is the lowest value of m, n, K where m is the number of 28 | // rows in the original, input matrix. 29 | K int 30 | } 31 | 32 | // NewTruncatedSVD creates a new TruncatedSVD transformer with K (the truncated 33 | // dimensionality) being set to the specified value k 34 | func NewTruncatedSVD(k int) *TruncatedSVD { 35 | return &TruncatedSVD{K: k} 36 | } 37 | 38 | // Fit performs the SVD factorisation on the input training data matrix, mat and 39 | // stores the output term matrix as a transform to apply to matrices in the Transform matrix. 40 | func (t *TruncatedSVD) Fit(mat mat.Matrix) Transformer { 41 | if _, err := t.FitTransform(mat); err != nil { 42 | panic("nlp: Failed to fit truncated SVD because " + err.Error()) 43 | } 44 | return t 45 | } 46 | 47 | // Transform applies the transform decomposed from the training data matrix in Fit() 48 | // to the input matrix. The resulting output matrix will be the closest approximation 49 | // to the input matrix at a reduced rank. The returned matrix is a dense matrix type. 50 | func (t *TruncatedSVD) Transform(m mat.Matrix) (mat.Matrix, error) { 51 | var product mat.Dense 52 | 53 | product.Mul(t.Components.T(), m) 54 | 55 | return &product, nil 56 | } 57 | 58 | // FitTransform is approximately equivalent to calling Fit() followed by Transform() 59 | // on the same matrix. This is a useful shortcut where separate training data is not being 60 | // used to fit the model i.e. the model is fitted on the fly to the test data. 61 | // The returned matrix is a dense matrix type. 62 | func (t *TruncatedSVD) FitTransform(m mat.Matrix) (mat.Matrix, error) { 63 | var svd mat.SVD 64 | if ok := svd.Factorize(m, mat.SVDThin); !ok { 65 | return nil, fmt.Errorf("Failed SVD Factorisation of working matrix") 66 | } 67 | s, u, v := t.extractSVD(&svd) 68 | 69 | r, c := m.Dims() 70 | min := minimum(t.K, r, c) 71 | 72 | // truncate U and V matrices to k << min(m, n) 73 | uk := u.Slice(0, r, 0, min) 74 | vk := v.Slice(0, c, 0, min) 75 | 76 | t.Components = uk.(*mat.Dense) 77 | 78 | // multiply Sigma by transpose of V. As sigma is a symmetrical (square) diagonal matrix it is 79 | // more efficient to simply multiply each element from the array of diagonal values with each 80 | // element from the matrix V rather than multiplying out the non-zero values from off the diagonal. 81 | var product mat.Dense 82 | product.Apply(func(i, j int, v float64) float64 { 83 | return (v * s[i]) 84 | }, vk.T()) 85 | 86 | return &product, nil 87 | } 88 | 89 | func minimum(k, m, n int) int { 90 | return min(k, min(m, n)) 91 | } 92 | 93 | func min(m, n int) int { 94 | if m < n { 95 | return m 96 | } 97 | return n 98 | } 99 | 100 | func (t *TruncatedSVD) extractSVD(svd *mat.SVD) (s []float64, u, v *mat.Dense) { 101 | var um, vm mat.Dense 102 | svd.UTo(&um) 103 | svd.VTo(&vm) 104 | s = svd.Values(nil) 105 | return s, &um, &vm 106 | } 107 | 108 | // Save binary serialises the model and writes it into w. This is useful for persisting 109 | // a trained model to disk so that it may be loaded (using the Load() method)in another 110 | // context (e.g. production) for reproducible results. 111 | func (t TruncatedSVD) Save(w io.Writer) error { 112 | var buf [8]byte 113 | binary.LittleEndian.PutUint64(buf[:], uint64(t.K)) 114 | if _, err := w.Write(buf[:]); err != nil { 115 | return err 116 | } 117 | 118 | _, err := t.Components.MarshalBinaryTo(w) 119 | 120 | return err 121 | } 122 | 123 | // Load binary deserialises the previously serialised model into the receiver. This is 124 | // useful for loading a previously trained and saved model from another context 125 | // (e.g. offline training) for use within another context (e.g. production) for 126 | // reproducible results. Load should only be performed with trusted data. 127 | func (t *TruncatedSVD) Load(r io.Reader) error { 128 | var n int 129 | var buf [8]byte 130 | var err error 131 | for n < len(buf) && err == nil { 132 | var nn int 133 | nn, err = r.Read(buf[n:]) 134 | n += nn 135 | } 136 | if err == io.EOF { 137 | return io.ErrUnexpectedEOF 138 | } 139 | if err != nil { 140 | return err 141 | } 142 | k := int(binary.LittleEndian.Uint64(buf[:])) 143 | 144 | var model mat.Dense 145 | if _, err := model.UnmarshalBinaryFrom(r); err != nil { 146 | return err 147 | } 148 | 149 | t.K = k 150 | t.Components = &model 151 | 152 | return nil 153 | } 154 | 155 | // PCA calculates the principal components of a matrix, or the axis of greatest variance and 156 | // then projects matrices onto those axis. 157 | // See https://en.wikipedia.org/wiki/Principal_component_analysis for further details. 158 | type PCA struct { 159 | // K is the number of components 160 | K int 161 | pc *stat.PC 162 | } 163 | 164 | // NewPCA constructs a new Principal Component Analysis transformer to reduce the dimensionality, 165 | // projecting matrices onto the axis of greatest variance 166 | func NewPCA(k int) *PCA { 167 | return &PCA{K: k, pc: &stat.PC{}} 168 | } 169 | 170 | // Fit calculates the principal component directions (axis of greatest variance) within the 171 | // training data which can then be used to project matrices onto those principal components using 172 | // the Transform() method. 173 | func (p *PCA) Fit(m mat.Matrix) Transformer { 174 | if ok := p.pc.PrincipalComponents(m.T(), nil); !ok { 175 | panic("nlp: PCA analysis failed during fitting") 176 | } 177 | 178 | return p 179 | } 180 | 181 | // Transform projects the matrix onto the first K principal components calculated during training 182 | // (the Fit() method). The returned matrix will be of reduced dimensionality compared to the input 183 | // (K x c compared to r x c of the input). 184 | func (p *PCA) Transform(m mat.Matrix) (mat.Matrix, error) { 185 | r, _ := m.Dims() 186 | 187 | //var proj mat.Dense 188 | var proj sparse.CSR 189 | var dst mat.Dense 190 | p.pc.VectorsTo(&dst) 191 | proj.Mul(m.T(), dst.Slice(0, r, 0, p.K)) 192 | 193 | // matrix is r x c (t x d) 194 | // m.T() = c x r (d x t) 195 | // slice c x K 196 | 197 | // (ar x ac) * (br x bc) = ar x bc 198 | // ac == br 199 | return proj.T(), nil 200 | } 201 | 202 | // FitTransform is approximately equivalent to calling Fit() followed by Transform() 203 | // on the same matrix. This is a useful shortcut where separate training data is not being 204 | // used to fit the model i.e. the model is fitted on the fly to the test data. 205 | func (p *PCA) FitTransform(m mat.Matrix) (mat.Matrix, error) { 206 | return p.Fit(m).Transform(m) 207 | } 208 | 209 | // ExplainedVariance returns a slice of float64 values representing the variances of the 210 | // principal component scores. 211 | func (p *PCA) ExplainedVariance() []float64 { 212 | return p.pc.VarsTo(nil) 213 | } 214 | -------------------------------------------------------------------------------- /index.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "container/heap" 5 | "sync" 6 | 7 | "github.com/james-bowman/nlp/measures/pairwise" 8 | "github.com/james-bowman/sparse" 9 | "gonum.org/v1/gonum/mat" 10 | ) 11 | 12 | // Match represents a matching item for nearest neighbour similarity searches. 13 | // It contains both the ID of the matching item and the distance from the queried item. 14 | // The distance is represented as a score from 0 (exact match) to 1 (orthogonal) 15 | // depending upon the metric used. 16 | type Match struct { 17 | Distance float64 18 | ID interface{} 19 | } 20 | 21 | // resultHeap is a min heap (priority queue) used to compile the top-k matches whilst 22 | // performing nearest neighbour similarity searches. 23 | type resultHeap struct { 24 | matches []Match 25 | } 26 | 27 | func (r resultHeap) Len() int { return len(r.matches) } 28 | 29 | func (r resultHeap) Less(i, j int) bool { return r.matches[i].Distance > r.matches[j].Distance } 30 | 31 | func (r resultHeap) Swap(i, j int) { r.matches[i], r.matches[j] = r.matches[j], r.matches[i] } 32 | 33 | func (r *resultHeap) Push(x interface{}) { 34 | r.matches = append(r.matches, x.(Match)) 35 | } 36 | 37 | func (r *resultHeap) Pop() interface{} { 38 | old := r.matches 39 | n := len(old) 40 | x := old[n-1] 41 | r.matches = old[0 : n-1] 42 | return x 43 | } 44 | 45 | // Indexer indexes vectors to support Nearest Neighbour (NN) similarity searches across 46 | // the indexed vectors. 47 | type Indexer interface { 48 | Index(v mat.Vector, id interface{}) 49 | Search(q mat.Vector, k int) []Match 50 | Remove(ids interface{}) 51 | } 52 | 53 | // LinearScanIndex supports Nearest Neighbour (NN) similarity searches across indexed 54 | // vectors performing queries in O(n) and requiring O(n) storage. As the name implies, 55 | // LinearScanIndex performs a linear scan across all indexed vectors comparing them 56 | // each in turn with the specified query vector using the configured pairwise distance 57 | // metric. LinearScanIndex is accurate and will always return the true top-k nearest 58 | // neighbours as opposed to some other types of index, like LSHIndex, 59 | // which perform Approximate Nearest Neighbour (ANN) searches and trade some recall 60 | // accuracy for performance over large scale datasets. 61 | type LinearScanIndex struct { 62 | lock sync.RWMutex 63 | signatures []mat.Vector 64 | ids []interface{} 65 | distance pairwise.Comparer 66 | } 67 | 68 | // NewLinearScanIndex construct a new empty LinearScanIndex which will use the specified 69 | // pairwise distance metric to determine nearest neighbours based on similarity. 70 | func NewLinearScanIndex(compareFN pairwise.Comparer) *LinearScanIndex { 71 | return &LinearScanIndex{distance: compareFN} 72 | } 73 | 74 | // Index adds the specified vector v with associated id to the index. 75 | func (b *LinearScanIndex) Index(v mat.Vector, id interface{}) { 76 | b.lock.Lock() 77 | b.signatures = append(b.signatures, v) 78 | b.ids = append(b.ids, id) 79 | b.lock.Unlock() 80 | } 81 | 82 | // Search searches for the top-k nearest neighbours in the index. The method 83 | // returns up to the top-k most similar items in unsorted order. The method may 84 | // return fewer than k items if less than k neighbours are found. 85 | func (b *LinearScanIndex) Search(qv mat.Vector, k int) []Match { 86 | b.lock.RLock() 87 | defer b.lock.RUnlock() 88 | 89 | size := len(b.signatures) 90 | 91 | var point int 92 | var results resultHeap 93 | results.matches = make([]Match, 0, k) 94 | 95 | for point = 0; point < k && point < size; point++ { 96 | mv := b.signatures[point] 97 | match := Match{Distance: b.distance(qv, mv), ID: b.ids[point]} 98 | results.matches = append(results.matches, match) 99 | } 100 | if len(results.matches) < k { 101 | return results.matches 102 | } 103 | heap.Init(&results) 104 | var dist float64 105 | for i := point; i < size; i++ { 106 | mv := b.signatures[i] 107 | dist = b.distance(qv, mv) 108 | if dist <= results.matches[0].Distance { 109 | heap.Pop(&results) 110 | heap.Push(&results, Match{Distance: dist, ID: b.ids[i]}) 111 | } 112 | } 113 | 114 | return results.matches 115 | } 116 | 117 | // Remove removes the vector with the specified id from the index. If no vector 118 | // is found with the specified id the method will simply do nothing. 119 | func (b *LinearScanIndex) Remove(id interface{}) { 120 | b.lock.Lock() 121 | defer b.lock.Unlock() 122 | 123 | for i, v := range b.ids { 124 | if v == id { 125 | copy(b.signatures[i:], b.signatures[i+1:]) 126 | b.signatures[len(b.signatures)-1] = nil 127 | b.signatures = b.signatures[:len(b.signatures)-1] 128 | 129 | copy(b.ids[i:], b.ids[i+1:]) 130 | b.ids[len(b.ids)-1] = nil 131 | b.ids = b.ids[:len(b.ids)-1] 132 | 133 | return 134 | } 135 | } 136 | } 137 | 138 | // Hasher interface represents a Locality Sensitive Hashing algorithm whereby 139 | // the proximity of data points is preserved in the hash space i.e. similar data 140 | // points will be hashed to values close together in the hash space. 141 | type Hasher interface { 142 | // Hash hashes the input vector into a BinaryVector hash representation 143 | Hash(mat.Vector) *sparse.BinaryVec 144 | } 145 | 146 | // LSHScheme interface represents LSH indexing schemes to support Approximate Nearest 147 | // Neighbour (ANN) search. 148 | type LSHScheme interface { 149 | // Put stores the specified LSH signature and associated ID in the LSH index 150 | Put(id interface{}, signature *sparse.BinaryVec) 151 | 152 | // GetCandidates returns the IDs of candidate nearest neighbours. It is up to 153 | // the calling code to further filter these candidates based on distance to arrive 154 | // at the top-k approximate nearest neighbours. The number of candidates returned 155 | // may be smaller or larger than k. 156 | GetCandidates(query *sparse.BinaryVec, k int) []interface{} 157 | 158 | // Remove removes the specified item from the LSH index 159 | Remove(id interface{}) 160 | } 161 | 162 | // LSHIndex is an LSH (Locality Sensitive Hashing) based index supporting Approximate 163 | // Nearest Neighbour (ANN) search in O(log n). The storage required by the index will 164 | // depend upon the underlying LSH scheme used but will typically be higher than O(n). 165 | // In use cases where accurate Nearest Neighbour search is required other types of 166 | // index should be considered like LinearScanIndex. 167 | type LSHIndex struct { 168 | lock sync.RWMutex 169 | isApprox bool 170 | hasher Hasher 171 | scheme LSHScheme 172 | signatures map[interface{}]mat.Vector 173 | distance pairwise.Comparer 174 | } 175 | 176 | // NewLSHIndex creates a new LSHIndex. When queried, the initial candidate 177 | // nearest neighbours returned by the underlying LSH indexing algorithm 178 | // are further filtered by comparing distances to the query vector using the supplied 179 | // distance metric. If approx is true, the filtering comparison is performed on the 180 | // hashes and if approx is false, then the comparison is performed on the original 181 | // vectors instead. This will have time and storage implications as comparing the 182 | // original vectors will be more accurate but slower and require the original vectors 183 | // be stored for the comparison. The LSH algorithm and underlying LSH indexing 184 | // algorithm may both be specified as hasher and store parameters respectively. 185 | func NewLSHIndex(approx bool, hasher Hasher, store LSHScheme, distance pairwise.Comparer) *LSHIndex { 186 | index := LSHIndex{ 187 | isApprox: approx, 188 | hasher: hasher, 189 | scheme: store, 190 | signatures: make(map[interface{}]mat.Vector), 191 | distance: distance, 192 | } 193 | 194 | return &index 195 | } 196 | 197 | // Index indexes the supplied vector along with its associated ID. 198 | func (l *LSHIndex) Index(v mat.Vector, id interface{}) { 199 | h := l.hasher.Hash(v) 200 | 201 | l.lock.Lock() 202 | defer l.lock.Unlock() 203 | 204 | l.scheme.Put(id, h) 205 | if l.isApprox { 206 | l.signatures[id] = h 207 | } else { 208 | l.signatures[id] = v 209 | } 210 | } 211 | 212 | // Search searches for the top-k approximate nearest neighbours in the index. The 213 | // method returns up to the top-k most similar items in unsorted order. The method may 214 | // return fewer than k items if less than k neighbours are found. 215 | func (l *LSHIndex) Search(q mat.Vector, k int) []Match { 216 | hv := l.hasher.Hash(q) 217 | 218 | l.lock.RLock() 219 | defer l.lock.RUnlock() 220 | 221 | candidateIDs := l.scheme.GetCandidates(hv, k) 222 | size := len(candidateIDs) 223 | 224 | var qv mat.Vector 225 | if l.isApprox { 226 | qv = hv 227 | } else { 228 | qv = q 229 | } 230 | 231 | var point int 232 | var results resultHeap 233 | results.matches = make([]Match, 0, k) 234 | 235 | for point = 0; point < k && point < size; point++ { 236 | mv := l.signatures[candidateIDs[point]] 237 | match := Match{Distance: l.distance(qv, mv), ID: candidateIDs[point]} 238 | results.matches = append(results.matches, match) 239 | } 240 | if len(results.matches) < k { 241 | return results.matches 242 | } 243 | heap.Init(&results) 244 | var dist float64 245 | for i := point; i < size; i++ { 246 | mv := l.signatures[candidateIDs[i]] 247 | dist = l.distance(qv, mv) 248 | if dist <= results.matches[0].Distance { 249 | heap.Pop(&results) 250 | heap.Push(&results, Match{Distance: dist, ID: candidateIDs[i]}) 251 | } 252 | } 253 | 254 | return results.matches 255 | } 256 | 257 | // Remove removes the vector with the specified id from the index. If no vector 258 | // is found with the specified id the method will simply do nothing. 259 | func (l *LSHIndex) Remove(id interface{}) { 260 | l.lock.Lock() 261 | defer l.lock.Unlock() 262 | 263 | delete(l.signatures, id) 264 | l.scheme.Remove(id) 265 | } 266 | -------------------------------------------------------------------------------- /lsh.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | radix "github.com/armon/go-radix" 8 | "github.com/james-bowman/sparse" 9 | ) 10 | 11 | // lshTableBucket represents a hash table bucket used for ClassicLSH. The bucket 12 | // is a slice of IDs relating to items whose hash maps to the bucket. 13 | type lshTableBucket []interface{} 14 | 15 | // lshTable is an hash table used for ClassicLSH. It is simply a map of hashcodes 16 | // to lshTableBuckets 17 | //type lshTable map[uint64]lshTableBucket 18 | type lshTable map[uint64]lshTableBucket 19 | 20 | // remove removes the specified item from the LSH table 21 | func (t lshTable) remove(id interface{}) { 22 | for key, bucketContents := range t { 23 | for j, indexedID := range bucketContents { 24 | if id == indexedID { 25 | bucketContents[j] = bucketContents[len(bucketContents)-1] 26 | t[key] = bucketContents[:len(bucketContents)-1] 27 | if len(t[key]) == 0 { 28 | delete(t, key) 29 | } 30 | return 31 | } 32 | } 33 | } 34 | } 35 | 36 | // ClassicLSH supports finding top-k Approximate Nearest Neighbours (ANN) using Locality 37 | // Sensitive Hashing (LSH). Classic LSH scheme is based on using hash tables to store 38 | // items by their locality sensitive hash code based on the work of A. Gionis et al. 39 | // Items that map to the same bucket (their hash codes collide) are similar. Multiple 40 | // hash tables are used to improve recall where some similar items would otherwise 41 | // hash to separate, neighbouring buckets in only a single table. 42 | // 43 | // A. Gionis, P. Indyk, and R. Motwani, “Similarity Search in High Dimensions via 44 | // Hashing,” VLDB ’99 Proc. 25th Int. Conf. Very Large Data Bases, vol. 99, no. 1, 45 | // pp. 518–529, 1999. 46 | // http://www.cs.princeton.edu/courses/archive/spring13/cos598C/Gionis.pdf%5Cnhttp://portal.acm.org/citation.cfm?id=671516 47 | type ClassicLSH struct { 48 | numHashtables int 49 | numHashfunctions int 50 | reqLen int 51 | hashTables []lshTable 52 | } 53 | 54 | // NewClassicLSH creates a new ClassicLSH with the configured number of hash tables 55 | // and hash functions per table. The length of hash signatures used in this type's 56 | // methods (Put() and GetCandidates()) should be exactly equal to functions * tables. 57 | // The Classic LSH algorithm uses multiple hash tables to improve recall for similar 58 | // items that hash to nearby buckets within a specific hash table. 59 | func NewClassicLSH(functions, tables int) *ClassicLSH { 60 | hashtables := make([]lshTable, tables) 61 | for i := range hashtables { 62 | hashtables[i] = make(map[uint64]lshTableBucket) 63 | } 64 | 65 | return &ClassicLSH{ 66 | reqLen: tables * functions, 67 | numHashtables: tables, 68 | numHashfunctions: functions, 69 | hashTables: hashtables, 70 | } 71 | } 72 | 73 | // Put stores the specified LSH signature and associated ID in the LSH index. 74 | // The method panics if the signature is not the same length as tables * functions. 75 | func (l *ClassicLSH) Put(id interface{}, signature *sparse.BinaryVec) { 76 | keys := l.hashKeysForSignature(signature) 77 | for i := range l.hashTables { 78 | l.hashTables[i][keys[i]] = append(l.hashTables[i][keys[i]], id) 79 | } 80 | } 81 | 82 | // GetCandidates returns the IDs of candidate nearest neighbours. It is up to 83 | // the calling code to further filter these candidates based on distance to arrive 84 | // at the top-k approximate nearest neighbours. The number of candidates returned 85 | // may be smaller or larger than k. The method panics if the signature is not the 86 | // same length as tables * functions. 87 | func (l *ClassicLSH) GetCandidates(query *sparse.BinaryVec, k int) []interface{} { 88 | keys := l.hashKeysForSignature(query) 89 | 90 | seen := make(map[interface{}]struct{}) 91 | for i, table := range l.hashTables { 92 | if bucketEntries, exist := table[keys[i]]; exist { 93 | for _, id := range bucketEntries { 94 | seen[id] = struct{}{} 95 | } 96 | } 97 | } 98 | 99 | // Collect results 100 | ids := make([]interface{}, len(seen)) 101 | var i int 102 | for index := range seen { 103 | ids[i] = index 104 | i++ 105 | } 106 | 107 | return ids 108 | } 109 | 110 | // Remove removes the specified item from the LSH index 111 | func (l *ClassicLSH) Remove(id interface{}) { 112 | for _, table := range l.hashTables { 113 | table.remove(id) 114 | } 115 | } 116 | 117 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per 118 | // table) each the length of the configured number of hash functions per table. 119 | // The method panics if the signature is not the same length as tables * functions. 120 | func (l *ClassicLSH) hashKeysForSignature(signature *sparse.BinaryVec) []uint64 { 121 | // TODO: rather than simply chunking up the hash signature into k/l chunks 122 | // possibly select hash functions (digits) uniformly at random (with replacement?) 123 | if signature.Len() != l.reqLen { 124 | panic(fmt.Sprintf("nlp: Specified signature is not the correct length. Needed %d but received %d", l.reqLen, signature.Len())) 125 | } 126 | keys := make([]uint64, l.numHashtables) 127 | for i := range keys { 128 | //keys[i] = signature.SliceToUint64(i*l.numHashfunctions, ((i+1)*l.numHashfunctions)-1) 129 | keys[i] = signature.SliceToUint64(i*l.numHashfunctions, ((i + 1) * l.numHashfunctions)) 130 | } 131 | return keys 132 | } 133 | 134 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per 135 | // table) each the length of the configured number of hash functions per table. 136 | // The method panics if the signature is not the same length as tables * functions. 137 | // func (l *ClassicLSH) hashKeysForSignature(signature *sparse.BinaryVec) []string { 138 | // // TODO: rather than simply chunking up the hash signature into k/l chunks 139 | // // possibly select hash functions (digits) uniformly at random (with replacement?) 140 | // if signature.Len() != l.reqLen { 141 | // panic(fmt.Sprintf("nlp: Specified signature is not the correct length. Needed %d but received %d", l.reqLen, signature.Len())) 142 | // } 143 | // keys := make([]string, l.numHashtables) 144 | // key := signature.String() 145 | // for i := range keys { 146 | // keys[i] = key[i*l.numHashfunctions : (i+1)*l.numHashfunctions] 147 | // } 148 | // return keys 149 | // } 150 | 151 | // LSHForest is an implementation of the LSH Forest Locality Sensitive Hashing scheme 152 | // based on the work of M. Bawa et al. 153 | // 154 | // M. Bawa, T. Condie, and P. Ganesan, “LSH forest: self-tuning indexes for 155 | // similarity search,” Proc. 14th Int. Conf. World Wide Web - WWW ’05, p. 651, 2005. 156 | // http://dl.acm.org/citation.cfm?id=1060745.1060840 157 | type LSHForest struct { 158 | trees []*radix.Tree 159 | numHashfunctions int 160 | reqLen int 161 | } 162 | 163 | // NewLSHForest creates a new LSHForest Locality Sensitive Hashing scheme with the 164 | // specified number of hash tables and hash functions per table. 165 | func NewLSHForest(functions int, tables int) *LSHForest { 166 | trees := make([]*radix.Tree, tables) 167 | for i := range trees { 168 | trees[i] = radix.New() 169 | } 170 | return &LSHForest{ 171 | trees: trees, 172 | numHashfunctions: functions, 173 | reqLen: functions * tables, 174 | } 175 | } 176 | 177 | // Put stores the specified LSH signature and associated ID in the LSH index 178 | func (l *LSHForest) Put(id interface{}, signature *sparse.BinaryVec) { 179 | keys := l.hashKeysForSignature(signature) 180 | for i, tree := range l.trees { 181 | //bucket, _ := tree.Get(keys[i]) 182 | bucket, ok := tree.Get(keys[i]) 183 | if !ok { 184 | bucket = make([]interface{}, 0) 185 | } 186 | tree.Insert(keys[i], append(bucket.([]interface{}), id)) 187 | } 188 | } 189 | 190 | // GetCandidates returns the IDs of candidate nearest neighbours. It is up to 191 | // the calling code to further filter these candidates based on distance to arrive 192 | // at the top-k approximate nearest neighbours. The number of candidates returned 193 | // may be smaller or larger than k. 194 | func (l *LSHForest) GetCandidates(query *sparse.BinaryVec, k int) []interface{} { 195 | keys := l.hashKeysForSignature(query) 196 | 197 | m := k 198 | seen := make(map[interface{}]struct{}) 199 | 200 | for i, tree := range l.trees { 201 | if bucketEntries, exist := tree.Get(keys[i]); exist { 202 | for _, id := range bucketEntries.([]interface{}) { 203 | seen[id] = struct{}{} 204 | } 205 | } 206 | } 207 | 208 | // if we have not found enough candidates then walk back up the trees for 209 | // similar items in neighbouring buckets with shared prefixes 210 | x := l.numHashfunctions 211 | for len(seen) < m && x > 0 { 212 | for i, tree := range l.trees { 213 | var k string 214 | if keys[i][x-1] == '1' { 215 | k = "0" 216 | } else { 217 | k = "1" 218 | } 219 | 220 | altKey := strings.Join([]string{keys[i][0 : x-1], k}, "") 221 | tree.WalkPrefix(altKey, func(s string, v interface{}) bool { 222 | for _, id := range v.([]interface{}) { 223 | seen[id] = struct{}{} 224 | } 225 | return false 226 | }) 227 | } 228 | x-- 229 | } 230 | 231 | // Collect results 232 | candidates := make([]interface{}, len(seen)) 233 | var i int 234 | for index := range seen { 235 | candidates[i] = index 236 | i++ 237 | } 238 | 239 | return candidates 240 | } 241 | 242 | // Remove removes the specified item from the LSH index 243 | func (l *LSHForest) Remove(id interface{}) { 244 | for _, tree := range l.trees { 245 | tree.Walk(func(s string, v interface{}) bool { 246 | bucketContents := v.([]interface{}) 247 | for i, indexedID := range bucketContents { 248 | if id == indexedID { 249 | bucketContents[i] = bucketContents[len(bucketContents)-1] 250 | bucketContents = bucketContents[:len(bucketContents)-1] 251 | if len(bucketContents) == 0 { 252 | tree.Delete(s) 253 | } else { 254 | tree.Insert(s, bucketContents) 255 | } 256 | return true 257 | } 258 | } 259 | return false 260 | }) 261 | } 262 | } 263 | 264 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per 265 | // table) each the length of the configured number of hash functions per table. 266 | // The method panics if the signature is not the same length as tables * functions. 267 | func (l *LSHForest) hashKeysForSignature(signature *sparse.BinaryVec) []string { 268 | // TODO: rather than simply chunking up the hash signature into k/l chunks 269 | // possibly select hash functions (digits) uniformly at random (with replacement?) 270 | if signature.Len() != l.reqLen { 271 | panic(fmt.Sprintf("nlp: Specified signature is not the correct length. Needed %d but received %d", l.reqLen, signature.Len())) 272 | } 273 | keys := make([]string, len(l.trees)) 274 | key := signature.String() 275 | for i := range keys { 276 | keys[i] = key[i*l.numHashfunctions : (i+1)*l.numHashfunctions] 277 | } 278 | return keys 279 | } 280 | -------------------------------------------------------------------------------- /lda_test.go: -------------------------------------------------------------------------------- 1 | package nlp_test 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "testing" 7 | 8 | "golang.org/x/exp/rand" 9 | 10 | "github.com/james-bowman/nlp" 11 | "gonum.org/v1/gonum/mat" 12 | ) 13 | 14 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"} 15 | 16 | func TestLDAFit(t *testing.T) { 17 | tests := []struct { 18 | topics int 19 | r, c int 20 | data []float64 21 | expectedTopics [][]float64 22 | }{ 23 | { 24 | topics: 3, 25 | r: 9, c: 9, 26 | data: []float64{ 27 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 28 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 29 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 30 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 31 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 32 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 33 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 34 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 35 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 36 | }, 37 | expectedTopics: [][]float64{ 38 | {0.33, 0.33, 0.33, 0, 0, 0, 0, 0, 0}, 39 | {0, 0, 0, 0, 0, 0, 0.33, 0.33, 0.33}, 40 | {0, 0, 0, 0.33, 0.33, 0.33, 0, 0, 0}, 41 | }, 42 | }, 43 | { 44 | topics: 3, 45 | r: 9, c: 9, 46 | data: []float64{ 47 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 48 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 49 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 50 | 0, 0, 0, 3, 5, 1, 0, 0, 0, 51 | 0, 0, 0, 3, 5, 0, 0, 0, 0, 52 | 0, 0, 0, 3, 5, 0, 0, 0, 0, 53 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 54 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 55 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 56 | }, 57 | expectedTopics: [][]float64{ 58 | {0.33, 0.33, 0.33, 0, 0, 0, 0, 0, 0}, 59 | {0, 0, 0, 0, 0, 0, 0.33, 0.33, 0.33}, 60 | {0, 0, 0, 0.428, 0.285, 0.285, 0, 0, 0}, 61 | }, 62 | }, 63 | } 64 | 65 | for ti, test := range tests { 66 | // set Rnd to fixed constant seed for deterministic results 67 | lda := nlp.NewLatentDirichletAllocation(test.topics) 68 | lda.Rnd = rand.New(rand.NewSource(uint64(0))) 69 | 70 | in := mat.NewDense(test.r, test.c, test.data) 71 | lda.Fit(in) 72 | 73 | components := lda.Components() 74 | 75 | for i := 0; i < test.topics; i++ { 76 | var sum float64 77 | for ri, v := range test.expectedTopics[i] { 78 | cv := components.At(i, ri) 79 | sum += cv 80 | if math.Abs(cv-v) > 0.01 { 81 | t.Errorf("Test %d: Topic (%d) over word (%d) distribution incorrect. Expected %f but received %f\n", ti, i, ri, v, cv) 82 | } 83 | } 84 | if math.Abs(1-sum) > 0.00000001 { 85 | t.Errorf("Test %d: values in topic (%d) over word distributions should sum to 1 but summed to %f\n", ti, i, sum) 86 | } 87 | } 88 | } 89 | } 90 | 91 | func TestLDAFitTransform(t *testing.T) { 92 | tests := []struct { 93 | topics int 94 | r, c int 95 | data []float64 96 | expectedDocs [][]float64 97 | }{ 98 | { 99 | topics: 3, 100 | r: 9, c: 9, 101 | data: []float64{ 102 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 103 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 104 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 105 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 106 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 107 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 108 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 109 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 110 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 111 | }, 112 | expectedDocs: [][]float64{ 113 | {1, 0, 0}, 114 | {1, 0, 0}, 115 | {1, 0, 0}, 116 | {0, 0, 1}, 117 | {0, 0, 1}, 118 | {0, 0, 1}, 119 | {0, 1, 0}, 120 | {0, 1, 0}, 121 | {0, 1, 0}, 122 | }, 123 | }, 124 | { 125 | topics: 3, 126 | r: 9, c: 9, 127 | data: []float64{ 128 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 129 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 130 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 131 | 0, 0, 0, 3, 5, 1, 0, 0, 0, 132 | 0, 0, 0, 3, 5, 0, 0, 0, 0, 133 | 0, 0, 0, 3, 5, 0, 0, 0, 0, 134 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 135 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 136 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 137 | }, 138 | expectedDocs: [][]float64{ 139 | {1, 0, 0}, 140 | {1, 0, 0}, 141 | {1, 0, 0}, 142 | {0, 0, 1}, 143 | {0, 0, 1}, 144 | {0, 0, 1}, 145 | {0, 1, 0}, 146 | {0, 1, 0}, 147 | {0, 1, 0}, 148 | }, 149 | }, 150 | } 151 | 152 | for ti, test := range tests { 153 | // set Rnd to fixed constant seed for deterministic results 154 | lda := nlp.NewLatentDirichletAllocation(test.topics) 155 | lda.Rnd = rand.New(rand.NewSource(uint64(0))) 156 | 157 | in := mat.NewDense(test.r, test.c, test.data) 158 | theta, err := lda.FitTransform(in) 159 | if err != nil { 160 | t.Error(err) 161 | } 162 | 163 | for j := 0; j < test.c; j++ { 164 | var sum float64 165 | for ri, v := range test.expectedDocs[j] { 166 | cv := theta.At(ri, j) 167 | sum += cv 168 | if math.Abs(cv-v) > 0.01 { 169 | t.Errorf("Test %d: Document (%d) over topic (%d) distribution incorrect. Expected %f but received %f\n", ti, j, ri, v, cv) 170 | } 171 | } 172 | if math.Abs(1-sum) > 0.00000001 { 173 | t.Errorf("Test %d: values in document (%d) over topic distributions should sum to 1 but summed to %f\n", ti, j, sum) 174 | } 175 | } 176 | } 177 | } 178 | 179 | func TestLDATransform(t *testing.T) { 180 | tests := []struct { 181 | topics int 182 | r, c int 183 | data []float64 184 | }{ 185 | { 186 | topics: 3, 187 | r: 9, c: 9, 188 | data: []float64{ 189 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 190 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 191 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 192 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 193 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 194 | 0, 0, 0, 3, 3, 3, 0, 0, 0, 195 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 196 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 197 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 198 | }, 199 | }, 200 | { 201 | topics: 3, 202 | r: 9, c: 9, 203 | data: []float64{ 204 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 205 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 206 | 3, 3, 3, 0, 0, 0, 0, 0, 0, 207 | 0, 0, 0, 3, 5, 1, 0, 0, 0, 208 | 0, 0, 0, 3, 5, 0, 0, 0, 0, 209 | 0, 0, 0, 3, 5, 0, 0, 0, 0, 210 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 211 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 212 | 0, 0, 0, 0, 0, 0, 4, 4, 4, 213 | }, 214 | }, 215 | } 216 | 217 | for ti, test := range tests { 218 | // set Rnd to fixed constant seed for deterministic results 219 | lda := nlp.NewLatentDirichletAllocation(test.topics) 220 | lda.Rnd = rand.New(rand.NewSource(uint64(0))) 221 | lda.PerplexityEvaluationFrequency = 2 222 | 223 | in := mat.NewDense(test.r, test.c, test.data) 224 | theta, err := lda.FitTransform(in) 225 | if err != nil { 226 | t.Error(err) 227 | } 228 | 229 | tTheta, err := lda.Transform(in) 230 | 231 | if !mat.EqualApprox(theta, tTheta, 0.035) { 232 | t.Errorf("Test %d: Transformed matrix not equal to FitTransformed\nExpected:\n %v\nbut received:\n %v\n", ti, mat.Formatted(theta), mat.Formatted(tTheta)) 233 | } 234 | } 235 | } 236 | 237 | func ExampleLatentDirichletAllocation() { 238 | corpus := []string{ 239 | "The quick brown fox jumped over the lazy dog", 240 | "The cow jumped over the moon", 241 | "The little dog laughed to see such fun", 242 | } 243 | 244 | // Create a pipeline with a count vectoriser and LDA transformer for 2 topics 245 | vectoriser := nlp.NewCountVectoriser(stopWords...) 246 | lda := nlp.NewLatentDirichletAllocation(2) 247 | pipeline := nlp.NewPipeline(vectoriser, lda) 248 | 249 | docsOverTopics, err := pipeline.FitTransform(corpus...) 250 | if err != nil { 251 | fmt.Printf("Failed to model topics for documents because %v", err) 252 | return 253 | } 254 | 255 | // Examine Document over topic probability distribution 256 | dr, dc := docsOverTopics.Dims() 257 | for doc := 0; doc < dc; doc++ { 258 | fmt.Printf("\nTopic distribution for document: '%s' -", corpus[doc]) 259 | for topic := 0; topic < dr; topic++ { 260 | if topic > 0 { 261 | fmt.Printf(",") 262 | } 263 | fmt.Printf(" Topic #%d=%f", topic, docsOverTopics.At(topic, doc)) 264 | } 265 | } 266 | 267 | // Examine Topic over word probability distribution 268 | topicsOverWords := lda.Components() 269 | tr, tc := topicsOverWords.Dims() 270 | 271 | vocab := make([]string, len(vectoriser.Vocabulary)) 272 | for k, v := range vectoriser.Vocabulary { 273 | vocab[v] = k 274 | } 275 | for topic := 0; topic < tr; topic++ { 276 | fmt.Printf("\nWord distribution for Topic #%d -", topic) 277 | for word := 0; word < tc; word++ { 278 | if word > 0 { 279 | fmt.Printf(",") 280 | } 281 | fmt.Printf(" '%s'=%f", vocab[word], topicsOverWords.At(topic, word)) 282 | } 283 | } 284 | } 285 | -------------------------------------------------------------------------------- /randomprojection_test.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | 7 | "github.com/james-bowman/nlp/measures/pairwise" 8 | "github.com/james-bowman/sparse" 9 | "golang.org/x/exp/rand" 10 | "gonum.org/v1/gonum/mat" 11 | ) 12 | 13 | func TestSignRandomProjection(t *testing.T) { 14 | tests := []struct { 15 | rows int 16 | cols int 17 | bits int 18 | }{ 19 | {rows: 100, cols: 1000, bits: 1024}, 20 | {rows: 100, cols: 1000, bits: 256}, 21 | } 22 | 23 | for ti, test := range tests { 24 | // Given an input matrix and a query matching one column 25 | matrix := mat.NewDense(test.rows, test.cols, nil) 26 | for i := 0; i < test.rows; i++ { 27 | for j := 0; j < test.cols; j++ { 28 | matrix.Set(i, j, rand.Float64()) 29 | } 30 | } 31 | 32 | query := matrix.ColView(0) 33 | 34 | // When transformed using sign random projections 35 | transformer := NewSignRandomProjection(test.bits) 36 | reducedDimMatrix, err := transformer.FitTransform(matrix) 37 | if err != nil { 38 | t.Errorf("Failed to transform matrix because %v\n", err) 39 | } 40 | m := reducedDimMatrix.(*sparse.Binary) 41 | 42 | reducedDimQuery, err := transformer.Transform(query) 43 | if err != nil { 44 | t.Errorf("Failed to transform query because %v\n", err) 45 | } 46 | q := reducedDimQuery.(*sparse.Binary).ColView(0) 47 | 48 | var culmDiff float64 49 | for i := 0; i < test.cols; i++ { 50 | angSim := pairwise.AngularSimilarity(query, matrix.ColView(i)) 51 | lshSim := pairwise.HammingSimilarity(q, m.ColView(i)) 52 | 53 | if i == 0 { 54 | if math.Abs(angSim-lshSim) >= 0.0000001 { 55 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim) 56 | } 57 | } 58 | 59 | diff := math.Abs(lshSim-angSim) / angSim 60 | culmDiff += diff 61 | } 62 | avgDiff := culmDiff / float64(test.cols) 63 | 64 | // Then output matrix should be of specified length, 65 | // matching column should still have similarity of ~1.0 and 66 | // avg difference betwen angular and hamming similarities should 67 | // be less than 0.03 68 | r, c := m.Dims() 69 | if r != test.bits || c != test.cols { 70 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.bits, test.cols, r, c) 71 | } 72 | if avgDiff >= 0.03 { 73 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.03, avgDiff) 74 | } 75 | } 76 | } 77 | 78 | func TestRandomProjection(t *testing.T) { 79 | tests := []struct { 80 | k int 81 | rows int 82 | cols int 83 | density float32 84 | }{ 85 | {k: 400, rows: 700, cols: 600, density: 0.02}, 86 | {k: 400, rows: 800, cols: 800, density: 0.02}, 87 | } 88 | 89 | for ti, test := range tests { 90 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR() 91 | query := matrix.ToCSC().ColView(0) 92 | 93 | // When transformed using sign random projections 94 | transformer := NewRandomProjection(test.k, float64(test.density)) 95 | transformer.rnd = rand.New(rand.NewSource(uint64(0))) 96 | reducedDimMatrix, err := transformer.FitTransform(matrix) 97 | if err != nil { 98 | t.Errorf("Failed to transform matrix because %v\n", err) 99 | } 100 | m := reducedDimMatrix.(*sparse.CSR).ToCSC() 101 | 102 | reducedDimQuery, err := transformer.Transform(query) 103 | if err != nil { 104 | t.Errorf("Failed to transform query because %v\n", err) 105 | } 106 | q := reducedDimQuery.(*sparse.CSR).ToCSC().ColView(0) 107 | 108 | var culmDiff float64 109 | ColDo(matrix, func(j int, v mat.Vector) { 110 | angSim := pairwise.CosineSimilarity(query, v) 111 | lshSim := pairwise.CosineSimilarity(q, m.ColView(j)) 112 | 113 | if j == 0 { 114 | if math.Abs(angSim-lshSim) >= 0.0000001 { 115 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim) 116 | } 117 | } 118 | 119 | //diff := math.Abs(lshSim-angSim) / angSim 120 | diff := math.Abs(lshSim - angSim) 121 | culmDiff += diff 122 | }) 123 | t.Logf("CulmDiff = %f\n", culmDiff) 124 | avgDiff := culmDiff / float64(test.cols) 125 | 126 | // Then output matrix should be of specified length, 127 | // matching column should still have similarity of ~1.0 and 128 | // avg difference betwen angular and hamming similarities should 129 | // be less than 0.03 130 | r, c := reducedDimMatrix.Dims() 131 | if r != test.k || c != test.cols { 132 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c) 133 | } 134 | if avgDiff >= 0.05 { 135 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.05, avgDiff) 136 | } 137 | } 138 | } 139 | 140 | func TestRandomIndexingFit(t *testing.T) { 141 | tests := []struct { 142 | k int 143 | rows int 144 | cols int 145 | density float32 146 | }{ 147 | {k: 400, rows: 700, cols: 600, density: 0.02}, 148 | {k: 400, rows: 800, cols: 800, density: 0.02}, 149 | } 150 | 151 | for ti, test := range tests { 152 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR() 153 | query := matrix.ToCSC().ColView(0) 154 | 155 | // When transformed using sign random projections 156 | transformer := NewRandomIndexing(test.k, float64(test.density)) 157 | transformer.rnd = rand.New(rand.NewSource(uint64(0))) 158 | reducedDimMatrix, err := transformer.FitTransform(matrix) 159 | if err != nil { 160 | t.Errorf("Failed to transform matrix because %v\n", err) 161 | } 162 | m := reducedDimMatrix.(sparse.TypeConverter).ToCSC() 163 | 164 | reducedDimQuery, err := transformer.Transform(query) 165 | if err != nil { 166 | t.Errorf("Failed to transform query because %v\n", err) 167 | } 168 | q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0) 169 | 170 | var culmDiff float64 171 | ColDo(matrix, func(j int, v mat.Vector) { 172 | angSim := pairwise.CosineSimilarity(query, v) 173 | lshSim := pairwise.CosineSimilarity(q, m.ColView(j)) 174 | 175 | if j == 0 { 176 | if math.Abs(angSim-lshSim) >= 0.05 { 177 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim) 178 | } 179 | } 180 | 181 | //diff := math.Abs(lshSim-angSim) / angSim 182 | diff := math.Abs(lshSim - angSim) 183 | culmDiff += diff 184 | }) 185 | t.Logf("CulmDiff = %f\n", culmDiff) 186 | avgDiff := culmDiff / float64(test.cols) 187 | 188 | // Then output matrix should be of specified length, 189 | // matching column should still have similarity of ~1.0 and 190 | // avg difference betwen angular and hamming similarities should 191 | // be less than 0.03 192 | r, c := reducedDimMatrix.Dims() 193 | if r != test.k || c != test.cols { 194 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c) 195 | } 196 | if avgDiff >= 0.12 { 197 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff) 198 | } 199 | } 200 | } 201 | 202 | func TestRandomIndexingPartialFit(t *testing.T) { 203 | tests := []struct { 204 | k int 205 | rows int 206 | cols int 207 | density float32 208 | }{ 209 | {k: 400, rows: 700, cols: 600, density: 0.02}, 210 | {k: 400, rows: 800, cols: 800, density: 0.02}, 211 | } 212 | 213 | for ti, test := range tests { 214 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR() 215 | query := matrix.ToCSC().ColView(0) 216 | 217 | // When transformed using sign random projections 218 | transformer := NewRandomIndexing(test.k, float64(test.density)) 219 | transformer.rnd = rand.New(rand.NewSource(uint64(0))) 220 | 221 | ColDo(matrix, func(j int, v mat.Vector) { 222 | transformer.PartialFit(v) 223 | }) 224 | 225 | reducedDimMatrix, err := transformer.Transform(matrix) 226 | if err != nil { 227 | t.Errorf("Failed to transform matrix because %v\n", err) 228 | } 229 | m := reducedDimMatrix.(sparse.TypeConverter).ToCSC() 230 | 231 | reducedDimQuery, err := transformer.Transform(query) 232 | if err != nil { 233 | t.Errorf("Failed to transform query because %v\n", err) 234 | } 235 | q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0) 236 | 237 | var culmDiff float64 238 | ColDo(matrix, func(j int, v mat.Vector) { 239 | angSim := pairwise.CosineSimilarity(query, v) 240 | lshSim := pairwise.CosineSimilarity(q, m.ColView(j)) 241 | 242 | if j == 0 { 243 | if math.Abs(angSim-lshSim) >= 0.05 { 244 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim) 245 | } 246 | } 247 | 248 | //diff := math.Abs(lshSim-angSim) / angSim 249 | diff := math.Abs(lshSim - angSim) 250 | culmDiff += diff 251 | }) 252 | t.Logf("CulmDiff = %f\n", culmDiff) 253 | avgDiff := culmDiff / float64(test.cols) 254 | 255 | // Then output matrix should be of specified length, 256 | // matching column should still have similarity of ~1.0 and 257 | // avg difference betwen angular and hamming similarities should 258 | // be less than 0.03 259 | r, c := reducedDimMatrix.Dims() 260 | if r != test.k || c != test.cols { 261 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c) 262 | } 263 | if avgDiff >= 0.12 { 264 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff) 265 | } 266 | } 267 | } 268 | 269 | func TestReflectiveRandomIndexing(t *testing.T) { 270 | tests := []struct { 271 | k int 272 | rows int 273 | cols int 274 | density float32 275 | }{ 276 | {k: 400, rows: 700, cols: 600, density: 0.02}, 277 | {k: 400, rows: 800, cols: 800, density: 0.02}, 278 | } 279 | 280 | for ti, test := range tests { 281 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR() 282 | query := matrix.ToCSC().ColView(0) 283 | 284 | // When transformed using Reflective Random Indexing 285 | transformer := NewReflectiveRandomIndexing(test.k, TermBasedRRI, 0, float64(test.density)) 286 | transformer.rnd = rand.New(rand.NewSource(uint64(0))) 287 | reducedDimMatrix, err := transformer.FitTransform(matrix) 288 | if err != nil { 289 | t.Errorf("Failed to transform matrix because %v\n", err) 290 | } 291 | m := reducedDimMatrix.(sparse.TypeConverter).ToCSC() 292 | 293 | reducedDimQuery, err := transformer.Transform(query) 294 | if err != nil { 295 | t.Errorf("Failed to transform query because %v\n", err) 296 | } 297 | q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0) 298 | 299 | var culmDiff float64 300 | ColDo(matrix, func(j int, v mat.Vector) { 301 | origSim := pairwise.CosineSimilarity(query, v) 302 | redSim := pairwise.CosineSimilarity(q, m.ColView(j)) 303 | 304 | if j == 0 { 305 | if math.Abs(origSim-redSim) >= 0.0000001 { 306 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Original) and %.10f (Reduced)\n", ti, origSim, redSim) 307 | } 308 | } 309 | 310 | diff := math.Abs(redSim - origSim) 311 | culmDiff += diff 312 | }) 313 | t.Logf("CulmDiff = %f\n", culmDiff) 314 | avgDiff := culmDiff / float64(test.cols) 315 | 316 | // Then output matrix should be of specified length, 317 | // matching column should still have similarity of ~1.0 and 318 | // avg difference betwen angular and hamming similarities should 319 | // be less than 0.03 320 | r, c := reducedDimMatrix.Dims() 321 | if r != test.k || c != test.cols { 322 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c) 323 | } 324 | if avgDiff >= 0.12 { 325 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff) 326 | } 327 | } 328 | } 329 | -------------------------------------------------------------------------------- /vectorisers.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | 7 | "github.com/james-bowman/sparse" 8 | "github.com/spaolacci/murmur3" 9 | "gonum.org/v1/gonum/mat" 10 | ) 11 | 12 | // Vectoriser provides a common interface for vectorisers that take a variable 13 | // set of string arguments and produce a numerical matrix of features. 14 | type Vectoriser interface { 15 | Fit(...string) Vectoriser 16 | Transform(...string) (mat.Matrix, error) 17 | FitTransform(...string) (mat.Matrix, error) 18 | } 19 | 20 | // OnlineVectoriser is an extension to the Vectoriser interface that supports 21 | // online (streaming/mini-batch) training as opposed to just batch. 22 | type OnlineVectoriser interface { 23 | Vectoriser 24 | PartialFit(...string) OnlineVectoriser 25 | } 26 | 27 | // Transformer provides a common interface for transformer steps. 28 | type Transformer interface { 29 | Fit(mat.Matrix) Transformer 30 | Transform(mat mat.Matrix) (mat.Matrix, error) 31 | FitTransform(mat mat.Matrix) (mat.Matrix, error) 32 | } 33 | 34 | // OnlineTransformer is an extension to the Transformer interface that 35 | // supports online (streaming/mini-batch) training as opposed to just batch. 36 | type OnlineTransformer interface { 37 | Transformer 38 | PartialFit(mat.Matrix) OnlineTransformer 39 | } 40 | 41 | // Tokeniser interface for tokenisers allowing substitution of different 42 | // tokenisation strategies e.g. Regexp and also supporting different 43 | // different token types n-grams and languages. 44 | type Tokeniser interface { 45 | // ForEachIn iterates over each token within text and invokes function 46 | // f with the token as parameter 47 | ForEachIn(text string, f func(token string)) 48 | 49 | // Tokenise returns a slice of all the tokens contained in string 50 | // text 51 | Tokenise(text string) []string 52 | } 53 | 54 | // RegExpTokeniser implements Tokeniser interface using a basic RegExp 55 | // pattern for unary-gram word tokeniser supporting optional stop word 56 | // removal 57 | type RegExpTokeniser struct { 58 | RegExp *regexp.Regexp 59 | StopWords map[string]bool 60 | } 61 | 62 | // NewTokeniser returns a new, default Tokeniser implementation. 63 | // stopWords is a potentially empty string slice 64 | // that contains the words that should be removed from the corpus 65 | // default regExpTokeniser will split words by whitespace/tabs: "\t\n\f\r " 66 | func NewTokeniser(stopWords ...string) Tokeniser { 67 | var stop map[string]bool 68 | 69 | stop = make(map[string]bool) 70 | for _, word := range stopWords { 71 | stop[word] = true 72 | } 73 | return &RegExpTokeniser{ 74 | RegExp: regexp.MustCompile("[\\p{L}]+"), 75 | StopWords: stop, 76 | } 77 | } 78 | 79 | // ForEachIn iterates over each token within text and invokes function 80 | // f with the token as parameter. If StopWords is not nil then any 81 | // tokens from text present in StopWords will be ignored. 82 | func (t *RegExpTokeniser) ForEachIn(text string, f func(token string)) { 83 | tokens := t.tokenise(text) 84 | for _, token := range tokens { 85 | if t.StopWords != nil { 86 | if t.StopWords[token] { 87 | continue 88 | } 89 | } 90 | f(token) 91 | } 92 | } 93 | 94 | // Tokenise returns a slice of all the tokens contained in string 95 | // text. If StopWords is not nil then any tokens from text present in 96 | // StopWords will be removed from the slice. 97 | func (t *RegExpTokeniser) Tokenise(text string) []string { 98 | words := t.tokenise(text) 99 | 100 | // filter out stop words 101 | if t.StopWords != nil { 102 | b := words[:0] 103 | for _, w := range words { 104 | if !t.StopWords[w] { 105 | b = append(b, w) 106 | } 107 | } 108 | return b 109 | } 110 | 111 | return words 112 | } 113 | 114 | // tokenise returns a slice of all the tokens contained in string 115 | // text. 116 | func (t *RegExpTokeniser) tokenise(text string) []string { 117 | // convert content to lower case 118 | c := strings.ToLower(text) 119 | 120 | // match whole words, removing any punctuation/whitespace 121 | words := t.RegExp.FindAllString(c, -1) 122 | 123 | return words 124 | } 125 | 126 | // CountVectoriser can be used to encode one or more text documents into a term document 127 | // matrix where each column represents a document within the corpus and each row represents 128 | // a term present in the training data set. Each element represents the frequency the 129 | // corresponding term appears in the corresponding document e.g. tf(t, d) = 5 would mean 130 | // that term t (perhaps the word "dog") appears 5 times in the document d. 131 | type CountVectoriser struct { 132 | // Vocabulary is a map of words to indices that point to the row number representing 133 | // that word in the term document matrix output from the Transform() and FitTransform() 134 | // methods. The Vocabulary map is populated by the Fit() or FitTransform() methods 135 | // based upon the words occurring in the datasets supplied to those methods. Within 136 | // Transform(), any words found in the test data set that were not present in the 137 | // training data set supplied to Fit() will not have an entry in the Vocabulary 138 | // and will be ignored. 139 | Vocabulary map[string]int 140 | 141 | // Tokeniser is used to tokenise input text into features. 142 | Tokeniser Tokeniser 143 | } 144 | 145 | // NewCountVectoriser creates a new CountVectoriser. 146 | // stopWords is a potentially empty slice of words to be removed from the corpus 147 | func NewCountVectoriser(stopWords ...string) *CountVectoriser { 148 | return &CountVectoriser{ 149 | Vocabulary: make(map[string]int), 150 | Tokeniser: NewTokeniser(stopWords...), 151 | } 152 | } 153 | 154 | // Fit processes the supplied training data (a variable number of strings representing 155 | // documents). Each word appearing inside the training data will be added to the 156 | // Vocabulary. The Fit() method is intended to be called once to train the model 157 | // in a batch context. Calling the Fit() method a sceond time have the effect of 158 | // re-training the model from scratch (discarding the previously learnt vocabulary). 159 | func (v *CountVectoriser) Fit(train ...string) Vectoriser { 160 | i := 0 161 | if len(v.Vocabulary) != 0 { 162 | v.Vocabulary = make(map[string]int) 163 | } 164 | v.fitVocab(i, train...) 165 | 166 | return v 167 | } 168 | 169 | // fitVocab learns the vocabulary contained within the supplied training documents 170 | func (v *CountVectoriser) fitVocab(start int, train ...string) { 171 | i := start 172 | for _, doc := range train { 173 | v.Tokeniser.ForEachIn(doc, func(word string) { 174 | _, exists := v.Vocabulary[word] 175 | if !exists { 176 | v.Vocabulary[word] = i 177 | i++ 178 | } 179 | }) 180 | } 181 | } 182 | 183 | // Transform transforms the supplied documents into a term document matrix where each 184 | // column is a feature vector representing one of the supplied documents. Each element 185 | // represents the frequency with which the associated term for that row occurred within 186 | // that document. The returned matrix is a sparse matrix type. 187 | func (v *CountVectoriser) Transform(docs ...string) (mat.Matrix, error) { 188 | mat := sparse.NewDOK(len(v.Vocabulary), len(docs)) 189 | 190 | for d, doc := range docs { 191 | v.Tokeniser.ForEachIn(doc, func(word string) { 192 | i, exists := v.Vocabulary[word] 193 | 194 | if exists { 195 | mat.Set(i, d, mat.At(i, d)+1) 196 | } 197 | }) 198 | } 199 | return mat, nil 200 | } 201 | 202 | // FitTransform is exactly equivalent to calling Fit() followed by Transform() on the 203 | // same matrix. This is a convenience where separate training data is not being 204 | // used to fit the model i.e. the model is fitted on the fly to the test data. 205 | // The returned matrix is a sparse matrix type. 206 | func (v *CountVectoriser) FitTransform(docs ...string) (mat.Matrix, error) { 207 | return v.Fit(docs...).Transform(docs...) 208 | } 209 | 210 | // HashingVectoriser can be used to encode one or more text documents into a term document 211 | // matrix where each column represents a document within the corpus and each row represents 212 | // a term. Each element represents the frequency the corresponding term appears in the 213 | // corresponding document e.g. tf(t, d) = 5 would mean that term t (perhaps the word "dog") 214 | // appears 5 times in the document d. 215 | type HashingVectoriser struct { 216 | NumFeatures int 217 | Tokeniser Tokeniser 218 | } 219 | 220 | // NewHashingVectoriser creates a new HashingVectoriser. If stopWords is not an empty slice then 221 | // english stop words will be removed. numFeatures specifies the number of features 222 | // that should be present in produced vectors. Each word in a document is hashed and 223 | // the mod of the hash and numFeatures gives the row in the matrix corresponding to that 224 | // word. 225 | func NewHashingVectoriser(numFeatures int, stopWords ...string) *HashingVectoriser { 226 | return &HashingVectoriser{ 227 | NumFeatures: numFeatures, 228 | Tokeniser: NewTokeniser(stopWords...), 229 | } 230 | } 231 | 232 | // Fit does nothing for a HashingVectoriser. As the HashingVectoriser vectorises features 233 | // based on their hash, it does require a pre-determined vocabulary to map features to their 234 | // correct row in the vector. It is effectively stateless and does not require fitting to 235 | // training data. The method is included for compatibility with other vectorisers. 236 | func (v *HashingVectoriser) Fit(train ...string) Vectoriser { 237 | // The hashing vectoriser is stateless and does not require pre-training so this 238 | // method does nothing. 239 | return v 240 | } 241 | 242 | // PartialFit does nothing for a HashingVectoriser. As the HashingVectoriser vectorises 243 | // features based on their hash, it does not require a pre-learnt vocabulary to map 244 | // features to the correct row in the feature vector. This method is included 245 | // for compatibility with other vectorisers. 246 | func (v *HashingVectoriser) PartialFit(train ...string) Vectoriser { 247 | // The hashing vectoriser is stateless and does not requre training so this method 248 | // does nothing. 249 | return v 250 | } 251 | 252 | // Transform transforms the supplied documents into a term document matrix where each 253 | // column is a feature vector representing one of the supplied documents. Each element 254 | // represents the frequency with which the associated term for that row occurred within 255 | // that document. The returned matrix is a sparse matrix type. 256 | func (v *HashingVectoriser) Transform(docs ...string) (mat.Matrix, error) { 257 | mat := sparse.NewDOK(v.NumFeatures, len(docs)) 258 | 259 | for d, doc := range docs { 260 | v.Tokeniser.ForEachIn(doc, func(word string) { 261 | h := murmur3.Sum32([]byte(word)) 262 | i := int(h) % v.NumFeatures 263 | 264 | mat.Set(i, d, mat.At(i, d)+1) 265 | }) 266 | } 267 | return mat, nil 268 | } 269 | 270 | // FitTransform for a HashingVectoriser is exactly equivalent to calling 271 | // Transform() with the same matrix. For most vectorisers, Fit() must be called 272 | // prior to Transform() and so this method is a convenience where separate 273 | // training data is not used to fit the model. For a HashingVectoriser, fitting is 274 | // not required and so this method is exactly equivalent to Transform(). As with 275 | // Fit(), this method is included with the HashingVectoriser for compatibility 276 | // with other vectorisers. The returned matrix is a sparse matrix type. 277 | func (v *HashingVectoriser) FitTransform(docs ...string) (mat.Matrix, error) { 278 | return v.Transform(docs...) 279 | } 280 | 281 | // Pipeline is a mechanism for composing processing pipelines out of vectorisers 282 | // transformation steps. For example to compose a classic LSA/LSI pipeline 283 | // (vectorisation -> TFIDF transformation -> Truncated SVD) one could use a 284 | // Pipeline as follows: 285 | // lsaPipeline := NewPipeline(NewCountVectoriser(false), NewTfidfTransformer(), NewTruncatedSVD(100)) 286 | // 287 | type Pipeline struct { 288 | Vectoriser Vectoriser 289 | Transformers []Transformer 290 | } 291 | 292 | // NewPipeline constructs a new processing pipline with the supplied Vectoriser 293 | // and one or more transformers 294 | func NewPipeline(vectoriser Vectoriser, transformers ...Transformer) *Pipeline { 295 | pipeline := Pipeline{ 296 | Vectoriser: vectoriser, 297 | Transformers: transformers, 298 | } 299 | 300 | return &pipeline 301 | } 302 | 303 | // Fit fits the model(s) to the supplied training data 304 | func (p *Pipeline) Fit(docs ...string) Vectoriser { 305 | if _, err := p.FitTransform(docs...); err != nil { 306 | panic("nlp: Failed to Fit pipeline because " + err.Error()) 307 | } 308 | 309 | return p 310 | } 311 | 312 | // Transform transforms the supplied documents into a matrix representation 313 | // of numerical feature vectors using a model(s) previously fitted to supplied 314 | // training data. 315 | func (p *Pipeline) Transform(docs ...string) (mat.Matrix, error) { 316 | matrix, err := p.Vectoriser.Transform(docs...) 317 | if err != nil { 318 | return matrix, err 319 | } 320 | for _, t := range p.Transformers { 321 | matrix, err = t.Transform(matrix) 322 | if err != nil { 323 | return matrix, err 324 | } 325 | } 326 | return matrix, nil 327 | } 328 | 329 | // FitTransform transforms the supplied documents into a matrix representation 330 | // of numerical feature vectors fitting the model to the supplied data in the 331 | // process. 332 | func (p *Pipeline) FitTransform(docs ...string) (mat.Matrix, error) { 333 | matrix, err := p.Vectoriser.FitTransform(docs...) 334 | if err != nil { 335 | return matrix, err 336 | } 337 | for _, t := range p.Transformers { 338 | matrix, err = t.FitTransform(matrix) 339 | if err != nil { 340 | return matrix, err 341 | } 342 | } 343 | return matrix, nil 344 | } 345 | -------------------------------------------------------------------------------- /randomprojection.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "math" 5 | "time" 6 | 7 | "golang.org/x/exp/rand" 8 | 9 | "github.com/james-bowman/sparse" 10 | "gonum.org/v1/gonum/mat" 11 | "gonum.org/v1/gonum/stat/distuv" 12 | "gonum.org/v1/gonum/stat/sampleuv" 13 | ) 14 | 15 | // SignRandomProjection represents a transform of a matrix into a lower 16 | // dimensional space. Sign Random Projection is a method of Locality 17 | // Sensitive Hashing (LSH) sometimes referred to as the random hyperplane method. 18 | // A set of random hyperplanes are created in the original dimensional 19 | // space and then input matrices are expressed relative to the random 20 | // hyperplanes as follows: 21 | // For each column vector in the input matrix, construct a corresponding output 22 | // bit vector with each bit (i) calculated as follows: 23 | // if dot(vector, hyperplane[i]) > 0 24 | // bit[i] = 1 25 | // else 26 | // bit[i] = 0 27 | // Whilst similar to other methods of random projection this method is unique in that 28 | // it uses only a single bit in the output matrix to represent the sign of the result 29 | // of the comparison (Dot product) with each hyperplane so encodes vector 30 | // representations with very low memory and processor requirements whilst preserving 31 | // relative distance between vectors from the original space. 32 | // Hamming similarity (and distance) between the transformed vectors in the 33 | // subspace can approximate Angular similarity (and distance) (which is strongly 34 | // related to Cosine similarity) of the associated vectors from the original space. 35 | type SignRandomProjection struct { 36 | // Bits represents the number of bits the output vectors should 37 | // be in length and hence the number of random hyperplanes needed 38 | // for the transformation 39 | Bits int 40 | 41 | // simhash is the simhash LSH (Locality Sensitive Hashing) algorithm 42 | // used to perform the sign random projection 43 | simHash *SimHash 44 | } 45 | 46 | // NewSignRandomProjection constructs a new SignRandomProjection transformer 47 | // to reduce the dimensionality. The transformer uses a number of random hyperplanes 48 | // represented by `bits` and is the dimensionality of the output, transformed 49 | // matrices. 50 | func NewSignRandomProjection(bits int) *SignRandomProjection { 51 | return &SignRandomProjection{Bits: bits} 52 | } 53 | 54 | // Fit creates the random hyperplanes from the input training data matrix, mat and 55 | // stores the hyperplanes as a transform to apply to matrices. 56 | func (s *SignRandomProjection) Fit(m mat.Matrix) Transformer { 57 | rows, _ := m.Dims() 58 | s.simHash = NewSimHash(s.Bits, rows) 59 | return s 60 | } 61 | 62 | // Transform applies the transform decomposed from the training data matrix in Fit() 63 | // to the input matrix. The columns in the resulting output matrix will be a low 64 | // dimensional binary representation of the columns within the original 65 | // i.e. a hash or fingerprint that can be quickly and efficiently compared with other 66 | // similar vectors. Hamming similarity in the new dimensional space can be 67 | // used to approximate Cosine similarity between the vectors of the original space. 68 | // The returned matrix is a Binary matrix or BinaryVec type depending 69 | // upon whether m is Matrix or Vector. 70 | func (s *SignRandomProjection) Transform(m mat.Matrix) (mat.Matrix, error) { 71 | _, cols := m.Dims() 72 | 73 | sigs := make([]sparse.BinaryVec, cols) 74 | ColDo(m, func(j int, v mat.Vector) { 75 | sigs[j] = *s.simHash.Hash(v) 76 | }) 77 | return sparse.NewBinary(s.Bits, cols, sigs), nil 78 | } 79 | 80 | // FitTransform is approximately equivalent to calling Fit() followed by Transform() 81 | // on the same matrix. This is a useful shortcut where separate training data is not being 82 | // used to fit the model i.e. the model is fitted on the fly to the test data. 83 | // The returned matrix is a Binary matrix or BinaryVec type depending upon 84 | // whether m is Matrix or Vector. 85 | func (s *SignRandomProjection) FitTransform(m mat.Matrix) (mat.Matrix, error) { 86 | return s.Fit(m).Transform(m) 87 | } 88 | 89 | // RandomProjection is a method of dimensionality reduction based upon 90 | // the Johnson–Lindenstrauss lemma stating that a small set of points 91 | // in a high-dimensional space can be embedded into a space of much 92 | // lower dimension in such a way that distances between the points 93 | // are nearly preserved. 94 | // 95 | // The technique projects the original 96 | // matrix orthogonally onto a random subspace, transforming the 97 | // elements of the original matrix into a lower dimensional representation. 98 | // Computing orthogonal matrices is expensive and so this technique 99 | // uses specially generated random matrices (hence the name) following 100 | // the principle that in high dimensional spaces, there are lots of 101 | // nearly orthogonal matrices. 102 | type RandomProjection struct { 103 | K int 104 | Density float64 105 | rnd *rand.Rand 106 | projections mat.Matrix 107 | } 108 | 109 | // NewRandomProjection creates and returns a new RandomProjection 110 | // transformer. The RandomProjection will use a specially generated 111 | // random matrix of the specified density and dimensionality k to 112 | // perform the transform to k dimensional space. 113 | func NewRandomProjection(k int, density float64) *RandomProjection { 114 | r := RandomProjection{ 115 | K: k, 116 | Density: density, 117 | } 118 | 119 | return &r 120 | } 121 | 122 | // Fit creates the random (almost) orthogonal matrix used to project 123 | // input matrices into the new reduced dimensional subspace. 124 | func (r *RandomProjection) Fit(m mat.Matrix) Transformer { 125 | rows, _ := m.Dims() 126 | r.projections = CreateRandomProjectionTransform(r.K, rows, r.Density, r.rnd) 127 | return r 128 | } 129 | 130 | // Transform applies the transformation, projecting the input matrix 131 | // into the reduced dimensional subspace. The transformed matrix 132 | // will be a sparse CSR format matrix of shape k x c. 133 | func (r *RandomProjection) Transform(m mat.Matrix) (mat.Matrix, error) { 134 | var product sparse.CSR 135 | 136 | // projections will be dimensions k x r (k x t) 137 | // m will be dimensions r x c (t x d) 138 | // product will be of reduced dimensions k x c (k x d) 139 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv { 140 | m = t.ToCSR() 141 | } 142 | 143 | product.Mul(r.projections, m) 144 | 145 | return &product, nil 146 | } 147 | 148 | // FitTransform is approximately equivalent to calling Fit() followed by Transform() 149 | // on the same matrix. This is a useful shortcut where separate training data is not being 150 | // used to fit the model i.e. the model is fitted on the fly to the test data. 151 | // The returned matrix is a sparse CSR format matrix of shape k x c. 152 | func (r *RandomProjection) FitTransform(m mat.Matrix) (mat.Matrix, error) { 153 | return r.Fit(m).Transform(m) 154 | } 155 | 156 | // RRIBasis represents the initial basis for the index/elemental vectors 157 | // used for Random Reflective Indexing 158 | type RRIBasis int 159 | 160 | const ( 161 | // DocBasedRRI represents columns (documents/contexts in a term-document 162 | // matrix) forming the initial basis for index/elemental vectors in Random Indexing 163 | DocBasedRRI RRIBasis = iota 164 | 165 | // TermBasedRRI indicates rows (terms in a term-document matrix) 166 | // form the initial basis for index/elemental vectors in Reflective Random Indexing. 167 | TermBasedRRI 168 | ) 169 | 170 | // RandomIndexing is a method of dimensionality reduction used for Latent Semantic 171 | // Analysis in a similar way to TruncatedSVD and PCA. Random 172 | // Indexing is designed to solve limitations of very high dimensional 173 | // vector space model implementations for modelling term co-occurance 174 | // in language processing such as SVD typically used for LSA/LSI (Latent 175 | // Semantic Analysis/Latent Semantic Indexing). In implementation 176 | // it bears some similarity to other random projection techniques 177 | // such as those implemented in RandomProjection and SignRandomProjection 178 | // within this package. 179 | // The RandomIndexing type can also be used to perform Reflective 180 | // Random Indexing which extends the Random Indexing model with additional 181 | // training cycles to better support indirect inferrence i.e. find synonyms 182 | // where the words do not appear together in documents. 183 | type RandomIndexing struct { 184 | // K specifies the number of dimensions for the semantic space 185 | K int 186 | 187 | // Density specifies the proportion of non-zero elements in the 188 | // elemental vectors 189 | Density float64 190 | 191 | // Type specifies the initial basis for the elemental vectors 192 | // i.e. whether they initially represent the rows or columns 193 | // This is only relevent for Reflective Random Indexing 194 | Type RRIBasis 195 | 196 | // Reflections specifies the number of reflective training cycles 197 | // to run during fitting for RRI (Reflective Random Indexing). For 198 | // Randome Indexing (non-reflective) this is 0. 199 | Reflections int 200 | 201 | rnd *rand.Rand 202 | 203 | // components is a k x t matrix where `t` is the number of terms 204 | // (rows) in the training data matrix. The columns in this matrix 205 | // contain the `context` vectors for RI where each column represents 206 | // a semantic representation of a term based upon the contexts 207 | // in which it has appeared within the training data. 208 | components mat.Matrix 209 | } 210 | 211 | // NewRandomIndexing returns a new RandomIndexing transformer 212 | // configured to transform term document matrices into k dimensional 213 | // space. The density parameter specifies the density of the index/elemental 214 | // vectors used to project the input matrix into lower dimensional 215 | // space i.e. the proportion of elements that are non-zero. 216 | func NewRandomIndexing(k int, density float64) *RandomIndexing { 217 | return &RandomIndexing{ 218 | K: k, 219 | Density: density, 220 | } 221 | } 222 | 223 | // NewReflectiveRandomIndexing returns a new RandomIndexing type 224 | // configured for Reflective Random Indexing. Reflective Random 225 | // Indexing applies additional (reflective) training cycles ontop 226 | // of Random Indexing to capture indirect inferences (synonyms). 227 | // i.e. similarity between terms that do not directly co-occur 228 | // within the same context/document. 229 | // basis specifies the basis for the reflective random indexing i.e. 230 | // whether the initial, random index/elemental vectors should represent 231 | // documents (columns) or terms (rows). 232 | // reflections is the number of additional training cycles to apply 233 | // to build the elemental vectors. 234 | // Specifying basis == DocBasedRRI and reflections == 0 is equivalent 235 | // to conventional Random Indexing. 236 | func NewReflectiveRandomIndexing(k int, basis RRIBasis, reflections int, density float64) *RandomIndexing { 237 | return &RandomIndexing{ 238 | K: k, 239 | Type: basis, 240 | Reflections: reflections, 241 | Density: density, 242 | } 243 | } 244 | 245 | // PartialFit extends the model to take account of the specified matrix m. The 246 | // context vectors are learnt and stored to be used for furture transformations 247 | // and analysis. PartialFit performs Random Indexing even if the Transformer is 248 | // configured for Reflective Random Indexing so if RRI is required please train 249 | // using the Fit() method as a batch operation. Unlike the Fit() method, the 250 | // PartialFit() method is designed to be called multiple times to support online 251 | // and mini-batch learning whereas the Fit() method is only intended to be called 252 | // once for batch learning. 253 | func (r *RandomIndexing) PartialFit(m mat.Matrix) OnlineTransformer { 254 | rows, cols := m.Dims() 255 | 256 | if r.components == nil || r.components.(*sparse.CSR).IsZero() { 257 | r.components = sparse.NewCSR(r.K, rows, make([]int, r.K+1), []int{}, []float64{}) 258 | } 259 | current := r.components 260 | 261 | // Create transform in transpose to get better randomised sparsity patterns 262 | // when partial fitting with small mini-batches e.g. single column/streaming 263 | idxVecs := CreateRandomProjectionTransform(cols, r.K, r.Density, r.rnd).T() 264 | ctxVecs := r.contextualise(m.T(), idxVecs) 265 | 266 | current.(*sparse.CSR).Add(current, ctxVecs) 267 | r.components = current 268 | 269 | return r 270 | } 271 | 272 | // Components returns a t x k matrix where `t` is the number of terms 273 | // (rows) in the training data matrix. The rows in this matrix 274 | // are the `context` vectors for RI each one representing 275 | // a semantic representation of a term based upon the contexts 276 | // in which it has appeared within the training data. 277 | func (r *RandomIndexing) Components() mat.Matrix { 278 | return r.components.T() 279 | } 280 | 281 | // SetComponents sets a t x k matrix where `t` is the number of terms 282 | // (rows) in the training data matrix. 283 | func (r *RandomIndexing) SetComponents(m mat.Matrix) { 284 | r.components = m 285 | } 286 | 287 | // Fit trains the model, creating random index/elemental vectors to 288 | // be used to construct the new projected feature vectors ('context' 289 | // vectors) in the reduced semantic dimensional space. If configured for 290 | // Reflective Random Indexing then Fit may actually run multiple 291 | // training cycles as specified during construction. The Fit method 292 | // trains the model in batch mode so is intended to be called once, for 293 | // online/streaming or mini-batch training please consider the 294 | // PartialFit method instead. 295 | func (r *RandomIndexing) Fit(m mat.Matrix) Transformer { 296 | rows, cols := m.Dims() 297 | var idxVecs mat.Matrix 298 | 299 | if r.Type == TermBasedRRI { 300 | idxVecs = CreateRandomProjectionTransform(r.K, rows, r.Density, r.rnd) 301 | } else { 302 | idxVecs = CreateRandomProjectionTransform(r.K, cols, r.Density, r.rnd) 303 | idxVecs = r.contextualise(m.T(), idxVecs) 304 | } 305 | 306 | for i := 0; i < r.Reflections; i++ { 307 | idxVecs = r.contextualise(m, idxVecs) 308 | idxVecs = r.contextualise(m.T(), idxVecs) 309 | } 310 | 311 | r.components = idxVecs 312 | return r 313 | } 314 | 315 | // FitTransform is approximately equivalent to calling Fit() followed by Transform() 316 | // on the same matrix. This is a useful shortcut where separate training data is not being 317 | // used to fit the model i.e. the model is fitted on the fly to the test data. 318 | // The returned matrix is a sparse CSR format matrix of shape k x c. 319 | func (r *RandomIndexing) FitTransform(m mat.Matrix) (mat.Matrix, error) { 320 | return r.Fit(m).Transform(m) 321 | } 322 | 323 | // Transform applies the transform, projecting matrix m into the 324 | // lower dimensional semantic space. The output matrix will be of 325 | // shape k x c and will be a sparse CSR format matrix. The transformation 326 | // for each document vector is simply the accumulation of all trained context 327 | // vectors relating to terms appearing in the document. These are weighted by 328 | // the frequency the term appears in the document. 329 | func (r *RandomIndexing) Transform(m mat.Matrix) (mat.Matrix, error) { 330 | return r.contextualise(m, r.components), nil 331 | } 332 | 333 | // contextualise accumulates the vectors vectors for each column in matrix m weighting 334 | // each row vector in vectors by its corresponding value in column of the matrix 335 | func (r *RandomIndexing) contextualise(m mat.Matrix, vectors mat.Matrix) mat.Matrix { 336 | var product sparse.CSR 337 | 338 | product.Mul(vectors, m) 339 | 340 | return &product 341 | } 342 | 343 | // CreateRandomProjectionTransform returns a new random matrix for 344 | // Random Projections of shape newDims x origDims. The matrix will 345 | // be randomly populated using probability distributions where density 346 | // is used as the probability that each element will be populated. 347 | // Populated values will be randomly selected from [-1, 1] scaled 348 | // according to the density and dimensions of the matrix. If rnd is 349 | // nil then a new random number generator will be created and used. 350 | func CreateRandomProjectionTransform(newDims, origDims int, density float64, rnd *rand.Rand) mat.Matrix { 351 | if rnd == nil { 352 | rnd = rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) 353 | } 354 | // TODO Possibly return a mat.Dense instead of sparse.CSR if 355 | // density == 1 356 | 357 | var ptr int 358 | var ind []int 359 | indptr := make([]int, newDims+1) 360 | 361 | for i := 0; i < newDims; i++ { 362 | nnz := binomial(origDims, density, rnd) 363 | if nnz > 0 { 364 | idx := make([]int, nnz) 365 | sampleuv.WithoutReplacement(idx, origDims, rnd) 366 | //sort.Ints(idx) 367 | ind = append(ind, idx...) 368 | ptr += nnz 369 | } 370 | indptr[i+1] = ptr 371 | } 372 | 373 | vals := make([]float64, len(ind)) 374 | values(vals, newDims, density, rnd) 375 | 376 | return sparse.NewCSR(newDims, origDims, indptr, ind, vals) 377 | } 378 | 379 | func binomial(n int, p float64, rnd *rand.Rand) int { 380 | dist := distuv.Bernoulli{ 381 | P: p, 382 | // Should this be Source (Gonum code and docs seem out of sync) 383 | Src: rnd, 384 | } 385 | 386 | var x int 387 | for i := 0; i < n; i++ { 388 | x += int(dist.Rand()) 389 | } 390 | return x 391 | } 392 | 393 | func values(idx []float64, dims int, density float64, rnd *rand.Rand) { 394 | dist := distuv.Bernoulli{ 395 | P: 0.5, 396 | // Should this be Source (Gonum code and docs seem out of sync) 397 | Src: rnd, 398 | } 399 | 400 | factor := math.Sqrt(1.0/density) / math.Sqrt(float64(dims)) 401 | for i := range idx { 402 | idx[i] = factor * (dist.Rand()*2 - 1) 403 | } 404 | } 405 | -------------------------------------------------------------------------------- /lda.go: -------------------------------------------------------------------------------- 1 | package nlp 2 | 3 | import ( 4 | "math" 5 | "runtime" 6 | "sync" 7 | "time" 8 | 9 | "github.com/james-bowman/sparse" 10 | "golang.org/x/exp/rand" 11 | "gonum.org/v1/gonum/mat" 12 | ) 13 | 14 | // LearningSchedule is used to calculate the learning rate for each iteration using a natural 15 | // gradient descent algorithm. 16 | type LearningSchedule struct { 17 | // S is the scale of the step size for the learning rate. 18 | S float64 19 | 20 | // Tau is the learning offset. The learning offset downweights the 21 | // learning rate from early iterations. 22 | Tau float64 23 | 24 | // Kappa controls the learning decay. This is the amount the learning rate 25 | // reduces each iteration. This is typically a value between 0.5 and 1.0. 26 | Kappa float64 27 | } 28 | 29 | // Calc returns the learning rate for the specified iteration 30 | func (l LearningSchedule) Calc(iteration float64) float64 { 31 | return l.S / math.Pow(l.Tau+iteration, l.Kappa) 32 | } 33 | 34 | type ldaMiniBatch struct { 35 | start, end int 36 | nPhiHat []float64 37 | nZHat []float64 38 | gamma []float64 39 | } 40 | 41 | func newLdaMiniBatch(topics int, words int) *ldaMiniBatch { 42 | l := ldaMiniBatch{ 43 | nPhiHat: make([]float64, topics*words), 44 | nZHat: make([]float64, topics), 45 | gamma: make([]float64, topics), 46 | } 47 | return &l 48 | } 49 | 50 | func (l *ldaMiniBatch) reset() { 51 | for i := range l.nPhiHat { 52 | l.nPhiHat[i] = 0 53 | } 54 | for i := range l.nZHat { 55 | l.nZHat[i] = 0 56 | } 57 | // assume gamma does not need to be zeroed between mini batches 58 | } 59 | 60 | // LatentDirichletAllocation (LDA) for fast unsupervised topic extraction. LDA processes 61 | // documents and learns their latent topic model estimating the posterior document over topic 62 | // probability distribution (the probabilities of each document being allocated to each 63 | // topic) and the posterior topic over word probability distribution. 64 | // 65 | // This transformer uses a parallel implemention of the 66 | // SCVB0 (Stochastic Collapsed Variational Bayes) Algorithm (https://arxiv.org/pdf/1305.2452.pdf) 67 | // by Jimmy Foulds with optional `clumping` optimisations. 68 | type LatentDirichletAllocation struct { 69 | // Iterations is the maximum number of training iterations 70 | Iterations int 71 | 72 | // PerplexityTolerance is the tolerance of perplexity below which the Fit method will stop iterating 73 | // and complete. If the evaluated perplexity is is below the tolerance, fitting will terminate successfully 74 | // without necessarily completing all of the configured number of training iterations. 75 | PerplexityTolerance float64 76 | 77 | // PerplexityEvaluationFrquency is the frequency with which to test Perplexity against PerplexityTolerance inside 78 | // Fit. A value <= 0 will not evaluate Perplexity at all and simply iterate for `Iterations` iterations. 79 | PerplexityEvaluationFrequency int 80 | 81 | // BatchSize is the size of mini batches used during training 82 | BatchSize int 83 | 84 | // K is the number of topics 85 | K int 86 | 87 | // NumBurnInPasses is the number of `burn-in` passes across the documents in the 88 | // training data to learn the document statistics before we start collecting topic statistics. 89 | BurnInPasses int 90 | 91 | // TransformationPasses is the number of passes to transform new documents given a previously 92 | // fitted topic model 93 | TransformationPasses int 94 | 95 | // MeanChangeTolerance is the tolerance of change to Theta between burn in passes. 96 | // If the level of change between passes is below the tolerance, the burn in will complete 97 | // without necessarily completing the configured number of passes. 98 | MeanChangeTolerance float64 99 | 100 | // ChangeEvaluationFrequency is the frequency with which to test Perplexity against 101 | // MeanChangeTolerance during burn-in and transformation. A value <= 0 will not evaluate 102 | // the mean change at all and simply iterate for `BurnInPasses` iterations. 103 | ChangeEvaluationFrequency int 104 | 105 | // Alpha is the prior of theta (the documents over topics distribution) 106 | Alpha float64 107 | 108 | // Eta is the prior of phi (the topics over words distribution) 109 | Eta float64 110 | 111 | // RhoPhi is the learning rate for phi (the topics over words distribution) 112 | RhoPhi LearningSchedule 113 | 114 | // RhoTheta is the learning rate for theta (the documents over topics distribution) 115 | RhoTheta LearningSchedule 116 | 117 | rhoPhiT float64 118 | rhoThetaT float64 119 | 120 | wordsInCorpus float64 121 | w, d int 122 | 123 | // Rnd is the random number generator used to generate the initial distributions 124 | // for nTheta (the document over topic distribution), nPhi (the topic over word 125 | // distribution) and nZ (the topic assignments). 126 | Rnd *rand.Rand 127 | 128 | // mutexes for updating global topic statistics 129 | phiMutex sync.Mutex 130 | zMutex sync.Mutex 131 | 132 | // Processes is the degree of parallelisation, or more specifically, the number of 133 | // concurrent go routines to use during fitting. 134 | Processes int 135 | 136 | // nPhi is the topics over words distribution 137 | nPhi []float64 138 | 139 | // nZ is the topic assignments 140 | nZ []float64 141 | } 142 | 143 | // NewLatentDirichletAllocation returns a new LatentDirichletAllocation type initialised 144 | // with default values for k topics. 145 | func NewLatentDirichletAllocation(k int) *LatentDirichletAllocation { 146 | // TODO: 147 | // - Add FitPartial (and FitPartialTransform?) methods 148 | // - refactor word counting 149 | // - rename and check rhoTheta_t and rhoPhi_t 150 | // - Check visibilitiy of member variables 151 | // - Try parallelising: 152 | // - minibatches 153 | // - individual docs within minibatches 154 | // - M step 155 | // - other areas 156 | // - investigate whetehr can combine/consolidate fitMiniBatch and burnIn 157 | // - Check whether nPhi could be sparse 158 | // - Add persistence methods 159 | 160 | l := LatentDirichletAllocation{ 161 | Iterations: 1000, 162 | PerplexityTolerance: 1e-2, 163 | PerplexityEvaluationFrequency: 30, 164 | BatchSize: 100, 165 | K: k, 166 | BurnInPasses: 1, 167 | TransformationPasses: 500, 168 | MeanChangeTolerance: 1e-5, 169 | ChangeEvaluationFrequency: 30, 170 | Alpha: 0.1, 171 | Eta: 0.01, 172 | RhoPhi: LearningSchedule{ 173 | S: 10, 174 | Tau: 1000, 175 | Kappa: 0.9, 176 | }, 177 | RhoTheta: LearningSchedule{ 178 | S: 1, 179 | Tau: 10, 180 | Kappa: 0.9, 181 | }, 182 | rhoPhiT: 1, 183 | rhoThetaT: 1, 184 | Rnd: rand.New(rand.NewSource(uint64(time.Now().UnixNano()))), 185 | Processes: runtime.GOMAXPROCS(0), 186 | } 187 | 188 | return &l 189 | } 190 | 191 | // init initialises model for fitting allocating memory for distributions and 192 | // randomising initial values. 193 | func (l *LatentDirichletAllocation) init(m mat.Matrix) { 194 | r, c := m.Dims() 195 | l.w, l.d = r, c 196 | l.nPhi = make([]float64, l.K*r) 197 | l.nZ = make([]float64, l.K) 198 | var v float64 199 | for i := 0; i < r; i++ { 200 | for k := 0; k < l.K; k++ { 201 | v = float64((l.Rnd.Int() % (r * l.K))) / float64(r*l.K) 202 | l.nPhi[i*l.K+k] = v 203 | l.nZ[k] += v 204 | } 205 | } 206 | } 207 | 208 | // Fit fits the model to the specified matrix m. The latent topics, and probability 209 | // distribution of topics over words, are learnt and stored to be used for furture transformations 210 | // and analysis. 211 | func (l *LatentDirichletAllocation) Fit(m mat.Matrix) Transformer { 212 | l.FitTransform(m) 213 | return l 214 | } 215 | 216 | // burnInDoc calculates document statistics as part of fitting and transforming new 217 | // documents 218 | func (l *LatentDirichletAllocation) burnInDoc(j int, iterations int, m mat.Matrix, wc float64, gamma *[]float64, nTheta []float64) { 219 | var rhoTheta float64 220 | var sum, prevSum float64 221 | var thetaInd int 222 | 223 | for counter := 1; counter <= iterations; counter++ { 224 | if l.ChangeEvaluationFrequency > 0 && counter%l.ChangeEvaluationFrequency == 0 && 1 < iterations { 225 | // take a copy of current column j 226 | prevSum = 0 227 | for k := 0; k < l.K; k++ { 228 | prevSum += nTheta[j*l.K+k] 229 | } 230 | } 231 | rhoTheta = l.RhoTheta.Calc(l.rhoThetaT + float64(counter)) 232 | ColNonZeroElemDo(m, j, func(i, j int, v float64) { 233 | var gammaSum float64 234 | for k := 0; k < l.K; k++ { 235 | // Eqn. 5. 236 | (*gamma)[k] = ((l.nPhi[i*l.K+k] + l.Eta) * (nTheta[j*l.K+k] + l.Alpha) / (l.nZ[k] + l.Eta*float64(l.w))) 237 | gammaSum += (*gamma)[k] 238 | } 239 | 240 | for k := 0; k < l.K; k++ { 241 | (*gamma)[k] /= gammaSum 242 | } 243 | 244 | for k := 0; k < l.K; k++ { 245 | // Eqn. 9. 246 | thetaInd = j*l.K + k 247 | nTheta[thetaInd] = ((math.Pow((1.0-rhoTheta), v) * nTheta[thetaInd]) + 248 | ((1 - math.Pow((1.0-rhoTheta), v)) * wc * (*gamma)[k])) 249 | } 250 | }) 251 | if l.ChangeEvaluationFrequency > 0 && counter%l.ChangeEvaluationFrequency == 0 && counter < iterations { 252 | sum = 0 253 | for k := 0; k < l.K; k++ { 254 | sum += nTheta[j*l.K+k] 255 | } 256 | if math.Abs(sum-prevSum)/float64(l.K) < l.MeanChangeTolerance { 257 | break 258 | } 259 | } 260 | } 261 | } 262 | 263 | // fitMiniBatch fits a proportion of the matrix as specified by miniBatch. The 264 | // algorithm is stochastic and so estimates across the minibatch and then applies those 265 | // estimates to the global statistics. 266 | func (l *LatentDirichletAllocation) fitMiniBatch(miniBatch *ldaMiniBatch, wc []float64, nTheta []float64, m mat.Matrix) { 267 | var rhoTheta float64 268 | batchSize := miniBatch.end - miniBatch.start 269 | var phiInd, thetaInd int 270 | 271 | for j := miniBatch.start; j < miniBatch.end; j++ { 272 | l.burnInDoc(j, l.BurnInPasses, m, wc[j], &miniBatch.gamma, nTheta) 273 | 274 | rhoTheta = l.RhoTheta.Calc(l.rhoThetaT + float64(l.BurnInPasses)) 275 | ColNonZeroElemDo(m, j, func(i, j int, v float64) { 276 | var gammaSum float64 277 | for k := 0; k < l.K; k++ { 278 | // Eqn. 5. 279 | miniBatch.gamma[k] = ((l.nPhi[i*l.K+k] + l.Eta) * (nTheta[j*l.K+k] + l.Alpha) / (l.nZ[k] + l.Eta*float64(l.w))) 280 | gammaSum += miniBatch.gamma[k] 281 | } 282 | for k := 0; k < l.K; k++ { 283 | miniBatch.gamma[k] /= gammaSum 284 | } 285 | 286 | for k := 0; k < l.K; k++ { 287 | // Eqn. 9. 288 | thetaInd = j*l.K + k 289 | nTheta[thetaInd] = ((math.Pow((1.0-rhoTheta), v) * nTheta[thetaInd]) + 290 | ((1 - math.Pow((1.0-rhoTheta), v)) * wc[j] * miniBatch.gamma[k])) 291 | 292 | // calculate sufficient stats 293 | nv := l.wordsInCorpus * miniBatch.gamma[k] / float64(batchSize) 294 | miniBatch.nPhiHat[i*l.K+k] += nv 295 | miniBatch.nZHat[k] += nv 296 | } 297 | }) 298 | } 299 | rhoPhi := l.RhoPhi.Calc(l.rhoPhiT) 300 | l.rhoPhiT++ 301 | 302 | // Eqn. 7. 303 | l.phiMutex.Lock() 304 | for w := 0; w < l.w; w++ { 305 | for k := 0; k < l.K; k++ { 306 | phiInd = w*l.K + k 307 | l.nPhi[phiInd] = ((1.0 - rhoPhi) * l.nPhi[phiInd]) + (rhoPhi * miniBatch.nPhiHat[phiInd]) 308 | } 309 | } 310 | l.phiMutex.Unlock() 311 | 312 | // Eqn. 8. 313 | l.zMutex.Lock() 314 | for k := 0; k < l.K; k++ { 315 | l.nZ[k] = ((1.0 - rhoPhi) * l.nZ[k]) + (rhoPhi * miniBatch.nZHat[k]) 316 | } 317 | l.zMutex.Unlock() 318 | } 319 | 320 | // normaliseTheta normalises theta to derive the posterior probability estimates for 321 | // documents over topics. All values for each document are divided by the sum of all 322 | // values for the document. 323 | func (l *LatentDirichletAllocation) normaliseTheta(theta []float64, result []float64) []float64 { 324 | //adjustment := l.Alpha 325 | adjustment := 0.0 326 | c := len(theta) / l.K 327 | if result == nil { 328 | result = make([]float64, l.K*c) 329 | } 330 | for j := 0; j < c; j++ { 331 | var sum float64 332 | for k := 0; k < l.K; k++ { 333 | sum += theta[j*l.K+k] + adjustment 334 | } 335 | for k := 0; k < l.K; k++ { 336 | result[j*l.K+k] = (theta[j*l.K+k] + adjustment) / sum 337 | } 338 | } 339 | return result 340 | } 341 | 342 | // normalisePhi normalises phi to derive the posterior probability estimates for 343 | // topics over words. All values for each topic are divided by the sum of all values 344 | // for the topic. 345 | func (l *LatentDirichletAllocation) normalisePhi(phi []float64, result []float64) []float64 { 346 | //adjustment := l.Eta 347 | adjustment := 0.0 348 | if result == nil { 349 | result = make([]float64, l.K*l.w) 350 | } 351 | sum := make([]float64, l.K) 352 | for i := 0; i < l.w; i++ { 353 | for k := 0; k < l.K; k++ { 354 | sum[k] += phi[i*l.K+k] + adjustment 355 | } 356 | } 357 | for i := 0; i < l.w; i++ { 358 | for k := 0; k < l.K; k++ { 359 | result[i*l.K+k] = (phi[i*l.K+k] + adjustment) / sum[k] 360 | } 361 | } 362 | return result 363 | } 364 | 365 | // Perplexity calculates the perplexity of the matrix m against the trained model. 366 | // m is first transformed into corresponding posterior estimates for document over topic 367 | // distributions and then used to calculate the perplexity. 368 | func (l *LatentDirichletAllocation) Perplexity(m mat.Matrix) float64 { 369 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv { 370 | m = t.ToCSC() 371 | } 372 | var wordCount float64 373 | r, c := m.Dims() 374 | 375 | if s, isSparse := m.(sparse.Sparser); isSparse { 376 | s.DoNonZero(func(i, j int, v float64) { 377 | wordCount += v 378 | }) 379 | } else { 380 | for i := 0; i < r; i++ { 381 | for j := 0; j < c; j++ { 382 | wordCount += m.At(i, j) 383 | } 384 | } 385 | } 386 | 387 | theta := l.unNormalisedTransform(m) 388 | return l.perplexity(m, wordCount, l.normaliseTheta(theta, theta), l.normalisePhi(l.nPhi, nil)) 389 | } 390 | 391 | // perplexity returns the perplexity of the matrix against the model. 392 | func (l *LatentDirichletAllocation) perplexity(m mat.Matrix, sum float64, nTheta []float64, nPhi []float64) float64 { 393 | _, c := m.Dims() 394 | var perplexity float64 395 | var ttlLogWordProb float64 396 | 397 | for j := 0; j < c; j++ { 398 | ColNonZeroElemDo(m, j, func(i, j int, v float64) { 399 | var dot float64 400 | for k := 0; k < l.K; k++ { 401 | dot += nPhi[i*l.K+k] * nTheta[j*l.K+k] 402 | } 403 | ttlLogWordProb += math.Log2(dot) * v 404 | }) 405 | } 406 | perplexity = math.Exp2(-ttlLogWordProb / sum) 407 | return perplexity 408 | } 409 | 410 | // Components returns the topic over words probability distribution. The returned 411 | // matrix is of dimensions K x W where w was the number of rows in the training matrix 412 | // and each column represents a unique words in the vocabulary and K is the number of 413 | // topics. 414 | func (l *LatentDirichletAllocation) Components() mat.Matrix { 415 | return mat.DenseCopyOf(mat.NewDense(l.w, l.K, l.normalisePhi(l.nPhi, nil)).T()) 416 | } 417 | 418 | // unNormalisedTransform performs an unNormalisedTransform - the output 419 | // needs to be normalised using normaliseTheta before use. 420 | func (l *LatentDirichletAllocation) unNormalisedTransform(m mat.Matrix) []float64 { 421 | _, c := m.Dims() 422 | theta := make([]float64, l.K*c) 423 | for i := range theta { 424 | //data[i] = rnd.Float64() + 0.5 425 | theta[i] = float64((l.Rnd.Int() % (c * l.K))) / float64(c*l.K) 426 | } 427 | gamma := make([]float64, l.K) 428 | 429 | for j := 0; j < c; j++ { 430 | var wc float64 431 | ColNonZeroElemDo(m, j, func(i, j int, v float64) { 432 | wc += v 433 | }) 434 | l.burnInDoc(j, l.TransformationPasses, m, wc, &gamma, theta) 435 | } 436 | return theta 437 | } 438 | 439 | // Transform transforms the input matrix into a matrix representing the distribution 440 | // of the documents over topics. 441 | // THe returned matrix contains the document over topic distributions where each element 442 | // is the probability of the corresponding document being related to the corresponding 443 | // topic. The returned matrix is a Dense matrix of shape K x C where K is the number 444 | // of topics and C is the number of columns in the input matrix (representing the 445 | // documents). 446 | func (l *LatentDirichletAllocation) Transform(m mat.Matrix) (mat.Matrix, error) { 447 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv { 448 | m = t.ToCSC() 449 | } 450 | _, c := m.Dims() 451 | theta := l.unNormalisedTransform(m) 452 | return mat.DenseCopyOf(mat.NewDense(c, l.K, l.normaliseTheta(theta, theta)).T()), nil 453 | } 454 | 455 | // FitTransform is approximately equivalent to calling Fit() followed by Transform() 456 | // on the same matrix. This is a useful shortcut where separate training data is not being 457 | // used to fit the model i.e. the model is fitted on the fly to the test data. 458 | // THe returned matrix contains the document over topic distributions where each element 459 | // is the probability of the corresponding document being related to the corresponding 460 | // topic. The returned matrix is a Dense matrix of shape K x C where K is the number 461 | // of topics and C is the number of columns in the input matrix (representing the 462 | // documents). 463 | func (l *LatentDirichletAllocation) FitTransform(m mat.Matrix) (mat.Matrix, error) { 464 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv { 465 | m = t.ToCSC() 466 | } 467 | 468 | l.init(m) 469 | 470 | _, c := m.Dims() 471 | 472 | nTheta := make([]float64, l.K*c) 473 | for i := 0; i < l.K*c; i++ { 474 | nTheta[i] = float64((l.Rnd.Int() % (c * l.K))) / float64(c*l.K) 475 | } 476 | wc := make([]float64, c) 477 | for j := 0; j < c; j++ { 478 | ColNonZeroElemDo(m, j, func(i, j int, v float64) { 479 | wc[j] += v 480 | }) 481 | l.wordsInCorpus += wc[j] 482 | } 483 | 484 | var phiProb []float64 485 | var thetaProb []float64 486 | 487 | numMiniBatches := int(math.Ceil(float64(c) / float64(l.BatchSize))) 488 | processes := l.Processes 489 | if numMiniBatches < l.Processes { 490 | processes = numMiniBatches 491 | } 492 | miniBatches := make([]*ldaMiniBatch, processes) 493 | for i := range miniBatches { 494 | miniBatches[i] = newLdaMiniBatch(l.K, l.w) 495 | } 496 | 497 | l.rhoPhiT = 1 498 | var perplexity float64 499 | var prevPerplexity float64 500 | 501 | for it := 0; it < l.Iterations; it++ { 502 | l.rhoThetaT++ 503 | 504 | mb := make(chan int) 505 | var wg sync.WaitGroup 506 | 507 | for process := 0; process < processes; process++ { 508 | wg.Add(1) 509 | go func(miniBatch *ldaMiniBatch) { 510 | defer wg.Done() 511 | for j := range mb { 512 | miniBatch.reset() 513 | miniBatch.start = j * l.BatchSize 514 | if j < numMiniBatches-1 { 515 | miniBatch.end = miniBatch.start + l.BatchSize 516 | } else { 517 | miniBatch.end = c 518 | } 519 | l.fitMiniBatch(miniBatch, wc, nTheta, m) 520 | } 521 | }(miniBatches[process]) 522 | } 523 | 524 | for j := 0; j < numMiniBatches; j++ { 525 | mb <- j 526 | } 527 | close(mb) 528 | wg.Wait() 529 | 530 | if l.PerplexityEvaluationFrequency > 0 && (it+1)%l.PerplexityEvaluationFrequency == 0 { 531 | phiProb = l.normalisePhi(l.nPhi, phiProb) 532 | thetaProb = l.normaliseTheta(nTheta, thetaProb) 533 | perplexity = l.perplexity(m, l.wordsInCorpus, thetaProb, phiProb) 534 | 535 | if prevPerplexity != 0 && math.Abs(prevPerplexity-perplexity) < l.PerplexityTolerance { 536 | break 537 | } 538 | prevPerplexity = perplexity 539 | } 540 | } 541 | return mat.DenseCopyOf(mat.NewDense(c, l.K, l.normaliseTheta(nTheta, thetaProb)).T()), nil 542 | } 543 | --------------------------------------------------------------------------------