├── .gitignore
├── LICENSE
├── README.md
├── project.clj
├── resources
├── core-site.xml
└── mapred-site.xml
├── src
└── lsa4solr
│ ├── cluster.clj
│ ├── clustering_protocol.clj
│ ├── core.clj
│ ├── dendrogram.clj
│ ├── hadoop_utils.clj
│ ├── hierarchical_clustering.clj
│ ├── lucene_utils.clj
│ └── mahout_matrix.clj
└── test
└── lsa4solr
└── core_test.clj
/.gitignore:
--------------------------------------------------------------------------------
1 | *.jar
2 | classes/*
3 | lib/*
4 | *~
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Licensed under the Apache License, Version 2.0 (the "License");
2 | you may not use this file except in compliance with the License.
3 | You may obtain a copy of the License at
4 |
5 | http://www.apache.org/licenses/LICENSE-2.0
6 |
7 | Unless required by applicable law or agreed to in writing, software
8 | distributed under the License is distributed on an "AS IS" BASIS,
9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | See the License for the specific language governing permissions and
11 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | lsa4solr
2 | ========
3 |
4 | A clustering engine for Solr based on Latent Semantic Analysis. The engine
5 | constructs a term frequency matrix which it stores in memory. When requests for
6 | clustering documents are made, the term document matrix is constructed for the
7 | documents in the query result and decomposed using Singular Value Decomposition.
8 | The document vectors are then reconstructed based on a reduced rank parameter to
9 | get rid of noise. These reconstructed document vectors are clustered by comparing
10 | the cosine similarity distance of each individual document to the first n principal
11 | components.
12 |
13 | Decomposition is performed using the DistributedLanczosSolver from Apache Mahout on
14 | a Hadoop cluster. After decomposition of the term-document matrix, the reduced rank
15 | document vectors are clusters using k-means clustering also from Apache Mahout or
16 | a local hierarchical clustering method. The number of clusters must be supplied if
17 | using the kmeans algorithm. A dendrogram is produced as output from the hierarchical
18 | clustering. The dendrogram is suitable as input to the [JavaScript InfoVis Toolkit](http://thejit.org/).
19 |
20 | Development goals include determining the optimal number of clusters, optimizing
21 | the reduced rank, etc.
22 |
23 | Building
24 | --------
25 |
26 | lsa4solr depends on the 3.1 development version of Solr and the
27 | 1.2 development version of Clojure. In order to build lsa4solr,
28 | you will need to build the appropriate versions of Solr and Clojure,
29 | generate the maven artifacts, and install them in your local
30 | maven repository. Then
31 |
32 | lein deps
33 | lein jar
34 |
35 | Installing
36 | ----------
37 |
38 | Due to some Clojure classloader requirements, you will need to install the
39 | lsa4solr jar and its dependencies into the Solr webapp/WEB-INF/lib directory
40 | rather than using the solrconfig.xml file to configure the path to the
41 | lsa4solr dependencies. The dependencies that need to be in the System
42 | classloader include:
43 |
44 | arpack-combo-0.1.jar
45 | clojure-1.2.0.jar
46 | clojure-contrib-1.2.0-master-20100122.191106-1.jar
47 | apache-solr-clustering-3.1-dev.jar
48 | parallelcolt-0.7.2.jar
49 | lsa4solr.jar
50 | netlib-java-0.9.1.jar
51 | hadoop-core-0.20.2.jar
52 | mahout-collections-0.4-SNAPSHOT.jar
53 | mahout-core-0.4-SNAPSHOT.jar
54 | mahout-math-0.4-SNAPSHOT.jar
55 | commons-cli-2.0-mahout.jar
56 | uncommons-maths-1.2.jar
57 |
58 | Configuring Solr
59 | ----------------
60 |
61 | Add the following to your solrconfig.xml
62 |
63 |
67 |
68 | lsa4solr.cluster.LSAClusteringEngine
69 | lsa4solr
70 | Summary
71 | Summary
72 |
73 |
74 |
77 |
78 | true
79 | lsa4solr
80 | true
81 |
82 |
83 | lsa4solr
84 |
85 |
86 |
87 | Configure the narrative-field parameter to be the text field of the
88 | schema you are working with and the id-field parameter to be the unique
89 | field that will be returned.
90 |
91 | You will need to tweak the Solr filters on the narrative field in order
92 | to get the best results. I have been using the following set of filters
93 | to get decent results:
94 |
95 |
96 |
97 |
98 |
99 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 | Hadoop Setup
113 | -----------------
114 |
115 | In order to use lsa4solr with Hadoop, make sure that the mahout-math-0.4.jar is
116 | in the Hadoop lib directory. This is a dependency of the mahout-core-0.4.jar which
117 | contains the distributed job. Put the core-site.xml and mapred-site.xml files from
118 | the resources directory into Solr's webapp/WEB-INF/classes directory and configure
119 | them to point to your Hadoop setup.
120 |
121 |
122 | Using
123 | -----
124 |
125 | Start Solr with the -Dsolr.clustering.enabled=true option. Once the server
126 | has started, cluster your documents using an URL like
127 |
128 | http://localhost:8983/solr/lsa4solr?nclusters=2&q=Summary:.*&rows=100&k=10&algorithm=kmeans
129 |
130 | where
131 |
132 | algorithm - the algorithm to use for clustering (hierarchical or kmeans)
133 | k - the rank of the reduced SVD matrix
134 | nclusters - the number of clusters to group the documents into (kmeans only)
135 | q - the standard Solr query parameter
136 | rows - the standard Solr rows parameter
137 |
138 | The cluster information will be at the bottom of the response.
139 |
140 | Testing
141 | -------
142 |
143 | On the Downloads page, there is a Usenet dataset which can be found [here](http://people.csail.mit.edu/jrennie/20Newsgroups/)
144 | Import some documents from two or more of the newsgroups into your Solr instance and access the lsa4solr URL.
145 |
146 | You can also use the cluster algorithm directly from the REPL
147 |
148 | lein swank
149 |
150 | user> (in-ns 'lsa4solr.cluster)
151 | #
152 | lsa4solr.cluster> (def reader (org.apache.lucene.index.IndexReader/open (org.apache.lucene.store.FSDirectory/open (new java.io.File "/path/to/solr/data/index"))))
153 | #'lsa4solr.cluster/reader
154 | lsa4solr.cluster> (def initial-terms (init-term-freq-doc reader "Summary"))
155 | #'lsa4solr.cluster/initial-terms
156 | lsa4solr.cluster> (def searcher (new org.apache.lucene.search.IndexSearcher reader))
157 | #'lsa4solr.cluster/searcher
158 | lsa4solr.cluster> (def queryparser
159 | (new org.apache.lucene.queryParser.QueryParser
160 | (org.apache.lucene.util.Version/LUCENE_30)
161 | "Summary"
162 | (new org.apache.lucene.analysis.SimpleAnalyzer)))
163 | #'lsa4solr.cluster/queryparser
164 | lsa4solr.cluster> (def result (. searcher search (. queryparser parse "Summary:br*") (. reader maxDoc)))
165 | #'lsa4solr.cluster/result
166 | lsa4solr.cluster> (def docids (map #(. %1 doc) (. result scoreDocs)))
167 | #'lsa4solr.cluster/docids
168 | lsa4solr.cluster> (def docslice (new org.apache.solr.search.DocSlice 0 (count docids) (int-array docids) (float-array (repeat (count docids) 1)) (count docids) 1))
169 | #'lsa4solr.cluster/docslice
170 | lsa4solr.cluster> (def kmeans-clst (cluster-kmeans-docs reader initial-terms 50 2 "Summary" "id"))
171 | lsa4solr.cluster> (def hierarchical-clst (cluster-hierarchical-docs reader initial-terms 50 "Summary" "id"))
172 |
--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
1 | (defproject lsa4solr "1.0.0-SNAPSHOT"
2 | :description "Clustering component for Solr based on Latent Semantic Analysis"
3 | :namespaces :all
4 | :repositories {"apache" "https://repository.apache.org/"}
5 | :dependencies [[org.clojure/clojure "1.2.0-master-SNAPSHOT"]
6 | [org.clojure/clojure-contrib "1.2.0-master-SNAPSHOT"]
7 | [org.apache.mahout/mahout-core "0.4-SNAPSHOT"
8 | :exclusions [org.apache.lucene/lucene-core
9 | org.apache.lucene/lucene-analyzers]]
10 | [org.apache.mahout/mahout-math "0.4-SNAPSHOT"]
11 | [org.slf4j/slf4j-log4j12 "1.5.11"]]
12 | :dev-dependencies [[leiningen/lein-swank "1.1.0"]
13 | [org.apache.solr/solr-core "3.1-SNAPSHOT" :exclusions [org.apache.lucene/lucene-snowball]]
14 | [org.apache.solr/solr-clustering "3.1-SNAPSHOT" :exclusions [org.carrot2/carrot2-mini]]])
15 |
--------------------------------------------------------------------------------
/resources/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | fs.default.name
10 | ${fs.default.name}
11 |
12 |
13 |
14 | hadoop.tmp.dir
15 | ${hadoop.tmp.dir}
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/resources/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | mapred.job.tracker
10 | ${mapred.job.tracker}
11 |
12 |
13 | mapred.map.tasks
14 | 32
15 |
16 |
17 | mapred.reduce.tasks
18 | 4
19 |
20 |
21 | mapred.local.dir
22 | ${mapred.local.dir}
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/lsa4solr/cluster.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.cluster
2 | (:use [clojure.contrib.seq-utils :only [indexed]]
3 | [lsa4solr core clustering-protocol]))
4 |
5 | (gen-class
6 | :name lsa4solr.cluster/LSAClusteringEngine
7 | :extends org.apache.solr.handler.clustering.SearchClusteringEngine
8 | :exposes-methods {init superinit}
9 | :init initialize-state
10 | :state state)
11 |
12 | (defn -initialize-state []
13 | [[] (ref {})])
14 |
15 | (defn init-term-freq-doc [reader field]
16 | (let [terms (. reader terms)
17 | numdocs (.maxDoc reader)
18 | counter (let [count (ref 0)] #(dosync (alter count inc)))]
19 | (apply merge
20 | (take-while
21 | #(= (nil? %1) false)
22 | (repeatedly
23 | (fn []
24 | (if (and (. terms next) (= field (.field (. terms term))))
25 | (let [text (. (. terms term) text)
26 | df (. terms docFreq)]
27 | {(keyword text)
28 | {
29 | :df df
30 | :idf (java.lang.Math/log (/ numdocs df))
31 | :idx (counter)
32 | }
33 | })
34 | nil)))))))
35 |
36 | (defn -init [this
37 | config
38 | solr-core]
39 | (let [super-result (.superinit this config solr-core)
40 | reader (.getReader (.get (.getSearcher solr-core true true nil)))
41 | narrative-field (.get config "narrative-field")
42 | id-field (.get config "id-field")
43 | name (.get config "name")]
44 | (dosync
45 | (alter (.state this) assoc
46 | :reader reader
47 | :name name
48 | :narrative-field narrative-field
49 | :id-field id-field
50 | :terms (init-term-freq-doc reader narrative-field))
51 | name)))
52 |
53 | (defn cluster-dispatch
54 | ;; K-Means
55 | ([reader field id-field terms doc-list k num-clusters]
56 | (let [doc-seq (iterator-seq (.iterator doc-list))
57 | clusters (cluster-kmeans-docs reader terms doc-seq k num-clusters field id-field)]
58 | (:clusters clusters)))
59 |
60 | ;; Hierarchical
61 | ([reader field id-field terms doc-list k]
62 | (let [doc-seq (iterator-seq (.iterator doc-list))
63 | clusters (cluster-hierarchical-docs reader terms doc-seq k field id-field)]
64 | (:clusters clusters))))
65 |
66 |
67 | (defn -cluster [this
68 | query
69 | doc-list
70 | solr-request]
71 | (let [algorithm (.get (.getParams solr-request) "algorithm")]
72 | (cond
73 | (= algorithm "hierarchical") (cluster-dispatch (:reader @(.state this))
74 | (:narrative-field @(.state this))
75 | (:id-field @(.state this))
76 | (:terms @(.state this))
77 | doc-list
78 | (Integer. (.get (.getParams solr-request) "k")))
79 | (= algorithm "kmeans") (cluster-dispatch (:reader @(.state this))
80 | (:narrative-field @(.state this))
81 | (:id-field @(.state this))
82 | (:terms @(.state this))
83 | doc-list
84 | (Integer. (.get (.getParams solr-request) "k"))
85 | (Integer. (.get (.getParams solr-request) "nclusters"))))))
86 |
87 |
--------------------------------------------------------------------------------
/src/lsa4solr/clustering_protocol.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.clustering-protocol
2 | (:use [lsa4solr core hadoop-utils lucene-utils mahout-matrix hierarchical-clustering dendrogram])
3 | (:require [clojure [zip :as z]])
4 | (:require [clojure.contrib
5 | [seq-utils :as seq-utils]
6 | [zip-filter :as zf]])
7 | (:import (org.apache.hadoop.conf Configuration)
8 | (org.apache.hadoop.fs FileSystem Path)
9 | (org.apache.hadoop.io Text SequenceFile$Reader)
10 | (org.apache.hadoop.fs.permission FsPermission)
11 | (org.apache.mahout.clustering.kmeans RandomSeedGenerator KMeansDriver)))
12 |
13 | (defn kmeans-cluster
14 | [num-clusters max-iterations V S]
15 | (let [hadoop-conf (Configuration.)
16 | fs (FileSystem/get hadoop-conf)
17 | base-path (Path. (str "/lsa4solr/kmeans-clustering/" (java.lang.System/nanoTime)))
18 | mkdirs-result (FileSystem/mkdirs fs
19 | base-path
20 | (FsPermission/getDefault))
21 | reduced-fm (mmult V S)
22 | reduced-m-path (str (.toString base-path) "/reducedm")
23 | writer (write-matrix hadoop-conf reduced-fm reduced-m-path)
24 | initial-centroids (RandomSeedGenerator/buildRandom reduced-m-path
25 | (str (.toString base-path) "/centroids")
26 | num-clusters)
27 | cluster-output-path (str (.toString base-path) "/clusterout")
28 | job (KMeansDriver/runJob
29 | reduced-m-path
30 | (.toString initial-centroids)
31 | cluster-output-path
32 | "org.apache.mahout.common.distance.CosineDistanceMeasure"
33 | 0.00000001
34 | max-iterations
35 | num-clusters)
36 | tkey (Text.)
37 | tval (Text.)
38 | groups (clojure.contrib.seq-utils/flatten
39 | (map (fn [file-status] (let [path (.getPath file-status)
40 | seq-reader (SequenceFile$Reader. fs path hadoop-conf)
41 | valseq (take-while (fn [v] (.next seq-reader tkey tval))
42 | (repeat [tkey tval]))]
43 | (map #(.toString (second %)) valseq)))
44 | (.globStatus fs (Path. (str cluster-output-path "/points/part*")))))]
45 | groups))
46 |
47 | (defn emit-leaf-node-fn
48 | [reader doc-seq id-field]
49 | (fn [node]
50 | (hash-map "name" (get-docid reader id-field (nth doc-seq (:id node)))
51 | "id" (get-docid reader id-field (nth doc-seq (:id node)))
52 | "data" {}
53 | "children" [])))
54 |
55 | (defn emit-branch-node-fn
56 | []
57 | (let [id (ref 0)]
58 | (fn [node children-arr]
59 | (hash-map "name" (:count (meta node))
60 | "id" (dosync (alter id inc))
61 | "data" (hash-map "count" (:count (meta node)))
62 | "children" children-arr))))
63 |
64 | (defn hierarchical-clustering
65 | [reader id-field doc-seq mat]
66 | (let [[dend merge-sequence] (last (hclust mat))]
67 | (dendrogram-to-map dend (emit-branch-node-fn) (emit-leaf-node-fn reader doc-seq id-field))))
68 |
69 | (defn get-mapper-common [terms vec-ref ndocs update-ref]
70 | (proxy [org.apache.lucene.index.TermVectorMapper]
71 | []
72 | (map [term frequency offsets positions]
73 | (let [term-entry ((keyword term) terms)]
74 | (dosync
75 | (update-ref vec-ref (- (:idx term-entry) 1) (* frequency (:idf term-entry))))))
76 | (setExpectations [field numTerms storeOffsets storePositions]
77 | nil)))
78 |
79 |
80 | (defn get-mapper
81 | [terms vec-ref ndocs]
82 | (get-mapper-common terms vec-ref ndocs
83 | (fn [vec-ref idx weight]
84 | (set-value @vec-ref idx weight))))
85 |
86 | (defn init-frequency-vector
87 | [n]
88 | (ref (create-vector n)))
89 |
90 | (defn get-frequency-matrix
91 | [reader field terms hits]
92 | (distributed-matrix (extract-frequency-vectors
93 | reader
94 | (fn [n] (init-frequency-vector n))
95 | (fn [terms vec-ref ndocs]
96 | (get-mapper terms
97 | vec-ref
98 | ndocs))
99 | field
100 | terms
101 | hits)))
102 |
103 | (defn decompose-term-doc-matrix
104 | [reader narrative-field terms doc-seq k]
105 | (let [fm (transpose (get-frequency-matrix reader
106 | narrative-field
107 | terms
108 | doc-seq))
109 | svd-factorization (decompose-svd fm k)
110 | U (:U svd-factorization)
111 | S (:S svd-factorization)
112 | V (:V svd-factorization)]
113 | (list U S V)))
114 |
115 | (defn cluster-kmeans-docs
116 | [reader
117 | terms
118 | doc-seq
119 | k
120 | num-clusters
121 | narrative-field
122 | id-field]
123 | (let [[U S V] (decompose-term-doc-matrix reader narrative-field terms doc-seq k)
124 | groups (kmeans-cluster num-clusters k V S)
125 | clusters (apply merge-with #(into %1 %2)
126 | (map #(hash-map (keyword (second %))
127 | (list (get-docid reader id-field (nth doc-seq (first %1)))))
128 | (seq-utils/indexed groups)))]
129 | {:clusters clusters
130 | :U U
131 | :S S
132 | :V V}))
133 |
134 | (defn cluster-hierarchical-docs
135 | [reader
136 | terms
137 | doc-seq
138 | k
139 | narrative-field
140 | id-field]
141 | (let [[U S V] (decompose-term-doc-matrix reader narrative-field terms doc-seq k)
142 | SVt (transpose (mmult S (transpose V)))
143 | clusters (hierarchical-clustering reader id-field doc-seq SVt)]
144 | {:clusters clusters
145 | :U U
146 | :S S
147 | :V V}))
--------------------------------------------------------------------------------
/src/lsa4solr/core.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.core)
2 |
3 | (defn to-array-of [class coll]
4 | (let [array (make-array class (count coll))]
5 | (dorun (map (fn [item index] (aset array index item))
6 | coll
7 | (iterate inc 0)))
8 | array))
9 |
--------------------------------------------------------------------------------
/src/lsa4solr/dendrogram.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.dendrogram
2 | (:require [clojure [zip :as z]])
3 | (:require [clojure.contrib
4 | [combinatorics :as combine]
5 | [zip-filter :as zf]
6 | [seq-utils :as seq-utils]]))
7 |
8 | (defn move-right
9 | "Moves right n steps from location"
10 | [loc n]
11 | (nth (iterate #(z/right %) loc) n))
12 |
13 | (defn remove-nodes
14 | "Removes multiple nodes in a single pass"
15 | [loc & n]
16 | (let [sorted-indexes (sort n)
17 | increments (range 0 (count sorted-indexes))
18 | incremental-indexes (map #(- %1 %2) sorted-indexes increments)]
19 | (reduce #(z/seq-zip (z/root (z/remove (move-right (z/down %1) %2))))
20 | loc
21 | incremental-indexes)))
22 |
23 | (defn merge-nodes
24 | "Merges two nodes by calling new-node to create the new node.
25 | Pulls all other nodes up one level to maintain dendrogram."
26 | [root node-indexes new-node]
27 | (let [n1 (z/node (move-right (z/down root) (first node-indexes)))
28 | n2 (z/node (move-right (z/down root) (second node-indexes)))
29 | new-tree (z/seq-zip
30 | (map #(with-meta
31 | (list (with-meta (z/node %) (meta (z/node %))))
32 | (meta (z/node %)))
33 | (zf/children (apply remove-nodes root node-indexes))))]
34 | (z/seq-zip (z/root (z/insert-child
35 | new-tree
36 | (new-node n1 n2))))))
37 |
38 | (defn bfs-depth-seq [branch? children root height]
39 | "Walks a tree to a certain depth and returns a lazy sequence of all nodes at specified depth"
40 | (let [walk (fn walk [queue]
41 | (when-let [node (peek queue)]
42 | (lazy-seq
43 | (cond
44 | (< (:depth node) height) (walk
45 | (into (pop queue)
46 | (when (branch? (:node node))
47 | (map #(hash-map :node % :depth (inc (:depth node)))
48 | (children (:node node))))))
49 | :default (cons (:node node)
50 | (walk (pop queue)))))))]
51 | (walk (conj clojure.lang.PersistentQueue/EMPTY (hash-map :node root :depth 0)))))
52 |
53 |
54 | (defn dendrogram-to-map
55 | [node emit-branch-node emit-leaf-node]
56 | (cond (z/branch? node) (emit-branch-node (z/node node)
57 | (map #(dendrogram-to-map % emit-branch-node emit-leaf-node)
58 | (zf/children node)))
59 | :default (emit-leaf-node (z/node node))))
60 |
61 | (defn cut
62 | "Cuts dendrogram at depth. Returns flattened descendants of groups at depth."
63 | [root depth]
64 | (map (fn [node]
65 | (map #(:id (z/node %))
66 | (filter #(not (z/branch? %))
67 | (zf/descendants node))))
68 | (bfs-depth-seq z/branch? zf/children root depth)))
69 |
70 | (defn dendrogram
71 | "Constructs a new dendrogram from a sequence of elements"
72 | [els]
73 | (z/seq-zip els))
--------------------------------------------------------------------------------
/src/lsa4solr/hadoop_utils.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.hadoop-utils
2 | (:import (org.apache.mahout.math VectorWritable)
3 | (org.apache.hadoop.io IntWritable)
4 | (org.apache.hadoop.fs FileSystem Path)
5 | (org.apache.hadoop.io SequenceFile$Writer)))
6 |
7 | (defn write-vectors [writer
8 | m]
9 | (doall (map #(.append writer %1 (VectorWritable. (.vector %2)))
10 | (map #(IntWritable. %)
11 | (range 0 (.numRows m)))
12 | (iterator-seq (.iterator m)))))
13 |
14 | (defn write-matrix [hadoop-conf m path-string]
15 | (let [fs (FileSystem/get hadoop-conf)
16 | path (Path. path-string)]
17 | (doto (SequenceFile$Writer. fs
18 | hadoop-conf
19 | path
20 | IntWritable
21 | VectorWritable)
22 | (write-vectors m)
23 | (.close))))
--------------------------------------------------------------------------------
/src/lsa4solr/hierarchical_clustering.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.hierarchical-clustering
2 | (:use [lsa4solr mahout-matrix dendrogram])
3 | (:require [clojure [zip :as z]])
4 | (:require [clojure.contrib
5 | [combinatorics :as combine]
6 | [zip-filter :as zf]
7 | [seq-utils :as seq-utils]])
8 | (:import (org.apache.mahout.math SparseMatrix
9 | RandomAccessSparseVector
10 | VectorWritable
11 | Matrix
12 | DenseMatrix)
13 | (org.apache.mahout.math.hadoop DistributedRowMatrix)))
14 |
15 | (defn get-count
16 | [cluster]
17 | (:count (meta cluster)))
18 |
19 | (defn get-centroid
20 | [cluster]
21 | (:centroid (meta cluster)))
22 |
23 | (defn merge-centroids
24 | [c1 c2]
25 | (add (mult (get-centroid c1) (double (/ 1 (get-count c1))))
26 | (mult (get-centroid c2) (double (/ 1 (get-count c2))))))
27 |
28 | (defn get-vecs
29 | [mat idxs]
30 | (map #(.getRow mat %) idxs))
31 |
32 | (defn average-dispersion
33 | [mat group centroid dist]
34 | (/ (reduce + (map #(dist centroid %) (get-vecs mat group)))
35 | (count group)))
36 |
37 | (defn average-intercluster-dispersion
38 | [mat clusters dist]
39 | (let [centroids (map #(apply centroid (get-vecs mat %)) clusters)
40 | combos (combine/combinations centroids 2)]
41 | (/ (reduce + (map #(apply dist %) combos))
42 | (count combos))))
43 |
44 | (defn hclust
45 | "Hierarchical clustering of the rows of mat. Returns a dendrogram
46 | and a merge sequence. The dendrogram is a tree with doc ids as
47 | leaf nodes and meta data in the branch nodes indicating the number
48 | of children and the centroid of the branch."
49 | [mat]
50 | (let [dend (dendrogram (map #(with-meta {:id %}
51 | (hash-map
52 | :centroid (.getRow mat %)
53 | :count 1))
54 | (range 0 (.numRows mat))))
55 | get-distance (memoize euclidean-distance)]
56 | (take
57 | (- (.numRows mat) 1)
58 | (iterate (fn [[dend merge-sequence]]
59 | (let [clusters (z/children dend)
60 | dists (map #(list % (get-distance (get-centroid (nth clusters (first %)))
61 | (get-centroid (nth clusters (second %)))))
62 | (combine/combinations (range 0 (count clusters)) 2))
63 | closest-pair (first (reduce #(if (< (second %1) (second %2)) %1 %2)
64 | (first dists)
65 | (rest dists)))]
66 | (list (merge-nodes dend
67 | closest-pair
68 | (fn [n1 n2]
69 | (with-meta (list (with-meta n1 (meta n1)) (with-meta n2 (meta n2)))
70 | (hash-map :count (apply + (map get-count [n1 n2]))
71 | :centroid (merge-centroids n1 n2)))))
72 | (conj merge-sequence closest-pair))))
73 | (list dend '())))))
--------------------------------------------------------------------------------
/src/lsa4solr/lucene_utils.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.lucene-utils)
2 |
3 | (defn get-docid [reader id-field id]
4 | (.stringValue (.getField (.document reader id) id-field)))
5 |
6 | (defn extract-frequency-vectors
7 | [reader init-frequency-vector get-mapper field terms hits]
8 | (map #(let [m (init-frequency-vector (count terms))
9 | mapper (get-mapper terms m (count hits))]
10 | (do (. reader getTermFreqVector (int %1) field mapper)
11 | @m))
12 | hits))
13 |
14 |
--------------------------------------------------------------------------------
/src/lsa4solr/mahout_matrix.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.mahout-matrix
2 | (:require [clojure.contrib.generic [math-functions :as math]])
3 | (:import (org.apache.mahout.math SparseMatrix RandomAccessSparseVector VectorWritable Matrix DenseMatrix)
4 | (org.apache.mahout.math.hadoop DistributedRowMatrix)
5 | (org.apache.mahout.math.hadoop.decomposer DistributedLanczosSolver)
6 | (org.apache.mahout.math.function UnaryFunction TimesFunction)
7 | (org.apache.mahout.common.distance EuclideanDistanceMeasure)
8 | (org.apache.hadoop.fs Path FileSystem)
9 | (org.apache.hadoop.fs.permission FsPermission)
10 | (org.apache.hadoop.conf Configuration)
11 | (org.apache.hadoop.mapred JobConf)
12 | (org.apache.hadoop.io IntWritable SequenceFile$Writer)))
13 |
14 | (defn create-vector
15 | [data]
16 | (cond
17 | (coll? data) (doto (RandomAccessSparseVector. (count data))
18 | ((fn [vec] (doall
19 | (map #(.setQuick vec %1 %2)
20 | (range 0 (count data))
21 | data)))))
22 | (integer? data) (doto (RandomAccessSparseVector. data))))
23 |
24 | (defn print-vector
25 | [v]
26 | (map #(.get %) (iterator-seq (.iterateAll v))))
27 |
28 | (defn minus
29 | [v1 v2]
30 | (.minus v1 v2))
31 |
32 | (defn mult
33 | [v1 s]
34 | (.times v1 s))
35 |
36 | (defn divide
37 | [v1 s]
38 | (.divide v1 s))
39 |
40 | (defn add
41 | [v1 v2]
42 | (.plus v1 v2))
43 |
44 | (defn centroid
45 | [& vecs]
46 | (divide (reduce add vecs) (count vecs)))
47 |
48 | (def euclidean-distance-measure (EuclideanDistanceMeasure.))
49 |
50 | (defn euclidean-distance
51 | [v1 v2]
52 | (.distance euclidean-distance-measure v1 v2))
53 |
54 | (defn mean
55 | [& vecs]
56 | (divide (reduce add vecs)
57 | (count vecs)))
58 |
59 | (defn variance
60 | [& vecs]
61 | (let [sample-mean (apply mean vecs)]
62 | (/ (reduce + (map #(math/pow (euclidean-distance % sample-mean) 2) vecs))
63 | (count vecs))))
64 |
65 | (defn set-value
66 | ([#^RandomAccessSparseVector vector index value] (.setQuick vector index value)))
67 |
68 | (defn distributed-matrix
69 | [vec-iterator]
70 | (let [hadoop-conf (Configuration.)
71 | fs (FileSystem/get hadoop-conf)
72 | base-path (Path. (str "/lsa4solr/matrix/" (java.lang.System/nanoTime)))
73 | mkdirs-result (FileSystem/mkdirs fs
74 | base-path
75 | (FsPermission/getDefault))
76 | m-path (str (.toString base-path) "/m")
77 | tmp-path (str (.toString base-path) "/tmp")
78 | nrows (count vec-iterator)
79 | ncols (.size (first vec-iterator))
80 | writer (doto (SequenceFile$Writer. fs
81 | hadoop-conf
82 | (Path. m-path)
83 | IntWritable
84 | VectorWritable)
85 | ((fn [wrt]
86 | (doall
87 | (map #(.append wrt
88 | (IntWritable. %1)
89 | (VectorWritable. %2))
90 | (range 0 nrows)
91 | vec-iterator))))
92 | (.close))]
93 | (doto
94 | (DistributedRowMatrix. m-path
95 | tmp-path
96 | nrows
97 | ncols)
98 | (.configure (JobConf. hadoop-conf)))))
99 |
100 | (defn local-matrix
101 | [data]
102 | (doto (DenseMatrix. (count data) (count (first data)))
103 | ((fn [m] (doall
104 | (map (fn [row] (.assignRow m row (create-vector (nth data row))))
105 | (range 0 (count data))))))))
106 |
107 | (defmulti mmult (fn [A & B] (type A)))
108 |
109 | (defmethod mmult DistributedRowMatrix [A B]
110 | (let [num-rows (.numRows A)
111 | num-cols (second (int-array (.size B)))]
112 | (doto (DenseMatrix. num-rows num-cols)
113 | ((fn [m] (doall (pmap #(.assignColumn m % (.times A (.getColumn B %)))
114 | (range 0 num-cols))))))))
115 | (defmethod mmult :default [A B]
116 | (.times A B))
117 |
118 | (defn diag
119 | [vals]
120 | (doto (SparseMatrix. (int-array [(count vals) (count vals)]))
121 | ((fn [m] (doall (map #(.setQuick m %1 %2 %3)
122 | (range 0 (count vals))
123 | (range 0 (count vals))
124 | vals))))))
125 |
126 | (defn invert-diagonal
127 | [mat]
128 | (.assign mat
129 | (proxy [UnaryFunction]
130 | []
131 | (apply [arg1] (if (= arg1 0) 0 (/ 1 arg1))))))
132 |
133 |
134 | (defn transpose
135 | [mat]
136 | (.transpose mat))
137 |
138 | (defn normalize-matrix-columns
139 | [mat]
140 | (let [num-rows (.numRows mat)
141 | num-cols (.numCols mat)]
142 | (doto (DenseMatrix. num-rows num-cols)
143 | ((fn [m] (doall (pmap #(.assignColumn m % (.normalize (.getColumn mat %)))
144 | (range 0 num-cols))))))))
145 |
146 | (defn decompose-svd
147 | [mat k]
148 | (let [eigenvalues (new java.util.ArrayList)
149 | eigenvectors (DenseMatrix. (+ k 2) (.numCols mat))
150 | decomposer (doto (DistributedLanczosSolver.)
151 | (.solve mat (+ k 2) eigenvectors eigenvalues false))
152 | V (normalize-matrix-columns (.viewPart (.transpose eigenvectors)
153 | (int-array [0 0])
154 | (int-array [(.numCols mat) k])))
155 | U (mmult mat V)
156 | S (diag (take k (reverse eigenvalues)))]
157 | {:U U
158 | :S S
159 | :V V}))
160 |
161 |
162 |
163 |
--------------------------------------------------------------------------------
/test/lsa4solr/core_test.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.core-test
2 | (:use [lsa4solr.core] :reload-all)
3 | (:use [clojure.test]))
4 |
5 | (deftest replace-me ;; FIXME: write
6 | (is false))
7 |
--------------------------------------------------------------------------------