├── .gitignore
├── LICENSE
├── README.md
├── project.clj
├── resources
    ├── core-site.xml
    └── mapred-site.xml
├── src
    └── lsa4solr
    │   ├── cluster.clj
    │   ├── clustering_protocol.clj
    │   ├── core.clj
    │   ├── dendrogram.clj
    │   ├── hadoop_utils.clj
    │   ├── hierarchical_clustering.clj
    │   ├── lucene_utils.clj
    │   └── mahout_matrix.clj
└── test
    └── lsa4solr
        └── core_test.clj


/.gitignore:
--------------------------------------------------------------------------------
1 | *.jar
2 | classes/*
3 | lib/*
4 | *~
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Licensed under the Apache License, Version 2.0 (the "License");
 2 | you may not use this file except in compliance with the License.
 3 | You may obtain a copy of the License at
 4 |  
 5 |     http://www.apache.org/licenses/LICENSE-2.0
 6 |  
 7 | Unless required by applicable law or agreed to in writing, software
 8 | distributed under the License is distributed on an "AS IS" BASIS,
 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | See the License for the specific language governing permissions and
11 | limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | lsa4solr
  2 | ========
  3 | 
  4 | A clustering engine for Solr based on Latent Semantic Analysis.  The engine
  5 | constructs a term frequency matrix which it stores in memory.  When requests for
  6 | clustering documents are made, the term document matrix is constructed for the
  7 | documents in the query result and decomposed using Singular Value Decomposition.
  8 | The document vectors are then reconstructed based on a reduced rank parameter to
  9 | get rid of noise.  These reconstructed document vectors are clustered by comparing
 10 | the cosine similarity distance of each individual document to the first n principal
 11 | components.
 12 | 
 13 | Decomposition is performed using the DistributedLanczosSolver from Apache Mahout on 
 14 | a Hadoop cluster.  After decomposition of the term-document matrix, the reduced rank 
 15 | document vectors are clusters using k-means clustering also from Apache Mahout or 
 16 | a local hierarchical clustering method.  The number of clusters must be supplied if
 17 | using the kmeans algorithm.  A dendrogram is produced as output from the hierarchical
 18 | clustering.  The dendrogram is suitable as input to the [JavaScript InfoVis Toolkit](http://thejit.org/).
 19 | 
 20 | Development goals include determining the optimal number of clusters, optimizing
 21 | the reduced rank, etc.
 22 | 
 23 | Building
 24 | --------
 25 | 
 26 | lsa4solr depends on the 3.1 development version of Solr and the
 27 | 1.2 development version of Clojure.  In order to build lsa4solr,
 28 | you will need to build the appropriate versions of Solr and Clojure,
 29 | generate the maven artifacts, and install them in your local
 30 | maven repository.  Then
 31 | 
 32 |   lein deps
 33 |   lein jar
 34 | 
 35 | Installing
 36 | ----------
 37 | 
 38 | Due to some Clojure classloader requirements, you will need to install the 
 39 | lsa4solr jar and its dependencies into the Solr webapp/WEB-INF/lib directory
 40 | rather than using the solrconfig.xml file to configure the path to the
 41 | lsa4solr dependencies.  The dependencies that need to be in the System
 42 | classloader include:
 43 | 
 44 |     arpack-combo-0.1.jar
 45 |     clojure-1.2.0.jar
 46 |     clojure-contrib-1.2.0-master-20100122.191106-1.jar
 47 |     apache-solr-clustering-3.1-dev.jar
 48 |     parallelcolt-0.7.2.jar
 49 |     lsa4solr.jar
 50 |     netlib-java-0.9.1.jar
 51 |     hadoop-core-0.20.2.jar
 52 |     mahout-collections-0.4-SNAPSHOT.jar
 53 |     mahout-core-0.4-SNAPSHOT.jar
 54 |     mahout-math-0.4-SNAPSHOT.jar
 55 |     commons-cli-2.0-mahout.jar
 56 |     uncommons-maths-1.2.jar
 57 |   
 58 | Configuring Solr
 59 | ----------------
 60 | 
 61 | Add the following to your solrconfig.xml
 62 | 
 63 |     <searchComponent
 64 |       name="lsa4solr"
 65 |       enable="${solr.clustering.enabled:false}"
 66 |       class="org.apache.solr.handler.clustering.ClusteringComponent" >
 67 |       <lst name="engine">
 68 |         <str name="classname">lsa4solr.cluster.LSAClusteringEngine</str>
 69 |         <str name="name">lsa4solr</str>
 70 |         <str name="narrative-field">Summary</str>
 71 |         <str name="id-field">Summary</str>
 72 |       </lst>
 73 |     </searchComponent>
 74 |      <requestHandler name="/lsa4solr"
 75 |                     enable="${solr.clustering.enabled:false}"
 76 |                     class="solr.SearchHandler">
 77 |        <lst name="defaults">
 78 |          <bool name="clustering">true</bool>
 79 |          <str name="clustering.engine">lsa4solr</str>
 80 |          <bool name="clustering.results">true</bool>
 81 |       </lst>     
 82 |       <arr name="last-components">
 83 |         <str>lsa4solr</str>
 84 |       </arr>
 85 |     </requestHandler>
 86 |   
 87 | Configure the narrative-field parameter to be the text field of the
 88 | schema you are working with and the id-field parameter to be the unique
 89 | field that will be returned.
 90 | 
 91 | You will need to tweak the Solr filters on the narrative field in order
 92 | to get the best results.  I have been using the following set of filters
 93 | to get decent results:
 94 | 
 95 |     <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
 96 |       <analyzer type="index">
 97 |         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 98 |         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
 99 |         <filter class="solr.WordDelimiterFilterFactory" 
100 | 		generateWordParts="0"
101 | 		generateNumberParts="0"
102 | 		catenateWords="1"
103 | 		catenateNumbers="1"
104 | 		catenateAll="0"/>
105 |          <filter class="solr.LowerCaseFilterFactory"/>
106 |          <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
107 |          <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
108 |       </analyzer>
109 |    </fieldType>
110 | 
111 | 
112 | Hadoop Setup
113 | -----------------
114 | 
115 | In order to use lsa4solr with Hadoop, make sure that the mahout-math-0.4.jar is
116 | in the Hadoop lib directory.  This is a dependency of the mahout-core-0.4.jar which
117 | contains the distributed job.  Put the core-site.xml and mapred-site.xml files from
118 | the resources directory into Solr's webapp/WEB-INF/classes directory and configure
119 | them to point to your Hadoop setup.
120 | 
121 | 
122 | Using
123 | -----
124 | 
125 | Start Solr with the -Dsolr.clustering.enabled=true option.  Once the server
126 | has started, cluster your documents using an URL like
127 | 
128 |     http://localhost:8983/solr/lsa4solr?nclusters=2&q=Summary:.*&rows=100&k=10&algorithm=kmeans
129 | 
130 | where
131 | 
132 |     algorithm - the algorithm to use for clustering (hierarchical or kmeans)
133 |     k         - the rank of the reduced SVD matrix
134 |     nclusters - the number of clusters to group the documents into (kmeans only)
135 |     q         - the standard Solr query parameter
136 |     rows      - the standard Solr rows parameter
137 |   
138 | The cluster information will be at the bottom of the response.
139 | 
140 | Testing
141 | -------
142 | 
143 | On the Downloads page, there is a Usenet dataset which can be found [here](http://people.csail.mit.edu/jrennie/20Newsgroups/)
144 | Import some documents from two or more of the newsgroups into your Solr instance and access the lsa4solr URL.
145 | 
146 | You can also use the cluster algorithm directly from the REPL
147 | 
148 |     lein swank
149 | 
150 |     user> (in-ns 'lsa4solr.cluster)
151 |     #<Namespace lsa4solr.cluster>
152 |     lsa4solr.cluster> (def reader (org.apache.lucene.index.IndexReader/open (org.apache.lucene.store.FSDirectory/open (new java.io.File "/path/to/solr/data/index"))))
153 |     #'lsa4solr.cluster/reader
154 |     lsa4solr.cluster> (def initial-terms (init-term-freq-doc reader "Summary"))
155 |     #'lsa4solr.cluster/initial-terms
156 |     lsa4solr.cluster> (def searcher (new org.apache.lucene.search.IndexSearcher reader))
157 |     #'lsa4solr.cluster/searcher
158 |     lsa4solr.cluster> (def queryparser 
159 |          (new org.apache.lucene.queryParser.QueryParser 
160 |     	  (org.apache.lucene.util.Version/LUCENE_30)
161 |     	  "Summary"
162 |     	  (new org.apache.lucene.analysis.SimpleAnalyzer)))
163 |     #'lsa4solr.cluster/queryparser
164 |     lsa4solr.cluster> (def result (. searcher search (. queryparser parse "Summary:br*") (. reader maxDoc)))
165 |     #'lsa4solr.cluster/result
166 |     lsa4solr.cluster> (def docids (map #(. %1 doc) (. result scoreDocs)))
167 |     #'lsa4solr.cluster/docids
168 |     lsa4solr.cluster> (def docslice (new org.apache.solr.search.DocSlice 0 (count docids) (int-array docids) (float-array (repeat (count docids) 1)) (count docids) 1))
169 |     #'lsa4solr.cluster/docslice
170 |     lsa4solr.cluster> (def kmeans-clst (cluster-kmeans-docs reader initial-terms 50 2 "Summary" "id"))
171 |     lsa4solr.cluster> (def hierarchical-clst (cluster-hierarchical-docs reader initial-terms 50 "Summary" "id"))
172 | 


--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
 1 | (defproject lsa4solr "1.0.0-SNAPSHOT"
 2 |   :description "Clustering component for Solr based on Latent Semantic Analysis"
 3 |   :namespaces :all
 4 |   :repositories {"apache" "https://repository.apache.org/"}
 5 |   :dependencies [[org.clojure/clojure "1.2.0-master-SNAPSHOT"]
 6 |                  [org.clojure/clojure-contrib "1.2.0-master-SNAPSHOT"]
 7 | 		 [org.apache.mahout/mahout-core "0.4-SNAPSHOT"
 8 | 		  :exclusions [org.apache.lucene/lucene-core
 9 | 			       org.apache.lucene/lucene-analyzers]]
10 | 		 [org.apache.mahout/mahout-math "0.4-SNAPSHOT"]
11 | 		 [org.slf4j/slf4j-log4j12 "1.5.11"]]
12 |   :dev-dependencies [[leiningen/lein-swank "1.1.0"]
13 | 		     [org.apache.solr/solr-core "3.1-SNAPSHOT" :exclusions [org.apache.lucene/lucene-snowball]]
14 | 		     [org.apache.solr/solr-clustering "3.1-SNAPSHOT" :exclusions [org.carrot2/carrot2-mini]]])
15 | 


--------------------------------------------------------------------------------
/resources/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 |     <name>fs.default.name</name>
10 |     <value>${fs.default.name}</value>
11 |   </property>
12 | 
13 |   <property>
14 |      <name>hadoop.tmp.dir</name>
15 |      <value>${hadoop.tmp.dir}</value>
16 |   </property>
17 | 
18 | 
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/resources/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 |   <property>
 9 |     <name>mapred.job.tracker</name>
10 |     <value>${mapred.job.tracker}</value>
11 |   </property>
12 |   <property>
13 |     <name>mapred.map.tasks</name>
14 |     <value>32</value>
15 |   </property>
16 |   <property>
17 |     <name>mapred.reduce.tasks</name>
18 |     <value>4</value>
19 |   </property>
20 | <property>
21 |     <name>mapred.local.dir</name>
22 |      <value>${mapred.local.dir}</value>
23 | </property>
24 | 
25 | 
26 | </configuration>
27 | 


--------------------------------------------------------------------------------
/src/lsa4solr/cluster.clj:
--------------------------------------------------------------------------------
 1 | (ns lsa4solr.cluster
 2 |   (:use [clojure.contrib.seq-utils :only [indexed]]
 3 | 	[lsa4solr core clustering-protocol]))
 4 | 
 5 | (gen-class
 6 |  :name lsa4solr.cluster/LSAClusteringEngine
 7 |  :extends org.apache.solr.handler.clustering.SearchClusteringEngine
 8 |  :exposes-methods {init superinit}
 9 |  :init initialize-state
10 |  :state state)
11 | 
12 | (defn -initialize-state []
13 |   [[] (ref {})])
14 | 
15 | (defn init-term-freq-doc [reader field]
16 |   (let [terms (. reader terms)
17 | 	numdocs (.maxDoc reader)
18 | 	counter (let [count (ref 0)] #(dosync (alter count inc)))]
19 |     (apply merge
20 | 	   (take-while 
21 | 	    #(= (nil? %1) false) 
22 | 	    (repeatedly 
23 | 	     (fn [] 
24 | 	       (if (and (. terms next) (= field (.field (. terms term))))
25 | 		 (let [text (. (. terms term) text)
26 | 		       df (. terms docFreq)]
27 | 		   {(keyword text) 
28 | 		    {
29 | 		     :df df
30 | 		     :idf (java.lang.Math/log (/ numdocs df))
31 | 		     :idx (counter)
32 | 		     }
33 | 		    })
34 | 		 nil)))))))
35 | 
36 | (defn -init [this
37 | 	     config
38 | 	     solr-core]
39 |   (let [super-result (.superinit this config solr-core)
40 | 	reader (.getReader (.get (.getSearcher solr-core true true nil)))
41 | 	narrative-field (.get config "narrative-field")
42 | 	id-field (.get config "id-field")
43 | 	name (.get config "name")]
44 |     (dosync 
45 |      (alter (.state this) assoc 
46 | 	    :reader reader 
47 | 	    :name name 
48 | 	    :narrative-field narrative-field 
49 | 	    :id-field id-field
50 | 	    :terms (init-term-freq-doc reader narrative-field))
51 |      name)))
52 | 
53 | (defn cluster-dispatch 
54 |   ;; K-Means
55 |   ([reader field id-field terms doc-list k num-clusters]
56 |      (let [doc-seq (iterator-seq (.iterator doc-list))
57 | 	   clusters (cluster-kmeans-docs reader terms doc-seq k num-clusters field id-field)]
58 |        (:clusters clusters)))
59 | 
60 |   ;; Hierarchical
61 |   ([reader field id-field terms doc-list k]
62 |      (let [doc-seq (iterator-seq (.iterator doc-list))
63 | 	   clusters (cluster-hierarchical-docs reader terms doc-seq k field id-field)]
64 |        (:clusters clusters))))
65 | 
66 | 
67 | (defn -cluster [this
68 | 		query
69 | 		doc-list
70 | 		solr-request]
71 |    (let [algorithm (.get (.getParams solr-request) "algorithm")]
72 |      (cond
73 |       (= algorithm "hierarchical") (cluster-dispatch (:reader @(.state this)) 
74 | 						     (:narrative-field @(.state this)) 
75 | 						     (:id-field @(.state this))
76 | 						     (:terms @(.state this)) 
77 | 						     doc-list 
78 | 						     (Integer. (.get (.getParams solr-request) "k")))
79 |       (= algorithm "kmeans") (cluster-dispatch (:reader @(.state this)) 
80 | 					       (:narrative-field @(.state this)) 
81 | 					       (:id-field @(.state this))
82 | 					       (:terms @(.state this)) 
83 | 					       doc-list 
84 | 					       (Integer. (.get (.getParams solr-request) "k"))
85 | 					       (Integer. (.get (.getParams solr-request) "nclusters"))))))
86 | 
87 | 


--------------------------------------------------------------------------------
/src/lsa4solr/clustering_protocol.clj:
--------------------------------------------------------------------------------
  1 | (ns lsa4solr.clustering-protocol
  2 |   (:use	[lsa4solr core hadoop-utils lucene-utils mahout-matrix hierarchical-clustering dendrogram])
  3 |   (:require [clojure [zip :as z]])
  4 |   (:require [clojure.contrib 
  5 | 	     [seq-utils :as seq-utils]
  6 | 	     [zip-filter :as zf]])
  7 |   (:import (org.apache.hadoop.conf Configuration)
  8 | 	   (org.apache.hadoop.fs FileSystem Path)
  9 | 	   (org.apache.hadoop.io Text SequenceFile$Reader)
 10 | 	   (org.apache.hadoop.fs.permission FsPermission)
 11 | 	   (org.apache.mahout.clustering.kmeans RandomSeedGenerator KMeansDriver)))
 12 | 
 13 | (defn kmeans-cluster
 14 |     [num-clusters max-iterations V S]
 15 |     (let [hadoop-conf (Configuration.)
 16 | 	  fs (FileSystem/get hadoop-conf)
 17 | 	  base-path (Path. (str "/lsa4solr/kmeans-clustering/" (java.lang.System/nanoTime)))
 18 | 	  mkdirs-result (FileSystem/mkdirs fs 
 19 | 					   base-path
 20 | 					   (FsPermission/getDefault))
 21 | 	  reduced-fm (mmult V S)
 22 | 	  reduced-m-path (str (.toString base-path) "/reducedm")
 23 | 	  writer (write-matrix hadoop-conf reduced-fm reduced-m-path)
 24 | 	  initial-centroids (RandomSeedGenerator/buildRandom reduced-m-path
 25 | 							     (str (.toString base-path) "/centroids") 
 26 | 							     num-clusters)
 27 | 	  cluster-output-path (str (.toString base-path) "/clusterout")
 28 | 	  job (KMeansDriver/runJob
 29 | 	       reduced-m-path
 30 | 	       (.toString initial-centroids)
 31 | 	       cluster-output-path
 32 | 	       "org.apache.mahout.common.distance.CosineDistanceMeasure"
 33 | 	       0.00000001 
 34 | 	       max-iterations
 35 | 	       num-clusters)
 36 | 	  tkey (Text.)
 37 | 	  tval (Text.)
 38 | 	  groups (clojure.contrib.seq-utils/flatten
 39 | 		  (map (fn [file-status] (let [path (.getPath file-status)
 40 | 					       seq-reader (SequenceFile$Reader. fs path hadoop-conf) 
 41 | 					       valseq (take-while (fn [v] (.next seq-reader tkey tval))
 42 | 								  (repeat [tkey tval]))]  
 43 | 					   (map #(.toString (second %)) valseq)))
 44 | 		       (.globStatus fs (Path. (str cluster-output-path "/points/part*")))))]
 45 |       groups))
 46 | 
 47 | (defn emit-leaf-node-fn
 48 |   [reader doc-seq id-field]
 49 |   (fn [node]
 50 |     (hash-map "name" (get-docid reader id-field (nth doc-seq (:id node)))
 51 | 	      "id" (get-docid reader id-field (nth doc-seq (:id node)))
 52 | 	      "data" {}
 53 | 	      "children" [])))
 54 | 
 55 | (defn emit-branch-node-fn
 56 |   []
 57 |   (let [id (ref 0)]
 58 |     (fn [node children-arr]
 59 |       (hash-map "name" (:count (meta node))
 60 | 		"id" (dosync (alter id inc))
 61 | 		"data" (hash-map "count" (:count (meta node)))
 62 | 		"children" children-arr))))
 63 | 
 64 | (defn hierarchical-clustering
 65 |   [reader id-field doc-seq mat]
 66 |   (let [[dend merge-sequence] (last (hclust mat))]
 67 |      (dendrogram-to-map dend (emit-branch-node-fn) (emit-leaf-node-fn reader doc-seq id-field))))
 68 | 
 69 | (defn get-mapper-common [terms vec-ref ndocs update-ref]
 70 |   (proxy [org.apache.lucene.index.TermVectorMapper]
 71 |       []
 72 |     (map [term frequency offsets positions]
 73 | 	 (let [term-entry ((keyword term) terms)]
 74 | 	   (dosync 
 75 | 	    (update-ref vec-ref (- (:idx term-entry) 1)  (* frequency (:idf term-entry))))))
 76 |     (setExpectations [field numTerms storeOffsets storePositions]
 77 | 		     nil)))
 78 | 
 79 | 
 80 | (defn get-mapper
 81 |   [terms vec-ref ndocs]
 82 |   (get-mapper-common terms vec-ref ndocs
 83 | 		     (fn [vec-ref idx weight]
 84 | 		       (set-value @vec-ref idx weight))))
 85 | 
 86 | (defn init-frequency-vector
 87 |   [n]
 88 |   (ref (create-vector n)))
 89 |   
 90 | (defn get-frequency-matrix 
 91 |   [reader field terms hits]
 92 |   (distributed-matrix (extract-frequency-vectors 
 93 | 		       reader
 94 | 		       (fn [n] (init-frequency-vector n))
 95 | 		       (fn [terms vec-ref ndocs] 
 96 | 			 (get-mapper terms
 97 | 				     vec-ref
 98 | 				     ndocs))
 99 | 		       field
100 | 		       terms
101 | 		       hits)))
102 | 
103 | (defn decompose-term-doc-matrix
104 |   [reader narrative-field terms doc-seq k]
105 |   (let [fm (transpose (get-frequency-matrix reader
106 | 					    narrative-field
107 | 					    terms
108 | 					    doc-seq))
109 | 	svd-factorization (decompose-svd fm k)
110 | 	U (:U svd-factorization)
111 | 	S (:S svd-factorization) 
112 | 	V (:V svd-factorization)]
113 |     (list U S V)))
114 | 
115 | (defn cluster-kmeans-docs 
116 |   [reader
117 |    terms
118 |    doc-seq
119 |    k
120 |    num-clusters
121 |    narrative-field
122 |    id-field]
123 |   (let [[U S V] (decompose-term-doc-matrix reader narrative-field terms doc-seq k)
124 | 	groups (kmeans-cluster num-clusters k V S)
125 | 	clusters (apply merge-with #(into %1 %2)
126 | 			(map #(hash-map (keyword (second %))
127 | 					(list (get-docid reader id-field (nth doc-seq (first %1)))))
128 | 			     (seq-utils/indexed groups)))]
129 |     {:clusters clusters
130 |      :U U
131 |      :S S
132 |      :V V}))
133 | 
134 | (defn cluster-hierarchical-docs 
135 |   [reader
136 |    terms
137 |    doc-seq
138 |    k
139 |    narrative-field
140 |    id-field]
141 |   (let [[U S V] (decompose-term-doc-matrix reader narrative-field terms doc-seq k)
142 | 	SVt (transpose (mmult S (transpose V)))
143 | 	clusters (hierarchical-clustering reader id-field doc-seq SVt)]
144 |     {:clusters clusters
145 |      :U U
146 |      :S S
147 |      :V V}))


--------------------------------------------------------------------------------
/src/lsa4solr/core.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.core)
2 | 
3 | (defn to-array-of [class coll] 
4 |   (let [array (make-array class (count coll))] 
5 |     (dorun (map (fn [item index] (aset array index item)) 
6 |                 coll 
7 |                 (iterate inc 0))) 
8 |     array))
9 | 


--------------------------------------------------------------------------------
/src/lsa4solr/dendrogram.clj:
--------------------------------------------------------------------------------
 1 | (ns lsa4solr.dendrogram
 2 |   (:require [clojure [zip :as z]])
 3 |   (:require [clojure.contrib 
 4 | 	     [combinatorics :as combine]
 5 | 	     [zip-filter :as zf]
 6 | 	     [seq-utils :as seq-utils]]))
 7 | 
 8 | (defn move-right
 9 |   "Moves right n steps from location"
10 |   [loc n]
11 |   (nth (iterate #(z/right %) loc) n))
12 | 
13 | (defn remove-nodes
14 |   "Removes multiple nodes in a single pass"
15 |   [loc & n]
16 |   (let [sorted-indexes (sort n)
17 | 	increments (range 0 (count sorted-indexes))
18 | 	incremental-indexes (map #(- %1 %2) sorted-indexes increments)]
19 |     (reduce #(z/seq-zip (z/root (z/remove (move-right (z/down %1) %2))))
20 | 	    loc
21 | 	    incremental-indexes)))
22 | 
23 | (defn merge-nodes 
24 |   "Merges two nodes by calling new-node to create the new node.
25 |    Pulls all other nodes up one level to maintain dendrogram."
26 |   [root node-indexes new-node] 
27 |   (let [n1 (z/node (move-right (z/down root) (first node-indexes)))
28 | 	n2 (z/node (move-right (z/down root) (second node-indexes)))
29 | 	new-tree (z/seq-zip 
30 | 		  (map #(with-meta
31 | 			  (list (with-meta (z/node %) (meta (z/node %))))
32 | 			  (meta (z/node %)))
33 | 		       (zf/children (apply remove-nodes root node-indexes))))]
34 |     (z/seq-zip (z/root (z/insert-child 
35 | 			new-tree
36 | 			(new-node n1 n2))))))
37 | 
38 | (defn bfs-depth-seq [branch? children root height]
39 |   "Walks a tree to a certain depth and returns a lazy sequence of all nodes at specified depth"
40 |   (let [walk (fn walk [queue]
41 |                (when-let [node (peek queue)]
42 |                  (lazy-seq
43 | 		  (cond 
44 | 		   (< (:depth node) height) (walk
45 | 					     (into (pop queue)
46 | 						   (when (branch? (:node node))
47 | 						     (map #(hash-map :node % :depth (inc (:depth node)))
48 | 							  (children (:node node))))))
49 | 		   :default (cons (:node node) 
50 | 				  (walk (pop queue)))))))]
51 |     (walk (conj clojure.lang.PersistentQueue/EMPTY (hash-map :node root :depth 0)))))
52 | 
53 | 
54 | (defn dendrogram-to-map
55 |   [node emit-branch-node emit-leaf-node]
56 |   (cond (z/branch? node) (emit-branch-node (z/node node)
57 | 					   (map #(dendrogram-to-map % emit-branch-node emit-leaf-node)
58 | 						(zf/children node)))
59 | 	:default (emit-leaf-node (z/node node))))
60 | 
61 | (defn cut
62 |   "Cuts dendrogram at depth.  Returns flattened descendants of groups at depth."
63 |   [root depth]
64 |   (map (fn [node] 
65 | 	 (map #(:id (z/node %))
66 | 	      (filter #(not (z/branch? %))
67 | 		      (zf/descendants node))))
68 |        (bfs-depth-seq z/branch? zf/children root depth)))
69 | 
70 | (defn dendrogram
71 |   "Constructs a new dendrogram from a sequence of elements"
72 |   [els]
73 |   (z/seq-zip els))


--------------------------------------------------------------------------------
/src/lsa4solr/hadoop_utils.clj:
--------------------------------------------------------------------------------
 1 | (ns lsa4solr.hadoop-utils
 2 |   (:import (org.apache.mahout.math VectorWritable)
 3 | 	   (org.apache.hadoop.io IntWritable)
 4 | 	   (org.apache.hadoop.fs FileSystem Path)
 5 | 	   (org.apache.hadoop.io SequenceFile$Writer)))
 6 | 
 7 | (defn write-vectors [writer
 8 | 		     m]
 9 |   (doall (map #(.append writer %1 (VectorWritable. (.vector %2))) 
10 | 	      (map #(IntWritable. %) 
11 | 		   (range 0 (.numRows m))) 
12 | 	      (iterator-seq (.iterator m)))))
13 | 
14 | (defn write-matrix [hadoop-conf m path-string]
15 |   (let [fs (FileSystem/get hadoop-conf)
16 | 	path (Path. path-string)]
17 |     (doto (SequenceFile$Writer. fs
18 | 				hadoop-conf
19 | 				path
20 | 				IntWritable
21 | 				VectorWritable)
22 |       (write-vectors m)
23 |       (.close))))


--------------------------------------------------------------------------------
/src/lsa4solr/hierarchical_clustering.clj:
--------------------------------------------------------------------------------
 1 | (ns lsa4solr.hierarchical-clustering
 2 |   (:use [lsa4solr mahout-matrix dendrogram])
 3 |   (:require [clojure [zip :as z]])
 4 |   (:require [clojure.contrib 
 5 | 	     [combinatorics :as combine]
 6 | 	     [zip-filter :as zf]
 7 | 	     [seq-utils :as seq-utils]])
 8 |   (:import (org.apache.mahout.math SparseMatrix 
 9 | 				   RandomAccessSparseVector
10 | 				   VectorWritable
11 | 				   Matrix
12 | 				   DenseMatrix)
13 | 	   (org.apache.mahout.math.hadoop DistributedRowMatrix)))
14 | 
15 | (defn get-count
16 |   [cluster]
17 |   (:count (meta cluster)))
18 | 
19 | (defn get-centroid
20 |   [cluster]
21 |   (:centroid (meta cluster)))
22 | 
23 | (defn merge-centroids
24 |   [c1 c2]
25 |   (add (mult (get-centroid c1) (double (/ 1 (get-count c1))))
26 |        (mult (get-centroid c2) (double (/ 1 (get-count c2))))))
27 | 
28 | (defn get-vecs
29 |   [mat idxs]
30 |   (map #(.getRow mat %) idxs))
31 | 
32 | (defn average-dispersion
33 |   [mat group centroid dist]
34 |   (/ (reduce + (map #(dist centroid %) (get-vecs mat group)))
35 |      (count group)))
36 | 
37 | (defn average-intercluster-dispersion
38 |   [mat clusters dist]
39 |   (let [centroids (map #(apply centroid (get-vecs mat %)) clusters)
40 | 	combos (combine/combinations centroids 2)]
41 |     (/ (reduce + (map #(apply dist %) combos))
42 |        (count combos))))
43 | 
44 | (defn hclust
45 |   "Hierarchical clustering of the rows of mat.  Returns a dendrogram
46 |    and a merge sequence.  The dendrogram is a tree with doc ids as 
47 |    leaf nodes and meta data in the branch nodes indicating the number
48 |    of children and the centroid of the branch."
49 |   [mat]
50 |   (let [dend (dendrogram (map #(with-meta {:id %} 
51 | 				      (hash-map 
52 | 				       :centroid (.getRow mat %) 
53 | 				       :count 1))
54 | 				   (range 0 (.numRows mat))))
55 | 	get-distance (memoize euclidean-distance)]
56 |     (take 
57 |      (- (.numRows mat) 1)
58 |      (iterate (fn [[dend merge-sequence]]
59 | 		(let [clusters (z/children dend)
60 | 		      dists (map #(list % (get-distance (get-centroid (nth clusters (first %)))
61 | 							(get-centroid (nth clusters (second %)))))
62 | 				 (combine/combinations (range 0 (count clusters)) 2))
63 | 		      closest-pair (first (reduce #(if (< (second %1) (second %2)) %1 %2)
64 | 						  (first dists)
65 | 						  (rest dists)))]
66 | 		  (list (merge-nodes dend 
67 | 				     closest-pair
68 | 				     (fn [n1 n2] 
69 | 				       (with-meta (list (with-meta n1 (meta n1)) (with-meta n2 (meta n2)))
70 | 					 (hash-map :count (apply + (map get-count [n1 n2]))
71 | 						   :centroid (merge-centroids n1 n2)))))
72 | 			(conj merge-sequence closest-pair))))
73 | 	      (list dend '())))))


--------------------------------------------------------------------------------
/src/lsa4solr/lucene_utils.clj:
--------------------------------------------------------------------------------
 1 | (ns lsa4solr.lucene-utils)
 2 | 
 3 | (defn get-docid [reader id-field id] 
 4 |   (.stringValue (.getField (.document reader id) id-field)))
 5 | 
 6 | (defn extract-frequency-vectors
 7 |   [reader init-frequency-vector get-mapper field terms hits]
 8 |   (map #(let [m (init-frequency-vector (count terms))
 9 | 	      mapper (get-mapper terms m (count hits))]
10 | 	  (do (. reader getTermFreqVector (int %1) field mapper)
11 | 	      @m)) 
12 |        hits))
13 | 
14 | 


--------------------------------------------------------------------------------
/src/lsa4solr/mahout_matrix.clj:
--------------------------------------------------------------------------------
  1 | (ns lsa4solr.mahout-matrix
  2 |   (:require [clojure.contrib.generic [math-functions :as math]])
  3 |   (:import (org.apache.mahout.math SparseMatrix RandomAccessSparseVector VectorWritable Matrix DenseMatrix)
  4 | 	   (org.apache.mahout.math.hadoop DistributedRowMatrix)
  5 | 	   (org.apache.mahout.math.hadoop.decomposer DistributedLanczosSolver)
  6 | 	   (org.apache.mahout.math.function UnaryFunction TimesFunction)
  7 | 	   (org.apache.mahout.common.distance EuclideanDistanceMeasure)
  8 | 	   (org.apache.hadoop.fs Path FileSystem)
  9 | 	   (org.apache.hadoop.fs.permission FsPermission)
 10 | 	   (org.apache.hadoop.conf Configuration)
 11 | 	   (org.apache.hadoop.mapred JobConf)
 12 | 	   (org.apache.hadoop.io IntWritable SequenceFile$Writer)))
 13 | 
 14 | (defn create-vector 
 15 |   [data]
 16 |   (cond
 17 |    (coll? data) (doto (RandomAccessSparseVector. (count data))
 18 | 		  ((fn [vec] (doall
 19 | 			      (map #(.setQuick vec %1 %2) 
 20 | 				   (range 0 (count data)) 
 21 | 				   data)))))
 22 |    (integer? data) (doto (RandomAccessSparseVector. data))))
 23 | 
 24 | (defn print-vector
 25 |   [v]
 26 |   (map #(.get %) (iterator-seq (.iterateAll v))))
 27 | 
 28 | (defn minus 
 29 |   [v1 v2]
 30 |   (.minus v1 v2))
 31 | 
 32 | (defn mult
 33 |   [v1 s]
 34 |   (.times v1 s))
 35 | 
 36 | (defn divide
 37 |   [v1 s]
 38 |   (.divide v1 s))
 39 | 
 40 | (defn add
 41 |   [v1 v2]
 42 |   (.plus v1 v2))
 43 | 
 44 | (defn centroid
 45 |   [& vecs]
 46 |   (divide (reduce add vecs) (count vecs)))
 47 | 
 48 | (def euclidean-distance-measure (EuclideanDistanceMeasure.))
 49 | 
 50 | (defn euclidean-distance
 51 |   [v1 v2]
 52 |   (.distance euclidean-distance-measure v1 v2))
 53 | 
 54 | (defn mean
 55 |   [& vecs]
 56 |   (divide (reduce add vecs) 
 57 | 	  (count vecs)))
 58 | 
 59 | (defn variance
 60 |   [& vecs]
 61 |   (let [sample-mean (apply mean vecs)]
 62 |     (/ (reduce + (map #(math/pow (euclidean-distance % sample-mean) 2) vecs))
 63 |        (count vecs))))
 64 | 
 65 | (defn set-value
 66 |   ([#^RandomAccessSparseVector vector index value] (.setQuick vector index value)))
 67 | 
 68 | (defn distributed-matrix
 69 |   [vec-iterator]
 70 |   (let [hadoop-conf (Configuration.)
 71 | 	fs (FileSystem/get hadoop-conf)
 72 | 	base-path (Path. (str "/lsa4solr/matrix/" (java.lang.System/nanoTime)))
 73 | 	mkdirs-result (FileSystem/mkdirs fs 
 74 | 					 base-path
 75 | 					 (FsPermission/getDefault))
 76 | 	m-path (str (.toString base-path) "/m")
 77 | 	tmp-path (str (.toString base-path) "/tmp")
 78 | 	nrows (count vec-iterator)
 79 | 	ncols (.size (first vec-iterator))
 80 | 	writer (doto (SequenceFile$Writer. fs
 81 | 					   hadoop-conf
 82 | 					   (Path. m-path)
 83 | 					   IntWritable
 84 | 					   VectorWritable)
 85 | 		 ((fn [wrt]
 86 | 		    (doall 
 87 | 		     (map #(.append wrt 
 88 | 				    (IntWritable. %1)
 89 | 				    (VectorWritable. %2))
 90 | 		    (range 0 nrows)
 91 | 		    vec-iterator))))
 92 | 		 (.close))]
 93 |     (doto
 94 | 	(DistributedRowMatrix. m-path
 95 | 			       tmp-path
 96 | 			       nrows
 97 | 			       ncols)
 98 | 			       (.configure (JobConf. hadoop-conf)))))
 99 | 
100 | (defn local-matrix
101 |   [data]
102 |   (doto (DenseMatrix. (count data) (count (first data)))
103 |     ((fn [m] (doall 
104 | 	      (map (fn [row] (.assignRow m row (create-vector (nth data row))))
105 | 		   (range 0 (count data))))))))
106 | 
107 | (defmulti mmult (fn [A & B] (type A)))
108 | 
109 | (defmethod mmult DistributedRowMatrix [A B]
110 |   (let [num-rows (.numRows A)
111 | 	num-cols (second (int-array (.size B)))]
112 |     (doto (DenseMatrix. num-rows num-cols)
113 |       ((fn [m] (doall (pmap #(.assignColumn m % (.times A (.getColumn B %))) 
114 | 			    (range 0 num-cols))))))))
115 | (defmethod mmult :default [A B]
116 |      (.times A B))
117 | 
118 | (defn diag
119 |   [vals]
120 |   (doto (SparseMatrix. (int-array [(count vals) (count vals)]))
121 |     ((fn [m] (doall (map #(.setQuick m %1 %2 %3) 
122 | 			 (range 0 (count vals))
123 | 			 (range 0 (count vals))
124 | 			 vals))))))
125 | 
126 | (defn invert-diagonal
127 |   [mat]
128 |   (.assign mat
129 | 	   (proxy [UnaryFunction]
130 | 	       []
131 | 	     (apply [arg1] (if (= arg1 0) 0 (/ 1 arg1))))))
132 | 
133 | 
134 | (defn transpose
135 |   [mat]
136 |   (.transpose mat))
137 | 
138 | (defn normalize-matrix-columns
139 |   [mat]
140 |   (let [num-rows (.numRows mat)
141 | 	num-cols (.numCols mat)]
142 |     (doto (DenseMatrix. num-rows num-cols)
143 |       ((fn [m] (doall (pmap #(.assignColumn m % (.normalize (.getColumn mat %))) 
144 | 			    (range 0 num-cols))))))))
145 | 
146 | (defn decompose-svd
147 |   [mat k]
148 |   (let [eigenvalues (new java.util.ArrayList)
149 | 	eigenvectors (DenseMatrix. (+ k 2) (.numCols mat))
150 | 	decomposer (doto (DistributedLanczosSolver.)
151 | 			  (.solve mat (+ k 2) eigenvectors eigenvalues false))
152 | 	V (normalize-matrix-columns (.viewPart (.transpose eigenvectors) 
153 | 					       (int-array [0 0]) 
154 | 					       (int-array [(.numCols mat) k])))
155 | 	U (mmult mat V)
156 | 	S (diag (take k (reverse eigenvalues)))]
157 |     {:U U
158 |      :S S
159 |      :V V}))
160 |      
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/test/lsa4solr/core_test.clj:
--------------------------------------------------------------------------------
1 | (ns lsa4solr.core-test
2 |   (:use [lsa4solr.core] :reload-all)
3 |   (:use [clojure.test]))
4 | 
5 | (deftest replace-me ;; FIXME: write
6 |   (is false))
7 | 


--------------------------------------------------------------------------------