├── .env ├── .gitignore ├── .travis.yml ├── README.md ├── project.clj ├── resources └── log4j.properties ├── src └── tf_idf │ └── core.clj └── test └── tf_idf └── core_test.clj /.env: -------------------------------------------------------------------------------- 1 | SPARK_LOCAL_IP=127.0.0.1 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .lein-repl-history 3 | .nrepl-port 4 | sparkling_getting_started.iml 5 | access.log 6 | .lein-failures 7 | *.iml 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: clojure 2 | lein: lein2 3 | jdk: 4 | - openjdk7 5 | - oraclejdk8 6 | script: "lein do clean, test" 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Getting started project for [Sparkling](https://gorillalabs.github.io/sparkling/) 2 | 3 | This is a companion repo to the sparkling [guide] (https://gorillalabs.github.io/sparkling/articles/tfidf_guide.html). Please see this guide for further information. -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject gorillalabs/sparkling-getting-started "2.0.0-SNAPSHOT" 2 | :description "A Sample project demonstrating the use of Sparkling (https://gorillalabs.github.io/sparkling/), as shown in tutorial https://gorillalabs.github.io/sparkling/articles/tfidf_guide.html" 3 | :url "https://gorillalabs.github.io/sparkling/articles/tfidf_guide.html" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.8.0"] 7 | [gorillalabs/sparkling "2.0.0"]] 8 | 9 | :aot [#".*" sparkling.serialization sparkling.destructuring] 10 | :main tf-idf.core 11 | :profiles {:provided {:dependencies [[org.apache.spark/spark-core_2.10 "2.1.0"] 12 | [org.apache.spark/spark-sql_2.10 "2.1.0"]]} 13 | :dev {:plugins [[lein-dotenv "RELEASE"]]}}) 14 | 15 | 16 | ;; run example with 17 | ;; lein run 18 | 19 | ;; test with 20 | ;; lein test 21 | -------------------------------------------------------------------------------- /resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # log4j config for clojure development 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Console appender 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %5p %c{2}:%L - %m%n 8 | 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 12 | 13 | # uncomment for tracing of fns 14 | # log4j.logger.sparkling=trace 15 | # log4j.logger.serializable.fn=trace 16 | # log4j.logger.sparkling=DEBUG 17 | -------------------------------------------------------------------------------- /src/tf_idf/core.clj: -------------------------------------------------------------------------------- 1 | (ns tf-idf.core 2 | (:require [clojure.string :as string] 3 | [sparkling.conf :as conf] 4 | [sparkling.core :as spark] 5 | [sparkling.serialization] 6 | [sparkling.destructuring :as s-de]) 7 | (:gen-class)) 8 | 9 | 10 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 11 | ;; Basic term handling functions 12 | 13 | (def stopwords #{"a" "all" "and" "any" "are" "is" "in" "of" "on" 14 | "or" "our" "so" "this" "the" "that" "to" "we"}) 15 | 16 | (defn terms [content] 17 | (map string/lower-case (string/split content #" "))) 18 | 19 | (def remove-stopwords (partial remove (partial contains? stopwords))) 20 | 21 | 22 | (remove-stopwords (terms "A quick brown fox jumps")) 23 | 24 | 25 | 26 | 27 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 28 | ;; tf / idf / tf*idf functions 29 | 30 | (defn idf [doc-count doc-count-for-term] 31 | (Math/log (/ doc-count (+ 1.0 doc-count-for-term)))) 32 | 33 | 34 | (System/getenv "SPARK_LOCAL_IP") 35 | 36 | 37 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 38 | ;; Basic Spark management 39 | 40 | (defn make-spark-context [] 41 | (let [c (-> (conf/spark-conf) 42 | (conf/master "local[*]") 43 | (conf/app-name "tfidf"))] 44 | (spark/spark-context c))) 45 | 46 | 47 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 48 | ;; Basic data model generation functions 49 | (defn term-count-from-doc 50 | "Returns a stopword filtered seq of tuples of doc-id,[term term-count doc-terms-count]" 51 | [doc-id content] 52 | (let [terms (remove-stopwords 53 | (terms content)) 54 | doc-terms-count (count terms) 55 | term-count (frequencies terms)] 56 | (map (fn [term] (spark/tuple [doc-id term] [(term-count term) doc-terms-count])) 57 | (distinct terms)))) 58 | 59 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 60 | ;; Spark Transformations / Actions 61 | 62 | (defn document-count [documents] 63 | (spark/count documents)) 64 | 65 | ; (term-count-from-doc "doc1" "A quick brown fox") 66 | 67 | 68 | (defn term-count-by-doc-term [documents] 69 | (->> 70 | documents 71 | (spark/flat-map-to-pair 72 | (s-de/key-value-fn term-count-from-doc)) 73 | spark/cache)) 74 | 75 | (defn document-count-by-term [document-term-count] 76 | (->> document-term-count 77 | (spark/map-to-pair (s-de/key-value-fn 78 | (fn [[_ term] [_ _]] (spark/tuple term 1)))) 79 | (spark/reduce-by-key +))) 80 | 81 | (defn idf-by-term [doc-count doc-count-for-term-rdd] 82 | (spark/map-values (partial idf doc-count) doc-count-for-term-rdd)) 83 | 84 | (defn tf-by-doc-term [document-term-count] 85 | (spark/map-to-pair (s-de/key-value-fn 86 | (fn [[doc term] [term-count doc-terms-count]] 87 | (spark/tuple term [doc (/ term-count doc-terms-count)]))) 88 | document-term-count)) 89 | 90 | 91 | (defn tf-idf-by-doc-term [doc-count document-term-count term-idf] 92 | (->> (spark/join (tf-by-doc-term document-term-count) term-idf) 93 | (spark/map-to-pair (s-de/key-val-val-fn 94 | (fn [term [doc tf] idf] 95 | (spark/tuple [doc term] (* tf idf))))) 96 | )) 97 | 98 | 99 | (defn tf-idf [corpus] 100 | (let [doc-count (document-count corpus) 101 | document-term-count (term-count-by-doc-term corpus) 102 | term-idf (idf-by-term doc-count (document-count-by-term document-term-count))] 103 | (tf-idf-by-doc-term doc-count document-term-count term-idf))) 104 | 105 | (def tuple-swap (memfn ^scala.Tuple2 swap)) 106 | 107 | (def swap-key-value (partial spark/map-to-pair tuple-swap)) 108 | 109 | (defn sort-by-value [rdd] 110 | (->> rdd 111 | swap-key-value 112 | (spark/sort-by-key compare false) 113 | swap-key-value 114 | )) 115 | 116 | 117 | 118 | (defn -main [& args] 119 | (let [sc (make-spark-context) 120 | documents [(spark/tuple :doc1 "Four score and seven years ago our fathers brought forth on this continent a new nation") 121 | (spark/tuple :doc2 "conceived in Liberty and dedicated to the proposition that all men are created equal") 122 | (spark/tuple :doc3 "Now we are engaged in a great civil war testing whether that nation or any nation so") 123 | (spark/tuple :doc4 "conceived and so dedicated can long endure We are met on a great battlefield of that war")] 124 | corpus (spark/parallelize-pairs sc documents) 125 | tf-idf (tf-idf corpus)] 126 | (println (.toDebugString tf-idf)) 127 | (clojure.pprint/pprint (spark/collect tf-idf)) 128 | #_(clojure.pprint/pprint (spark/take 10 (sort-by-value tf-idf))) 129 | )) 130 | -------------------------------------------------------------------------------- /test/tf_idf/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns tf-idf.core-test 2 | (:import [org.apache.spark.serializer KryoSerializer] 3 | [org.apache.spark.serializer KryoSerializerInstance]) 4 | (:require [clojure.test :refer :all] 5 | [tf-idf.core :refer :all] 6 | [sparkling.core :as spark] 7 | [sparkling.conf :as conf] 8 | [sparkling.destructuring :as s-de] 9 | [sparkling.serialization :as requirered-to-have-serializer-class-ready])) 10 | 11 | 12 | (deftest sparkling-serialization 13 | (testing "registrator" 14 | (is (instance? KryoSerializerInstance 15 | (.newInstance (KryoSerializer. (-> (conf/spark-conf) 16 | (conf/master "local") 17 | ))))))) 18 | 19 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 20 | ;; Helper functions 21 | 22 | (def tuple2vec (s-de/key-value-fn vector)) 23 | 24 | (defn first2vec [rdd] 25 | (tuple2vec (spark/first rdd))) 26 | 27 | (defn all2vec [rdd] 28 | (map tuple2vec (spark/collect rdd))) 29 | 30 | (defn round 31 | "Round a number to the given precision (number of significant digits)" 32 | [precision d] 33 | (let [factor (Math/pow 10 precision)] 34 | (/ (Math/round (* d factor)) factor))) 35 | 36 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37 | ;; Tests 38 | 39 | (deftest domain-functions-test 40 | (testing "tf functions" 41 | (is (= ["quick" "brown" "fox" "jumps"] 42 | (remove-stopwords (terms "A quick brown fox jumps")) 43 | ))) 44 | 45 | (testing "idf functions" 46 | (is (= -0.8109302162163288 (idf 4 8))) 47 | (is (= -0.2231435513142097 (idf 4 4))) 48 | (is (= 1.3862943611198906 (idf 4 0)))) 49 | 50 | (testing "domain model manipulation functions" 51 | (is (= [(spark/tuple ["doc1" "four"] [1 (int 3)]) 52 | (spark/tuple ["doc1" "score"] [1 (int 3)]) 53 | (spark/tuple ["doc1" "seven"] [1 (int 3)])] 54 | (into [] (term-count-from-doc "doc1" "Four score and seven")))))) 55 | 56 | 57 | 58 | 59 | (defn make-test-context [] 60 | (-> (conf/spark-conf) 61 | (conf/master "local") 62 | (conf/app-name "tfidf-test"))) 63 | 64 | (def documents-fixture 65 | [(spark/tuple "doc1" "Four score and seven years ago our fathers brought forth on this continent a new nation") 66 | (spark/tuple "doc2" "conceived in Liberty and dedicated to the proposition that all men are created equal") 67 | (spark/tuple "doc3" "Now we are engaged in a great civil war testing whether that nation or any nation so") 68 | (spark/tuple "doc4" "conceived and so dedicated can long endure We are met on a great battlefield of that war")]) 69 | 70 | 71 | 72 | (deftest spark-functions-test 73 | (spark/with-context c (make-test-context) 74 | (testing "count the number of documents" 75 | (is (= 4 76 | (document-count 77 | (spark/parallelize-pairs c documents-fixture)))) 78 | (is (= 0 79 | (document-count 80 | (spark/parallelize-pairs c [])))) 81 | ) 82 | 83 | (testing "term-count-by-doc-term" 84 | (is (= [["doc1" "four"] [1 11]] 85 | (first2vec 86 | (term-count-by-doc-term 87 | (spark/parallelize-pairs c documents-fixture)))))) 88 | 89 | (testing "document-count-by-term" 90 | (is (= #{["four" 2] ["eggs" 1]} 91 | (into #{} (all2vec 92 | (document-count-by-term 93 | (spark/parallelize-pairs c [#sparkling/tuple [["doc1" "four"] [1 2]] 94 | #sparkling/tuple [["doc1" "eggs"] [1 2]] 95 | #sparkling/tuple [["doc2" "four"] [1 1]] 96 | ]))))))) 97 | 98 | (testing "" 99 | (is (= #{["four" ["doc1" 1/2]] 100 | ["four" ["doc2" 1]] 101 | ["eggs" ["doc1" 1/2]]} 102 | (into #{} (all2vec 103 | (tf-by-doc-term (spark/parallelize-pairs c [#sparkling/tuple [["doc1" "four"] [1 2]] 104 | #sparkling/tuple [["doc1" "eggs"] [1 2]] 105 | #sparkling/tuple [["doc2" "four"] [1 1]] 106 | ]))))))) 107 | 108 | (testing "idf" 109 | (is (= #{["four" (round 4 (Math/log (/ 2 3)))] ["eggs" (round 4 (Math/log (/ 2 2)))]} 110 | (into #{} 111 | (map (fn [[term idf]] [term (round 4 idf)]) 112 | (all2vec 113 | (idf-by-term 2 114 | (spark/parallelize-pairs c [#sparkling/tuple ["four" 2] 115 | #sparkling/tuple ["eggs" 1]] 116 | )))))))) 117 | 118 | )) 119 | 120 | --------------------------------------------------------------------------------