├── .gitignore ├── test └── stemmers │ └── test │ ├── porter.clj │ ├── soundex.clj │ └── core.clj ├── project.clj ├── README.md ├── src └── stemmers │ ├── soundex.clj │ ├── core.clj │ └── porter.clj └── docs └── uberdoc.html /.gitignore: -------------------------------------------------------------------------------- 1 | pom.xml 2 | *jar 3 | lib 4 | classes 5 | -------------------------------------------------------------------------------- /test/stemmers/test/porter.clj: -------------------------------------------------------------------------------- 1 | (ns stemmers.test.porter 2 | (:use 3 | clojure.test 4 | stemmers.porter 5 | stemmers.test.porter-test-cases)) 6 | 7 | (deftest stems 8 | (doseq [[input output] test-cases] 9 | (is (= (stem input) output) (str "for " input)))) 10 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject stemmers "0.2.3" 2 | :description "A collection of stemmers for use in search engines and the like." 3 | :url "http://github.com/mattdw/stemmers" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.11.1"]] 7 | :dev-dependencies [[marginalia "0.3.2"] 8 | [swank-clojure "1.3.0-SNAPSHOT"]]) 9 | -------------------------------------------------------------------------------- /test/stemmers/test/soundex.clj: -------------------------------------------------------------------------------- 1 | (ns stemmers.test.soundex 2 | (:use clojure.test 3 | stemmers.soundex)) 4 | 5 | (deftest rupert-robert-rubin 6 | (is (= (stem "robert") "R163")) 7 | (is (= (stem "rupert") "R163")) 8 | (is (= (stem "rubin") "R150"))) 9 | 10 | (deftest ashcraft-ashcroft 11 | (is (= (stem "ashcraft") (stem "ashcroft") "A261"))) 12 | 13 | (deftest jumanji-fruittrees 14 | (is (= (stem "jumanji") "J552")) 15 | (is (= (stem "fruittress") "F636"))) 16 | -------------------------------------------------------------------------------- /test/stemmers/test/core.clj: -------------------------------------------------------------------------------- 1 | (ns stemmers.test.core 2 | (:use [stemmers.core] :reload) 3 | (:use [clojure.test])) 4 | 5 | (deftest basic-tokenise 6 | (is (= (tokenise "nothing ever happens") 7 | ["nothing" "ever" "happens"]))) 8 | 9 | (deftest default-stem 10 | (is (= (stems "the hungry dog hungrily jumped over the angry moon with a hunger, jumping with anger.") 11 | ["hungri" "dog" "hungrili" "jump" "over" "angri" "moon" "with" "hunger" "jump" "with" "anger"]))) 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stemmers 2 | 3 | A collection of stemmers in Clojure, ready to plug into your own code. 4 | 5 | 6 | 7 | ## Usage 8 | 9 | All stemmers export a `stem` top-level function, which takes a single 10 | word and returns its stem. Additionally, `stemmers.core` provides 11 | basic plumbing with the `stems` function: 12 | 13 | (require '(stemmers core soundex porter)) 14 | ;; defaults to porter stemmer 15 | (stemmers.core/stems "a phrase for stemming") 16 | => ("phrase" "for" "stem") 17 | ;; other stemmers are easily used 18 | (stemmers.core/stems "a phrase for stemming" stemmers.soundex/stem) 19 | => ("P620" "F600" "S355") 20 | 21 | ## License 22 | 23 | Copyright (C) 2010 Matt Wilson 24 | 25 | Distributed under the Eclipse Public License, the same as Clojure. 26 | -------------------------------------------------------------------------------- /src/stemmers/soundex.clj: -------------------------------------------------------------------------------- 1 | ;; An implementation of the American Soundex algorithm, as described 2 | ;; at 3 | 4 | (ns stemmers.soundex 5 | (:require [clojure.string :as str])) 6 | 7 | ;; Consonant codings. `h`, `w` and the vowels are special cases, as we'll see later. 8 | (def char-map 9 | {\b 1, \f 1, \p 1, \v 1 10 | \c 2, \g 2, \j 2, \k 2, \q 2, \s 2, \x 2, \z 2 11 | \d 3, \t 3 12 | \l 4 13 | \m 5, \n 5 14 | \r 6}) 15 | 16 | (defn stem 17 | "Return the soundex code for a single word." 18 | [word] 19 | (let [index (first word)] 20 | (loop [w (rest word) 21 | code-seq [] 22 | collapse-next? false] 23 | (if (or (empty? w) 24 | (>= (count code-seq) 3)) 25 | (.toUpperCase (apply str index 26 | (take 3 (concat code-seq (repeat 0))))) 27 | (cond 28 | ;; if we have a `w` or `h`, we ignore it and allow collapsing 29 | (#{\w \h} (first w)) (recur (rest w) code-seq true) 30 | 31 | ;; if it's a vowel, we ignore it, but don't collapse any 32 | ;; following duplicate 33 | (#{\a \e \i \o \u} (first w)) (recur (rest w) code-seq false) 34 | 35 | ;; otherwise, we conj the next code on, respecting `collapse-next?` 36 | :else (let [next-code (char-map (first w))] 37 | (if (and collapse-next? (= next-code (last code-seq))) 38 | (recur (rest w) code-seq true) 39 | (recur (rest w) (conj code-seq next-code) true)))))))) 40 | -------------------------------------------------------------------------------- /src/stemmers/core.clj: -------------------------------------------------------------------------------- 1 | ;; ## Stemmers 2 | 3 | ;; A top-level interface to the stemmers, handling tokenising of 4 | ;; sentences and phrases, as well as removing extra-short and -long 5 | ;; words, and excluding common noisy words (see `*excluded-words*`). 6 | 7 | (ns stemmers.core 8 | (:require [clojure.string :as str] 9 | [stemmers.porter])) 10 | 11 | ;; For later (internal) use. 12 | (def default-stemmer ^{:private true} stemmers.porter/stem) 13 | 14 | ;; Set of specific words to exclude from stemming. 15 | (def ^:dynamic *excluded-words* 16 | #{"the" "and" "was" "are" "not" "you" "were" "that" "this" "did" 17 | "etc" "there" "they" "our" "their"}) 18 | 19 | ;; Ignore words shorter than this. 20 | (def ^:dynamic *min-word-length* 3) 21 | 22 | ;; Ignore words longer than this. 23 | (def ^:dynamic *max-word-length* 30) 24 | 25 | (defn excluded-word? 26 | "`true` if `word` matches our exclusion criteria." 27 | [word] 28 | (or (not (<= *min-word-length* (count word) *max-word-length*)) 29 | (*excluded-words* word))) 30 | 31 | (defn remove-excluded-words 32 | "Remove short and blacklisted words (see `*excluded-words*`)." 33 | [word-seq] 34 | (filter (complement excluded-word?) 35 | word-seq)) 36 | 37 | (defn expand-hyphenated-words 38 | "Expand any hyphenated words in seq into `[hyphenated-word hyphenated word]`." 39 | [word-seq] 40 | (mapcat (fn [^String w] 41 | (if (.contains w "-") (conj (seq (.split w "-")) w) [w])) 42 | word-seq)) 43 | 44 | ;; ## Top-level interface 45 | 46 | (defn tokenise 47 | "Tokenise a phrase, respecting exclusion criteria. e.g.: 48 | 49 | => (tokenise \"searching a set of words\") 50 | (\"searching\" \"set\" \"words\")" 51 | [^String txt] 52 | (-> (str/replace txt #"[^-\d\w]+" " ") 53 | (.toLowerCase) 54 | (str/split #"\s+") 55 | expand-hyphenated-words 56 | remove-excluded-words)) 57 | 58 | (defn stems 59 | "Stem all words in a phrase or sentence, with reference to `*max-` and `*min-word-length*` and `*excluded-words*`." 60 | ([phrase] (stems phrase default-stemmer)) 61 | ([phrase stemmer-func] 62 | (map stemmer-func (tokenise phrase)))) 63 | -------------------------------------------------------------------------------- /src/stemmers/porter.clj: -------------------------------------------------------------------------------- 1 | ;; An implementation of the Porter Stemmer algorithm, detailed at 2 | ;; 3 | 4 | (ns stemmers.porter 5 | (:require [clojure.string :as str])) 6 | 7 | (def stem 8 | ^String 9 | (let [c "[^aeiou]" 10 | v "[aeiouy]" 11 | cs (str c "[^aeiouy]*") 12 | vs (str v "[aeiou]*") 13 | mgr0 (re-pattern (str "^(" cs ")?" vs cs)) 14 | meq1 (re-pattern (str "^(" cs ")?" vs cs "(" vs ")?$")) 15 | mgr1 (re-pattern (str "^(" cs ")?" vs cs vs cs)) 16 | s_v (re-pattern (str "^(" cs ")?" vs)) 17 | 18 | step1a-re1 #"^(.+?)(ss|i)es$" 19 | step1a-re2 #"^(.+?)([^s])s$" 20 | 21 | step1a (fn [w] 22 | (cond 23 | (re-find step1a-re1 w) (str/replace w step1a-re1 "$1$2") 24 | (re-find step1a-re2 w) (str/replace w step1a-re2 "$1$2") 25 | :else w)) 26 | 27 | step1b-re1 #"^(.+?)eed$" 28 | step1b-re2 #"^(.+?)(ed|ing)$" 29 | step1b-stem1 #"(at|bl|iz)$" 30 | step1b-stem2 #"([^aeiouylsz])\1$" 31 | step1b-stem3 (re-pattern (str "^" cs v "[^aeiouwxy]$")) 32 | 33 | step1b (fn [w] 34 | (let [groups1 (re-find step1b-re1 w) 35 | groups2 (re-find step1b-re2 w)] 36 | (cond 37 | groups1 (if (re-find mgr0 (groups1 1)) 38 | (apply str (butlast w)) 39 | w) 40 | groups2 (let [stem (groups2 1)] 41 | (if (re-find s_v stem) 42 | (cond 43 | (re-find step1b-stem1 stem) (str stem "e") 44 | (re-find step1b-stem2 stem) (apply str (butlast stem)) 45 | (re-find step1b-stem3 stem) (str stem "e") 46 | :else stem) 47 | w)) 48 | :else w))) 49 | 50 | step1c-re1 #"^(.+?)y$" 51 | 52 | step1c (fn [w] 53 | (if-let [[_ stem & _] (re-find step1c-re1 w)] 54 | (if (re-find s_v stem) (str stem "i") w) 55 | w)) 56 | 57 | apply-suffix-map (fn [suffix-map ^String w] 58 | (if-let [[stem suffix] 59 | (first (for [key (keys suffix-map) 60 | :when (.endsWith w key)] 61 | [(.substring w 0 (- (count w) (count key))) key]))] 62 | (if (re-find mgr0 stem) 63 | (str stem (suffix-map suffix)) 64 | w) 65 | w)) 66 | 67 | step2-map {"ational" "ate" 68 | "tional" "tion" 69 | "enci" "ence" 70 | "anci" "ance" 71 | "izer" "ize" 72 | "bli" "ble" 73 | "alli" "al" 74 | "entli" "ent" 75 | "eli" "e" 76 | "ousli" "ous" 77 | "ization" "ize" 78 | "ation" "ate" 79 | "ator" "ate" 80 | "alism" "al" 81 | "iveness" "ive" 82 | "fulness" "ful" 83 | "ousness" "ous" 84 | "aliti" "al" 85 | "iviti" "ive" 86 | "biliti" "ble" 87 | "logi" "log"} 88 | 89 | step2 (partial apply-suffix-map step2-map) 90 | 91 | step3-map {"icate" "ic" 92 | "ative" "" 93 | "alize" "al" 94 | "iciti" "ic" 95 | "ical" "ic" 96 | "ful" "" 97 | "ness" ""} 98 | 99 | step3 (partial apply-suffix-map step3-map) 100 | 101 | step4-suffixes1 ["al" "ance" "ence" "er" "ic" "able" "ible" "ant" "ement" 102 | "ment" "ent" "ou" "ism" "ate" "iti" "ous" "ive" "ize"] 103 | step4-re1 #"^(.+?)(s|t)(ion)$" 104 | 105 | step4 (fn [^String w] 106 | (if-let [stem (first (for [suffix step4-suffixes1 107 | :when (.endsWith w suffix)] 108 | (.substring w 0 (- (count w) (count suffix)))))] 109 | (if (re-find mgr1 stem) stem w) 110 | (if-let [groups (re-find step4-re1 w)] 111 | (let [stem (str (groups 1) (groups 2))] 112 | (if (re-find mgr1 stem) stem w)) 113 | w))) 114 | 115 | step5-re1 #"^(.+?)e$" 116 | step5-re2 (re-pattern (str "^" cs v "[^aeiouwxy]$")) 117 | 118 | step5 (fn [w] 119 | (if-let [[_ stem & _] (re-find step5-re1 w)] 120 | (if (or (re-find mgr1 stem) 121 | (and (re-find meq1 stem) (not (re-find step5-re2 stem)))) 122 | stem 123 | w) 124 | w)) 125 | 126 | step6-re #"ll$" 127 | 128 | step6 (fn [w] 129 | (if (and (re-find step6-re w) (re-find mgr1 w)) 130 | (apply str (butlast w)) 131 | w)) 132 | 133 | step-y1 (fn [^String w] 134 | (let [firstch (.substring w 0 1) 135 | firstch-y? (= firstch "y")] 136 | [firstch-y? (if firstch-y? 137 | (str "Y" (.substring w 1)) 138 | w)])) 139 | step-y2 (fn [firstch-y? ^String w] 140 | (if firstch-y? 141 | (str (.toLowerCase ^String (.substring w 0 1)) 142 | (.substring w 1)) 143 | w))] 144 | 145 | (fn [word] 146 | (if (< (count word) 3) 147 | word 148 | (let [[starts-with-y? w] (step-y1 word)] 149 | (->> w 150 | step1a step1b step1c step2 step3 step4 step5 step6 151 | (step-y2 starts-with-y?))))))) 152 | -------------------------------------------------------------------------------- /docs/uberdoc.html: -------------------------------------------------------------------------------- 1 | 2 | stemmers -- Marginalia

stemmers

0.2.0


A collection of stemmers for use in search engines and the like.

2529 |

dependencies

org.clojure/clojure
1.2.0
org.clojure/clojure-contrib
1.2.0

dev dependencies

marginalia
0.3.2
swank-clojure
1.3.0-SNAPSHOT



(this space intentionally left blank)
 

Stemmers

2530 |

A top-level interface to the stemmers, handling tokenising of 2531 | sentences and phrases, as well as removing extra-short and -long 2532 | words, and excluding common noisy words (see *excluded-words*).

2533 |
2534 | 
2535 | (ns stemmers.core
2536 |   (:require [clojure.string :as str]
2537 |             [stemmers.porter]))
2538 |

For later (internal) use.

2539 |
(def default-stemmer ^{:private true} stemmers.porter/stem)
2540 |

Set of specific words to exclude from stemming.

2541 |
(def *excluded-words*
2542 |   #{"the" "and" "was" "are" "not" "you" "were" "that" "this" "did"
2543 |     "etc" "there" "they" "our" "their"})
2544 |

Ignore words shorter than this.

2545 |
(def *min-word-length* 3)
2546 |

Ignore words longer than this.

2547 |
(def *max-word-length* 30)

true if word matches our exclusion criteria.

2548 |
2549 | (defn excluded-word?
2550 |   [word]
2551 |   (or (not (<= *min-word-length* (count word) *max-word-length*))
2552 |       (*excluded-words* word)))

Remove short and blacklisted words (see *excluded-words*).

2553 |
2554 | (defn remove-excluded-words
2555 |   [word-seq]
2556 |   (filter (complement excluded-word?)
2557 |           word-seq))

Expand any hyphenated words in seq into [hyphenated-word hyphenated word].

2558 |
2559 | (defn expand-hyphenated-words
2560 |   [word-seq]
2561 |   (mapcat (fn [^String w]
2562 |             (if (.contains w "-") (conj (seq (.split w "-")) w) [w]))
2563 |           word-seq))

Top-level interface

2564 |

Tokenise a phrase, respecting exclusion criteria. e.g.:

2565 | 2566 |
=> (tokenise "searching a set of words")
2567 | ("searching" "set" "words")
2568 | 
2569 |
2570 | (defn tokenise
2571 |   [^String txt]
2572 |   (-> (str/replace txt #"[^-\d\w]+" " ")
2573 |       (.toLowerCase)
2574 |       (str/split #"\s+")
2575 |       expand-hyphenated-words
2576 |       remove-excluded-words))

Stem all words in a phrase or sentence, with reference to *max- and *min-word-length* and *excluded-words*.

2577 |
2578 | (defn stems
2579 |   ([phrase] (stems phrase default-stemmer))
2580 |   ([phrase stemmer-func]
2581 |      (map stemmer-func (tokenise phrase))))
 

An implementation of the Porter Stemmer algorithm, detailed at 2582 | http://tartarus.org/~martin/PorterStemmer/

2583 |
2584 | (ns stemmers.porter
2585 |   (:require [clojure.string :as str]))
2586 |
2587 | (def stem
2588 |   ^String
2589 |   (let [c "[^aeiou]"
2590 |         v "[aeiouy]"
2591 |         cs (str c "[^aeiouy]*")
2592 |         vs (str v "[aeiou]*")
2593 |         mgr0 (re-pattern (str "^(" cs ")?" vs cs))
2594 |         meq1 (re-pattern (str "^(" cs ")?" vs cs "(" vs ")?$"))
2595 |         mgr1 (re-pattern (str "^(" cs ")?" vs cs vs cs))
2596 |         s_v  (re-pattern (str "^(" cs ")?" vs))
2597 | 
2598 |         step1a-re1 #"^(.+?)(ss|i)es$"
2599 |         step1a-re2 #"^(.+?)([^s])s$"
2600 |         
2601 |         step1a (fn [w]
2602 |                  (cond
2603 |                   (re-find step1a-re1 w) (str/replace w step1a-re1 "$1$2")
2604 |                   (re-find step1a-re2 w) (str/replace w step1a-re2 "$1$2")
2605 |                   :else w))
2606 | 
2607 |         step1b-re1 #"^(.+?)eed$"
2608 |         step1b-re2 #"^(.+?)(ed|ing)$"
2609 |         step1b-stem1 #"(at|bl|iz)$"
2610 |         step1b-stem2 #"([^aeiouylsz])\1$"
2611 |         step1b-stem3 (re-pattern (str "^" cs v "[^aeiouwxy]$"))
2612 |         
2613 |         step1b (fn [w]
2614 |                  (let [groups1 (re-find step1b-re1 w)
2615 |                        groups2 (re-find step1b-re2 w)]
2616 |                    (cond
2617 |                     groups1 (if (re-find mgr0 (groups1 1))
2618 |                               (apply str (butlast w))
2619 |                               w)
2620 |                     groups2 (let [stem (groups2 1)]
2621 |                               (if (re-find s_v stem)
2622 |                                 (cond
2623 |                                  (re-find step1b-stem1 stem) (str stem "e")
2624 |                                  (re-find step1b-stem2 stem) (apply str (butlast stem))
2625 |                                  (re-find step1b-stem3 stem) (str stem "e")
2626 |                                  :else stem)
2627 |                                 w))
2628 |                     :else w)))
2629 | 
2630 |         step1c-re1 #"^(.+?)y$"
2631 |         
2632 |         step1c (fn [w]
2633 |                  (if-let [[_ stem & _] (re-find step1c-re1 w)]
2634 |                    (if (re-find s_v stem) (str stem "i") w)
2635 |                    w))
2636 | 
2637 |         apply-suffix-map (fn [suffix-map ^String w]
2638 |                            (if-let [[stem suffix]
2639 |                                     (first (for [key (keys suffix-map)
2640 |                                                  :when (.endsWith w key)]
2641 |                                              [(.substring w 0 (- (count w) (count key))) key]))]
2642 |                              (if (re-find mgr0 stem)
2643 |                                (str stem (suffix-map suffix))
2644 |                                w)
2645 |                              w))
2646 | 
2647 |         step2-map {"ational" "ate"
2648 |                    "tional"  "tion"
2649 |                    "enci"    "ence"
2650 |                    "anci"    "ance"
2651 |                    "izer"    "ize"
2652 |                    "bli"     "ble"
2653 |                    "alli"    "al"
2654 |                    "entli"   "ent"
2655 |                    "eli"     "e"
2656 |                    "ousli"   "ous"
2657 |                    "ization" "ize"
2658 |                    "ation"   "ate"
2659 |                    "ator"    "ate"
2660 |                    "alism"   "al"
2661 |                    "iveness" "ive"
2662 |                    "fulness" "ful"
2663 |                    "ousness" "ous"
2664 |                    "aliti"   "al"
2665 |                    "iviti"   "ive"
2666 |                    "biliti"  "ble"
2667 |                    "logi"    "log"}
2668 |         
2669 |         step2 (partial apply-suffix-map step2-map)
2670 | 
2671 |         step3-map {"icate" "ic"
2672 |                    "ative" ""
2673 |                    "alize" "al"
2674 |                    "iciti" "ic"
2675 |                    "ical"  "ic"
2676 |                    "ful"   ""
2677 |                    "ness"  ""}
2678 |         
2679 |         step3 (partial apply-suffix-map step3-map)
2680 | 
2681 |         step4-suffixes1 ["al" "ance" "ence" "er" "ic" "able" "ible" "ant" "ement"
2682 |                          "ment" "ent" "ou" "ism" "ate" "iti" "ous" "ive" "ize"]
2683 |         step4-re1 #"^(.+?)(s|t)(ion)$"
2684 |         
2685 |         step4 (fn [^String w]
2686 |                 (if-let [stem (first (for [suffix step4-suffixes1
2687 |                                            :when (.endsWith w suffix)]
2688 |                                        (.substring w 0 (- (count w) (count suffix)))))]
2689 |                   (if (re-find mgr1 stem) stem w)
2690 |                   (if-let [groups (re-find step4-re1 w)]
2691 |                     (let [stem (str (groups 1) (groups 2))]
2692 |                       (if (re-find mgr1 stem) stem w))
2693 |                     w)))
2694 | 
2695 |         step5-re1 #"^(.+?)e$"
2696 |         step5-re2 (re-pattern (str "^" cs v "[^aeiouwxy]$"))
2697 |         
2698 |         step5 (fn [w]
2699 |                 (if-let [[_ stem & _] (re-find step5-re1 w)]
2700 |                   (if (or (re-find mgr1 stem)
2701 |                           (and (re-find meq1 stem) (not (re-find step5-re2 stem))))
2702 |                     stem
2703 |                     w)
2704 |                   w))
2705 | 
2706 |         step6-re #"ll$"
2707 |         
2708 |         step6 (fn [w]
2709 |                 (if (and (re-find step6-re w) (re-find mgr1 w))
2710 |                   (apply str (butlast w))
2711 |                   w))
2712 | 
2713 |         step-y1 (fn [^String w]
2714 |                   (let [firstch (.substring w 0 1)
2715 |                         firstch-y? (= firstch "y")]
2716 |                     [firstch-y? (if firstch-y?
2717 |                                   (str "Y" (.substring w 1))
2718 |                                   w)]))
2719 |         step-y2 (fn [firstch-y? ^String w]
2720 |                   (if firstch-y?
2721 |                     (str (.toLowerCase ^String (.substring w 0 1))
2722 |                          (.substring w 1))
2723 |                     w))]
2724 | 
2725 |     (fn [word]
2726 |       (if (< (count word) 3)
2727 |         word
2728 |         (let [[starts-with-y? w] (step-y1 word)]
2729 |           (->> w
2730 |                step1a step1b step1c step2 step3 step4 step5 step6
2731 |                (step-y2 starts-with-y?)))))))
 

An implementation of the American Soundex algorithm, as described 2732 | at http://en.wikipedia.org/wiki/Soundex

2733 |
2734 | (ns stemmers.soundex
2735 |   (:require [clojure.string :as str]))
2736 |

Consonant codings. h, w and the vowels are special cases, as we'll see later.

2737 |
(def char-map
2738 |   {\b 1, \f 1, \p 1, \v 1
2739 |    \c 2, \g 2, \j 2, \k 2, \q 2, \s 2, \x 2, \z 2
2740 |    \d 3, \t 3
2741 |    \l 4
2742 |    \m 5, \n 5
2743 |    \r 6})

Return the soundex code for a single word.

2744 |
2745 | (defn stem
2746 |   [word]
2747 |   (let [index (first word)]
2748 |     (loop [w (rest word)
2749 |            code-seq []
2750 |            collapse-next? false]
2751 |       (if (or (empty? w)
2752 |               (>= (count code-seq) 3))
2753 |         (.toUpperCase (apply str index
2754 |                              (take 3 (concat code-seq (repeat 0)))))

if we have a w or h, we ignore it and allow collapsing

2755 |
        (cond
2756 |          (#{\w \h} (first w))          (recur (rest w) code-seq true)

if it's a vowel, we ignore it, but don't collapse any 2757 | following duplicate

2758 |
         
2759 |          (#{\a \e \i \o \u} (first w)) (recur (rest w) code-seq false)

otherwise, we conj the next code on, respecting collapse-next?

2760 |
         
2761 |          :else (let [next-code (char-map (first w))]
2762 |                  (if (and collapse-next? (= next-code (last code-seq)))
2763 |                    (recur (rest w) code-seq true)
2764 |                    (recur (rest w) (conj code-seq next-code) true))))))))
 
--------------------------------------------------------------------------------