├── .gitignore ├── LICENSE ├── README.md ├── newword-find ├── pom.xml └── src │ ├── main │ └── scala │ │ └── cn │ │ └── spark │ │ └── nlp │ │ └── newwordfind │ │ ├── core │ │ ├── NewWordFind.scala │ │ ├── NewWordFindConfig.scala │ │ └── PrefixPartitioner.scala │ │ ├── newwordfind.scala │ │ ├── trie │ │ ├── Node.scala │ │ ├── NodeProcessor.scala │ │ ├── Trie.scala │ │ └── TrieErgodicProcessor.scala │ │ └── utils │ │ ├── NGram.scala │ │ └── WordCountUtils.scala │ └── test │ └── scala │ └── cn │ └── spark │ └── nlp │ └── newwordfind │ ├── Test.scala │ ├── TestNewWordFind.scala │ └── TestTrie.scala └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | *.class 3 | *.o 4 | *.a 5 | *.so 6 | *.exe 7 | *.log 8 | *.prefs 9 | *.classpath 10 | *.metadata 11 | *.project 12 | *.bak 13 | *.springBeans 14 | .settings 15 | build 16 | debug 17 | target 18 | 19 | # Numerous always-ignore extensions 20 | *.diff 21 | *.err 22 | *.orig 23 | *.rej 24 | *.swo 25 | *.swp 26 | *.zip 27 | *.vi 28 | ~* 29 | *.sass-cache 30 | *.ruby-version 31 | 32 | # OS or Editor folders 33 | *.DS_Store 34 | *._* 35 | Thumbs.db 36 | *.cache 37 | *.tmproj 38 | *.esproj 39 | nbproject 40 | *.sublime-project 41 | *.sublime-workspace 42 | 43 | # Komodo 44 | *.komodoproject 45 | *.komodotools 46 | 47 | # Folders to ignore 48 | .hg 49 | .svn 50 | .CVS 51 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sparkNLP 2 | 基于spark的NLP应用 3 | 4 | 一 新词发现 5 | 6 | 一个完全不基于词典的分词 7 | 8 | 主要实现点:左邻右邻信息熵, 点间互信息, Ngram 9 | 10 | ==> 后续会考虑重合子串的处理 11 | 12 | 调用方式: 13 | 14 | import cn.spark.nlp.newwordfind._ 15 | 16 | def main(args: Array[String]): Unit = { 17 | val conf = new SparkConf() 18 | .setAppName("new-words-find") 19 | .setMaster("local[3]") 20 | 21 | val pattern = "[\u4E00-\u9FA5]+".r 22 | val stopwords = "[你|我|他|她|它]+" 23 | 24 | val minLen = 2 25 | val maxLen = 6 26 | val minCount = 20 27 | val minInfoEnergy = 2.0 28 | val minPim = 20.0 29 | val numPartition = 6 30 | 31 | val newWordFindConfig = NewWordFindConfig(minLen, maxLen, 32 | minCount, minInfoEnergy, minPim, numPartition) 33 | 34 | val sc = new SparkContext(conf) 35 | 36 | val lines =sc.textFile("/Users/songyaheng/Downloads/西游记.txt") 37 | .flatMap(pattern.findAllIn(_).toSeq) 38 | .flatMap(_.split(stopwords)) 39 | .newWord(sc, newWordFindConfig) 40 | //wepf: w:代表词(_1) e: 代表该词的信息熵(左邻右邻信息熵中最小的)(_2) p: 代表点间互信息(_3) f: 代表词频(_4) 41 | .map(wepf => (wepf._2 * wepf._3 * wepf._4, wepf._1)) 42 | .sortByKey(false, 1) 43 | .foreach(println) 44 | 45 | sc.stop() 46 | } 47 | 48 | 结果如下: 49 | 50 | (18168.514250954064,袈裟) 51 | (17713.06665111492,芭蕉) 52 | (16798.82717604416,吩咐) 53 | (16142.526118040218,葫芦) 54 | (11702.377091478338,乾坤) 55 | (11628.283999511725,哪吒) 56 | (11457.444521822847,猢狲) 57 | (10285.647417662267,琉璃) 58 | (7756.876983908059,荆棘) 59 | (7340.1686818898015,包袱) 60 | (7250.92797731874,校尉) 61 | (6876.8133704999855,钵盂) 62 | (6141.238795255687,揭谛) 63 | (6097.826306360437,惫懒) 64 | (4567.736774433127,苍蝇) 65 | (4398.391082713808,弼马温) 66 | (4208.842378551715,抖擞) 67 | (3865.6764241340757,孽畜) 68 | (3806.4273334808236,驿丞) 69 | (3369.570454781614,夯货) 70 | (3209.808428480634,悚惧) 71 | (3104.343061153103,祭赛) 72 | (3051.358367810302,武艺) 73 | (2996.755537579268,丑陋) 74 | (2821.9446891721645,怠慢) 75 | (2789.486615314228,蟠桃) 76 | (2706.7702230076206,逍遥) 77 | (2661.6587009929654,伺候) 78 | (2428.8996887030903,输赢) 79 | (2318.4894066182214,纷纷) 80 | (2314.1992786437513,奶奶) 81 | (2309.8451555988668,妈妈) 82 | (2021.7681532826207,尘埃) 83 | (1840.4474331072206,森森) 84 | (1768.7517043782416,伽蓝) 85 | (1673.1962179067696,悄悄) 86 | (1453.9759495186454,踪迹) 87 | (1338.3997377699582,杨柳) -------------------------------------------------------------------------------- /newword-find/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | cn.spark.nlp 8 | newword-find 9 | 1.0.0-SNAPSHOT 10 | 11 | 2.2.0 12 | 2.11.8 13 | 14 | 15 | 16 | 17 | org.apache.spark 18 | spark-core_2.11 19 | ${spark.version} 20 | 21 | 22 | org.scala-lang 23 | scala-reflect 24 | 25 | 26 | 27 | 28 | org.apache.spark 29 | spark-streaming_2.11 30 | ${spark.version} 31 | 32 | 33 | org.apache.spark 34 | spark-sql_2.11 35 | ${spark.version} 36 | 37 | 38 | org.apache.spark 39 | spark-hive_2.11 40 | ${spark.version} 41 | 42 | 43 | org.apache.spark 44 | spark-mllib_2.11 45 | ${spark.version} 46 | 47 | 48 | org.scala-lang 49 | scala-library 50 | ${scala.version} 51 | compile 52 | 53 | 54 | org.scala-lang 55 | scala-compiler 56 | ${scala.version} 57 | compile 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | org.scala-tools 67 | maven-scala-plugin 68 | 2.15.2 69 | 70 | 71 | 72 | compile 73 | testCompile 74 | 75 | 76 | 77 | 78 | 79 | 80 | maven-compiler-plugin 81 | 3.7.0 82 | 83 | 1.8 84 | 1.8 85 | 86 | 87 | 88 | 89 | 90 | org.apache.maven.plugins 91 | maven-surefire-plugin 92 | 2.20.1 93 | 94 | true 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/NewWordFind.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.core 2 | 3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie, TrieErgodicProcessor} 4 | import cn.spark.nlp.newwordfind.utils.{NGram, WordCountUtils} 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.broadcast.Broadcast 7 | import org.apache.spark.rdd.RDD 8 | 9 | import scala.collection.mutable.ArrayBuffer 10 | 11 | /** 12 | * 13 | * @author songyaheng on 2017/12/7 14 | * @version 1.0 15 | */ 16 | object NewWordFind { 17 | 18 | def newWordRDD(sc: SparkContext, rdd: RDD[_], newWordFindConfig: NewWordFindConfig): RDD[(String, Double, Double, Int)] = { 19 | val lineCount = rddLineCount(rdd) 20 | val trieRdd = trieRDD(rdd, newWordFindConfig) 21 | val trieCount = trieRdd.reduce(_.join(new NodeProcessor[(Double, Int), Trie[(Double,Int)], String] { 22 | override def process(e: String)(t: (Double, Int), trie: Trie[(Double, Int)]) = { 23 | trie.value(e) match { 24 | case Some(tt) => trie.updateValue(e, (math.min(t._1, tt._1), t._2 + tt._2)) 25 | case None => trie.insert(e, t) 26 | } 27 | } 28 | })(_)) 29 | val btrieCount = sc.broadcast(trieCount) 30 | trieRdd.mapPartitions(r => rddPmi(r, newWordFindConfig, lineCount, btrieCount)) 31 | } 32 | 33 | def rddLineCount(rdd: RDD[_]): Double = { 34 | rdd.flatMap(line => line.toString.split("")) 35 | .map(w => (w, 1)) 36 | .reduceByKey(_ + _) 37 | .count().toDouble 38 | } 39 | 40 | def trieRDD(rdd: RDD[_], newWordFindConfig: NewWordFindConfig): RDD[Trie[(Double, Int)]] = { 41 | rdd.flatMap(l => NGram.nGram(l.toString, 1, newWordFindConfig.maxlen)) 42 | .reduceByKey(WordCountUtils.count) 43 | .filter(wmf => wmf._2._3 >= newWordFindConfig.minCount) 44 | .map(WordCountUtils.energyCount) 45 | .partitionBy(new PrefixPartitioner(newWordFindConfig.numPartitions)) 46 | .mapPartitions(WordCountUtils.trieRDD) 47 | .cache() 48 | } 49 | 50 | def rddPmi(iterator: Iterator[Trie[(Double, Int)]], newWordFindConfig: NewWordFindConfig, lineCount: Double, btrieCount: Broadcast[Trie[(Double, Int)]]): Iterator[(String, Double, Double, Int)] = { 51 | val trie = iterator.next() 52 | var array = new ArrayBuffer[(String, Double, Double, Int)]() 53 | trie.ergodic(new TrieErgodicProcessor[String, (Double, Int)] { 54 | override def process(t: String, e: (Double, Int)): Unit = { 55 | if (t.length >= newWordFindConfig.minlen) { 56 | val pmi = (0 to t.length - 2).map(i => { 57 | val a = t.substring(0, i + 1) 58 | val b = t.substring(i + 1) 59 | val av = btrieCount.value.value(a) match { 60 | case Some(v) => v._2.toDouble 61 | case _ => newWordFindConfig.minCount.toDouble 62 | } 63 | val bv = btrieCount.value.value(b) match { 64 | case Some(v) => v._2.toDouble 65 | case _ => newWordFindConfig.minCount.toDouble 66 | } 67 | e._2 * lineCount * 1.0 / (av * bv) 68 | }).min 69 | if (pmi >= newWordFindConfig.minPmi && e._1 >= newWordFindConfig.minInfoEnergy) { 70 | array += ((t, e._1, pmi, e._2)) 71 | } 72 | } 73 | } 74 | }) 75 | array.toIterator 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/NewWordFindConfig.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.core 2 | 3 | /** 4 | * 5 | * @author songyaheng on 2017/12/7 6 | * @version 1.0 7 | */ 8 | case class NewWordFindConfig( 9 | minlen: Int, 10 | maxlen: Int, 11 | minCount: Int, 12 | minInfoEnergy: Double, 13 | minPmi: Double, 14 | numPartitions: Int 15 | ) 16 | 17 | object NewWordFindConfig { 18 | val minlen: Int = 2 19 | val maxlen: Int = 4 20 | val minCount: Int = 5 21 | val minInfoEnergy: Double = 1.0 22 | val minPmi: Double = 1.0 23 | val numPartitions: Int = 6 24 | } 25 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/PrefixPartitioner.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.core 2 | 3 | import org.apache.spark.Partitioner 4 | 5 | /** 6 | * 7 | * @author songyaheng on 2017/12/8 8 | * @version 1.0 9 | */ 10 | class PrefixPartitioner(numPartition: Int) extends Partitioner{ 11 | override def numPartitions = numPartition 12 | 13 | override def getPartition(key: Any): Int = { 14 | val prefix = key.toString.substring(0, 1) 15 | val code = prefix.hashCode % numPartition 16 | if (code < 0) { 17 | code + numPartition 18 | } else { 19 | code 20 | } 21 | } 22 | 23 | override def equals(other: Any): Boolean = other match { 24 | case prefixPartitioner: PrefixPartitioner => 25 | prefixPartitioner.numPartitions == numPartitions 26 | case _ => 27 | false 28 | } 29 | 30 | override def hashCode(): Int = numPartitions 31 | 32 | } 33 | 34 | 35 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/newwordfind.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp 2 | 3 | import cn.spark.nlp.newwordfind.core.{NewWordFind, NewWordFindConfig} 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.reflect.ClassTag 8 | 9 | /** 10 | * 11 | * @author songyaheng on 2017/12/7 12 | * @version 1.0 13 | */ 14 | package object newwordfind { 15 | 16 | implicit def sparkRDDFunctions(rdd: RDD[String]) = new SparkRDDFunctions[String](rdd) 17 | 18 | class SparkRDDFunctions[T : ClassTag](rdd: RDD[T]) extends Serializable { 19 | def newWord(sc: SparkContext): RDD[(String, Double, Double, Int)] = 20 | NewWordFind.newWordRDD(sc, rdd, NewWordFindConfig(NewWordFindConfig.minlen, 21 | NewWordFindConfig.maxlen, 22 | NewWordFindConfig.minCount, 23 | NewWordFindConfig.minInfoEnergy, 24 | NewWordFindConfig.minPmi, 25 | NewWordFindConfig.numPartitions)) 26 | def newWord(sc: SparkContext, newWordFindConfig: NewWordFindConfig): RDD[(String, Double, Double, Int)] = 27 | NewWordFind.newWordRDD(sc, rdd, newWordFindConfig) 28 | 29 | } 30 | 31 | 32 | 33 | 34 | } 35 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/Node.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.trie 2 | 3 | import scala.collection.immutable.TreeMap 4 | 5 | 6 | /** 7 | * 8 | * @author songyaheng on 2017/11/27 9 | * @version 1.0 10 | */ 11 | class Node[T](char: Char) extends Serializable { 12 | var content: Char = char 13 | var isEnd: Boolean = false 14 | var childMap: Map[Char, Node[T]] = TreeMap[Char, Node[T]]() 15 | var t: T = _ 16 | var depth: Int = 0 17 | var count: Int = 0 18 | 19 | def nextNode(char: Char): Option[Node[T]] = { 20 | if (childMap.nonEmpty) { 21 | childMap.get(char) 22 | } else { 23 | None 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/NodeProcessor.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.trie 2 | 3 | /** 4 | * 5 | * @author songyaheng on 2017/11/28 6 | * @version 1.0 7 | */ 8 | trait NodeProcessor[T, R, E] extends Serializable{ 9 | def process(e: E)(t: T, trie: Trie[T]): R 10 | } 11 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/Trie.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.trie 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | 5 | /** 6 | * 7 | * @author songyaheng on 2017/11/28 8 | * @version 1.0 9 | */ 10 | class Trie[T] extends Serializable { 11 | 12 | val root: Node[T] = new Node[T](' ') 13 | 14 | def insert(word: String, t: T): Trie[T] = { 15 | insert(word, t, null) 16 | } 17 | 18 | /** 19 | * 插入数据的方法 20 | * @param word 21 | * @param t 22 | * @param processor 23 | */ 24 | def insert(word: String, t: T, processor: NodeProcessor[T, Trie[T], String]): Trie[T] = { 25 | this.synchronized { 26 | if (word.isEmpty) return this 27 | value(word) match { 28 | case Some(tt) => { 29 | if (processor == null) { 30 | return this 31 | } else { 32 | return processor.process(word)(t, this) 33 | } 34 | } 35 | case None => { 36 | var curentNode: Node[T] = root 37 | var deep: Int = 0 38 | word.trim.foreach(c => { 39 | deep = deep + 1 40 | curentNode.nextNode(c) match { 41 | case Some(nd) => 42 | curentNode = nd 43 | case None => 44 | curentNode.childMap += (c -> new Node[T](c)) 45 | curentNode.count = curentNode.childMap.size 46 | curentNode.nextNode(c) match { 47 | case Some(nd) => 48 | curentNode = nd 49 | curentNode.depth = deep 50 | case None => 51 | return this 52 | } 53 | } 54 | }) 55 | curentNode.t = t 56 | curentNode.isEnd = true 57 | this 58 | } 59 | } 60 | 61 | } 62 | } 63 | 64 | /** 65 | * 判断是否存在 66 | * @param word 67 | * @return 68 | */ 69 | def exist(word: String): Boolean = { 70 | var curentNode = root 71 | word.trim.toCharArray.foreach(c => { 72 | if (curentNode.nextNode(c).isEmpty) { 73 | false 74 | } else { 75 | curentNode.nextNode(c) match { 76 | case Some(nd) => curentNode = nd 77 | case None => return false 78 | } 79 | } 80 | }) 81 | if (curentNode.isEnd) { 82 | true 83 | } else { 84 | false 85 | } 86 | } 87 | 88 | /** 89 | * 获取输入字符串的标签值 90 | * @param word 91 | * @return 92 | */ 93 | def value(word: String): Option[T] = { 94 | if (word.isEmpty) return None 95 | var curentNode = root 96 | word.toCharArray.foreach(c => { 97 | curentNode.nextNode(c) match { 98 | case Some(nd) => curentNode = nd 99 | case None => return None 100 | } 101 | }) 102 | if (curentNode.isEnd) { 103 | Some(curentNode.t) 104 | } else { 105 | None 106 | } 107 | } 108 | 109 | def updateValue(word: String, t: T): Trie[T] = { 110 | if (word.isEmpty) return this 111 | this.synchronized { 112 | var curentNode = root 113 | word.toCharArray.foreach(c => { 114 | curentNode.nextNode(c) match { 115 | case Some(nd) => 116 | curentNode = nd 117 | case None => return this 118 | } 119 | }) 120 | if (curentNode.isEnd) curentNode.t = t 121 | this 122 | } 123 | } 124 | 125 | /** 126 | * 获取前缀的所有词 127 | * @param prifix 128 | * @return 129 | */ 130 | def allWords(prifix: String): ArrayBuffer[String] = { 131 | val rs: ArrayBuffer[String] = ArrayBuffer[String]() 132 | if (prifix.isEmpty) { 133 | return rs 134 | } 135 | var node: Node[T] = root 136 | prifix.trim.toCharArray.foreach(c => { 137 | if (node.count == 0) { 138 | return rs 139 | } else { 140 | node.nextNode(c) match { 141 | case Some(nd) => node = nd 142 | case None => return rs 143 | } 144 | } 145 | }) 146 | if (node.count != 0) { 147 | fullWords(node, prifix, rs) 148 | } 149 | rs 150 | } 151 | 152 | /** 153 | * 递归获取节点值 154 | * @param node 155 | * @param profix 156 | * @param arrayBuffer 157 | */ 158 | private def fullWords(node: Node[T], profix: String, arrayBuffer: ArrayBuffer[String]): Unit = { 159 | node.childMap.values.foreach(nd => { 160 | fullWords(nd, profix + nd.content, arrayBuffer) 161 | }) 162 | if (node.isEnd) arrayBuffer += profix 163 | } 164 | 165 | /** 166 | * 两棵树的join 167 | * @param trie 168 | * @return 169 | */ 170 | def join(processor: NodeProcessor[T, Trie[T], String])(trie: Trie[T]): Trie[T] = { 171 | this.synchronized { 172 | val node = trie.root 173 | if (node.count != 0) { 174 | interacte(node, "", processor) 175 | } 176 | this 177 | } 178 | } 179 | 180 | private def interacte(node: Node[T], profix: String, processor: NodeProcessor[T, Trie[T], String]): Unit = { 181 | node.childMap.values.foreach(nd => interacte(nd, profix + nd.content, processor)) 182 | if (node.isEnd) { 183 | this.value(profix) match { 184 | case Some(t) => processor.process(profix)(node.t, this) 185 | case None => this.insert(profix, node.t) 186 | } 187 | } 188 | } 189 | 190 | def ergodic(trieErgodicProcessor: TrieErgodicProcessor[String, T]): Unit = 191 | this.synchronized { 192 | val node = this.root 193 | if (node.count != 0) { 194 | innerErgodic(node, "", trieErgodicProcessor) 195 | } 196 | } 197 | 198 | private def innerErgodic(node: Node[T], profix: String, trieErgodicProcessor: TrieErgodicProcessor[String, T]): Unit = { 199 | node.childMap.values.foreach(nd => innerErgodic(nd, profix + nd.content, trieErgodicProcessor)) 200 | if (node.isEnd) { 201 | trieErgodicProcessor.process(profix, node.t) 202 | } 203 | } 204 | 205 | } 206 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/TrieErgodicProcessor.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.trie 2 | 3 | /** 4 | * 5 | * @author songyaheng on 2017/12/8 6 | * @version 1.0 7 | */ 8 | trait TrieErgodicProcessor[T, E] extends Serializable{ 9 | def process(t: T, e: E) 10 | } 11 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/utils/NGram.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.utils 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | 5 | /** 6 | * n-gram 切词工具 7 | * 8 | * @author songyaheng on 2017/11/27 9 | * @version 1.0 10 | */ 11 | object NGram extends Serializable { 12 | /** 13 | * 单句双向切词 14 | * @param s 15 | * @return 16 | */ 17 | def newWordGram(s: String, len: Int): List[(String, (Map[String, Int], Map[String, Int], Int))] = { 18 | val sen = "$" + s + "$" 19 | (1 to s.length - len + 1).map(i => { 20 | val w = sen.substring(i, i + len) 21 | val lw = sen.substring(i -1, i) 22 | val rw = sen.substring(i + len).substring(0, 1) 23 | (w, (Map(lw -> 1), Map(rw -> 1), 1)) 24 | }).toList 25 | } 26 | 27 | def nGram(s: String, minlen: Int, maxlen: Int): List[(String, (Map[String, Int], Map[String, Int], Int))] = { 28 | (minlen to maxlen).flatMap( i => newWordGram(s, i)).toList 29 | } 30 | 31 | def nGramByLen(s: String, len: Int, f: Double): List[(String, Double)] = 32 | (0 to s.length - len).map(i => { 33 | (s.substring(i, i + len), f) 34 | }).toList 35 | 36 | 37 | def nGramByWord(s: String, minlen: Int, maxlen: Int, f: Double): List[(String, Double)] = 38 | (minlen to maxlen).flatMap( i => nGramByLen(s, i, f)).toList 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /newword-find/src/main/scala/cn/spark/nlp/newwordfind/utils/WordCountUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind.utils 2 | 3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie} 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.broadcast.Broadcast 6 | 7 | /** 8 | * 9 | * @author songyaheng on 2017/12/4 10 | * @version 1.0 11 | */ 12 | object WordCountUtils extends Serializable { 13 | 14 | /** 15 | * 计算左邻右邻信息和频率 16 | * @param a 17 | * @param b 18 | * @return 19 | */ 20 | def count(a: (Map[String, Int], Map[String, Int], Int), b: (Map[String, Int], Map[String, Int], Int)): (Map[String, Int], Map[String, Int], Int) = { 21 | var la = a._1 22 | b._1.foreach(kv => { 23 | if (la.contains(kv._1)) { 24 | val v = la(kv._1) + kv._2 25 | la += (kv._1 -> v) 26 | } else { 27 | la += kv 28 | } 29 | }) 30 | var lr = a._1 31 | b._2.foreach(kv => { 32 | if (lr.contains(kv._1)) { 33 | val v = lr(kv._1) + kv._2 34 | lr += (kv._1 -> v) 35 | } else { 36 | lr += kv 37 | } 38 | }) 39 | val wc = a._3 + b._3 40 | (la, lr, wc) 41 | } 42 | 43 | /** 44 | * 计算左邻右邻信息熵 45 | * @param v 46 | * @return 47 | */ 48 | def energyCount(v: (String, (Map[String, Int], Map[String, Int], Int))): (String, (Double, Int)) = { 49 | val lcount = v._2._1.values.sum 50 | val rcount = v._2._2.values.sum 51 | val le = v._2._1.values.map(c => { 52 | val p = c * 1.0 / lcount 53 | -1 * p * Math.log(p) / Math.log(2) 54 | }).sum 55 | val re = v._2._2.values.map(c => { 56 | val p = c * 1.0 / rcount 57 | -1 * p * Math.log(p) / Math.log(2) 58 | }).sum 59 | val e = Math.min(le, re) 60 | (v._1, (e, v._2._3)) 61 | } 62 | 63 | def trieRDD(words: Iterator[(String, (Double, Int))]): Iterator[Trie[(Double, Int)]] = { 64 | val trie = new Trie[(Double, Int)] 65 | while (words.hasNext) { 66 | val w = words.next() 67 | trie.insert(w._1, w._2, new NodeProcessor[(Double, Int), Trie[(Double, Int)], String] { 68 | override def process(e: String)(t: (Double, Int), trie: Trie[(Double, Int)]) = { 69 | trie.value(e) match { 70 | case Some(ef) => trie.updateValue(e, (Math.min(ef._1, t._1), ef._2 + t._2)) 71 | case None => trie.insert(e, t) 72 | } 73 | } 74 | }) 75 | } 76 | List(trie).iterator 77 | } 78 | 79 | 80 | 81 | 82 | def permutations(list: List[Int]): Set[List[Int]] = { 83 | list match { 84 | case Nil => Set(Nil) 85 | case (head::tail) => 86 | for(p0 <- permutations(tail); i<-0 to (p0 length); (xs,ys)=p0 splitAt i) yield xs:::List(head):::ys 87 | } 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /newword-find/src/test/scala/cn/spark/nlp/newwordfind/Test.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind 2 | 3 | import cn.spark.nlp.newwordfind.utils.{NGram, WordCountUtils} 4 | 5 | /** 6 | * 7 | * @author songyaheng on 2017/12/4 8 | * @version 1.0 9 | */ 10 | object Test { 11 | def main(args: Array[String]): Unit = { 12 | val s = "中华人民中华" 13 | NGram.nGramByWord(s, 1, 4, 1).foreach(println) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /newword-find/src/test/scala/cn/spark/nlp/newwordfind/TestNewWordFind.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind 2 | 3 | import cn.spark.nlp.newwordfind.core.NewWordFindConfig 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import cn.spark.nlp.newwordfind._ 6 | /** 7 | * 8 | * @author songyaheng on 2017/12/9 9 | * @version 1.0 10 | */ 11 | object TestNewWordFind { 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf() 14 | .setAppName("new-words-find") 15 | .setMaster("local[3]") 16 | 17 | val pattern = "[\u4E00-\u9FA5]+".r 18 | val stopwords = "[你|我|他|她|它]+" 19 | 20 | val minLen = 2 21 | val maxLen = 6 22 | val minCount = 20 23 | val minInfoEnergy = 2.0 24 | val minPim = 20.0 25 | val numPartition = 6 26 | 27 | val newWordFindConfig = NewWordFindConfig(minLen, maxLen, 28 | minCount, minInfoEnergy, minPim, numPartition) 29 | 30 | val sc = new SparkContext(conf) 31 | 32 | val lines =sc.textFile("/Users/songyaheng/Downloads/西游记.txt") 33 | .flatMap(pattern.findAllIn(_).toSeq) 34 | .flatMap(_.split(stopwords)) 35 | .newWord(sc, newWordFindConfig) 36 | //wepf: w:代表词(_1) e: 代表该词的信息熵(左邻右邻信息熵中最小的)(_2) p: 代表点间互信息(_3) f: 代表词频(_4) 37 | .map(wepf => (wepf._2 * wepf._3 * wepf._4, wepf._1)) 38 | .sortByKey(false, 1) 39 | .foreach(println) 40 | 41 | sc.stop() 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /newword-find/src/test/scala/cn/spark/nlp/newwordfind/TestTrie.scala: -------------------------------------------------------------------------------- 1 | package cn.spark.nlp.newwordfind 2 | 3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie} 4 | 5 | /** 6 | * 7 | * @author songyaheng on 2017/11/30 8 | * @version 1.0 9 | */ 10 | object TestTrie { 11 | def main(args: Array[String]): Unit = { 12 | val trie: Trie[Double] = new Trie[Double]() 13 | trie.insert("中国", 1.0) 14 | trie.insert("中国人", 2.0) 15 | trie.insert("中华人民", 3.0) 16 | trie.updateValue("中国人", 9.0) 17 | 18 | trie.insert("中国", 1.0 , new NodeProcessor[Double, Trie[Double], String] { 19 | override def process(e: String)(t: Double, trie: Trie[Double]) = { 20 | trie.value(e) match { 21 | case Some(tt) => trie.updateValue(e, tt + t) 22 | case None => trie.insert(e, t) 23 | } 24 | } 25 | }) 26 | 27 | println(trie.exist("中国")) 28 | println(trie.value("中国")) 29 | println(trie.allWords("中国")) 30 | 31 | val trie2 = new Trie[Double]() 32 | trie2.insert("中华小当家", 0.5) 33 | trie2.insert("中华人民共和国", 0.6) 34 | trie2.insert("中国", 1.0) 35 | trie.join(new NodeProcessor[Double, Trie[Double], String] { 36 | override def process(e: String)(t: Double, trie: Trie[Double]) = { 37 | trie.value(e) match { 38 | case Some(tt) => trie.updateValue(e, tt + t) 39 | case None => trie.insert(e, t) 40 | } 41 | } 42 | })(trie2) 43 | 44 | 45 | println(trie.value("中国")) 46 | 47 | println(trie.allWords("中华")) 48 | 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | cn.spark.nlp 8 | sparkNLP 9 | 1.0.0-SNAPSHOT 10 | 11 | 12 | 2.2.0 13 | 2.11.8 14 | 15 | 16 | 17 | 18 | org.scala-lang 19 | scala-library 20 | ${scala.version} 21 | compile 22 | 23 | 24 | org.scala-lang 25 | scala-compiler 26 | ${scala.version} 27 | compile 28 | 29 | 30 | 31 | 32 | 33 | 34 | org.scala-tools 35 | maven-scala-plugin 36 | 2.15.2 37 | 38 | 39 | 40 | compile 41 | testCompile 42 | 43 | 44 | 45 | 46 | 47 | 48 | maven-compiler-plugin 49 | 3.7.0 50 | 51 | 1.8 52 | 1.8 53 | 54 | 55 | 56 | 57 | 58 | org.apache.maven.plugins 59 | maven-surefire-plugin 60 | 2.20.1 61 | 62 | true 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | newword-find 72 | 73 | 74 | 75 | --------------------------------------------------------------------------------