├── .gitignore
├── LICENSE
├── README.md
├── newword-find
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── cn
│ │ └── spark
│ │ └── nlp
│ │ └── newwordfind
│ │ ├── core
│ │ ├── NewWordFind.scala
│ │ ├── NewWordFindConfig.scala
│ │ └── PrefixPartitioner.scala
│ │ ├── newwordfind.scala
│ │ ├── trie
│ │ ├── Node.scala
│ │ ├── NodeProcessor.scala
│ │ ├── Trie.scala
│ │ └── TrieErgodicProcessor.scala
│ │ └── utils
│ │ ├── NGram.scala
│ │ └── WordCountUtils.scala
│ └── test
│ └── scala
│ └── cn
│ └── spark
│ └── nlp
│ └── newwordfind
│ ├── Test.scala
│ ├── TestNewWordFind.scala
│ └── TestTrie.scala
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | *.class
3 | *.o
4 | *.a
5 | *.so
6 | *.exe
7 | *.log
8 | *.prefs
9 | *.classpath
10 | *.metadata
11 | *.project
12 | *.bak
13 | *.springBeans
14 | .settings
15 | build
16 | debug
17 | target
18 |
19 | # Numerous always-ignore extensions
20 | *.diff
21 | *.err
22 | *.orig
23 | *.rej
24 | *.swo
25 | *.swp
26 | *.zip
27 | *.vi
28 | ~*
29 | *.sass-cache
30 | *.ruby-version
31 |
32 | # OS or Editor folders
33 | *.DS_Store
34 | *._*
35 | Thumbs.db
36 | *.cache
37 | *.tmproj
38 | *.esproj
39 | nbproject
40 | *.sublime-project
41 | *.sublime-workspace
42 |
43 | # Komodo
44 | *.komodoproject
45 | *.komodotools
46 |
47 | # Folders to ignore
48 | .hg
49 | .svn
50 | .CVS
51 | .idea
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sparkNLP
2 | 基于spark的NLP应用
3 |
4 | 一 新词发现
5 |
6 | 一个完全不基于词典的分词
7 |
8 | 主要实现点:左邻右邻信息熵, 点间互信息, Ngram
9 |
10 | ==> 后续会考虑重合子串的处理
11 |
12 | 调用方式:
13 |
14 | import cn.spark.nlp.newwordfind._
15 |
16 | def main(args: Array[String]): Unit = {
17 | val conf = new SparkConf()
18 | .setAppName("new-words-find")
19 | .setMaster("local[3]")
20 |
21 | val pattern = "[\u4E00-\u9FA5]+".r
22 | val stopwords = "[你|我|他|她|它]+"
23 |
24 | val minLen = 2
25 | val maxLen = 6
26 | val minCount = 20
27 | val minInfoEnergy = 2.0
28 | val minPim = 20.0
29 | val numPartition = 6
30 |
31 | val newWordFindConfig = NewWordFindConfig(minLen, maxLen,
32 | minCount, minInfoEnergy, minPim, numPartition)
33 |
34 | val sc = new SparkContext(conf)
35 |
36 | val lines =sc.textFile("/Users/songyaheng/Downloads/西游记.txt")
37 | .flatMap(pattern.findAllIn(_).toSeq)
38 | .flatMap(_.split(stopwords))
39 | .newWord(sc, newWordFindConfig)
40 | //wepf: w:代表词(_1) e: 代表该词的信息熵(左邻右邻信息熵中最小的)(_2) p: 代表点间互信息(_3) f: 代表词频(_4)
41 | .map(wepf => (wepf._2 * wepf._3 * wepf._4, wepf._1))
42 | .sortByKey(false, 1)
43 | .foreach(println)
44 |
45 | sc.stop()
46 | }
47 |
48 | 结果如下:
49 |
50 | (18168.514250954064,袈裟)
51 | (17713.06665111492,芭蕉)
52 | (16798.82717604416,吩咐)
53 | (16142.526118040218,葫芦)
54 | (11702.377091478338,乾坤)
55 | (11628.283999511725,哪吒)
56 | (11457.444521822847,猢狲)
57 | (10285.647417662267,琉璃)
58 | (7756.876983908059,荆棘)
59 | (7340.1686818898015,包袱)
60 | (7250.92797731874,校尉)
61 | (6876.8133704999855,钵盂)
62 | (6141.238795255687,揭谛)
63 | (6097.826306360437,惫懒)
64 | (4567.736774433127,苍蝇)
65 | (4398.391082713808,弼马温)
66 | (4208.842378551715,抖擞)
67 | (3865.6764241340757,孽畜)
68 | (3806.4273334808236,驿丞)
69 | (3369.570454781614,夯货)
70 | (3209.808428480634,悚惧)
71 | (3104.343061153103,祭赛)
72 | (3051.358367810302,武艺)
73 | (2996.755537579268,丑陋)
74 | (2821.9446891721645,怠慢)
75 | (2789.486615314228,蟠桃)
76 | (2706.7702230076206,逍遥)
77 | (2661.6587009929654,伺候)
78 | (2428.8996887030903,输赢)
79 | (2318.4894066182214,纷纷)
80 | (2314.1992786437513,奶奶)
81 | (2309.8451555988668,妈妈)
82 | (2021.7681532826207,尘埃)
83 | (1840.4474331072206,森森)
84 | (1768.7517043782416,伽蓝)
85 | (1673.1962179067696,悄悄)
86 | (1453.9759495186454,踪迹)
87 | (1338.3997377699582,杨柳)
--------------------------------------------------------------------------------
/newword-find/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | cn.spark.nlp
8 | newword-find
9 | 1.0.0-SNAPSHOT
10 |
11 | 2.2.0
12 | 2.11.8
13 |
14 |
15 |
16 |
17 | org.apache.spark
18 | spark-core_2.11
19 | ${spark.version}
20 |
21 |
22 | org.scala-lang
23 | scala-reflect
24 |
25 |
26 |
27 |
28 | org.apache.spark
29 | spark-streaming_2.11
30 | ${spark.version}
31 |
32 |
33 | org.apache.spark
34 | spark-sql_2.11
35 | ${spark.version}
36 |
37 |
38 | org.apache.spark
39 | spark-hive_2.11
40 | ${spark.version}
41 |
42 |
43 | org.apache.spark
44 | spark-mllib_2.11
45 | ${spark.version}
46 |
47 |
48 | org.scala-lang
49 | scala-library
50 | ${scala.version}
51 | compile
52 |
53 |
54 | org.scala-lang
55 | scala-compiler
56 | ${scala.version}
57 | compile
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 | org.scala-tools
67 | maven-scala-plugin
68 | 2.15.2
69 |
70 |
71 |
72 | compile
73 | testCompile
74 |
75 |
76 |
77 |
78 |
79 |
80 | maven-compiler-plugin
81 | 3.7.0
82 |
83 | 1.8
84 | 1.8
85 |
86 |
87 |
88 |
89 |
90 | org.apache.maven.plugins
91 | maven-surefire-plugin
92 | 2.20.1
93 |
94 | true
95 |
96 |
97 |
98 |
99 |
100 |
101 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/NewWordFind.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.core
2 |
3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie, TrieErgodicProcessor}
4 | import cn.spark.nlp.newwordfind.utils.{NGram, WordCountUtils}
5 | import org.apache.spark.SparkContext
6 | import org.apache.spark.broadcast.Broadcast
7 | import org.apache.spark.rdd.RDD
8 |
9 | import scala.collection.mutable.ArrayBuffer
10 |
11 | /**
12 | *
13 | * @author songyaheng on 2017/12/7
14 | * @version 1.0
15 | */
16 | object NewWordFind {
17 |
18 | def newWordRDD(sc: SparkContext, rdd: RDD[_], newWordFindConfig: NewWordFindConfig): RDD[(String, Double, Double, Int)] = {
19 | val lineCount = rddLineCount(rdd)
20 | val trieRdd = trieRDD(rdd, newWordFindConfig)
21 | val trieCount = trieRdd.reduce(_.join(new NodeProcessor[(Double, Int), Trie[(Double,Int)], String] {
22 | override def process(e: String)(t: (Double, Int), trie: Trie[(Double, Int)]) = {
23 | trie.value(e) match {
24 | case Some(tt) => trie.updateValue(e, (math.min(t._1, tt._1), t._2 + tt._2))
25 | case None => trie.insert(e, t)
26 | }
27 | }
28 | })(_))
29 | val btrieCount = sc.broadcast(trieCount)
30 | trieRdd.mapPartitions(r => rddPmi(r, newWordFindConfig, lineCount, btrieCount))
31 | }
32 |
33 | def rddLineCount(rdd: RDD[_]): Double = {
34 | rdd.flatMap(line => line.toString.split(""))
35 | .map(w => (w, 1))
36 | .reduceByKey(_ + _)
37 | .count().toDouble
38 | }
39 |
40 | def trieRDD(rdd: RDD[_], newWordFindConfig: NewWordFindConfig): RDD[Trie[(Double, Int)]] = {
41 | rdd.flatMap(l => NGram.nGram(l.toString, 1, newWordFindConfig.maxlen))
42 | .reduceByKey(WordCountUtils.count)
43 | .filter(wmf => wmf._2._3 >= newWordFindConfig.minCount)
44 | .map(WordCountUtils.energyCount)
45 | .partitionBy(new PrefixPartitioner(newWordFindConfig.numPartitions))
46 | .mapPartitions(WordCountUtils.trieRDD)
47 | .cache()
48 | }
49 |
50 | def rddPmi(iterator: Iterator[Trie[(Double, Int)]], newWordFindConfig: NewWordFindConfig, lineCount: Double, btrieCount: Broadcast[Trie[(Double, Int)]]): Iterator[(String, Double, Double, Int)] = {
51 | val trie = iterator.next()
52 | var array = new ArrayBuffer[(String, Double, Double, Int)]()
53 | trie.ergodic(new TrieErgodicProcessor[String, (Double, Int)] {
54 | override def process(t: String, e: (Double, Int)): Unit = {
55 | if (t.length >= newWordFindConfig.minlen) {
56 | val pmi = (0 to t.length - 2).map(i => {
57 | val a = t.substring(0, i + 1)
58 | val b = t.substring(i + 1)
59 | val av = btrieCount.value.value(a) match {
60 | case Some(v) => v._2.toDouble
61 | case _ => newWordFindConfig.minCount.toDouble
62 | }
63 | val bv = btrieCount.value.value(b) match {
64 | case Some(v) => v._2.toDouble
65 | case _ => newWordFindConfig.minCount.toDouble
66 | }
67 | e._2 * lineCount * 1.0 / (av * bv)
68 | }).min
69 | if (pmi >= newWordFindConfig.minPmi && e._1 >= newWordFindConfig.minInfoEnergy) {
70 | array += ((t, e._1, pmi, e._2))
71 | }
72 | }
73 | }
74 | })
75 | array.toIterator
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/NewWordFindConfig.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.core
2 |
3 | /**
4 | *
5 | * @author songyaheng on 2017/12/7
6 | * @version 1.0
7 | */
8 | case class NewWordFindConfig(
9 | minlen: Int,
10 | maxlen: Int,
11 | minCount: Int,
12 | minInfoEnergy: Double,
13 | minPmi: Double,
14 | numPartitions: Int
15 | )
16 |
17 | object NewWordFindConfig {
18 | val minlen: Int = 2
19 | val maxlen: Int = 4
20 | val minCount: Int = 5
21 | val minInfoEnergy: Double = 1.0
22 | val minPmi: Double = 1.0
23 | val numPartitions: Int = 6
24 | }
25 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/PrefixPartitioner.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.core
2 |
3 | import org.apache.spark.Partitioner
4 |
5 | /**
6 | *
7 | * @author songyaheng on 2017/12/8
8 | * @version 1.0
9 | */
10 | class PrefixPartitioner(numPartition: Int) extends Partitioner{
11 | override def numPartitions = numPartition
12 |
13 | override def getPartition(key: Any): Int = {
14 | val prefix = key.toString.substring(0, 1)
15 | val code = prefix.hashCode % numPartition
16 | if (code < 0) {
17 | code + numPartition
18 | } else {
19 | code
20 | }
21 | }
22 |
23 | override def equals(other: Any): Boolean = other match {
24 | case prefixPartitioner: PrefixPartitioner =>
25 | prefixPartitioner.numPartitions == numPartitions
26 | case _ =>
27 | false
28 | }
29 |
30 | override def hashCode(): Int = numPartitions
31 |
32 | }
33 |
34 |
35 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/newwordfind.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp
2 |
3 | import cn.spark.nlp.newwordfind.core.{NewWordFind, NewWordFindConfig}
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 |
7 | import scala.reflect.ClassTag
8 |
9 | /**
10 | *
11 | * @author songyaheng on 2017/12/7
12 | * @version 1.0
13 | */
14 | package object newwordfind {
15 |
16 | implicit def sparkRDDFunctions(rdd: RDD[String]) = new SparkRDDFunctions[String](rdd)
17 |
18 | class SparkRDDFunctions[T : ClassTag](rdd: RDD[T]) extends Serializable {
19 | def newWord(sc: SparkContext): RDD[(String, Double, Double, Int)] =
20 | NewWordFind.newWordRDD(sc, rdd, NewWordFindConfig(NewWordFindConfig.minlen,
21 | NewWordFindConfig.maxlen,
22 | NewWordFindConfig.minCount,
23 | NewWordFindConfig.minInfoEnergy,
24 | NewWordFindConfig.minPmi,
25 | NewWordFindConfig.numPartitions))
26 | def newWord(sc: SparkContext, newWordFindConfig: NewWordFindConfig): RDD[(String, Double, Double, Int)] =
27 | NewWordFind.newWordRDD(sc, rdd, newWordFindConfig)
28 |
29 | }
30 |
31 |
32 |
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/Node.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.trie
2 |
3 | import scala.collection.immutable.TreeMap
4 |
5 |
6 | /**
7 | *
8 | * @author songyaheng on 2017/11/27
9 | * @version 1.0
10 | */
11 | class Node[T](char: Char) extends Serializable {
12 | var content: Char = char
13 | var isEnd: Boolean = false
14 | var childMap: Map[Char, Node[T]] = TreeMap[Char, Node[T]]()
15 | var t: T = _
16 | var depth: Int = 0
17 | var count: Int = 0
18 |
19 | def nextNode(char: Char): Option[Node[T]] = {
20 | if (childMap.nonEmpty) {
21 | childMap.get(char)
22 | } else {
23 | None
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/NodeProcessor.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.trie
2 |
3 | /**
4 | *
5 | * @author songyaheng on 2017/11/28
6 | * @version 1.0
7 | */
8 | trait NodeProcessor[T, R, E] extends Serializable{
9 | def process(e: E)(t: T, trie: Trie[T]): R
10 | }
11 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/Trie.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.trie
2 |
3 | import scala.collection.mutable.ArrayBuffer
4 |
5 | /**
6 | *
7 | * @author songyaheng on 2017/11/28
8 | * @version 1.0
9 | */
10 | class Trie[T] extends Serializable {
11 |
12 | val root: Node[T] = new Node[T](' ')
13 |
14 | def insert(word: String, t: T): Trie[T] = {
15 | insert(word, t, null)
16 | }
17 |
18 | /**
19 | * 插入数据的方法
20 | * @param word
21 | * @param t
22 | * @param processor
23 | */
24 | def insert(word: String, t: T, processor: NodeProcessor[T, Trie[T], String]): Trie[T] = {
25 | this.synchronized {
26 | if (word.isEmpty) return this
27 | value(word) match {
28 | case Some(tt) => {
29 | if (processor == null) {
30 | return this
31 | } else {
32 | return processor.process(word)(t, this)
33 | }
34 | }
35 | case None => {
36 | var curentNode: Node[T] = root
37 | var deep: Int = 0
38 | word.trim.foreach(c => {
39 | deep = deep + 1
40 | curentNode.nextNode(c) match {
41 | case Some(nd) =>
42 | curentNode = nd
43 | case None =>
44 | curentNode.childMap += (c -> new Node[T](c))
45 | curentNode.count = curentNode.childMap.size
46 | curentNode.nextNode(c) match {
47 | case Some(nd) =>
48 | curentNode = nd
49 | curentNode.depth = deep
50 | case None =>
51 | return this
52 | }
53 | }
54 | })
55 | curentNode.t = t
56 | curentNode.isEnd = true
57 | this
58 | }
59 | }
60 |
61 | }
62 | }
63 |
64 | /**
65 | * 判断是否存在
66 | * @param word
67 | * @return
68 | */
69 | def exist(word: String): Boolean = {
70 | var curentNode = root
71 | word.trim.toCharArray.foreach(c => {
72 | if (curentNode.nextNode(c).isEmpty) {
73 | false
74 | } else {
75 | curentNode.nextNode(c) match {
76 | case Some(nd) => curentNode = nd
77 | case None => return false
78 | }
79 | }
80 | })
81 | if (curentNode.isEnd) {
82 | true
83 | } else {
84 | false
85 | }
86 | }
87 |
88 | /**
89 | * 获取输入字符串的标签值
90 | * @param word
91 | * @return
92 | */
93 | def value(word: String): Option[T] = {
94 | if (word.isEmpty) return None
95 | var curentNode = root
96 | word.toCharArray.foreach(c => {
97 | curentNode.nextNode(c) match {
98 | case Some(nd) => curentNode = nd
99 | case None => return None
100 | }
101 | })
102 | if (curentNode.isEnd) {
103 | Some(curentNode.t)
104 | } else {
105 | None
106 | }
107 | }
108 |
109 | def updateValue(word: String, t: T): Trie[T] = {
110 | if (word.isEmpty) return this
111 | this.synchronized {
112 | var curentNode = root
113 | word.toCharArray.foreach(c => {
114 | curentNode.nextNode(c) match {
115 | case Some(nd) =>
116 | curentNode = nd
117 | case None => return this
118 | }
119 | })
120 | if (curentNode.isEnd) curentNode.t = t
121 | this
122 | }
123 | }
124 |
125 | /**
126 | * 获取前缀的所有词
127 | * @param prifix
128 | * @return
129 | */
130 | def allWords(prifix: String): ArrayBuffer[String] = {
131 | val rs: ArrayBuffer[String] = ArrayBuffer[String]()
132 | if (prifix.isEmpty) {
133 | return rs
134 | }
135 | var node: Node[T] = root
136 | prifix.trim.toCharArray.foreach(c => {
137 | if (node.count == 0) {
138 | return rs
139 | } else {
140 | node.nextNode(c) match {
141 | case Some(nd) => node = nd
142 | case None => return rs
143 | }
144 | }
145 | })
146 | if (node.count != 0) {
147 | fullWords(node, prifix, rs)
148 | }
149 | rs
150 | }
151 |
152 | /**
153 | * 递归获取节点值
154 | * @param node
155 | * @param profix
156 | * @param arrayBuffer
157 | */
158 | private def fullWords(node: Node[T], profix: String, arrayBuffer: ArrayBuffer[String]): Unit = {
159 | node.childMap.values.foreach(nd => {
160 | fullWords(nd, profix + nd.content, arrayBuffer)
161 | })
162 | if (node.isEnd) arrayBuffer += profix
163 | }
164 |
165 | /**
166 | * 两棵树的join
167 | * @param trie
168 | * @return
169 | */
170 | def join(processor: NodeProcessor[T, Trie[T], String])(trie: Trie[T]): Trie[T] = {
171 | this.synchronized {
172 | val node = trie.root
173 | if (node.count != 0) {
174 | interacte(node, "", processor)
175 | }
176 | this
177 | }
178 | }
179 |
180 | private def interacte(node: Node[T], profix: String, processor: NodeProcessor[T, Trie[T], String]): Unit = {
181 | node.childMap.values.foreach(nd => interacte(nd, profix + nd.content, processor))
182 | if (node.isEnd) {
183 | this.value(profix) match {
184 | case Some(t) => processor.process(profix)(node.t, this)
185 | case None => this.insert(profix, node.t)
186 | }
187 | }
188 | }
189 |
190 | def ergodic(trieErgodicProcessor: TrieErgodicProcessor[String, T]): Unit =
191 | this.synchronized {
192 | val node = this.root
193 | if (node.count != 0) {
194 | innerErgodic(node, "", trieErgodicProcessor)
195 | }
196 | }
197 |
198 | private def innerErgodic(node: Node[T], profix: String, trieErgodicProcessor: TrieErgodicProcessor[String, T]): Unit = {
199 | node.childMap.values.foreach(nd => innerErgodic(nd, profix + nd.content, trieErgodicProcessor))
200 | if (node.isEnd) {
201 | trieErgodicProcessor.process(profix, node.t)
202 | }
203 | }
204 |
205 | }
206 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/TrieErgodicProcessor.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.trie
2 |
3 | /**
4 | *
5 | * @author songyaheng on 2017/12/8
6 | * @version 1.0
7 | */
8 | trait TrieErgodicProcessor[T, E] extends Serializable{
9 | def process(t: T, e: E)
10 | }
11 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/utils/NGram.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.utils
2 |
3 | import scala.collection.mutable.ArrayBuffer
4 |
5 | /**
6 | * n-gram 切词工具
7 | *
8 | * @author songyaheng on 2017/11/27
9 | * @version 1.0
10 | */
11 | object NGram extends Serializable {
12 | /**
13 | * 单句双向切词
14 | * @param s
15 | * @return
16 | */
17 | def newWordGram(s: String, len: Int): List[(String, (Map[String, Int], Map[String, Int], Int))] = {
18 | val sen = "$" + s + "$"
19 | (1 to s.length - len + 1).map(i => {
20 | val w = sen.substring(i, i + len)
21 | val lw = sen.substring(i -1, i)
22 | val rw = sen.substring(i + len).substring(0, 1)
23 | (w, (Map(lw -> 1), Map(rw -> 1), 1))
24 | }).toList
25 | }
26 |
27 | def nGram(s: String, minlen: Int, maxlen: Int): List[(String, (Map[String, Int], Map[String, Int], Int))] = {
28 | (minlen to maxlen).flatMap( i => newWordGram(s, i)).toList
29 | }
30 |
31 | def nGramByLen(s: String, len: Int, f: Double): List[(String, Double)] =
32 | (0 to s.length - len).map(i => {
33 | (s.substring(i, i + len), f)
34 | }).toList
35 |
36 |
37 | def nGramByWord(s: String, minlen: Int, maxlen: Int, f: Double): List[(String, Double)] =
38 | (minlen to maxlen).flatMap( i => nGramByLen(s, i, f)).toList
39 |
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/utils/WordCountUtils.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind.utils
2 |
3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie}
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.broadcast.Broadcast
6 |
7 | /**
8 | *
9 | * @author songyaheng on 2017/12/4
10 | * @version 1.0
11 | */
12 | object WordCountUtils extends Serializable {
13 |
14 | /**
15 | * 计算左邻右邻信息和频率
16 | * @param a
17 | * @param b
18 | * @return
19 | */
20 | def count(a: (Map[String, Int], Map[String, Int], Int), b: (Map[String, Int], Map[String, Int], Int)): (Map[String, Int], Map[String, Int], Int) = {
21 | var la = a._1
22 | b._1.foreach(kv => {
23 | if (la.contains(kv._1)) {
24 | val v = la(kv._1) + kv._2
25 | la += (kv._1 -> v)
26 | } else {
27 | la += kv
28 | }
29 | })
30 | var lr = a._1
31 | b._2.foreach(kv => {
32 | if (lr.contains(kv._1)) {
33 | val v = lr(kv._1) + kv._2
34 | lr += (kv._1 -> v)
35 | } else {
36 | lr += kv
37 | }
38 | })
39 | val wc = a._3 + b._3
40 | (la, lr, wc)
41 | }
42 |
43 | /**
44 | * 计算左邻右邻信息熵
45 | * @param v
46 | * @return
47 | */
48 | def energyCount(v: (String, (Map[String, Int], Map[String, Int], Int))): (String, (Double, Int)) = {
49 | val lcount = v._2._1.values.sum
50 | val rcount = v._2._2.values.sum
51 | val le = v._2._1.values.map(c => {
52 | val p = c * 1.0 / lcount
53 | -1 * p * Math.log(p) / Math.log(2)
54 | }).sum
55 | val re = v._2._2.values.map(c => {
56 | val p = c * 1.0 / rcount
57 | -1 * p * Math.log(p) / Math.log(2)
58 | }).sum
59 | val e = Math.min(le, re)
60 | (v._1, (e, v._2._3))
61 | }
62 |
63 | def trieRDD(words: Iterator[(String, (Double, Int))]): Iterator[Trie[(Double, Int)]] = {
64 | val trie = new Trie[(Double, Int)]
65 | while (words.hasNext) {
66 | val w = words.next()
67 | trie.insert(w._1, w._2, new NodeProcessor[(Double, Int), Trie[(Double, Int)], String] {
68 | override def process(e: String)(t: (Double, Int), trie: Trie[(Double, Int)]) = {
69 | trie.value(e) match {
70 | case Some(ef) => trie.updateValue(e, (Math.min(ef._1, t._1), ef._2 + t._2))
71 | case None => trie.insert(e, t)
72 | }
73 | }
74 | })
75 | }
76 | List(trie).iterator
77 | }
78 |
79 |
80 |
81 |
82 | def permutations(list: List[Int]): Set[List[Int]] = {
83 | list match {
84 | case Nil => Set(Nil)
85 | case (head::tail) =>
86 | for(p0 <- permutations(tail); i<-0 to (p0 length); (xs,ys)=p0 splitAt i) yield xs:::List(head):::ys
87 | }
88 | }
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/newword-find/src/test/scala/cn/spark/nlp/newwordfind/Test.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind
2 |
3 | import cn.spark.nlp.newwordfind.utils.{NGram, WordCountUtils}
4 |
5 | /**
6 | *
7 | * @author songyaheng on 2017/12/4
8 | * @version 1.0
9 | */
10 | object Test {
11 | def main(args: Array[String]): Unit = {
12 | val s = "中华人民中华"
13 | NGram.nGramByWord(s, 1, 4, 1).foreach(println)
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/newword-find/src/test/scala/cn/spark/nlp/newwordfind/TestNewWordFind.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind
2 |
3 | import cn.spark.nlp.newwordfind.core.NewWordFindConfig
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import cn.spark.nlp.newwordfind._
6 | /**
7 | *
8 | * @author songyaheng on 2017/12/9
9 | * @version 1.0
10 | */
11 | object TestNewWordFind {
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf()
14 | .setAppName("new-words-find")
15 | .setMaster("local[3]")
16 |
17 | val pattern = "[\u4E00-\u9FA5]+".r
18 | val stopwords = "[你|我|他|她|它]+"
19 |
20 | val minLen = 2
21 | val maxLen = 6
22 | val minCount = 20
23 | val minInfoEnergy = 2.0
24 | val minPim = 20.0
25 | val numPartition = 6
26 |
27 | val newWordFindConfig = NewWordFindConfig(minLen, maxLen,
28 | minCount, minInfoEnergy, minPim, numPartition)
29 |
30 | val sc = new SparkContext(conf)
31 |
32 | val lines =sc.textFile("/Users/songyaheng/Downloads/西游记.txt")
33 | .flatMap(pattern.findAllIn(_).toSeq)
34 | .flatMap(_.split(stopwords))
35 | .newWord(sc, newWordFindConfig)
36 | //wepf: w:代表词(_1) e: 代表该词的信息熵(左邻右邻信息熵中最小的)(_2) p: 代表点间互信息(_3) f: 代表词频(_4)
37 | .map(wepf => (wepf._2 * wepf._3 * wepf._4, wepf._1))
38 | .sortByKey(false, 1)
39 | .foreach(println)
40 |
41 | sc.stop()
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/newword-find/src/test/scala/cn/spark/nlp/newwordfind/TestTrie.scala:
--------------------------------------------------------------------------------
1 | package cn.spark.nlp.newwordfind
2 |
3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie}
4 |
5 | /**
6 | *
7 | * @author songyaheng on 2017/11/30
8 | * @version 1.0
9 | */
10 | object TestTrie {
11 | def main(args: Array[String]): Unit = {
12 | val trie: Trie[Double] = new Trie[Double]()
13 | trie.insert("中国", 1.0)
14 | trie.insert("中国人", 2.0)
15 | trie.insert("中华人民", 3.0)
16 | trie.updateValue("中国人", 9.0)
17 |
18 | trie.insert("中国", 1.0 , new NodeProcessor[Double, Trie[Double], String] {
19 | override def process(e: String)(t: Double, trie: Trie[Double]) = {
20 | trie.value(e) match {
21 | case Some(tt) => trie.updateValue(e, tt + t)
22 | case None => trie.insert(e, t)
23 | }
24 | }
25 | })
26 |
27 | println(trie.exist("中国"))
28 | println(trie.value("中国"))
29 | println(trie.allWords("中国"))
30 |
31 | val trie2 = new Trie[Double]()
32 | trie2.insert("中华小当家", 0.5)
33 | trie2.insert("中华人民共和国", 0.6)
34 | trie2.insert("中国", 1.0)
35 | trie.join(new NodeProcessor[Double, Trie[Double], String] {
36 | override def process(e: String)(t: Double, trie: Trie[Double]) = {
37 | trie.value(e) match {
38 | case Some(tt) => trie.updateValue(e, tt + t)
39 | case None => trie.insert(e, t)
40 | }
41 | }
42 | })(trie2)
43 |
44 |
45 | println(trie.value("中国"))
46 |
47 | println(trie.allWords("中华"))
48 |
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | cn.spark.nlp
8 | sparkNLP
9 | 1.0.0-SNAPSHOT
10 |
11 |
12 | 2.2.0
13 | 2.11.8
14 |
15 |
16 |
17 |
18 | org.scala-lang
19 | scala-library
20 | ${scala.version}
21 | compile
22 |
23 |
24 | org.scala-lang
25 | scala-compiler
26 | ${scala.version}
27 | compile
28 |
29 |
30 |
31 |
32 |
33 |
34 | org.scala-tools
35 | maven-scala-plugin
36 | 2.15.2
37 |
38 |
39 |
40 | compile
41 | testCompile
42 |
43 |
44 |
45 |
46 |
47 |
48 | maven-compiler-plugin
49 | 3.7.0
50 |
51 | 1.8
52 | 1.8
53 |
54 |
55 |
56 |
57 |
58 | org.apache.maven.plugins
59 | maven-surefire-plugin
60 | 2.20.1
61 |
62 | true
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 | newword-find
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------