├── .gitignore
├── LICENSE
├── README.md
├── newword-find
    ├── pom.xml
    └── src
    │   ├── main
    │       └── scala
    │       │   └── cn
    │       │       └── spark
    │       │           └── nlp
    │       │               └── newwordfind
    │       │                   ├── core
    │       │                       ├── NewWordFind.scala
    │       │                       ├── NewWordFindConfig.scala
    │       │                       └── PrefixPartitioner.scala
    │       │                   ├── newwordfind.scala
    │       │                   ├── trie
    │       │                       ├── Node.scala
    │       │                       ├── NodeProcessor.scala
    │       │                       ├── Trie.scala
    │       │                       └── TrieErgodicProcessor.scala
    │       │                   └── utils
    │       │                       ├── NGram.scala
    │       │                       └── WordCountUtils.scala
    │   └── test
    │       └── scala
    │           └── cn
    │               └── spark
    │                   └── nlp
    │                       └── newwordfind
    │                           ├── Test.scala
    │                           ├── TestNewWordFind.scala
    │                           └── TestTrie.scala
└── pom.xml


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | *.class
 3 | *.o
 4 | *.a
 5 | *.so
 6 | *.exe
 7 | *.log
 8 | *.prefs
 9 | *.classpath
10 | *.metadata
11 | *.project
12 | *.bak
13 | *.springBeans
14 | .settings
15 | build
16 | debug
17 | target
18 | 
19 | # Numerous always-ignore extensions
20 | *.diff
21 | *.err
22 | *.orig
23 | *.rej
24 | *.swo
25 | *.swp
26 | *.zip
27 | *.vi
28 | ~*
29 | *.sass-cache
30 | *.ruby-version
31 | 
32 | # OS or Editor folders
33 | *.DS_Store
34 | *._*
35 | Thumbs.db
36 | *.cache
37 | *.tmproj
38 | *.esproj
39 | nbproject
40 | *.sublime-project
41 | *.sublime-workspace
42 | 
43 | # Komodo
44 | *.komodoproject
45 | *.komodotools
46 | 
47 | # Folders to ignore
48 | .hg
49 | .svn
50 | .CVS
51 | .idea


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sparkNLP
 2 | 基于spark的NLP应用
 3 | 
 4 | 一 新词发现
 5 | 
 6 | 一个完全不基于词典的分词
 7 | 
 8 | 主要实现点：左邻右邻信息熵， 点间互信息， Ngram
 9 | 
10 | ==> 后续会考虑重合子串的处理
11 | 
12 | 调用方式：
13 |     
14 |     import cn.spark.nlp.newwordfind._
15 |     
16 |     def main(args: Array[String]): Unit = {
17 |         val conf = new SparkConf()
18 |           .setAppName("new-words-find")
19 |           .setMaster("local[3]")
20 |     
21 |         val pattern = "[\u4E00-\u9FA5]+".r
22 |         val stopwords = "[你|我|他|她|它]+"
23 |     
24 |         val minLen = 2
25 |         val maxLen = 6
26 |         val minCount = 20
27 |         val minInfoEnergy = 2.0
28 |         val minPim = 20.0
29 |         val numPartition = 6
30 |     
31 |         val newWordFindConfig = NewWordFindConfig(minLen, maxLen,
32 |           minCount, minInfoEnergy, minPim, numPartition)
33 |     
34 |         val sc = new SparkContext(conf)
35 |         
36 |         val lines =sc.textFile("/Users/songyaheng/Downloads/西游记.txt")
37 |           .flatMap(pattern.findAllIn(_).toSeq)
38 |           .flatMap(_.split(stopwords))
39 |           .newWord(sc, newWordFindConfig)
40 |           //wepf: w:代表词(_1) e: 代表该词的信息熵（左邻右邻信息熵中最小的）(_2) p: 代表点间互信息(_3) f: 代表词频(_4)
41 |           .map(wepf => (wepf._2 * wepf._3 * wepf._4, wepf._1))
42 |           .sortByKey(false, 1)
43 |           .foreach(println)
44 |     
45 |         sc.stop()
46 |       }
47 |       
48 | 结果如下：
49 | 
50 |         (18168.514250954064,袈裟)
51 |         (17713.06665111492,芭蕉)
52 |         (16798.82717604416,吩咐)
53 |         (16142.526118040218,葫芦)
54 |         (11702.377091478338,乾坤)
55 |         (11628.283999511725,哪吒)
56 |         (11457.444521822847,猢狲)
57 |         (10285.647417662267,琉璃)
58 |         (7756.876983908059,荆棘)
59 |         (7340.1686818898015,包袱)
60 |         (7250.92797731874,校尉)
61 |         (6876.8133704999855,钵盂)
62 |         (6141.238795255687,揭谛)
63 |         (6097.826306360437,惫懒)
64 |         (4567.736774433127,苍蝇)
65 |         (4398.391082713808,弼马温)
66 |         (4208.842378551715,抖擞)
67 |         (3865.6764241340757,孽畜)
68 |         (3806.4273334808236,驿丞)
69 |         (3369.570454781614,夯货)
70 |         (3209.808428480634,悚惧)
71 |         (3104.343061153103,祭赛)
72 |         (3051.358367810302,武艺)
73 |         (2996.755537579268,丑陋)
74 |         (2821.9446891721645,怠慢)
75 |         (2789.486615314228,蟠桃)
76 |         (2706.7702230076206,逍遥)
77 |         (2661.6587009929654,伺候)
78 |         (2428.8996887030903,输赢)
79 |         (2318.4894066182214,纷纷)
80 |         (2314.1992786437513,奶奶)
81 |         (2309.8451555988668,妈妈)
82 |         (2021.7681532826207,尘埃)
83 |         (1840.4474331072206,森森)
84 |         (1768.7517043782416,伽蓝)
85 |         (1673.1962179067696,悄悄)
86 |         (1453.9759495186454,踪迹)
87 |         (1338.3997377699582,杨柳)


--------------------------------------------------------------------------------
/newword-find/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>cn.spark.nlp</groupId>
  8 |     <artifactId>newword-find</artifactId>
  9 |     <version>1.0.0-SNAPSHOT</version>
 10 |     <properties>
 11 |         <spark.version>2.2.0</spark.version>
 12 |         <scala.version>2.11.8</scala.version>
 13 |     </properties>
 14 | 
 15 |     <dependencies>
 16 |         <dependency>
 17 |             <groupId>org.apache.spark</groupId>
 18 |             <artifactId>spark-core_2.11</artifactId>
 19 |             <version>${spark.version}</version>
 20 |             <exclusions>
 21 |                 <exclusion>
 22 |                     <groupId>org.scala-lang</groupId>
 23 |                     <artifactId>scala-reflect</artifactId>
 24 |                 </exclusion>
 25 |             </exclusions>
 26 |         </dependency>
 27 |         <dependency>
 28 |             <groupId>org.apache.spark</groupId>
 29 |             <artifactId>spark-streaming_2.11</artifactId>
 30 |             <version>${spark.version}</version>
 31 |         </dependency>
 32 |         <dependency>
 33 |             <groupId>org.apache.spark</groupId>
 34 |             <artifactId>spark-sql_2.11</artifactId>
 35 |             <version>${spark.version}</version>
 36 |         </dependency>
 37 |         <dependency>
 38 |             <groupId>org.apache.spark</groupId>
 39 |             <artifactId>spark-hive_2.11</artifactId>
 40 |             <version>${spark.version}</version>
 41 |         </dependency>
 42 |         <dependency>
 43 |             <groupId>org.apache.spark</groupId>
 44 |             <artifactId>spark-mllib_2.11</artifactId>
 45 |             <version>${spark.version}</version>
 46 |         </dependency>
 47 |         <dependency>
 48 |             <groupId>org.scala-lang</groupId>
 49 |             <artifactId>scala-library</artifactId>
 50 |             <version>${scala.version}</version>
 51 |             <scope>compile</scope>
 52 |         </dependency>
 53 |         <dependency>
 54 |             <groupId>org.scala-lang</groupId>
 55 |             <artifactId>scala-compiler</artifactId>
 56 |             <version>${scala.version}</version>
 57 |             <scope>compile</scope>
 58 |         </dependency>
 59 | 
 60 |     </dependencies>
 61 | 
 62 | 
 63 |     <build>
 64 |         <plugins>
 65 |             <plugin>
 66 |                 <groupId>org.scala-tools</groupId>
 67 |                 <artifactId>maven-scala-plugin</artifactId>
 68 |                 <version>2.15.2</version>
 69 |                 <executions>
 70 |                     <execution>
 71 |                         <goals>
 72 |                             <goal>compile</goal>
 73 |                             <goal>testCompile</goal>
 74 |                         </goals>
 75 |                     </execution>
 76 |                 </executions>
 77 |             </plugin>
 78 | 
 79 |             <plugin>
 80 |                 <artifactId>maven-compiler-plugin</artifactId>
 81 |                 <version>3.7.0</version>
 82 |                 <configuration>
 83 |                     <source>1.8</source>
 84 |                     <target>1.8</target>
 85 |                 </configuration>
 86 |             </plugin>
 87 | 
 88 | 
 89 |             <plugin>
 90 |                 <groupId>org.apache.maven.plugins</groupId>
 91 |                 <artifactId>maven-surefire-plugin</artifactId>
 92 |                 <version>2.20.1</version>
 93 |                 <configuration>
 94 |                     <skip>true</skip>
 95 |                 </configuration>
 96 |             </plugin>
 97 | 
 98 |         </plugins>
 99 |     </build>
100 | 
101 | </project>


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/NewWordFind.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.core
 2 | 
 3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie, TrieErgodicProcessor}
 4 | import cn.spark.nlp.newwordfind.utils.{NGram, WordCountUtils}
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.broadcast.Broadcast
 7 | import org.apache.spark.rdd.RDD
 8 | 
 9 | import scala.collection.mutable.ArrayBuffer
10 | 
11 | /**
12 |   *
13 |   * @author songyaheng on 2017/12/7
14 |   * @version 1.0
15 |   */
16 | object NewWordFind {
17 | 
18 |   def newWordRDD(sc: SparkContext, rdd: RDD[_], newWordFindConfig: NewWordFindConfig): RDD[(String, Double, Double, Int)] = {
19 |     val lineCount = rddLineCount(rdd)
20 |     val trieRdd = trieRDD(rdd, newWordFindConfig)
21 |     val trieCount = trieRdd.reduce(_.join(new NodeProcessor[(Double, Int), Trie[(Double,Int)], String] {
22 |       override def process(e: String)(t: (Double, Int), trie: Trie[(Double, Int)]) = {
23 |         trie.value(e) match {
24 |           case Some(tt) => trie.updateValue(e, (math.min(t._1, tt._1), t._2 + tt._2))
25 |           case None => trie.insert(e, t)
26 |         }
27 |       }
28 |     })(_))
29 |     val btrieCount = sc.broadcast(trieCount)
30 |     trieRdd.mapPartitions(r => rddPmi(r, newWordFindConfig, lineCount, btrieCount))
31 |   }
32 | 
33 |   def rddLineCount(rdd: RDD[_]): Double = {
34 |     rdd.flatMap(line => line.toString.split(""))
35 |         .map(w => (w, 1))
36 |         .reduceByKey(_ + _)
37 |         .count().toDouble
38 |   }
39 | 
40 |   def trieRDD(rdd: RDD[_], newWordFindConfig: NewWordFindConfig): RDD[Trie[(Double, Int)]] = {
41 |     rdd.flatMap(l => NGram.nGram(l.toString, 1, newWordFindConfig.maxlen))
42 |       .reduceByKey(WordCountUtils.count)
43 |       .filter(wmf => wmf._2._3 >= newWordFindConfig.minCount)
44 |       .map(WordCountUtils.energyCount)
45 |       .partitionBy(new PrefixPartitioner(newWordFindConfig.numPartitions))
46 |       .mapPartitions(WordCountUtils.trieRDD)
47 |       .cache()
48 |   }
49 | 
50 |   def rddPmi(iterator: Iterator[Trie[(Double, Int)]], newWordFindConfig: NewWordFindConfig, lineCount: Double, btrieCount: Broadcast[Trie[(Double, Int)]]): Iterator[(String, Double, Double, Int)] = {
51 |     val trie = iterator.next()
52 |     var array = new ArrayBuffer[(String, Double, Double, Int)]()
53 |     trie.ergodic(new TrieErgodicProcessor[String, (Double, Int)] {
54 |       override def process(t: String, e: (Double, Int)): Unit = {
55 |         if (t.length >= newWordFindConfig.minlen) {
56 |           val pmi = (0 to t.length - 2).map(i => {
57 |             val a = t.substring(0, i + 1)
58 |             val b = t.substring(i + 1)
59 |             val av = btrieCount.value.value(a) match {
60 |               case Some(v) => v._2.toDouble
61 |               case _ => newWordFindConfig.minCount.toDouble
62 |             }
63 |             val bv = btrieCount.value.value(b) match {
64 |               case Some(v) => v._2.toDouble
65 |               case _ => newWordFindConfig.minCount.toDouble
66 |             }
67 |             e._2 * lineCount * 1.0 / (av * bv)
68 |           }).min
69 |           if (pmi >= newWordFindConfig.minPmi && e._1 >= newWordFindConfig.minInfoEnergy) {
70 |             array += ((t, e._1, pmi, e._2))
71 |           }
72 |         }
73 |       }
74 |     })
75 |     array.toIterator
76 |   }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/NewWordFindConfig.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.core
 2 | 
 3 | /**
 4 |   *
 5 |   * @author songyaheng on 2017/12/7
 6 |   * @version 1.0
 7 |   */
 8 | case class NewWordFindConfig(
 9 |                               minlen: Int,
10 |                               maxlen: Int,
11 |                               minCount: Int,
12 |                               minInfoEnergy: Double,
13 |                               minPmi: Double,
14 |                               numPartitions: Int
15 |                             )
16 | 
17 | object NewWordFindConfig {
18 |   val minlen: Int = 2
19 |   val maxlen: Int = 4
20 |   val minCount: Int = 5
21 |   val minInfoEnergy: Double = 1.0
22 |   val minPmi: Double = 1.0
23 |   val numPartitions: Int = 6
24 | }
25 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/core/PrefixPartitioner.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.core
 2 | 
 3 | import org.apache.spark.Partitioner
 4 | 
 5 | /**
 6 |   *
 7 |   * @author songyaheng on 2017/12/8
 8 |   * @version 1.0
 9 |   */
10 | class PrefixPartitioner(numPartition: Int) extends Partitioner{
11 |   override def numPartitions = numPartition
12 | 
13 |   override def getPartition(key: Any): Int = {
14 |     val prefix = key.toString.substring(0, 1)
15 |     val code = prefix.hashCode % numPartition
16 |     if (code < 0) {
17 |       code + numPartition
18 |     } else {
19 |       code
20 |     }
21 |   }
22 | 
23 |   override def equals(other: Any): Boolean = other match {
24 |     case prefixPartitioner: PrefixPartitioner =>
25 |       prefixPartitioner.numPartitions == numPartitions
26 |     case _ =>
27 |       false
28 |   }
29 | 
30 |   override def hashCode(): Int = numPartitions
31 | 
32 | }
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/newwordfind.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp
 2 | 
 3 | import cn.spark.nlp.newwordfind.core.{NewWordFind, NewWordFindConfig}
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | /**
10 |   *
11 |   * @author songyaheng on 2017/12/7
12 |   * @version 1.0
13 |   */
14 | package object newwordfind {
15 | 
16 |   implicit def sparkRDDFunctions(rdd: RDD[String]) = new SparkRDDFunctions[String](rdd)
17 | 
18 |   class SparkRDDFunctions[T : ClassTag](rdd: RDD[T]) extends Serializable {
19 |     def newWord(sc: SparkContext): RDD[(String, Double, Double, Int)] =
20 |       NewWordFind.newWordRDD(sc, rdd, NewWordFindConfig(NewWordFindConfig.minlen,
21 |         NewWordFindConfig.maxlen,
22 |         NewWordFindConfig.minCount,
23 |         NewWordFindConfig.minInfoEnergy,
24 |         NewWordFindConfig.minPmi,
25 |         NewWordFindConfig.numPartitions))
26 |     def newWord(sc: SparkContext, newWordFindConfig: NewWordFindConfig): RDD[(String, Double, Double, Int)] =
27 |       NewWordFind.newWordRDD(sc, rdd, newWordFindConfig)
28 | 
29 |   }
30 | 
31 | 
32 | 
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/Node.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.trie
 2 | 
 3 | import scala.collection.immutable.TreeMap
 4 | 
 5 | 
 6 | /**
 7 |   *
 8 |   * @author songyaheng on 2017/11/27
 9 |   * @version 1.0
10 |   */
11 | class Node[T](char: Char) extends Serializable {
12 |   var content: Char = char
13 |   var isEnd: Boolean = false
14 |   var childMap: Map[Char, Node[T]] = TreeMap[Char, Node[T]]()
15 |   var t: T = _
16 |   var depth: Int = 0
17 |   var count: Int = 0
18 | 
19 |   def nextNode(char: Char): Option[Node[T]] = {
20 |     if (childMap.nonEmpty) {
21 |       childMap.get(char)
22 |     } else {
23 |       None
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/NodeProcessor.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.trie
 2 | 
 3 | /**
 4 |   *
 5 |   * @author songyaheng on 2017/11/28
 6 |   * @version 1.0
 7 |   */
 8 | trait NodeProcessor[T, R, E] extends Serializable{
 9 |   def process(e: E)(t: T, trie: Trie[T]): R
10 | }
11 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/Trie.scala:
--------------------------------------------------------------------------------
  1 | package cn.spark.nlp.newwordfind.trie
  2 | 
  3 | import scala.collection.mutable.ArrayBuffer
  4 | 
  5 | /**
  6 |   *
  7 |   * @author songyaheng on 2017/11/28
  8 |   * @version 1.0
  9 |   */
 10 | class Trie[T] extends Serializable {
 11 | 
 12 |   val root: Node[T] = new Node[T](' ')
 13 | 
 14 |   def insert(word: String, t: T): Trie[T] = {
 15 |     insert(word, t, null)
 16 |   }
 17 | 
 18 |   /**
 19 |     * 插入数据的方法
 20 |     * @param word
 21 |     * @param t
 22 |     * @param processor
 23 |     */
 24 |   def insert(word: String, t: T, processor: NodeProcessor[T, Trie[T], String]): Trie[T] = {
 25 |     this.synchronized {
 26 |       if (word.isEmpty) return this
 27 |       value(word) match {
 28 |         case Some(tt) => {
 29 |           if (processor == null) {
 30 |             return this
 31 |           } else {
 32 |             return processor.process(word)(t, this)
 33 |           }
 34 |         }
 35 |         case None => {
 36 |           var curentNode: Node[T] = root
 37 |           var deep: Int = 0
 38 |           word.trim.foreach(c => {
 39 |             deep = deep + 1
 40 |             curentNode.nextNode(c) match {
 41 |               case Some(nd) =>
 42 |                 curentNode = nd
 43 |               case None =>
 44 |                 curentNode.childMap += (c -> new Node[T](c))
 45 |                 curentNode.count = curentNode.childMap.size
 46 |                 curentNode.nextNode(c) match {
 47 |                   case Some(nd) =>
 48 |                     curentNode = nd
 49 |                     curentNode.depth = deep
 50 |                   case None =>
 51 |                     return this
 52 |                 }
 53 |             }
 54 |           })
 55 |           curentNode.t = t
 56 |           curentNode.isEnd = true
 57 |           this
 58 |         }
 59 |       }
 60 | 
 61 |     }
 62 |   }
 63 | 
 64 |   /**
 65 |     * 判断是否存在
 66 |     * @param word
 67 |     * @return
 68 |     */
 69 |   def exist(word: String): Boolean = {
 70 |     var curentNode = root
 71 |     word.trim.toCharArray.foreach(c => {
 72 |       if (curentNode.nextNode(c).isEmpty) {
 73 |         false
 74 |       } else {
 75 |         curentNode.nextNode(c) match {
 76 |           case Some(nd) => curentNode = nd
 77 |           case None => return false
 78 |         }
 79 |       }
 80 |     })
 81 |     if (curentNode.isEnd) {
 82 |       true
 83 |     } else {
 84 |       false
 85 |     }
 86 |   }
 87 | 
 88 |   /**
 89 |     * 获取输入字符串的标签值
 90 |     * @param word
 91 |     * @return
 92 |     */
 93 |   def value(word: String): Option[T] = {
 94 |     if (word.isEmpty) return None
 95 |     var curentNode = root
 96 |     word.toCharArray.foreach(c => {
 97 |       curentNode.nextNode(c) match {
 98 |         case Some(nd) => curentNode = nd
 99 |         case None => return None
100 |       }
101 |     })
102 |     if (curentNode.isEnd) {
103 |       Some(curentNode.t)
104 |     } else {
105 |       None
106 |     }
107 |   }
108 | 
109 |   def updateValue(word: String, t: T): Trie[T] = {
110 |     if (word.isEmpty) return this
111 |     this.synchronized {
112 |       var curentNode = root
113 |       word.toCharArray.foreach(c => {
114 |         curentNode.nextNode(c) match {
115 |           case Some(nd) =>
116 |             curentNode = nd
117 |           case None => return this
118 |         }
119 |       })
120 |       if (curentNode.isEnd) curentNode.t = t
121 |       this
122 |     }
123 |   }
124 | 
125 |   /**
126 |     * 获取前缀的所有词
127 |     * @param prifix
128 |     * @return
129 |     */
130 |   def allWords(prifix: String): ArrayBuffer[String] = {
131 |     val rs: ArrayBuffer[String] = ArrayBuffer[String]()
132 |     if (prifix.isEmpty) {
133 |       return rs
134 |     }
135 |     var node: Node[T] = root
136 |     prifix.trim.toCharArray.foreach(c => {
137 |       if (node.count == 0) {
138 |         return rs
139 |       } else {
140 |         node.nextNode(c) match {
141 |           case Some(nd) => node = nd
142 |           case None => return rs
143 |         }
144 |       }
145 |     })
146 |     if (node.count != 0) {
147 |       fullWords(node, prifix, rs)
148 |     }
149 |     rs
150 |   }
151 | 
152 |   /**
153 |     * 递归获取节点值
154 |     * @param node
155 |     * @param profix
156 |     * @param arrayBuffer
157 |     */
158 |   private def fullWords(node: Node[T], profix: String, arrayBuffer: ArrayBuffer[String]): Unit = {
159 |     node.childMap.values.foreach(nd => {
160 |       fullWords(nd, profix + nd.content, arrayBuffer)
161 |     })
162 |     if (node.isEnd) arrayBuffer += profix
163 |   }
164 | 
165 |   /**
166 |     * 两棵树的join
167 |     * @param trie
168 |     * @return
169 |     */
170 |   def join(processor: NodeProcessor[T, Trie[T], String])(trie: Trie[T]): Trie[T] = {
171 |     this.synchronized {
172 |       val node = trie.root
173 |       if (node.count != 0) {
174 |         interacte(node, "", processor)
175 |       }
176 |       this
177 |     }
178 |   }
179 | 
180 |   private def interacte(node: Node[T], profix: String, processor: NodeProcessor[T, Trie[T], String]): Unit = {
181 |     node.childMap.values.foreach(nd => interacte(nd, profix + nd.content, processor))
182 |     if (node.isEnd) {
183 |       this.value(profix) match {
184 |         case Some(t) => processor.process(profix)(node.t, this)
185 |         case None => this.insert(profix, node.t)
186 |       }
187 |     }
188 |   }
189 | 
190 |   def ergodic(trieErgodicProcessor: TrieErgodicProcessor[String, T]): Unit =
191 |     this.synchronized {
192 |       val node = this.root
193 |       if (node.count != 0) {
194 |         innerErgodic(node, "", trieErgodicProcessor)
195 |       }
196 |     }
197 | 
198 |   private def innerErgodic(node: Node[T], profix: String, trieErgodicProcessor: TrieErgodicProcessor[String, T]): Unit = {
199 |     node.childMap.values.foreach(nd => innerErgodic(nd, profix + nd.content, trieErgodicProcessor))
200 |     if (node.isEnd) {
201 |       trieErgodicProcessor.process(profix, node.t)
202 |     }
203 |   }
204 | 
205 | }
206 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/trie/TrieErgodicProcessor.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.trie
 2 | 
 3 | /**
 4 |   *
 5 |   * @author songyaheng on 2017/12/8
 6 |   * @version 1.0
 7 |   */
 8 | trait TrieErgodicProcessor[T, E] extends Serializable{
 9 |   def process(t: T, e: E)
10 | }
11 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/utils/NGram.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.utils
 2 | 
 3 | import scala.collection.mutable.ArrayBuffer
 4 | 
 5 | /**
 6 |   * n-gram 切词工具
 7 |   *
 8 |   * @author songyaheng on 2017/11/27
 9 |   * @version 1.0
10 |   */
11 | object NGram extends Serializable {
12 |   /**
13 |     * 单句双向切词
14 |     * @param s
15 |     * @return
16 |     */
17 |   def newWordGram(s: String, len: Int): List[(String, (Map[String, Int], Map[String, Int], Int))] = {
18 |     val sen = "$" + s + "$"
19 |     (1 to s.length - len + 1).map(i => {
20 |       val w = sen.substring(i, i + len)
21 |       val lw = sen.substring(i -1, i)
22 |       val rw = sen.substring(i + len).substring(0, 1)
23 |       (w, (Map(lw -> 1), Map(rw -> 1), 1))
24 |     }).toList
25 |   }
26 | 
27 |   def nGram(s: String, minlen: Int, maxlen: Int): List[(String, (Map[String, Int], Map[String, Int], Int))] = {
28 |     (minlen to maxlen).flatMap( i => newWordGram(s, i)).toList
29 |   }
30 | 
31 |   def nGramByLen(s: String, len: Int, f: Double): List[(String, Double)] =
32 |     (0 to s.length - len).map(i => {
33 |       (s.substring(i, i + len), f)
34 |     }).toList
35 | 
36 | 
37 |   def nGramByWord(s: String, minlen: Int, maxlen: Int, f: Double): List[(String, Double)] =
38 |     (minlen to maxlen).flatMap( i => nGramByLen(s, i, f)).toList
39 | 
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/newword-find/src/main/scala/cn/spark/nlp/newwordfind/utils/WordCountUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind.utils
 2 | 
 3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie}
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.broadcast.Broadcast
 6 | 
 7 | /**
 8 |   *
 9 |   * @author songyaheng on 2017/12/4
10 |   * @version 1.0
11 |   */
12 | object WordCountUtils extends Serializable {
13 | 
14 |   /**
15 |     * 计算左邻右邻信息和频率
16 |     * @param a
17 |     * @param b
18 |     * @return
19 |     */
20 |   def count(a: (Map[String, Int], Map[String, Int], Int), b: (Map[String, Int], Map[String, Int], Int)): (Map[String, Int], Map[String, Int], Int) = {
21 |     var la = a._1
22 |     b._1.foreach(kv => {
23 |       if (la.contains(kv._1)) {
24 |         val v = la(kv._1) + kv._2
25 |         la += (kv._1 -> v)
26 |       } else {
27 |         la += kv
28 |       }
29 |     })
30 |     var lr = a._1
31 |     b._2.foreach(kv => {
32 |       if (lr.contains(kv._1)) {
33 |         val v = lr(kv._1) + kv._2
34 |         lr += (kv._1 -> v)
35 |       } else {
36 |         lr += kv
37 |       }
38 |     })
39 |     val wc = a._3 + b._3
40 |     (la, lr, wc)
41 |   }
42 | 
43 |   /**
44 |     * 计算左邻右邻信息熵
45 |     * @param v
46 |     * @return
47 |     */
48 |   def energyCount(v: (String, (Map[String, Int], Map[String, Int], Int))): (String, (Double, Int)) = {
49 |     val lcount = v._2._1.values.sum
50 |     val rcount = v._2._2.values.sum
51 |     val le = v._2._1.values.map(c => {
52 |       val p = c * 1.0 / lcount
53 |       -1 * p * Math.log(p) / Math.log(2)
54 |     }).sum
55 |     val re = v._2._2.values.map(c => {
56 |       val p = c * 1.0 / rcount
57 |       -1 * p * Math.log(p) / Math.log(2)
58 |     }).sum
59 |     val e = Math.min(le, re)
60 |     (v._1, (e, v._2._3))
61 |   }
62 | 
63 |   def trieRDD(words: Iterator[(String, (Double, Int))]): Iterator[Trie[(Double, Int)]] = {
64 |     val trie = new Trie[(Double, Int)]
65 |     while (words.hasNext) {
66 |       val w = words.next()
67 |       trie.insert(w._1, w._2, new NodeProcessor[(Double, Int), Trie[(Double, Int)], String] {
68 |         override def process(e: String)(t: (Double, Int), trie: Trie[(Double, Int)]) = {
69 |           trie.value(e) match {
70 |             case Some(ef) => trie.updateValue(e, (Math.min(ef._1, t._1), ef._2 + t._2))
71 |             case None => trie.insert(e, t)
72 |           }
73 |         }
74 |       })
75 |     }
76 |     List(trie).iterator
77 |   }
78 | 
79 | 
80 | 
81 | 
82 |   def permutations(list: List[Int]): Set[List[Int]] = {
83 |     list match {
84 |       case Nil => Set(Nil)
85 |       case (head::tail) =>
86 |         for(p0 <- permutations(tail); i<-0 to (p0 length); (xs,ys)=p0 splitAt i) yield xs:::List(head):::ys
87 |     }
88 |   }
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/newword-find/src/test/scala/cn/spark/nlp/newwordfind/Test.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind
 2 | 
 3 | import cn.spark.nlp.newwordfind.utils.{NGram, WordCountUtils}
 4 | 
 5 | /**
 6 |   *
 7 |   * @author songyaheng on 2017/12/4
 8 |   * @version 1.0
 9 |   */
10 | object Test {
11 |   def main(args: Array[String]): Unit = {
12 |     val s = "中华人民中华"
13 |     NGram.nGramByWord(s, 1, 4, 1).foreach(println)
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/newword-find/src/test/scala/cn/spark/nlp/newwordfind/TestNewWordFind.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind
 2 | 
 3 | import cn.spark.nlp.newwordfind.core.NewWordFindConfig
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | import cn.spark.nlp.newwordfind._
 6 | /**
 7 |   *
 8 |   * @author songyaheng on 2017/12/9
 9 |   * @version 1.0
10 |   */
11 | object TestNewWordFind {
12 |   def main(args: Array[String]): Unit = {
13 |     val conf = new SparkConf()
14 |       .setAppName("new-words-find")
15 |       .setMaster("local[3]")
16 | 
17 |     val pattern = "[\u4E00-\u9FA5]+".r
18 |     val stopwords = "[你|我|他|她|它]+"
19 | 
20 |     val minLen = 2
21 |     val maxLen = 6
22 |     val minCount = 20
23 |     val minInfoEnergy = 2.0
24 |     val minPim = 20.0
25 |     val numPartition = 6
26 | 
27 |     val newWordFindConfig = NewWordFindConfig(minLen, maxLen,
28 |       minCount, minInfoEnergy, minPim, numPartition)
29 | 
30 |     val sc = new SparkContext(conf)
31 | 
32 |     val lines =sc.textFile("/Users/songyaheng/Downloads/西游记.txt")
33 |       .flatMap(pattern.findAllIn(_).toSeq)
34 |       .flatMap(_.split(stopwords))
35 |       .newWord(sc, newWordFindConfig)
36 |       //wepf: w:代表词(_1) e: 代表该词的信息熵（左邻右邻信息熵中最小的）(_2) p: 代表点间互信息(_3) f: 代表词频(_4)
37 |       .map(wepf => (wepf._2 * wepf._3 * wepf._4, wepf._1))
38 |       .sortByKey(false, 1)
39 |       .foreach(println)
40 | 
41 |     sc.stop()
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/newword-find/src/test/scala/cn/spark/nlp/newwordfind/TestTrie.scala:
--------------------------------------------------------------------------------
 1 | package cn.spark.nlp.newwordfind
 2 | 
 3 | import cn.spark.nlp.newwordfind.trie.{NodeProcessor, Trie}
 4 | 
 5 | /**
 6 |   *
 7 |   * @author songyaheng on 2017/11/30
 8 |   * @version 1.0
 9 |   */
10 | object TestTrie {
11 |   def main(args: Array[String]): Unit = {
12 |     val trie: Trie[Double] = new Trie[Double]()
13 |     trie.insert("中国", 1.0)
14 |     trie.insert("中国人", 2.0)
15 |     trie.insert("中华人民", 3.0)
16 |     trie.updateValue("中国人", 9.0)
17 | 
18 |     trie.insert("中国", 1.0 , new NodeProcessor[Double, Trie[Double], String] {
19 |       override def process(e: String)(t: Double, trie: Trie[Double]) = {
20 |         trie.value(e) match {
21 |           case Some(tt) => trie.updateValue(e, tt + t)
22 |           case None => trie.insert(e, t)
23 |         }
24 |       }
25 |     })
26 | 
27 |     println(trie.exist("中国"))
28 |     println(trie.value("中国"))
29 |     println(trie.allWords("中国"))
30 | 
31 |     val trie2 = new Trie[Double]()
32 |     trie2.insert("中华小当家", 0.5)
33 |     trie2.insert("中华人民共和国", 0.6)
34 |     trie2.insert("中国", 1.0)
35 |     trie.join(new NodeProcessor[Double, Trie[Double], String] {
36 |       override def process(e: String)(t: Double, trie: Trie[Double]) = {
37 |         trie.value(e) match {
38 |           case Some(tt) => trie.updateValue(e, tt + t)
39 |           case None => trie.insert(e, t)
40 |         }
41 |       }
42 |     })(trie2)
43 | 
44 | 
45 |     println(trie.value("中国"))
46 | 
47 |     println(trie.allWords("中华"))
48 | 
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>cn.spark.nlp</groupId>
 8 |     <artifactId>sparkNLP</artifactId>
 9 |     <version>1.0.0-SNAPSHOT</version>
10 | 
11 |     <properties>
12 |         <spark.version>2.2.0</spark.version>
13 |         <scala.version>2.11.8</scala.version>
14 |     </properties>
15 | 
16 |     <dependencies>
17 |         <dependency>
18 |             <groupId>org.scala-lang</groupId>
19 |             <artifactId>scala-library</artifactId>
20 |             <version>${scala.version}</version>
21 |             <scope>compile</scope>
22 |         </dependency>
23 |         <dependency>
24 |             <groupId>org.scala-lang</groupId>
25 |             <artifactId>scala-compiler</artifactId>
26 |             <version>${scala.version}</version>
27 |             <scope>compile</scope>
28 |         </dependency>
29 |     </dependencies>
30 | 
31 |     <build>
32 |         <plugins>
33 |             <plugin>
34 |                 <groupId>org.scala-tools</groupId>
35 |                 <artifactId>maven-scala-plugin</artifactId>
36 |                 <version>2.15.2</version>
37 |                 <executions>
38 |                     <execution>
39 |                         <goals>
40 |                             <goal>compile</goal>
41 |                             <goal>testCompile</goal>
42 |                         </goals>
43 |                     </execution>
44 |                 </executions>
45 |             </plugin>
46 | 
47 |             <plugin>
48 |                 <artifactId>maven-compiler-plugin</artifactId>
49 |                 <version>3.7.0</version>
50 |                 <configuration>
51 |                     <source>1.8</source>
52 |                     <target>1.8</target>
53 |                 </configuration>
54 |             </plugin>
55 | 
56 | 
57 |             <plugin>
58 |                 <groupId>org.apache.maven.plugins</groupId>
59 |                 <artifactId>maven-surefire-plugin</artifactId>
60 |                 <version>2.20.1</version>
61 |                 <configuration>
62 |                     <skip>true</skip>
63 |                 </configuration>
64 |             </plugin>
65 | 
66 |         </plugins>
67 |     </build>
68 | 
69 |     <modules>
70 |         <module>
71 |             newword-find
72 |         </module>
73 |     </modules>
74 | </project>
75 | 


--------------------------------------------------------------------------------