├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src ├── main └── java │ └── com │ └── guokr │ └── nlp │ ├── GkNlpCli.java │ └── __PKG__.java └── test └── java └── com └── guokr └── nlp └── test └── BasicTests.java /.gitignore: -------------------------------------------------------------------------------- 1 | *jar 2 | /target/ 3 | .lein-deps-sum 4 | .lein-failures 5 | 6 | dependency-reduced-pom.xml 7 | 8 | # OS generated files # 9 | .DS_Store 10 | .DS_Store? 11 | ._* 12 | .Spotlight-V100 13 | .Trashes 14 | Icon? 15 | ehthumbs.db 16 | Thumbs.db 17 | 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | stan-cn-nlp 2 | ============ 3 | 4 | An API wrapper based on Stanford NLP packages for Simplified Chinese users 5 | 6 | * * * * * * * 7 | 8 | Copyright (C) 2012 Guokr.com 9 | 10 | This program is free software; you can redistribute it and/or 11 | modify it under the terms of the GNU General Public License 12 | as published by the Free Software Foundation; either version 2 13 | of the License, or (at your option) any later version. 14 | 15 | This program is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with this program; if not, write to the Free Software 22 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 23 | 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | stan-cn-nlp 2 | ============ 3 | 4 | An API wrapper based on Stanford NLP packages for the convenience of Chinese 5 | users. This package is based on stan-cn-* family: 6 | 7 | * stan-cn-com: Common code base 8 | * stan-cn-seg: Chinese segmentation and related data model 9 | * stan-cn-ner: Named entity recognition and related data model 10 | * stan-cn-tag: POS tagging and related data model 11 | 12 | This package bundled seg, ner and tagging together. So if you only need one of 13 | them, you can use stan-cn-seg, stan-cn-ner, stan-cn-tag separately. 14 | 15 | Purpose of the packages 16 | ------------------------ 17 | 18 | The original Stanford CoreNLP packages with default language settings in Maven 19 | central is only for English. If you are dealing with simplified Chinese, you 20 | still need to download the Chinese model and fix some configuration files. 21 | 22 | The burden is not too much, but if you deploy these packages to a server 23 | cluster, this burden might be amplified. 24 | 25 | Whatever you face a single node or a server farm, it would be a pleasurable 26 | solution to provide packages with default settings of Chinese language 27 | models. That is what we do. 28 | 29 | Comments, reviews, bug reports and patches are welcomed. 30 | 31 | Current version 32 | ---------------- 33 | 34 | Current version is 0.0.4 and based on Stanford CoreNLP 3.2.0 with minor fixes. 35 | 36 | including below dependency: 37 | 38 | * maven: 39 | ```xml 40 | 41 | com.guokr 42 | stan-cn-nlp 43 | 0.0.4 44 | 45 | ``` 46 | * leiningen: 47 | ```clojure 48 | [com.guokr/stan-cn-nlp "0.0.4"] 49 | ``` 50 | * sbt: 51 | ```scala 52 | libraryDependencies += "com.guokr" % "stan-cn-nlp" % "0.0.4" 53 | ``` 54 | 55 | Simplified API 56 | --------------- 57 | 58 | We use a very simple API to reduce the complexity. 59 | 60 | ```java 61 | new SegWrapper(settings).segment(text); 62 | new NerWrapper(settings).recognize(text); 63 | new TagWrapper(settings).tag(text); 64 | ``` 65 | 66 | Or if you want to use the default language models, just use 67 | 68 | ```java 69 | __PKG__.INSTANCE.segment(text); 70 | __PKG__.INSTANCE.recognize(text); 71 | __PKG__.INSTANCE.tag(text); 72 | ``` 73 | 74 | The command line tool 75 | ---------------------- 76 | 77 | Please follow below steps to play with: 78 | 79 | * git clone git://github.com/guokr/stan-cn-nlp.git 80 | * cd stan-cn-nlp 81 | * mvn package 82 | * java -Xms1g -Xmx2g -jar target/stan-cn-nlp-0.0.5-SNAPSHOT-standalone.jar seg "大江东去浪淘尽" 83 | * java -Xms1g -Xmx2g -jar target/stan-cn-nlp-0.0.5-SNAPSHOT-standalone.jar ner "大江东去浪淘尽" 84 | * java -Xms1g -Xmx2g -jar target/stan-cn-nlp-0.0.5-SNAPSHOT-standalone.jar tag "大江东去浪淘尽" 85 | 86 | Preparation for release 87 | ------------------------ 88 | 89 | Before release this package to maven central, please execute below commands: 90 | 91 | * mvn clean source:jar javadoc:jar package 92 | * export MAVEN_OPTS=-Xmx2048m 93 | * mvn release:clean 94 | * mvn release:prepare 95 | * mvn release:perform 96 | 97 | Authors 98 | -------- 99 | 100 | * Mingli Yuan ( https://github.com/mountain ) 101 | * Rui Wang ( https://github.com/isnowfy ) 102 | * Wanjian Wu ( https://github.com/jseagull ) 103 | 104 | License 105 | -------- 106 | 107 | GPLv2, just same as the license of Stanford CoreNLP package 108 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.guokr 5 | stan-cn-nlp 6 | 0.0.5-SNAPSHOT 7 | jar 8 | stan-cn-nlp 9 | An API wrapper based on Stanford Core NLP package for Simplified Chinese users 10 | https://github.com/guokr/stan-cn-nlp 11 | 12 | 13 | GNU General Public License Version 2 14 | http://www.gnu.org/licenses/gpl-2.0.txt 15 | 16 | 17 | 18 | https://github.com/guokr/stan-cn-nlp 19 | scm:git:git://github.com/guokr/stan-cn-nlp.git 20 | scm:git:ssh://git@github.com/guokr/stan-cn-nlp.git 21 | 22 | 23 | 24 | mingli.yuan 25 | Mingli Yuan 26 | mingli.yuan@gmail.com 27 | 28 | 29 | isnowify 30 | Rui Wang 31 | isnowfy@gmail.com 32 | 33 | 34 | 35 | org.sonatype.oss 36 | oss-parent 37 | 7 38 | 39 | 40 | 1.6 41 | 1.5 42 | UTF-8 43 | UTF-8 44 | 45 | 46 | 47 | com.guokr 48 | stan-cn-com 49 | 0.0.5-SNAPSHOT 50 | 51 | 52 | com.guokr 53 | stan-cn-seg 54 | 0.0.5-SNAPSHOT 55 | 56 | 57 | com.guokr 58 | stan-cn-ner 59 | 0.0.5-SNAPSHOT 60 | 61 | 62 | com.guokr 63 | stan-cn-tag 64 | 0.0.5-SNAPSHOT 65 | 66 | 67 | junit 68 | junit 69 | 4.7 70 | test 71 | 72 | 73 | 74 | src/main/java 75 | src/test/java 76 | 77 | 78 | src/main/resources 79 | 80 | 81 | 82 | 83 | src/main/resources 84 | 85 | 86 | target 87 | target/classes 88 | 89 | 90 | org.apache.maven.plugins 91 | maven-compiler-plugin 92 | 3.0 93 | 94 | 1.5 95 | 1.5 96 | 97 | 98 | 99 | org.codehaus.mojo 100 | build-helper-maven-plugin 101 | 1.7 102 | 103 | 104 | add-source 105 | generate-sources 106 | 107 | add-source 108 | 109 | 110 | 111 | src/main/java 112 | 113 | 114 | 115 | 116 | add-test-source 117 | generate-test-sources 118 | 119 | add-test-source 120 | 121 | 122 | 123 | src/test/java 124 | 125 | 126 | 127 | 128 | 129 | 130 | org.apache.maven.plugins 131 | maven-surefire-plugin 132 | 2.12.4 133 | 134 | 135 | org.apache.maven.surefire 136 | surefire-junit47 137 | 2.12.4 138 | 139 | 140 | 141 | 142 | **/*Tests.java 143 | 144 | once 145 | -Xms2g -Xmx2g 146 | 147 | 148 | 149 | org.apache.maven.plugins 150 | maven-shade-plugin 151 | 2.0 152 | 153 | 154 | package 155 | 156 | shade 157 | 158 | 159 | false 160 | true 161 | stan-cn-nlp-${project.version}-standalone 162 | 163 | 164 | 165 | com.guokr.nlp.GkNlpCli 166 | 1 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /src/main/java/com/guokr/nlp/GkNlpCli.java: -------------------------------------------------------------------------------- 1 | package com.guokr.nlp; 2 | 3 | import java.lang.reflect.Method; 4 | import java.io.PrintStream; 5 | 6 | public class GkNlpCli { 7 | 8 | public static void main(String[] args) { 9 | __PKG__ pkg = __PKG__.INSTANCE; 10 | if(args.length < 2) { 11 | usage(); 12 | } else { 13 | String subcmd = args[0]; 14 | String text = args[1]; 15 | if(subcmd.equals("seg")) { 16 | try { 17 | String segText = pkg.segment(text); 18 | System.out.println(segText); 19 | } catch (Exception e) { 20 | System.out.println(e); 21 | e.printStackTrace(System.err); 22 | } 23 | } else if(subcmd.equals("ner")) { 24 | try { 25 | String nerText = pkg.recognize(text); 26 | System.out.println(nerText); 27 | } catch (Exception e) { 28 | System.out.println(e); 29 | e.printStackTrace(System.err); 30 | } 31 | } else if(subcmd.equals("tag")) { 32 | try { 33 | String tagText = pkg.tag(text); 34 | System.out.println(tagText); 35 | } catch (Exception e) { 36 | System.out.println(e); 37 | e.printStackTrace(System.err); 38 | } 39 | } 40 | } 41 | 42 | } 43 | 44 | private static void usage() { 45 | PrintStream out = System.out; 46 | out.println("gknlp [command] [text]"); 47 | out.println("\tcommands:"); 48 | out.println("\t\tseg: segment the text into words"); 49 | out.println("\t\tner: recognize the named entity in the text"); 50 | out.println("\t\ttag: tag the text into words with syntax information"); 51 | out.println(""); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/guokr/nlp/__PKG__.java: -------------------------------------------------------------------------------- 1 | package com.guokr.nlp; 2 | 3 | public enum __PKG__ { 4 | 5 | INSTANCE; 6 | 7 | public String segment(String text) { 8 | return com.guokr.nlp.seg.__SEG__.INSTANCE.segment(text); 9 | } 10 | 11 | public String recognize(String text) { 12 | return com.guokr.nlp.ner.__NER__.INSTANCE.recognize(text); 13 | } 14 | 15 | public String tag(String text) { 16 | return com.guokr.nlp.tag.__TAG__.INSTANCE.tag(text); 17 | } 18 | 19 | } 20 | 21 | -------------------------------------------------------------------------------- /src/test/java/com/guokr/nlp/test/BasicTests.java: -------------------------------------------------------------------------------- 1 | package com.guokr.nlp.test; 2 | 3 | import java.lang.reflect.Method; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import org.junit.Test; 8 | import org.junit.runner.RunWith; 9 | import org.junit.runners.JUnit4; 10 | 11 | import com.guokr.nlp.__PKG__; 12 | 13 | @RunWith(JUnit4.class) 14 | public class BasicTests { 15 | 16 | @Test 17 | public void testSeg() throws Exception { 18 | String segText = __PKG__.INSTANCE.segment("这是个测试"); 19 | assertEquals("这 是 个 测试", segText); 20 | } 21 | 22 | @Test 23 | public void testNer() throws Exception { 24 | String nerText = __PKG__.INSTANCE.recognize("这是个测试"); 25 | assertEquals("这/O 是/O 个/O 测试/O", nerText); 26 | } 27 | 28 | @Test 29 | public void testTag() throws Exception { 30 | String tagText = __PKG__.INSTANCE.tag("这是个测试"); 31 | assertEquals("这#PN 是#VC 个#M 测试#NN", tagText); 32 | } 33 | 34 | } 35 | --------------------------------------------------------------------------------