├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
├── main
└── java
│ └── com
│ └── guokr
│ └── nlp
│ ├── GkNlpCli.java
│ └── __PKG__.java
└── test
└── java
└── com
└── guokr
└── nlp
└── test
└── BasicTests.java
/.gitignore:
--------------------------------------------------------------------------------
1 | *jar
2 | /target/
3 | .lein-deps-sum
4 | .lein-failures
5 |
6 | dependency-reduced-pom.xml
7 |
8 | # OS generated files #
9 | .DS_Store
10 | .DS_Store?
11 | ._*
12 | .Spotlight-V100
13 | .Trashes
14 | Icon?
15 | ehthumbs.db
16 | Thumbs.db
17 |
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | stan-cn-nlp
2 | ============
3 |
4 | An API wrapper based on Stanford NLP packages for Simplified Chinese users
5 |
6 | * * * * * * *
7 |
8 | Copyright (C) 2012 Guokr.com
9 |
10 | This program is free software; you can redistribute it and/or
11 | modify it under the terms of the GNU General Public License
12 | as published by the Free Software Foundation; either version 2
13 | of the License, or (at your option) any later version.
14 |
15 | This program is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | GNU General Public License for more details.
19 |
20 | You should have received a copy of the GNU General Public License
21 | along with this program; if not, write to the Free Software
22 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 |
24 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | stan-cn-nlp
2 | ============
3 |
4 | An API wrapper based on Stanford NLP packages for the convenience of Chinese
5 | users. This package is based on stan-cn-* family:
6 |
7 | * stan-cn-com: Common code base
8 | * stan-cn-seg: Chinese segmentation and related data model
9 | * stan-cn-ner: Named entity recognition and related data model
10 | * stan-cn-tag: POS tagging and related data model
11 |
12 | This package bundled seg, ner and tagging together. So if you only need one of
13 | them, you can use stan-cn-seg, stan-cn-ner, stan-cn-tag separately.
14 |
15 | Purpose of the packages
16 | ------------------------
17 |
18 | The original Stanford CoreNLP packages with default language settings in Maven
19 | central is only for English. If you are dealing with simplified Chinese, you
20 | still need to download the Chinese model and fix some configuration files.
21 |
22 | The burden is not too much, but if you deploy these packages to a server
23 | cluster, this burden might be amplified.
24 |
25 | Whatever you face a single node or a server farm, it would be a pleasurable
26 | solution to provide packages with default settings of Chinese language
27 | models. That is what we do.
28 |
29 | Comments, reviews, bug reports and patches are welcomed.
30 |
31 | Current version
32 | ----------------
33 |
34 | Current version is 0.0.4 and based on Stanford CoreNLP 3.2.0 with minor fixes.
35 |
36 | including below dependency:
37 |
38 | * maven:
39 | ```xml
40 |
41 | com.guokr
42 | stan-cn-nlp
43 | 0.0.4
44 |
45 | ```
46 | * leiningen:
47 | ```clojure
48 | [com.guokr/stan-cn-nlp "0.0.4"]
49 | ```
50 | * sbt:
51 | ```scala
52 | libraryDependencies += "com.guokr" % "stan-cn-nlp" % "0.0.4"
53 | ```
54 |
55 | Simplified API
56 | ---------------
57 |
58 | We use a very simple API to reduce the complexity.
59 |
60 | ```java
61 | new SegWrapper(settings).segment(text);
62 | new NerWrapper(settings).recognize(text);
63 | new TagWrapper(settings).tag(text);
64 | ```
65 |
66 | Or if you want to use the default language models, just use
67 |
68 | ```java
69 | __PKG__.INSTANCE.segment(text);
70 | __PKG__.INSTANCE.recognize(text);
71 | __PKG__.INSTANCE.tag(text);
72 | ```
73 |
74 | The command line tool
75 | ----------------------
76 |
77 | Please follow below steps to play with:
78 |
79 | * git clone git://github.com/guokr/stan-cn-nlp.git
80 | * cd stan-cn-nlp
81 | * mvn package
82 | * java -Xms1g -Xmx2g -jar target/stan-cn-nlp-0.0.5-SNAPSHOT-standalone.jar seg "大江东去浪淘尽"
83 | * java -Xms1g -Xmx2g -jar target/stan-cn-nlp-0.0.5-SNAPSHOT-standalone.jar ner "大江东去浪淘尽"
84 | * java -Xms1g -Xmx2g -jar target/stan-cn-nlp-0.0.5-SNAPSHOT-standalone.jar tag "大江东去浪淘尽"
85 |
86 | Preparation for release
87 | ------------------------
88 |
89 | Before release this package to maven central, please execute below commands:
90 |
91 | * mvn clean source:jar javadoc:jar package
92 | * export MAVEN_OPTS=-Xmx2048m
93 | * mvn release:clean
94 | * mvn release:prepare
95 | * mvn release:perform
96 |
97 | Authors
98 | --------
99 |
100 | * Mingli Yuan ( https://github.com/mountain )
101 | * Rui Wang ( https://github.com/isnowfy )
102 | * Wanjian Wu ( https://github.com/jseagull )
103 |
104 | License
105 | --------
106 |
107 | GPLv2, just same as the license of Stanford CoreNLP package
108 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.guokr
5 | stan-cn-nlp
6 | 0.0.5-SNAPSHOT
7 | jar
8 | stan-cn-nlp
9 | An API wrapper based on Stanford Core NLP package for Simplified Chinese users
10 | https://github.com/guokr/stan-cn-nlp
11 |
12 |
13 | GNU General Public License Version 2
14 | http://www.gnu.org/licenses/gpl-2.0.txt
15 |
16 |
17 |
18 | https://github.com/guokr/stan-cn-nlp
19 | scm:git:git://github.com/guokr/stan-cn-nlp.git
20 | scm:git:ssh://git@github.com/guokr/stan-cn-nlp.git
21 |
22 |
23 |
24 | mingli.yuan
25 | Mingli Yuan
26 | mingli.yuan@gmail.com
27 |
28 |
29 | isnowify
30 | Rui Wang
31 | isnowfy@gmail.com
32 |
33 |
34 |
35 | org.sonatype.oss
36 | oss-parent
37 | 7
38 |
39 |
40 | 1.6
41 | 1.5
42 | UTF-8
43 | UTF-8
44 |
45 |
46 |
47 | com.guokr
48 | stan-cn-com
49 | 0.0.5-SNAPSHOT
50 |
51 |
52 | com.guokr
53 | stan-cn-seg
54 | 0.0.5-SNAPSHOT
55 |
56 |
57 | com.guokr
58 | stan-cn-ner
59 | 0.0.5-SNAPSHOT
60 |
61 |
62 | com.guokr
63 | stan-cn-tag
64 | 0.0.5-SNAPSHOT
65 |
66 |
67 | junit
68 | junit
69 | 4.7
70 | test
71 |
72 |
73 |
74 | src/main/java
75 | src/test/java
76 |
77 |
78 | src/main/resources
79 |
80 |
81 |
82 |
83 | src/main/resources
84 |
85 |
86 | target
87 | target/classes
88 |
89 |
90 | org.apache.maven.plugins
91 | maven-compiler-plugin
92 | 3.0
93 |
94 | 1.5
95 | 1.5
96 |
97 |
98 |
99 | org.codehaus.mojo
100 | build-helper-maven-plugin
101 | 1.7
102 |
103 |
104 | add-source
105 | generate-sources
106 |
107 | add-source
108 |
109 |
110 |
111 | src/main/java
112 |
113 |
114 |
115 |
116 | add-test-source
117 | generate-test-sources
118 |
119 | add-test-source
120 |
121 |
122 |
123 | src/test/java
124 |
125 |
126 |
127 |
128 |
129 |
130 | org.apache.maven.plugins
131 | maven-surefire-plugin
132 | 2.12.4
133 |
134 |
135 | org.apache.maven.surefire
136 | surefire-junit47
137 | 2.12.4
138 |
139 |
140 |
141 |
142 | **/*Tests.java
143 |
144 | once
145 | -Xms2g -Xmx2g
146 |
147 |
148 |
149 | org.apache.maven.plugins
150 | maven-shade-plugin
151 | 2.0
152 |
153 |
154 | package
155 |
156 | shade
157 |
158 |
159 | false
160 | true
161 | stan-cn-nlp-${project.version}-standalone
162 |
163 |
164 |
165 | com.guokr.nlp.GkNlpCli
166 | 1
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
--------------------------------------------------------------------------------
/src/main/java/com/guokr/nlp/GkNlpCli.java:
--------------------------------------------------------------------------------
1 | package com.guokr.nlp;
2 |
3 | import java.lang.reflect.Method;
4 | import java.io.PrintStream;
5 |
6 | public class GkNlpCli {
7 |
8 | public static void main(String[] args) {
9 | __PKG__ pkg = __PKG__.INSTANCE;
10 | if(args.length < 2) {
11 | usage();
12 | } else {
13 | String subcmd = args[0];
14 | String text = args[1];
15 | if(subcmd.equals("seg")) {
16 | try {
17 | String segText = pkg.segment(text);
18 | System.out.println(segText);
19 | } catch (Exception e) {
20 | System.out.println(e);
21 | e.printStackTrace(System.err);
22 | }
23 | } else if(subcmd.equals("ner")) {
24 | try {
25 | String nerText = pkg.recognize(text);
26 | System.out.println(nerText);
27 | } catch (Exception e) {
28 | System.out.println(e);
29 | e.printStackTrace(System.err);
30 | }
31 | } else if(subcmd.equals("tag")) {
32 | try {
33 | String tagText = pkg.tag(text);
34 | System.out.println(tagText);
35 | } catch (Exception e) {
36 | System.out.println(e);
37 | e.printStackTrace(System.err);
38 | }
39 | }
40 | }
41 |
42 | }
43 |
44 | private static void usage() {
45 | PrintStream out = System.out;
46 | out.println("gknlp [command] [text]");
47 | out.println("\tcommands:");
48 | out.println("\t\tseg: segment the text into words");
49 | out.println("\t\tner: recognize the named entity in the text");
50 | out.println("\t\ttag: tag the text into words with syntax information");
51 | out.println("");
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/com/guokr/nlp/__PKG__.java:
--------------------------------------------------------------------------------
1 | package com.guokr.nlp;
2 |
3 | public enum __PKG__ {
4 |
5 | INSTANCE;
6 |
7 | public String segment(String text) {
8 | return com.guokr.nlp.seg.__SEG__.INSTANCE.segment(text);
9 | }
10 |
11 | public String recognize(String text) {
12 | return com.guokr.nlp.ner.__NER__.INSTANCE.recognize(text);
13 | }
14 |
15 | public String tag(String text) {
16 | return com.guokr.nlp.tag.__TAG__.INSTANCE.tag(text);
17 | }
18 |
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/src/test/java/com/guokr/nlp/test/BasicTests.java:
--------------------------------------------------------------------------------
1 | package com.guokr.nlp.test;
2 |
3 | import java.lang.reflect.Method;
4 |
5 | import static org.junit.Assert.assertEquals;
6 |
7 | import org.junit.Test;
8 | import org.junit.runner.RunWith;
9 | import org.junit.runners.JUnit4;
10 |
11 | import com.guokr.nlp.__PKG__;
12 |
13 | @RunWith(JUnit4.class)
14 | public class BasicTests {
15 |
16 | @Test
17 | public void testSeg() throws Exception {
18 | String segText = __PKG__.INSTANCE.segment("这是个测试");
19 | assertEquals("这 是 个 测试", segText);
20 | }
21 |
22 | @Test
23 | public void testNer() throws Exception {
24 | String nerText = __PKG__.INSTANCE.recognize("这是个测试");
25 | assertEquals("这/O 是/O 个/O 测试/O", nerText);
26 | }
27 |
28 | @Test
29 | public void testTag() throws Exception {
30 | String tagText = __PKG__.INSTANCE.tag("这是个测试");
31 | assertEquals("这#PN 是#VC 个#M 测试#NN", tagText);
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------