├── MainPartExtractor.iml
├── README.md
├── pom.xml
└── src
└── main
├── java
└── com
│ └── hankcs
│ └── nlp
│ └── lex
│ ├── MainPart.java
│ └── MainPartExtractor.java
└── resources
├── logback.xml
└── models
├── chineseFactored.ser
└── chinesePCFG.ser
/MainPartExtractor.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | MainPartExtractor
2 | =================
3 |
4 | 主谓宾提取器的Java实现
5 |
6 | 提取中文句子主干
7 | --
8 |
9 | - 调用方法
10 |
11 | ```java
12 | public static void main(String[] args)
13 | {
14 | String[] testCaseArray = {
15 | "我一直很喜欢你",
16 | "你被我喜欢",
17 | "美丽又善良的你被卑微的我深深的喜欢着……",
18 | "只有自信的程序员才能把握未来",
19 | "主干识别可以提高检索系统的智能",
20 | "这个项目的作者是hankcs",
21 | "hankcs是一个无门无派的浪人",
22 | "搜索hankcs可以找到我的博客",
23 | "静安区体育局2013年部门决算情况说明",
24 | "这类算法在有限的一段时间内终止",
25 | };
26 | for (String testCase : testCaseArray)
27 | {
28 | MainPart mp = MainPartExtractor.getMainPart(testCase);
29 | System.out.printf("%s\t%s\n", testCase, mp);
30 | }
31 | }
32 | ```
33 | - 编译说明
34 |
35 | 请使用Maven编译,会自动下载依赖项。
36 | - 算法详解
37 |
38 | 利用依存关系可以提取句子的主要成分(也就是小学和公务员考试中出现的“提取主干”),可以实现语义上的智能理解。
39 | 详见[《提取中文句子主谓宾的Java实现》][1]
40 |
41 | [1]: http://www.hankcs.com/nlp/chinese-sentences-svo-java-extraction.html
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.hankcs
6 | MainPartExtractor
7 | 1.0
8 | war
9 |
10 | MainPartExtractor
11 | http://www.hankcs.com/
12 |
13 | 主谓宾提取工具
14 |
15 |
16 | 码农场
17 | http://www.hankcs.com/
18 |
19 |
20 |
21 | GNU GENERAL PUBLIC LICENSE, Version 3
22 | http://www.gnu.org/licenses/gpl.html
23 |
24 |
25 | 2014
26 |
27 |
28 | hankcs
29 | me@hankcs.com
30 | http://www.hankcs.com
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 | org.apache.maven.plugins
39 | maven-compiler-plugin
40 | ${maven-compiler-plugin.version}
41 |
42 | ${project.build.sourceEncoding}
43 | ${java.version}
44 | ${java.version}
45 | true
46 | true
47 | true
48 |
49 |
50 |
51 |
52 | org.apache.maven.plugins
53 | maven-jar-plugin
54 | ${maven-jar-plugin.version}
55 |
56 |
57 |
58 | org.apache.maven.plugins
59 | maven-resources-plugin
60 | ${maven-resources-plugin.version}
61 |
62 | ${project.build.sourceEncoding}
63 |
64 |
65 |
66 |
67 |
68 | 1.7
69 | UTF-8
70 |
71 | 3.0
72 | 2.4
73 | 2.14
74 | 2.6
75 | 2.2.1
76 | 6.1.26
77 | 3.5
78 |
79 | 6.0
80 | 3.8.1
81 | 1.7.2
82 | 1.6.4
83 | 0.9.28
84 | 3.1
85 | 1.4.1
86 | chargebee-1.0
87 | 1.9.19
88 | 3.3.1
89 | 1.6.4
90 | 1.9.13
91 |
92 | 5.1.18
93 |
94 |
95 |
96 | mvn-repo
97 | http://maven.ansj.org/
98 |
99 |
100 |
101 |
102 | javax
103 | javaee-web-api
104 | ${javaee-web-api.version}
105 | provided
106 |
107 |
108 | junit
109 | junit
110 | ${junit.version}
111 | test
112 |
113 |
114 |
115 |
116 | org.slf4j
117 | slf4j-api
118 | ${slf4j-api.version}
119 |
120 |
121 |
122 | ch.qos.logback
123 | logback-classic
124 | ${logback-classic.version}
125 |
126 |
127 | commons-logging
128 | commons-logging
129 |
130 |
131 |
132 |
133 | com.hankcs
134 | hanlp
135 | portable-1.2.4
136 |
137 |
138 | edu.stanford.nlp
139 | stanford-parser
140 | ${stanford-parser.version}
141 |
142 |
143 |
144 | org.slf4j
145 | jcl-over-slf4j
146 | ${jcl-over-slf4j.version}
147 |
148 |
149 |
150 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/nlp/lex/MainPart.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.nlp.lex;
2 |
3 | import edu.stanford.nlp.trees.TreeGraphNode;
4 |
5 | /**
6 | * 对主谓宾结果的封装
7 | * @author hankcs
8 | */
9 | public class MainPart
10 | {
11 | /**
12 | * 主语
13 | */
14 | public TreeGraphNode subject;
15 | /**
16 | * 谓语
17 | */
18 | public TreeGraphNode predicate;
19 | /**
20 | * 宾语
21 | */
22 | public TreeGraphNode object;
23 |
24 | /**
25 | * 结果
26 | */
27 | public String result;
28 |
29 | public MainPart(TreeGraphNode subject, TreeGraphNode predicate, TreeGraphNode object)
30 | {
31 | this.subject = subject;
32 | this.predicate = predicate;
33 | this.object = object;
34 | }
35 |
36 | public MainPart(TreeGraphNode predicate)
37 | {
38 | this(null, predicate, null);
39 | }
40 |
41 | public MainPart()
42 | {
43 | result = "";
44 | }
45 |
46 | /**
47 | * 结果填充完成
48 | */
49 | public void done()
50 | {
51 | result = predicate.toString("value");
52 | if (subject != null)
53 | {
54 | result = subject.toString("value") + result;
55 | }
56 | if (object != null)
57 | {
58 | result = result + object.toString("value");
59 | }
60 | }
61 |
62 | public boolean isDone()
63 | {
64 | return result != null;
65 | }
66 |
67 | @Override
68 | public String toString()
69 | {
70 | if (result != null) return result;
71 | return "MainPart{" +
72 | "主语='" + subject + '\'' +
73 | ", 谓语='" + predicate + '\'' +
74 | ", 宾语='" + object + '\'' +
75 | '}';
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/nlp/lex/MainPartExtractor.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.nlp.lex;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import com.hankcs.hanlp.seg.common.Term;
5 | import edu.stanford.nlp.ling.Word;
6 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
7 | import edu.stanford.nlp.trees.*;
8 | import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack;
9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 |
12 | import java.util.*;
13 |
14 | /**
15 | * 提取主谓宾
16 | *
17 | * @author hankcs
18 | */
19 | public class MainPartExtractor
20 | {
21 |
22 | private static final Logger LOG = LoggerFactory.getLogger(MainPartExtractor.class);
23 | private static LexicalizedParser lp;
24 | private static GrammaticalStructureFactory gsf;
25 | static
26 | {
27 | //模型
28 | String models = "models/chineseFactored.ser";
29 | LOG.info("载入文法模型:" + models);
30 | lp = LexicalizedParser.loadModel(models);
31 | //汉语
32 | TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
33 | gsf = tlp.grammaticalStructureFactory();
34 | }
35 |
36 | /**
37 | * 获取句子的主谓宾
38 | *
39 | * @param sentence 问题
40 | * @return 问题结构
41 | */
42 | public static MainPart getMainPart(String sentence)
43 | {
44 | // 去掉不可见字符
45 | sentence = sentence.replace("\\s+", "");
46 | // 分词,用空格隔开
47 | List wordList = seg(sentence);
48 | return getMainPart(wordList);
49 | }
50 |
51 | /**
52 | * 获取句子的主谓宾
53 | *
54 | * @param words HashWord列表
55 | * @return 问题结构
56 | */
57 | public static MainPart getMainPart(List words)
58 | {
59 | MainPart mainPart = new MainPart();
60 | if (words == null || words.size() == 0) return mainPart;
61 | Tree tree = lp.apply(words);
62 | LOG.info("句法树:{}", tree.pennString());
63 | // 根据整个句子的语法类型来采用不同的策略提取主干
64 | switch (tree.firstChild().label().toString())
65 | {
66 | case "NP":
67 | // 名词短语,认为只有主语,将所有短NP拼起来作为主语即可
68 | mainPart = getNPPhraseMainPart(tree);
69 | break;
70 | default:
71 | GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
72 | Collection tdls = gs.typedDependenciesCCprocessed(true);
73 | LOG.info("依存关系:{}", tdls);
74 | TreeGraphNode rootNode = getRootNode(tdls);
75 | if (rootNode == null)
76 | {
77 | return getNPPhraseMainPart(tree);
78 | }
79 | LOG.info("中心词语:", rootNode);
80 | mainPart = new MainPart(rootNode);
81 | for (TypedDependency td : tdls)
82 | {
83 | // 依存关系的出发节点,依存关系,以及结束节点
84 | TreeGraphNode gov = td.gov();
85 | GrammaticalRelation reln = td.reln();
86 | String shortName = reln.getShortName();
87 | TreeGraphNode dep = td.dep();
88 | if (gov == rootNode)
89 | {
90 | switch (shortName)
91 | {
92 | case "nsubjpass":
93 | case "dobj":
94 | case "attr":
95 | mainPart.object = dep;
96 | break;
97 | case "nsubj":
98 | case "top":
99 | mainPart.subject = dep;
100 | break;
101 | }
102 | }
103 | if (mainPart.object != null && mainPart.subject != null)
104 | {
105 | break;
106 | }
107 | }
108 | // 尝试合并主语和谓语中的名词性短语
109 | combineNN(tdls, mainPart.subject);
110 | combineNN(tdls, mainPart.object);
111 | if (!mainPart.isDone()) mainPart.done();
112 | }
113 |
114 | return mainPart;
115 | }
116 |
117 | private static MainPart getNPPhraseMainPart(Tree tree)
118 | {
119 | MainPart mainPart = new MainPart();
120 | StringBuilder sbResult = new StringBuilder();
121 | List phraseList = getPhraseList("NP", tree);
122 | for (String phrase : phraseList)
123 | {
124 | sbResult.append(phrase);
125 | }
126 | mainPart.result = sbResult.toString();
127 | return mainPart;
128 | }
129 |
130 | /**
131 | * 从句子中提取最小粒度的短语
132 | * @param type
133 | * @param sentence
134 | * @return
135 | */
136 | public static List getPhraseList(String type, String sentence)
137 | {
138 | return getPhraseList(type, lp.apply(seg(sentence)));
139 | }
140 |
141 | private static List getPhraseList(String type, Tree tree)
142 | {
143 | List phraseList = new LinkedList();
144 | for (Tree subtree : tree)
145 | {
146 | if(subtree.isPrePreTerminal() && subtree.label().value().equals(type))
147 | {
148 | StringBuilder sbResult = new StringBuilder();
149 | for (Tree leaf : subtree.getLeaves())
150 | {
151 | sbResult.append(leaf.value());
152 | }
153 | phraseList.add(sbResult.toString());
154 | }
155 | }
156 | return phraseList;
157 | }
158 |
159 | /**
160 | * 合并名词性短语为一个节点
161 | * @param tdls 依存关系集合
162 | * @param target 目标节点
163 | */
164 | private static void combineNN(Collection tdls, TreeGraphNode target)
165 | {
166 | if (target == null) return;
167 | for (TypedDependency td : tdls)
168 | {
169 | // 依存关系的出发节点,依存关系,以及结束节点
170 | TreeGraphNode gov = td.gov();
171 | GrammaticalRelation reln = td.reln();
172 | String shortName = reln.getShortName();
173 | TreeGraphNode dep = td.dep();
174 | if (gov == target)
175 | {
176 | switch (shortName)
177 | {
178 | case "nn":
179 | target.setValue(dep.toString("value") + target.value());
180 | return;
181 | }
182 | }
183 | }
184 | }
185 |
186 | private static TreeGraphNode getRootNode(Collection tdls)
187 | {
188 | for (TypedDependency td : tdls)
189 | {
190 | if (td.reln() == GrammaticalRelation.ROOT)
191 | {
192 | return td.dep();
193 | }
194 | }
195 |
196 | return null;
197 | }
198 |
199 | /**
200 | * 分词
201 | *
202 | * @param sentence 句子
203 | * @return 分词结果
204 | */
205 | private static List seg(String sentence)
206 | {
207 | //分词
208 | LOG.info("正在对短句进行分词:" + sentence);
209 | List wordList = new LinkedList<>();
210 | List terms = HanLP.segment(sentence);
211 | StringBuffer sbLogInfo = new StringBuffer();
212 | for (Term term : terms)
213 | {
214 | Word word = new Word(term.word);
215 | wordList.add(word);
216 | sbLogInfo.append(word);
217 | sbLogInfo.append(' ');
218 | }
219 | LOG.info("分词结果为:" + sbLogInfo);
220 | return wordList;
221 | }
222 |
223 | public static MainPart getMainPart(String sentence, String delimiter)
224 | {
225 | List wordList = new LinkedList<>();
226 | for (String word : sentence.split(delimiter))
227 | {
228 | wordList.add(new Word(word));
229 | }
230 | return getMainPart(wordList);
231 | }
232 |
233 | /**
234 | * 调用演示
235 | * @param args
236 | */
237 | public static void main(String[] args)
238 | {
239 | String[] testCaseArray = {
240 | "我一直很喜欢你",
241 | "你被我喜欢",
242 | "美丽又善良的你被卑微的我深深的喜欢着……",
243 | "只有自信的程序员才能把握未来",
244 | "主干识别可以提高检索系统的智能",
245 | "这个项目的作者是hankcs",
246 | "hankcs是一个无门无派的浪人",
247 | "搜索hankcs可以找到我的博客",
248 | "静安区体育局2013年部门决算情况说明",
249 | "这类算法在有限的一段时间内终止",
250 | };
251 | for (String testCase : testCaseArray)
252 | {
253 | MainPart mp = MainPartExtractor.getMainPart(testCase);
254 | System.out.printf("%s\t%s\n", testCase, mp);
255 | }
256 | }
257 | }
--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | %m%n
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/main/resources/models/chineseFactored.ser:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/MainPartExtractor/d00f014294242e3b0a70719c761979fddc87f59e/src/main/resources/models/chineseFactored.ser
--------------------------------------------------------------------------------
/src/main/resources/models/chinesePCFG.ser:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/MainPartExtractor/d00f014294242e3b0a70719c761979fddc87f59e/src/main/resources/models/chinesePCFG.ser
--------------------------------------------------------------------------------