├── .gitignore
├── md
    ├── components
    │   ├── sentiment_analysis.md
    │   ├── coreference_resolution.md
    │   ├── semantic_role_labeling.md
    │   ├── named_entity_recognition.md
    │   ├── part_of_speech_tagging.md
    │   └── dependency_parsing.md
    ├── quickstart
    │   ├── milestones.md
    │   ├── install.md
    │   ├── decode.md
    │   ├── release.md
    │   └── train.md
    ├── tutorial
    │   ├── depstate.md
    │   ├── tensorflow.md
    │   ├── maven.md
    │   ├── processing_state.md
    │   ├── nlp_component.md
    │   ├── create_nlp_component.md
    │   └── tree_based_nlp_component.md
    └── supplements
    │   ├── data-format.md
    │   └── english-lexica-models.md
├── .settings
    ├── org.eclipse.wst.validation.prefs
    ├── org.eclipse.m2e.core.prefs
    ├── org.eclipse.core.resources.prefs
    ├── org.eclipse.wst.common.project.facet.core.xml
    ├── org.eclipse.wst.common.component
    └── org.eclipse.jdt.core.prefs
├── src
    ├── test
    │   ├── resources
    │   │   ├── dat
    │   │   │   ├── nlp4j.txt
    │   │   │   ├── nlp4j.txt.nlp
    │   │   │   ├── sample-dev.tsv
    │   │   │   └── sample-trn.tsv
    │   │   ├── log4j.properties
    │   │   └── decoder-test-config.xml
    │   └── java
    │   │   └── edu
    │   │       └── emory
    │   │           └── mathcs
    │   │               └── nlp
    │   │                   └── decode
    │   │                       └── AbstractNLPDecoderTest.java
    └── main
    │   ├── resources
    │       └── edu
    │       │   └── emory
    │       │       └── mathcs
    │       │           └── nlp
    │       │               └── configuration
    │       │                   ├── log4j.properties
    │       │                   ├── config-decode-pos.xml
    │       │                   ├── config-decode-en.xml
    │       │                   ├── config-train-sample.xml
    │       │                   ├── config-train-sample-optimized.xml
    │       │                   ├── config-train-doc.xml
    │       │                   ├── config-train-ner.xml
    │       │                   ├── config-train-pos.xml
    │       │                   └── config-train-dep.xml
    │   └── java
    │       └── edu
    │           └── emory
    │               └── mathcs
    │                   └── nlp
    │                       ├── bin
    │                           ├── Version.java
    │                           ├── NLPDemo.java
    │                           ├── NLPDecode.java
    │                           └── DEPEvaluate.java
    │                       ├── decode
    │                           ├── NLPDecoder.java
    │                           ├── NLPUtils.java
    │                           ├── DecodeConfig.java
    │                           └── AbstractNLPDecoder.java
    │                       ├── component
    │                           └── morph
    │                           │   └── MorphologicalAnalyzer.java
    │                       ├── zzz
    │                           ├── RadiologyDecode.java
    │                           ├── TokenizeIt.java
    │                           ├── CSVRadiology.java
    │                           ├── CSVSentiment.java
    │                           └── NEGazetteerCreate.java
    │                       └── network
    │                           ├── NLPSocketClient.java
    │                           └── NLPSocketServer.java
├── LICENSE.txt
├── README.md
└── pom.xml


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target
4 | 


--------------------------------------------------------------------------------
/md/components/sentiment_analysis.md:
--------------------------------------------------------------------------------
1 | # Sentiment Analysis
2 | 


--------------------------------------------------------------------------------
/md/components/coreference_resolution.md:
--------------------------------------------------------------------------------
1 | # Coreference Resolution
2 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/src/test/resources/dat/nlp4j.txt:
--------------------------------------------------------------------------------
1 | The NLP4J project provides a NLP toolkit for JVM languages. This project is under the Apache 2 license and is currently developed by the NLP Research Group at Emory University.


--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/main/resources=UTF-8
4 | encoding//src/test/java=UTF-8
5 | encoding//src/test/resources=UTF-8
6 | encoding/<project>=UTF-8
7 | 


--------------------------------------------------------------------------------
/md/quickstart/milestones.md:
--------------------------------------------------------------------------------
 1 | # Milestones
 2 | 
 3 | ## Version 1.1.0 (03/31/2016)
 4 | 
 5 | * Word2Vec and Struct2Vec.
 6 | * Semantic role labeling.
 7 | * Sentiment analysis.
 8 | 
 9 | ## Version 1.2.0 (04/30/2016)
10 | 
11 | * Coreference resolution.
12 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <faceted-project>
3 |   <runtime name="Apache Tomcat v8.0"/>
4 |   <fixed facet="java"/>
5 |   <fixed facet="jst.utility"/>
6 |   <installed facet="jst.utility" version="1.0"/>
7 |   <installed facet="java" version="1.8"/>
8 | </faceted-project>
9 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
2 |     <wb-module deploy-name="nlp4j">
3 |         <wb-resource deploy-path="/" source-path="/src/main/java"/>
4 |         <wb-resource deploy-path="/" source-path="/src/main/resources"/>
5 |     </wb-module>
6 | </project-modules>
7 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to A1.
 2 | log4j.rootLogger=INFO, A1
 3 | 
 4 | # A1 is set to be a ConsoleAppender.
 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # A1 uses PatternLayout.
 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.A1.layout.conversionPattern=%m
10 | 


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to A1.
 2 | log4j.rootLogger=INFO, A1
 3 | 
 4 | # A1 is set to be a ConsoleAppender.
 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # A1 uses PatternLayout.
 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.A1.layout.conversionPattern=%m
10 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
4 | org.eclipse.jdt.core.compiler.compliance=1.8
5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
7 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
8 | org.eclipse.jdt.core.compiler.source=1.8
9 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2015, Emory University
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-pos.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 	<tsv>
 3 |         <column index="1" field="form"/>
 4 |     </tsv>
 5 | 
 6 |     <lexica>
 7 |         <ambiguity_classes field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
 8 |         <word_clusters field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
 9 |     </lexica>
10 | 
11 |     <models>
12 |     	<pos>edu/emory/mathcs/nlp/models/en-pos.xz</pos>
13 |     </models>
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/md/components/semantic_role_labeling.md:
--------------------------------------------------------------------------------
1 | # Semantic Role Labeling
2 | 
3 | Our semantic role labeler uses a higher-order argument pruning algorithm that significantly improves recall from the first-order argument pruning algorithm, yet keeps a similar labeling complexity in practice. Our labeler takes about 0.45 milliseconds for labeling all arguments of each predicate on an Intel Xeon 2.57GHz machine and shows state-of-the-art accuracy compared to other dependency-based labeling approaches.
4 | 
5 | * [Transition-based Semantic Role Labeling Using Predicate Argument Clustering](http://aclweb.org/anthology/W11-0906), Jinho D. Choi, Martha Palmer, In Proceedings of the ACL Workshop on Relational Models of Semantics (RELMS'11), 37–45, 2011.
6 | 


--------------------------------------------------------------------------------
/src/test/resources/decoder-test-config.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <tsv>
 3 |         <column index="1" field="form"/>
 4 |     </tsv>
 5 | 
 6 |     <lexica>
 7 |         <ambiguity_classes field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
 8 |         <word_clusters field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
 9 |         <word_embeddings field="word_form_undigitalized">edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz</word_embeddings>
10 |     </lexica>
11 | 
12 |     <models>
13 |         <pos>edu/emory/mathcs/nlp/models/en-pos.xz</pos>
14 |     </models>
15 | </configuration>
16 | 


--------------------------------------------------------------------------------
/md/tutorial/depstate.md:
--------------------------------------------------------------------------------
 1 | # DEPState
 2 | 
 3 | [`DEPState`](../../java/edu/emory/mathcs/nlp/component/dep/DEPState.java) implements the parsing algorithm and holds the parsing states processed by the algorithm.
 4 | 
 5 | ## Initialization
 6 | 
 7 | ```java
 8 | public DEPStateArcEager(N[] nodes)
 9 | {
10 | 	super(nodes);
11 | 
12 | 	stack = new IntArrayList();
13 | 	input = 0;
14 | }
15 | ```
16 | 
17 | ## Save Oracle
18 | 
19 | The oracle is saved as a list of [`DEPArc`](../../java/edu/emory/mathcs/nlp/component/dep/DEPArc.java).  `oracle[0]` is preserved for the artificial root node and the rest holds the gold-standard head information for each node.
20 | 
21 | 
22 | ```java
23 | @Override
24 | public void saveOracle()
25 | {
26 | 	oracle = Arrays.stream(nodes).map(n -> n.clearDependencies()).toArray(DEPArc[]::new);
27 | }
28 | ```
29 | 
30 | # Arc-Eager
31 | 
32 | implements the arc-eager algorithm ([Nivre 2008](http://www.mitpressjournals.org/doi/pdf/10.1162/coli.07-056-R1-07-027), Section 4.2), that is the most widely used projective parsing algorithm.


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 	<tsv>
 3 |         <column index="1" field="form"/>
 4 |     </tsv>
 5 | 
 6 |     <lexica>
 7 |         <ambiguity_classes field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
 8 |         <word_clusters field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
 9 |         <word_embeddings field="word_form_undigitalized">edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz</word_embeddings>
10 |         <named_entity_gazetteers field="word_form_simplified">edu/emory/mathcs/nlp/lexica/en-named-entity-gazetteers-simplified.xz</named_entity_gazetteers>
11 |     </lexica>
12 | 
13 |     <models>
14 |     	<pos>edu/emory/mathcs/nlp/models/en-pos.xz</pos>
15 |     	<ner>edu/emory/mathcs/nlp/models/en-ner.xz</ner>
16 |     	<dep>edu/emory/mathcs/nlp/models/en-dep.xz</dep>
17 |     </models>
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/md/tutorial/tensorflow.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | * Install [python 3.x](https://www.python.org).
 4 | * Install [pip](https://pip.pypa.io):
 5 | 
 6 |   ```bash
 7 |   # https://bootstrap.pypa.io/get-pip.py
 8 |   python3 get-pip.py
 9 |   ```
10 | 
11 | * Install [virtualenv](https://pypi.python.org/pypi/virtualenv):
12 | 
13 |   ```bash
14 |   sudo pip3 install --upgrade virtualenv  
15 |   ```
16 | 
17 | 
18 | * Create a virtual environment in the directory, `vnlp`:
19 | 
20 |   ```bash
21 |   virtualenv --system-site-packages vnlp
22 |   ```
23 |   
24 | * Activate the `vnlp` environment:
25 | 
26 |   ```bash
27 |   source vnlp/bin/activate
28 |   ```
29 |   
30 | * Install [tensorflow](https://www.tensorflow.org) under `vnlp`:
31 | 
32 |   ```bash
33 |   # linux
34 |   pip3 install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.8.0-cp27-none-linux_x86_64.whl
35 |   # mac
36 |   pip3 install --upgrade https://storage.googleapis.com/tensorflow/mac/tensorflow-0.8.0-py3-none-any.whl
37 |   ```
38 | 
39 | * Install [scipy](pip3 install scipy) and [scikit-learn](http://scikit-learn.org) under `vnlp`:
40 | 
41 |   ```
42 |   pip3 install -U scipy
43 |   pip3 install -U scikit-learn
44 |   ```


--------------------------------------------------------------------------------
/src/test/resources/dat/nlp4j.txt.nlp:
--------------------------------------------------------------------------------
 1 | 1	The	the	DT	_	3	det	_	O
 2 | 2	NLP4J	nlp0j	NNP	pos2=NN	3	compound	_	U-ORG
 3 | 3	project	project	NN	_	4	nsubj	_	O
 4 | 4	provides	provide	VBZ	_	0	root	_	O
 5 | 5	a	a	DT	_	7	det	_	O
 6 | 6	NLP	nlp	NN	pos2=NNP	7	compound	_	O
 7 | 7	toolkit	toolkit	NN	_	4	dobj	_	O
 8 | 8	for	for	IN	_	7	prep	_	O
 9 | 9	JVM	jvm	NN	pos2=NNP	10	compound	_	U-ORG
10 | 10	languages	language	NNS	_	8	pobj	_	O
11 | 11	.	.	.	_	4	punct	_	O
12 | 
13 | 1	This	this	DT	_	2	det	_	O
14 | 2	project	project	NN	_	3	nsubj	_	O
15 | 3	is	be	VBZ	_	0	root	_	O
16 | 4	under	under	IN	_	3	prep	_	O
17 | 5	the	the	DT	_	8	det	_	O
18 | 6	Apache	apache	NNP	pos2=NN	8	nmod	_	O
19 | 7	2	0	CD	pos2=NNP	6	nmod	_	O
20 | 8	license	license	NN	pos2=NNS	4	pobj	_	O
21 | 9	and	and	CC	_	3	cc	_	O
22 | 10	is	be	VBZ	_	12	auxpass	_	O
23 | 11	currently	currently	RB	_	12	advmod	_	O
24 | 12	developed	develop	VBN	_	3	conj	_	O
25 | 13	by	by	IN	_	12	agent	_	O
26 | 14	the	the	DT	_	17	det	_	B-ORG
27 | 15	NLP	nlp	NNP	_	17	compound	_	I-ORG
28 | 16	Research	research	NNP	_	17	compound	_	I-ORG
29 | 17	Group	group	NNP	_	13	pobj	_	L-ORG
30 | 18	at	at	IN	_	17	prep	_	O
31 | 19	Emory	emory	NNP	_	20	compound	_	B-ORG
32 | 20	University	university	NNP	_	18	pobj	_	L-ORG
33 | 21	.	.	.	_	3	punct	_	O
34 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/bin/Version.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.bin;
17 | 
18 | /**
19 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
20 |  */
21 | public class Version
22 | {
23 | 	public static void main(String[] args)
24 | 	{
25 | 		System.out.println("====================================");
26 | 		System.out.println("NLP4J Version 1.1.2");
27 | 		System.out.println("Webpage: http://nlp.mathcs.emory.edu");
28 | 		System.out.println("Contact: choi@mathcs.emory.edu");
29 | 		System.out.println("====================================");
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/decode/NLPDecoder.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.decode;
17 | 
18 | import java.io.InputStream;
19 | 
20 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
21 | 
22 | /**
23 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
24 |  */
25 | public class NLPDecoder extends AbstractNLPDecoder<NLPNode>
26 | {
27 | 	public NLPDecoder() {super();}
28 | 	
29 | 	public NLPDecoder(DecodeConfig config)
30 | 	{
31 | 		super(config);
32 | 	}
33 | 	
34 | 	public NLPDecoder(InputStream configuration)
35 | 	{
36 | 		super(new DecodeConfig(configuration));
37 | 	}
38 | 	
39 | 	@Override
40 | 	public NLPNode create()
41 | 	{
42 | 		return new NLPNode();
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/md/tutorial/maven.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## Run a Java class using Maven
 4 | 
 5 | * Specify the [JVM options](http://www.oracle.com/technetwork/articles/java/vmoptions-jsp-140102.html) in Maven.  If you are using [Bash](https://www.gnu.org/software/bash/), export `MAVEN_OPTS`:
 6 | 
 7 |    ```
 8 |    export MAVEN_OPTS='-Xmx8g -XX:+UseConcMarkSweepGC -XX:MaxPermSize=128m'
 9 |    ```
10 | 
11 | * Compile the Java project using Maven by running the following command from the top directory, where the [`pom.xml`](../../pom.xml) is located. The `target/classes` directory should be created after running this command if it does not already exist.
12 | 
13 |    ```
14 |    mvn compile
15 |    ```
16 | 
17 | * Copy [`log4j.properties`](../../src/main/resources/configuration/log4j.properties) to `target/classes` if it is not already specified in your path.
18 | 
19 | * Run an executable Java class using `mvn exec:java`.  For instance, the following command executes [`POSTrain`](../../src/main/java/edu/emory/mathcs/nlp/bin/POSTrain.java) (see [part-of-speech tagging](../component/part_of_speech_tagging.md#training) for more details about the command). Note that the base filenames are used in this example, but use the filenames with their absolute paths if they are not getting recognized.
20 | 
21 |    ```
22 |    mvn exec:java -Dexec.mainClass="edu.emory.mathcs.nlp.bin.POSTrain" -Dexec.args="-c config_train_pos.xml -t wsj_0001.dep -d wsj_0001.dep"
23 |    ```
24 | 


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-sample.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <tsv>
 3 |         <column index="1" field="form"/>
 4 |         <column index="2" field="lemma"/>
 5 |         <column index="3" field="pos"/>
 6 |         <column index="4" field="feats"/>
 7 |         <column index="5" field="dhead"/>
 8 |         <column index="6" field="deprel"/>
 9 |     </tsv>
10 | 
11 |     <optimizer>
12 |         <l1_regularization>0.00001</l1_regularization>
13 |         <algorithm>adagrad-mini-batch</algorithm>
14 |         <learning_rate>0.02</learning_rate>
15 |         <feature_cutoff>0</feature_cutoff>
16 |         <lols fixed="0" decaying="0.95"/>
17 |         <batch_size>1</batch_size>
18 |         <max_epoch>5</max_epoch>
19 |         <bias>0</bias>
20 |     </optimizer>
21 | 
22 |     <feature_template>
23 |         <feature f0="i:lemma"/>
24 |         <feature f0="j:lemma"/>
25 |         <feature f0="i:part_of_speech_tag"/>
26 |         <feature f0="j:part_of_speech_tag"/>
27 | 
28 |         <feature f0="i:part_of_speech_tag" f1="i:lemma"/>
29 |         <feature f0="j:part_of_speech_tag" f1="j:lemma"/>
30 | 
31 |         <feature f0="i:part_of_speech_tag" f1="j:part_of_speech_tag"/>
32 |         <feature f0="i:part_of_speech_tag" f1="j:lemma"/>
33 |         <feature f0="i:lemma"              f1="j:part_of_speech_tag"/>
34 |         <feature f0="i:lemma"              f1="j:lemma"/>
35 |     </feature_template>
36 | </configuration>
37 | 


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-sample-optimized.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 	<tsv>
 3 |         <column index="1" field="form"/>
 4 |         <column index="2" field="lemma"/>
 5 |         <column index="3" field="pos"/>
 6 |         <column index="4" field="feats"/>
 7 |         <column index="5" field="dhead"/>
 8 |         <column index="6" field="deprel"/>
 9 |     </tsv>
10 | 
11 |     <optimizer>
12 |         <l1_regularization>0.00001</l1_regularization>
13 |         <algorithm>adagrad-mini-batch</algorithm>
14 |         <learning_rate>0.02</learning_rate>
15 |         <feature_cutoff>0</feature_cutoff>
16 |         <lols fixed="0" decaying="0.95"/>
17 |         <batch_size>1</batch_size>
18 |         <max_epoch>3</max_epoch>
19 |         <bias>0</bias>
20 |     </optimizer>
21 | 
22 |     <feature_template>
23 |         <feature f0="i:lemma"/>
24 |         <feature f0="j:lemma"/>
25 |         <feature f0="i:part_of_speech_tag"/>
26 |         <feature f0="j:part_of_speech_tag"/>
27 | 
28 |         <feature f0="i:part_of_speech_tag" f1="i:lemma"/>
29 |         <feature f0="j:part_of_speech_tag" f1="j:lemma"/>
30 | 
31 |         <feature f0="i:part_of_speech_tag" f1="j:part_of_speech_tag"/>
32 |         <feature f0="i:part_of_speech_tag" f1="j:lemma"/>
33 |         <feature f0="i:lemma"              f1="j:part_of_speech_tag"/>
34 |         <feature f0="i:lemma"              f1="j:lemma"/>
35 |     </feature_template>
36 | </configuration>
37 | 


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-doc.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 	<tsv document_based="true">
 3 |         <column index="1" field="form"/>
 4 |         <column index="2" field="lemma"/>
 5 |         <column index="3" field="pos"/>
 6 |         <column index="4" field="feats"/>
 7 |         <column index="5" field="dhead"/>
 8 |         <column index="6" field="deprel"/>
 9 |         <column index="8" field="nament"/>
10 |     </tsv>
11 |     
12 |     <lexica>
13 |     	<word_embeddingsx field="word_form_undigitalized">/Users/jdchoi/Documents/EmoryNLP/nlp4j-english/src/main/resources/edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz</word_embeddingsx>
14 |     </lexica>
15 |     
16 |     <doc_feat_key>r3</doc_feat_key>
17 | 
18 |     <optimizer>
19 |         <algorithm>adagrad</algorithm>
20 |         <learning_rate>0.01</learning_rate>
21 |         <l1_regularization>0.001</l1_regularization>
22 |         <feature_cutoff>0</feature_cutoff>
23 |         <max_epoch>20</max_epoch>
24 |         <bias>0</bias>
25 |     </optimizer>
26 | 
27 |     <feature_template>
28 |     	<feature t="bag_of_words_count" f0="i:word_form_simplified"/>
29 |     </feature_template>
30 |     
31 |     
32 |     
33 |     <feature_template_r0>
34 |     	<feature t="bag_of_words_count" f0="i:word_form_simplified"/>
35 |     	<feature t="bag_of_words_count" f0="i:word_form_simplified" f1="i+1:word_form_simplified"/>
36 |     </feature_template_r0>
37 | </configuration>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NLP4J
 2 | 
 3 | The NLP4J project provides a NLP toolkit for JVM languages. This project is under the [Apache 2](http://www.apache.org/licenses/LICENSE-2.0) license and is currently developed by the [NLP Research Group](http://nlp.mathcs.emory.edu) at [Emory University](http://emory.edu). Please join our [forum](https://groups.google.com/forum/#!forum/emorynlp) to get notifications about new releases and give your feedback about this project.
 4 | 
 5 | * Latest release: [1.1.2](http://search.maven.org/#search%7Cga%7C1%7Cnlp4j) (06/29/2016).
 6 | * [Release notes](md/quickstart/release.md).
 7 | * [Google groups](https://groups.google.com/forum/#!forum/emorynlp).
 8 | 
 9 | ## Quick Start
10 | 
11 | * [How to install](md/quickstart/install.md).
12 | * [How to train](md/quickstart/train.md).
13 | * [How to decode](md/quickstart/decode.md).
14 | * [NLP4J demo](http://nlp.mathcs.emory.edu:8080/nlp4j).
15 | 
16 | ## Components
17 | 
18 | * [Tokenization](https://github.com/emorynlp/tokenization).
19 | * [Morphological analysis](https://github.com/emorynlp/morphological_analysis).
20 | * [Part-of-speech tagging](md/components/part_of_speech_tagging.md).
21 | * [Named entity recognition](md/components/named_entity_recognition.md).
22 | * [Dependency parsing](md/components/dependency_parsing.md).
23 | * Semantic role labeling (coming soon).
24 | * Sentiment analysis (coming soon).
25 | * Word2Vec & Struct2Vec (coming soon).
26 | 
27 | ## Supplements
28 | 
29 | * [English lexica and models](md/supplements/english-lexica-models.md) (hosted in [bitbucket](https://bitbucket.org/emorynlp/nlp4j-english)).
30 | * [Data format](md/supplements/data-format.md).
31 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/bin/NLPDemo.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.bin;
17 | 
18 | import edu.emory.mathcs.nlp.common.util.IOUtils;
19 | import edu.emory.mathcs.nlp.common.util.Joiner;
20 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
21 | import edu.emory.mathcs.nlp.decode.AbstractNLPDecoder;
22 | import edu.emory.mathcs.nlp.decode.NLPDecoder;
23 | 
24 | /**
25 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
26 |  */
27 | public class NLPDemo
28 | {
29 | 	static public void main(String[] args) throws Exception
30 | 	{
31 | 		final String configFile = "src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml";
32 | 		final String inputFile  = "src/test/resources/dat/nlp4j.txt";
33 | 		
34 | 		NLPDecoder nlp4j = new NLPDecoder(IOUtils.createFileInputStream(configFile));
35 | 		NLPNode[] nodes;
36 | 
37 | 		String sentence = "John bought a car for Mary.";
38 | 		nodes = nlp4j.decode(sentence);
39 | 		System.out.println(Joiner.join(nodes, "\n", 1)+"\n");
40 | 		nlp4j.decode(IOUtils.createFileInputStream(inputFile), System.out, AbstractNLPDecoder.FORMAT_RAW);
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/component/morph/MorphologicalAnalyzer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.component.morph;
17 | 
18 | import java.util.List;
19 | 
20 | import edu.emory.mathcs.nlp.common.util.Language;
21 | import edu.emory.mathcs.nlp.component.morph.english.EnglishMorphAnalyzer;
22 | import edu.emory.mathcs.nlp.component.template.NLPComponent;
23 | import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode;
24 | 
25 | /**
26 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
27 |  */
28 | public class MorphologicalAnalyzer<N extends AbstractNLPNode<N>> implements NLPComponent<N>
29 | {
30 | 	private MorphAnalyzer analyzer;
31 | 	
32 | 	public MorphologicalAnalyzer(Language language)
33 | 	{
34 | 		analyzer = new EnglishMorphAnalyzer();
35 | 	}
36 | 
37 | 	@Override
38 | 	public void process(N[] nodes)
39 | 	{
40 | 		N node;
41 | 		
42 | 		for (int i=1; i<nodes.length; i++)
43 | 		{
44 | 			node = nodes[i];
45 | 			node.setLemma(analyzer.lemmatize(node.getWordFormSimplified(), node.getPartOfSpeechTag()));
46 | 		}
47 | 	}
48 | 
49 | 	@Override
50 | 	public void process(List<N[]> document)
51 | 	{
52 | 		for (N[] nodes : document)
53 | 			process(nodes);
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/zzz/RadiologyDecode.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.zzz;
17 | 
18 | import edu.emory.mathcs.nlp.common.util.FileUtils;
19 | import edu.emory.mathcs.nlp.common.util.IOUtils;
20 | import edu.emory.mathcs.nlp.decode.AbstractNLPDecoder;
21 | import edu.emory.mathcs.nlp.decode.NLPDecoder;
22 | 
23 | /**
24 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
25 |  */
26 | public class RadiologyDecode
27 | {
28 | 	static public void main(String[] args) throws Exception
29 | 	{
30 | 		final String configFile   = "/Users/jdchoi/Documents/EmoryNLP/nlp4j/src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-deident.xml";
31 | 		final String inputDir     = "/Users/jdchoi/Desktop/radiology/Q2";
32 | 		final String inputExt     = "txt";
33 | 		final String outputExt    = "tsv";
34 | 		final String outputFormat = AbstractNLPDecoder.FORMAT_LINE;
35 | 		
36 | 		NLPDecoder nlp4j = new NLPDecoder(IOUtils.createFileInputStream(configFile));
37 | 		
38 | 		for (String inputFile : FileUtils.getFileList(inputDir, inputExt))
39 | 		{
40 | 			System.out.println(inputFile);
41 | 			String outputFile = inputFile+"."+outputExt;
42 | 			nlp4j.decode(IOUtils.createFileInputStream(inputFile), IOUtils.createFileOutputStream(outputFile), outputFormat);	
43 | 		}
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/java/edu/emory/mathcs/nlp/decode/AbstractNLPDecoderTest.java:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  ** This data and information is proprietary to, and a valuable trade secret
 3 |  ** of, Basis Technology Corp.  It is given in confidence by Basis Technology
 4 |  ** and may only be used as permitted under the license agreement under which
 5 |  ** it has been distributed, and in no other way.
 6 |  **
 7 |  ** Copyright (c) 2015 Basis Technology Corporation All rights reserved.
 8 |  **
 9 |  ** The technical data and information provided herein are provided with
10 |  ** `limited rights', and the computer software provided herein is provided
11 |  ** with `restricted rights' as those terms are defined in DAR and ASPR
12 |  ** 7-104.9(a).
13 |  ******************************************************************************/
14 | 
15 | package edu.emory.mathcs.nlp.decode;
16 | 
17 | import java.io.InputStream;
18 | import java.net.URL;
19 | 
20 | import org.junit.Test;
21 | 
22 | import com.google.common.io.Resources;
23 | 
24 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
25 | import edu.emory.mathcs.nlp.component.template.reader.TSVReader;
26 | 
27 | /**
28 |  *
29 |  */
30 | public class AbstractNLPDecoderTest {
31 | 
32 |     @Test
33 |     public void createTsv() throws Exception {
34 |         URL configUrl = Resources.getResource("decoder-test-config.xml");
35 |         DecodeConfig config;
36 |         try (InputStream configStream = Resources.asByteSource(configUrl).openStream()) {
37 |             config = new DecodeConfig(configStream);
38 |         }
39 | 
40 |         NLPDecoder decoder = new NLPDecoder(config);
41 |         TSVReader<NLPNode> reader = decoder.createTSVReader();
42 |         URL tsvUrl = Resources.getResource("dat/sample-dev.tsv");
43 |         try (InputStream tsvStream = Resources.asByteSource(tsvUrl).openStream()) {
44 |             reader.open(tsvStream);
45 |             reader.readDocument();
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/md/components/named_entity_recognition.md:
--------------------------------------------------------------------------------
 1 | # Named Entity Recognition
 2 | 
 3 | Our named entity recognizer uses both sparse and dense features extracted from named entity gazetteers, word clusters, and word embeddings. It processes over 47K tokens per second on an Intel Xeon 2.30GHz machine and shows the state-of-the-art accuracy (91.0% on the CoNLL'03 corpus).
 4 | 
 5 | * [Dynamic Feature Induction: The Last Gist to the State-of-the-Art](http://naacl.org/naacl-hlt-2016/), Jinho D. Choi, Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (NAACL'16), San Diego, CA, 2016.
 6 | * [Intrinsic and Extrinsic Evaluations of Word Embeddings](http://www.aaai.org/Conferences/AAAI/2016/aaai16accepted-papers.pdf), Michael Zhai, Johnny Tan, Jinho D. Choi, Proceedings of the AAAI 2015 Student Program, Phoenix, AZ, 2015.
 7 | 
 8 | ## English Tags
 9 | 
10 | | Tag            | Description | Version |
11 | | -------------- | ----------- | ------- |
12 | | `PERSON`       | People, including fictional | 1.0.0 || `NORP`         | Nationalities or religious or political groups | 1.0.0 || ￼`FAC`          | Buildings, airports, highways, bridges, etc. | 1.0.0 || `ORG`          | Companies, agencies, institutions, etc. | 1.0.0 || `GPE`          | Countries, cities, states | 1.0.0 || `LOC`          | Non-GPE locations, mountain ranges, bodies of water | 1.0.0 || `PRODUCT`      | Vehicles, weapons, foods, etc. (not services) | 1.0.0 || ￼`EVENT`        | Named hurricanes, battles, wars, sports events, etc. | 1.0.0 || `WORK OF ART`  | Titles of books, songs, etc. | 1.0.0 || `LAW`          | Named documents made into laws | 1.0.0 || `LANGUAGE`     | Any named language | 1.0.0 |
13 | | `DATE`         | Absolute or relative dates or periods | 1.0.0 |
14 | | `TIME`         | Times smaller than a day | 1.0.0 |
15 | | `PERCENT`      | Percentage (including "%") | 1.0.0 |
16 | | `MONEY`        | Monetary values, including unit | 1.0.0 |
17 | | `QUANTITY`     | Measurements, as of weight or distance | 1.0.0 |
18 | | `ORDINAL`      | Ordinals (e.g., "first", "1st") | 1.0.0 |
19 | | `CARDINAL`     | Numerals that do not fall under another type | 1.0.0 |
20 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/network/NLPSocketClient.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.network;
17 | 
18 | import java.io.BufferedInputStream;
19 | import java.io.BufferedOutputStream;
20 | import java.io.DataInputStream;
21 | import java.io.DataOutputStream;
22 | import java.io.InputStream;
23 | import java.io.OutputStream;
24 | import java.net.Socket;
25 | 
26 | /**
27 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
28 |  */
29 | public class NLPSocketClient
30 | {
31 | 	private final String SERVER_ADDRESS;
32 | 	private final int    SERVER_PORT;
33 | 	
34 | 	public NLPSocketClient(String address, int port)
35 | 	{
36 | 		SERVER_ADDRESS = address;
37 | 		SERVER_PORT    = port;
38 | 	}
39 | 	
40 | 	public String decode(String text, String format)
41 | 	{
42 | 		StringBuilder build = new StringBuilder();
43 | 		
44 | 		try
45 | 		{
46 | 			String data = format+":"+text+NLPSocketServer.END;
47 | 			Socket socket = new Socket(SERVER_ADDRESS, SERVER_PORT);
48 | 			InputStream in = new DataInputStream(new BufferedInputStream(socket.getInputStream()));
49 | 			OutputStream out = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream()));
50 | 			
51 | 			out.write(data.getBytes());
52 | 			out.flush();
53 | 			
54 | 			byte[] buffer = new byte[2048];
55 | 			int i;
56 | 			
57 | 			while ((i = in.read(buffer, 0, buffer.length)) >= 0)
58 | 			{
59 | 				build.append(new String(buffer, 0, i));
60 | 				if (build.toString().endsWith(NLPSocketServer.END)) break;
61 | 			}
62 | 			
63 | 			socket.close();
64 | 		}
65 | 		catch (Exception e) {e.printStackTrace();}
66 | 		
67 | 		return build.toString();
68 | 	}
69 | 	
70 | 	static public void main(String[] args)
71 | 	{
72 | 		NLPSocketClient client = new NLPSocketClient("127.0.0.1", 8000);
73 | 		System.out.println(client.decode("UN peacekeepers abuse children", "raw"));
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/md/supplements/data-format.md:
--------------------------------------------------------------------------------
 1 | # Data Format
 2 | 
 3 | ## Raw Format
 4 | 
 5 | The `raw` format accepts texts in any format.
 6 | 
 7 | ```
 8 | I'd like to meet Dr. Choi. He's a professor at Emory University.
 9 | ```
10 | 
11 | ## Sentence Format
12 | 
13 | The `sen` format expects a sentence per line.
14 | 
15 | ```
16 | I'd like to meet Dr. Choi.
17 | He's a professor at Emory University.
18 | ```
19 | 
20 | ## Tab Separated Values Format
21 | 
22 | The `tsv` format expects columns delimited by `\t` and sentences separated by `\n`.
23 | 
24 | ```tsv
25 | 1  I           I           PRP  _  3  nsubj     3:A0;5:A0  O
26 | 2  'd          would       MD   _  3  aux       3:AM-MOD   O
27 | 3  like        like        VB   _  0  root      _          O
28 | 4  to          to          TO   _  5  aux       _          O
29 | 5  meet        meet        VB   _  3  xcomp     3:A1       O
30 | 6  Dr.         dr.         NNP  _  7  compound  _          O
31 | 7  Choi        choi        NNP  _  5  dobj      5:A1       U-PERSON
32 | 8  .           .           .    _  3  punct     _          O
33 | 
34 | 1  He          he          PRP  _  2  nsubj     2:A1       O
35 | 2  's          's          VBZ  _  0  root      _          O
36 | 3  a           a           DT   _  4  det       _          O
37 | 4  professor   professor   NN   _  2  attr      2:A2       O
38 | 5  at          at          IN   _  4  prep      _          O
39 | 6  Emory       emory       NNP  _  7  compound  _          B-ORG
40 | 7  University  university  NNP  _  5  pobj      _          L-ORG
41 | 8  .           .           .    _  2  punct     _          O
42 | ```
43 | 
44 | The column fields are specified in the [configuration files](../../src/main/resources/configuration/) as follows:
45 | 
46 | ```xml
47 | <configuration>
48 |     <tsv>
49 |         <column index="1" field="form"/>
50 |         <column index="2" field="lemma"/>
51 |         <column index="3" field="pos"/>
52 |         <column index="4" field="feats"/>
53 |         <column index="5" field="dhead"/>
54 |         <column index="6" field="deprel"/>
55 |         <column index="7" field="sheads"/>
56 |         <column index="8" field="nament"/>
57 |     </tsv>
58 | </configuration>
59 | ```
60 | 
61 | * `form`: word form.
62 | * `lemma`: lemma.
63 | * `pos`: part-of-speech tag.
64 | * `feats`: extra features; features are delimited by `|`, and keys and values are delimited by `=` (e.g., `k1=v1|k2=v2`).
65 | * `dhead`: dependency head token ID.
66 | * `deprel`: dependency label.
67 | * `sheads`: semantic heads; head IDs and labels are delimited by `:`.
68 | * `nament`: named entity tags in the BILOU notaiton.


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/decode/NLPUtils.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.decode;
17 | 
18 | import java.io.InputStream;
19 | import java.io.ObjectInputStream;
20 | 
21 | import edu.emory.mathcs.nlp.common.util.IOUtils;
22 | import edu.emory.mathcs.nlp.common.util.Joiner;
23 | import edu.emory.mathcs.nlp.common.util.Language;
24 | import edu.emory.mathcs.nlp.component.template.NLPComponent;
25 | import edu.emory.mathcs.nlp.component.template.OnlineComponent;
26 | import edu.emory.mathcs.nlp.component.template.feature.Field;
27 | import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode;
28 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
29 | import edu.emory.mathcs.nlp.component.template.state.NLPState;
30 | import edu.emory.mathcs.nlp.component.template.util.NLPFlag;
31 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer;
32 | import edu.emory.mathcs.nlp.tokenization.Tokenizer;
33 | 
34 | /**
35 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
36 |  */
37 | public class NLPUtils
38 | {
39 | 	static public String FEAT_POS_2ND   = "pos2";
40 | 	static public String FEAT_PREDICATE = "pred";
41 | 
42 | 	static public String toStringLine(NLPNode[] nodes, String delim, Field field)
43 | 	{
44 | 		return Joiner.join(nodes, delim, 1, nodes.length, n -> n.getValue(field));
45 | 	}
46 | 	
47 | 	static public Tokenizer createTokenizer(Language language)
48 | 	{
49 | 		return new EnglishTokenizer();
50 | 	}
51 | 	
52 | 	@SuppressWarnings("unchecked")
53 | 	static public <N extends AbstractNLPNode<N>,S extends NLPState<N>>NLPComponent<N> getComponent(InputStream in)
54 | 	{
55 | 		ObjectInputStream oin = IOUtils.createObjectXZBufferedInputStream(in);
56 | 		OnlineComponent<N,S> component = null;
57 | 		
58 | 		try
59 | 		{
60 | 			component = (OnlineComponent<N,S>)oin.readObject();
61 | 			component.setFlag(NLPFlag.DECODE);
62 | 			oin.close();
63 | 		}
64 | 		catch (Exception e) {e.printStackTrace();}
65 | 
66 | 		return component;
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/md/tutorial/processing_state.md:
--------------------------------------------------------------------------------
 1 | # Processing State
 2 | 
 3 | ## NLPState
 4 | 
 5 | A state object is created for every input (e.g., a sentence), providing information about the current state of the component (e.g., a pointer to the input token).  Creating a separate state object for each input enables the component to be thread-safe.  All state objects extend [`NLPState`](../../java/edu/emory/mathcs/nlp/component/util/state/NLPState.java), which takes two genetic types `N` and `L` representing the types of the input node (e.g., [`NLPNode`](../../java/edu/emory/mathcs/nlp/component/util/NLPNode.java)) and the label (e.g., `String`), respectively.
 6 | 
 7 | ```java
 8 | public abstract class NLPState<N,L>
 9 | ```
10 | 
11 | This class contains several abstract methods:
12 | 
13 | ```java
14 | /** Clears and saves the gold-standard labels in the input nodes if available. */
15 | public abstract void clearGoldLabels();
16 | 
17 | /** Moves onto the next state */
18 | public abstract void next();
19 | 
20 | /** @return true if no more state can be processed; otherwise, false. */
21 | public abstract boolean isTerminate();
22 | 
23 | /** @return the gold standard label for the current state. */
24 | public abstract L getGoldLabel();
25 | 
26 | /** Assigns the specific label to the current state. */
27 | public abstract void setLabel(L label);
28 | 
29 | /** Evaluates all predictions given the current input. */
30 | public abstract void evaluate(Eval eval);
31 | ```
32 | 
33 | See [NLP component](nlp_component.md) for more details about how these abstract methods are used.
34 | 
35 | ## L2RState
36 | 
37 | [`N2RState`](../../java/edu/emory/mathcs/nlp/component/util/state/N2RState.java) defines the left-to-right tagging strategy commonly used in NLP (e.g., part-of-speech tagging, named entity recognition).  It extends [`NLPState`](#nlpstate) and takes a genetic type `N` representing the type of the input node (e.g., [`POSNode`](../../java/edu/emory/mathcs/nlp/component/pos/POSNode.java)).
38 | 
39 | ```java
40 | public abstract class L2RState<N> extends NLPState<N,String>
41 | ```
42 | 
43 | This state keeps track of the pointer to the processing node, starting at `0`. It then moves onto the next state by incrementing the pointer to the next node. Finally, it terminates if there is no more node to process.
44 | 
45 | ```java
46 | protected int index = 0;
47 | 
48 | @Override
49 | public void next()
50 | {
51 | 	index++;
52 | }
53 | 	
54 | @Override
55 | public boolean isTerminate()
56 | {
57 | 	return index >= nodes.length;
58 | }
59 | ```
60 | See [`POSState`](../../java/edu/emory/mathcs/nlp/component/pos/POSState.java) for the example of a subclass inheriting this class.


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/bin/NLPDecode.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.bin;
17 | 
18 | import java.util.Collections;
19 | import java.util.List;
20 | 
21 | import org.kohsuke.args4j.Option;
22 | 
23 | import edu.emory.mathcs.nlp.common.util.BinUtils;
24 | import edu.emory.mathcs.nlp.common.util.FileUtils;
25 | import edu.emory.mathcs.nlp.common.util.IOUtils;
26 | import edu.emory.mathcs.nlp.decode.AbstractNLPDecoder;
27 | import edu.emory.mathcs.nlp.decode.NLPDecoder;
28 | 
29 | /**
30 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
31 |  */
32 | public class NLPDecode
33 | {
34 | 	@Option(name="-c", usage="confinguration filename (required)", required=true, metaVar="<filename>")
35 | 	public String configuration_file;
36 | 	@Option(name="-i", usage="input path (required)", required=true, metaVar="<filepath>")
37 | 	public String input_path;
38 | 	@Option(name="-ie", usage="input file extension (default: *)", required=false, metaVar="<string>")
39 | 	public String input_ext = "*";
40 | 	@Option(name="-oe", usage="output file extension (default: nlp)", required=false, metaVar="<string>")
41 | 	public String output_ext = "nlp";
42 | 	@Option(name="-format", usage="format of the input data (raw|line|tsv; default: raw)", required=false, metaVar="<string>")
43 | 	private String format = AbstractNLPDecoder.FORMAT_RAW;
44 | 	@Option(name="-threads", usage="number of threads (default: 2)", required=false, metaVar="<integer>")
45 | 	protected int threads = 2;
46 | 	private NLPDecoder decoder;
47 | 
48 | //	======================================== CONSTRUCTORS ========================================
49 | 	
50 | 	public NLPDecode(String[] args)
51 | 	{
52 | 		BinUtils.initArgs(args, this);
53 | 		List<String> filelist = FileUtils.getFileList(input_path, input_ext, false);
54 | 		Collections.sort(filelist);
55 | 		
56 | 		decoder = new NLPDecoder(IOUtils.createFileInputStream(configuration_file));
57 | 		decoder.decode(filelist, output_ext, format, threads);
58 | 	}
59 | 	
60 | 	static public void main(String[] args)
61 | 	{
62 | 		new NLPDecode(args);
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/decode/DecodeConfig.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015, Emory University
 3 |  * 
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  * 
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  * 
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package edu.emory.mathcs.nlp.decode;
17 | 
18 | import java.io.InputStream;
19 | 
20 | import org.w3c.dom.Element;
21 | 
22 | import edu.emory.mathcs.nlp.common.util.XMLUtils;
23 | import edu.emory.mathcs.nlp.component.template.config.NLPConfig;
24 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
25 | 
26 | /**
27 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
28 |  */
29 | public class DecodeConfig extends NLPConfig<NLPNode>
30 | {
31 | 	private String part_of_speech_tagging;
32 | 	private String named_entity_recognition;
33 | 	private String dependency_parsing;
34 | 	private String semantic_role_labeling;
35 | 	
36 | 	public DecodeConfig() {}
37 | 	
38 | 	public DecodeConfig(InputStream in)
39 | 	{
40 | 		super(in);
41 | 		initComponents();
42 | 	}
43 | 	
44 | 	public void initComponents()
45 | 	{
46 | 		Element eModels = XMLUtils.getFirstElementByTagName(xml, "models");
47 | 		
48 | 		setPartOfSpeechTagging   (XMLUtils.getTextContentFromFirstElementByTagName(eModels, "pos"));
49 | 		setNamedEntityRecognition(XMLUtils.getTextContentFromFirstElementByTagName(eModels, "ner"));
50 | 		setDependencyParsing     (XMLUtils.getTextContentFromFirstElementByTagName(eModels, "dep"));
51 | 		setSemanticRoleLabeling  (XMLUtils.getTextContentFromFirstElementByTagName(eModels, "srl"));
52 | 	}
53 | 	
54 | 	public String getPartOfSpeechTagging()
55 | 	{
56 | 		return part_of_speech_tagging;
57 | 	}
58 | 	
59 | 	public String getNamedEntityRecognition()
60 | 	{
61 | 		return named_entity_recognition;
62 | 	}
63 | 	
64 | 	public String getDependencyParsing()
65 | 	{
66 | 		return dependency_parsing;
67 | 	}
68 | 	
69 | 	public String getSemanticRoleLabeling()
70 | 	{
71 | 		return semantic_role_labeling;
72 | 	}
73 | 	
74 | 	public void setPartOfSpeechTagging(String filename)
75 | 	{
76 | 		part_of_speech_tagging = filename;
77 | 	}
78 | 	
79 | 	public void setNamedEntityRecognition(String filename)
80 | 	{
81 | 		named_entity_recognition = filename;
82 | 	}
83 | 	
84 | 	public void setDependencyParsing(String filename)
85 | 	{
86 | 		dependency_parsing = filename;
87 | 	}
88 | 	
89 | 	public void setSemanticRoleLabeling(String filename)
90 | 	{
91 | 		semantic_role_labeling = filename;
92 | 	}
93 | }
94 | 


--------------------------------------------------------------------------------
/md/quickstart/install.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | ## With Maven
 4 | 
 5 | * Make sure [Java 8](http://www.oracle.com/technetwork/java/javase/) and [Maven 3](https://maven.apache.org) are installed on your machine.
 6 | * Add the following dependency to `pom.xml`:
 7 | 
 8 | 	```xml
 9 |     <dependency>
10 |       <groupId>edu.emory.mathcs.nlp</groupId>
11 |       <artifactId>nlp4j</artifactId>
12 |       <version>RELEASE</version>
13 |     </dependency>
14 | 	```
15 | 
16 | * For English models, add the following dependency to `pom.xml`.
17 | 
18 | 	```xml
19 |     <dependency>
20 |       <groupId>edu.emory.mathcs.nlp</groupId>
21 |       <artifactId>nlp4j-english</artifactId>
22 |       <version>RELEASE</version>
23 |     </dependency>
24 | 	```
25 | 	
26 | * Install the maven project:
27 | 
28 | 	```bash
29 | 	mvn clean install
30 | 	```
31 | 	
32 | * Run the following command:
33 | 
34 | 	```bash
35 | 	mvn exec:java -Dexec.mainClass="edu.emory.mathcs.nlp.bin.Version"
36 | 	```
37 | 
38 | 	If you see the following message, it is properly installed.
39 | 
40 | 	```
41 | 	[INFO] Scanning for projects...
42 | 	[INFO]                                                                         
43 | 	[INFO] ------------------------------------------------------------------------
44 | 	[INFO] Building nlp4j x.x.x
45 | 	[INFO] ------------------------------------------------------------------------
46 | 	[INFO] 
47 | 	[INFO] --- exec-maven-plugin:1.4.0:java (default-cli) @ nlp4j ---
48 | 	====================================
49 | 	Emory NLP Version x.x.x
50 | 	Webpage: http://nlp.mathcs.emory.edu
51 | 	Contact: jinho.choi@emory.edu
52 | 	====================================
53 | 	[INFO] ------------------------------------------------------------------------
54 | 	[INFO] BUILD SUCCESS
55 | 	[INFO] ------------------------------------------------------------------------
56 | 	[INFO] Total time: 0.739s
57 | 	[INFO] Finished at: Tue Nov 24 20:08:59 EST 2015
58 | 	[INFO] Final Memory: 11M/247M
59 | 	[INFO] ------------------------------------------------------------------------
60 | 	```
61 | 
62 | ## Without Maven
63 | 
64 | * Make sure [Java 8](http://www.oracle.com/technetwork/java/javase/) is installed on your machine.
65 | * Download [`nlp4j.jar`](http://nlp.mathcs.emory.edu/nlp4j/nlp4j-1.1.2.jar), and add it to your classpath. If you are using [bash](https://www.gnu.org/software/bash/), export `CLASSPATH` as follows:
66 | 
67 | 	```bash
68 | 	export CLASSPATH=nlp4j-x.x.x.jar:.
69 | 	```
70 | 
71 | * For English models, download [`nlp4j-english.jar`](http://search.maven.org/remotecontent?filepath=edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar), and add it to your classpath:
72 | 
73 | 	```bash
74 | 	export CLASSPATH=nlp4j-x.x.x.jar:nlp4j-english-x.x.x.jar:.
75 | 	```
76 | 	
77 | * Run the following command:
78 | 
79 | 	```bash
80 | 	java edu.emory.mathcs.nlp.bin.Version
81 | 	```
82 | 
83 | 	If you see the following message, it is properly installed.
84 | 
85 | 	```
86 | 	====================================
87 | 	NLP4J Version x.x.x
88 | 	Webpage: http://nlp.mathcs.emory.edu
89 | 	Contact: jinho.choi@emory.edu
90 | 	====================================
91 | 	```


--------------------------------------------------------------------------------
/md/components/part_of_speech_tagging.md:
--------------------------------------------------------------------------------
 1 | # Part-of-Speech Tagging
 2 | 
 3 | Our part-of-speech tagger uses the generalized model from dynamic model selection and utilizes ambiguity classes trained on a large corpus. It processes over 82K tokens per second on an Intel Xeon 2.30GHz machine and shows the state-of-the-art accuracy (97.64% on the WSJ corpus).
 4 | 
 5 | * [Dynamic Feature Induction: The Last Gist to the State-of-the-Art](http://naacl.org/naacl-hlt-2016/), Jinho D. Choi, Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (NAACL'16), San Diego, CA, 2016.
 6 | * [Intrinsic and Extrinsic Evaluations of Word Embeddings](http://www.aaai.org/Conferences/AAAI/2016/aaai16accepted-papers.pdf), Michael Zhai, Johnny Tan, Jinho D. Choi, Proceedings of the AAAI 2015 Student Program, Phoenix, AZ, 2015.
 7 | * [Fast and Robust Part-of-Speech Tagging Using Dynamic Model Selection](http://aclweb.org/anthology-new/P/P12/P12-2071.pdf), Jinho D. Choi, Martha Palmer, Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL'12), 363-367, Jeju, Korea, 2012.
 8 | 
 9 | ## English Tags
10 | 
11 | | Tag     | Description | Version |
12 | | ------- | ----------- | ------- |
13 | | `$`     | Dollar | 1.0.0 |
14 | | `:`     | Colon | 1.0.0 |
15 | | `,`     | Comma | 1.0.0 |
16 | | `.`     | Period | 1.0.0 |
17 | | ````    | Left quote | 1.0.0 |
18 | | `''`    | Right quote | 1.0.0 |
19 | | `-LRB-` | Left bracket | 1.0.0 |
20 | | `-RRB-` | Right bracket | 1.0.0 |
21 | | `ADD`   | Email | 1.0.0 |
22 | | `AFX`   | Affix | 1.0.0 |
23 | | `CC`    | Coordinating conjunction | 1.0.0 |
24 | | `CD`    | Cardinal number | 1.0.0 |
25 | | `DT`    | Determiner | 1.0.0 |
26 | | `EX`    | Existential `there` | 1.0.0 |
27 | | `FW`    | Foreign word | 1.0.0 |
28 | | `GW`    | Go with | 1.0.0 |
29 | | `HYPH`  | Hyphen | 1.0.0 |
30 | | `IN`    | Preposition or subordinating conjunction | 1.0.0 |
31 | | `JJ`    | Adjective | 1.0.0 |
32 | | `JJR`   | Adjective, comparative | 1.0.0 |
33 | | `JJS`   | Adjective, superlative | 1.0.0 |
34 | | `LS`    | List item marker | 1.0.0 |
35 | | `MD`    | Modal | 1.0.0 |
36 | | `NFP`   | Superfluous punctuation | 1.0.0 |
37 | | `NN`    | Noun, singular or mass | 1.0.0 |
38 | | `NNS`   | Noun, plural | 1.0.0 |
39 | | `NNP`   | Proper noun, singular | 1.0.0 |
40 | | `NNPS`  | Proper noun, plural | 1.0.0 |
41 | | `PDT`   | Predeterminer | 1.0.0 |
42 | | `POS`   | Possessive ending | 1.0.0 |
43 | | `PRP`   | Personal pronoun | 1.0.0 |
44 | | `PRP$`  | Possessive pronoun | 1.0.0 |
45 | | `RB`    | Adverb | 1.0.0 |
46 | | `RBR`   | Adverb, comparative | 1.0.0 |
47 | | `RBS`   | Adverb, superlative | 1.0.0 |
48 | | `RP`    | Particle | 1.0.0 |
49 | | `SYM`   | Symbol | 1.0.0 |
50 | | `TO`    | To | 1.0.0 |
51 | | `UH`    | Interjection | 1.0.0 |
52 | | `VB`    | Verb, base form | 1.0.0 |
53 | | `VBD`   | Verb, past tense | 1.0.0 |
54 | | `VBG`   | Verb, gerund or present participle | 1.0.0 |
55 | | `VBN`   | Verb, past participle | 1.0.0 |
56 | | `VBP`   | Verb, non-3rd person singular present | 1.0.0 |
57 | | `VBZ`   | Verb, 3rd person singular present | 1.0.0 |
58 | | `WDT`   | Wh-determiner | 1.0.0 |
59 | | `WP`    | Wh-pronoun | 1.0.0 |
60 | | `WP$`   | Wh-pronoun, possessive | 1.0.0 |
61 | | `WRB`   | Wh-adverb | 1.0.0 |
62 | | `XX`    | Unknown | 1.0.0 |
63 | 


--------------------------------------------------------------------------------
/md/tutorial/nlp_component.md:
--------------------------------------------------------------------------------
  1 | # NLP Component
  2 | 
  3 | All components extend [`NLPComponent`](../../java/edu/emory/mathcs/nlp/component/util/NLPComponent.java), providing general methods for supervised NLP.  This class takes three genetic types `N`, `L`, and `S` representing the types of the input nodes, the label, and the [processing state](processing_state.md), respectively.
  4 | 
  5 | ```java
  6 | public abstract class NLPComponent<N,L,S extends NLPState<N,L>> implements Serializable
  7 | ```
  8 | 
  9 | This class contains several abstract methods:
 10 | 
 11 | ```java
 12 | /** @return the processing state for the input nodes. */
 13 | protected abstract S createState(N[] nodes);
 14 | 
 15 | /** @return the gold-standard label for training; otherwise, the predicted label. */
 16 | protected abstract L getLabel(S state, StringVector vector);
 17 | 
 18 | /** Adds a training instance (label, x) to the statistical model. */
 19 | protected abstract void addInstance(L label, StringVector vector);
 20 | 
 21 | /** @return the vector consisting of all features extracted from the state. */
 22 | protected abstract StringVector extractFeatures(S state);
 23 | ```
 24 | 
 25 | These abstract methods are used in the `process` method providing a genetic way for processing the NLP component.
 26 | 
 27 | ```java
 28 | public void process(N[] nodes)
 29 | {
 30 | 	S state = createState(nodes);
 31 | 	if (!isDecode()) state.clearGoldLabels();
 32 | 	
 33 | 	while (!state.isTerminate())
 34 | 	{
 35 | 		StringVector vector = extractFeatures(state);
 36 | 		if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector);
 37 | 		L label = getLabel(state, vector);
 38 | 		state.setLabel(label);
 39 | 		state.next();
 40 | 	}
 41 | 	
 42 | 	if (isEvaluate()) state.evaluate(eval);
 43 | }
 44 | ```
 45 | 
 46 | * The `process` method takes an array of nodes with the genetic type `N`.
 47 | 
 48 | 	```java
 49 | 	public void process(N[] nodes)
 50 | 	```
 51 | 
 52 | * It begins by creating a [processing state](processing_state.md).
 53 | 
 54 | 	```java
 55 | 	S state = createState(nodes);
 56 | 	```
 57 | 
 58 | * It is important to clear out and save existing gold-standard labels before training; accidental usage of these labels can lead to inflated evaluation scores.
 59 | 
 60 | 	```java
 61 | 	if (!isDecode()) state.clearGoldLabels();
 62 | 	```
 63 | 
 64 | * The method iterates through every state as defined in the [processing state](processing_state.md).
 65 | 
 66 | 	```java
 67 | 	while (!state.isTerminate())
 68 | 	{
 69 | 		...
 70 | 		state.next();
 71 | 	}
 72 | 	```
 73 | 
 74 | * For each state, it creates a vector consisting of features extracted from the current state.
 75 | 
 76 | 	```java
 77 | 	StringVector vector = extractFeatures(state);
 78 | 	```
 79 | 
 80 | * During training and bootstrapping, it adds the training instance to the statistical model.
 81 | 
 82 | 	```java
 83 | 	if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector);
 84 | 	```
 85 | 
 86 | * Given the feature vector, it predicts the label of the current state either from the oracle or the statistical model.
 87 | 
 88 | 	```java
 89 | 	L label = getLabel(state, vector);
 90 | 	```
 91 | 
 92 | * Finally, it assigns the label to the current state.
 93 | 
 94 | 	```java
 95 | 	state.setLabel(label);
 96 | 	```
 97 | 
 98 | * During evaluation, the accuracy counts are updated to the evaluator.
 99 | 
100 | 	```java
101 | 	if (isEvaluate()) state.evaluate(eval);
102 | 	```
103 | 
104 | 


--------------------------------------------------------------------------------
/md/tutorial/create_nlp_component.md:
--------------------------------------------------------------------------------
  1 | # Online Component
  2 | 
  3 | All components extend [`NLPComponent`](../../java/edu/emory/mathcs/nlp/component/util/NLPComponent.java), providing general methods for supervised NLP.  This class takes three genetic types `N`, `L`, and `S` representing the types of the input nodes, the label, and the [processing state](processing_state.md), respectively.
  4 | 
  5 | ```java
  6 | public abstract class NLPComponent<N,L,S extends NLPState<N,L>> implements Serializable
  7 | ```
  8 | 
  9 | This class contains several abstract methods:
 10 | 
 11 | ```java
 12 | /** @return the processing state for the input nodes. */
 13 | protected abstract S createState(N[] nodes);
 14 | 
 15 | /** @return the gold-standard label for training; otherwise, the predicted label. */
 16 | protected abstract L getLabel(S state, StringVector vector);
 17 | 
 18 | /** Adds a training instance (label, x) to the statistical model. */
 19 | protected abstract void addInstance(L label, StringVector vector);
 20 | 
 21 | /** @return the vector consisting of all features extracted from the state. */
 22 | protected abstract StringVector extractFeatures(S state);
 23 | ```
 24 | 
 25 | These abstract methods are used in the `process` method providing a genetic way for processing the NLP component.
 26 | 
 27 | ```java
 28 | public void process(N[] nodes)
 29 | {
 30 | 	S state = createState(nodes);
 31 | 	if (!isDecode()) state.clearGoldLabels();
 32 | 	
 33 | 	while (!state.isTerminate())
 34 | 	{
 35 | 		StringVector vector = extractFeatures(state);
 36 | 		if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector);
 37 | 		L label = getLabel(state, vector);
 38 | 		state.setLabel(label);
 39 | 		state.next();
 40 | 	}
 41 | 	
 42 | 	if (isEvaluate()) state.evaluate(eval);
 43 | }
 44 | ```
 45 | 
 46 | * The `process` method takes an array of nodes with the genetic type `N`.
 47 | 
 48 | 	```java
 49 | 	public void process(N[] nodes)
 50 | 	```
 51 | 
 52 | * It begins by creating a [processing state](processing_state.md).
 53 | 
 54 | 	```java
 55 | 	S state = createState(nodes);
 56 | 	```
 57 | 
 58 | * It is important to clear out and save existing gold-standard labels before training; accidental usage of these labels can lead to inflated evaluation scores.
 59 | 
 60 | 	```java
 61 | 	if (!isDecode()) state.clearGoldLabels();
 62 | 	```
 63 | 
 64 | * The method iterates through every state as defined in the [processing state](processing_state.md).
 65 | 
 66 | 	```java
 67 | 	while (!state.isTerminate())
 68 | 	{
 69 | 		...
 70 | 		state.next();
 71 | 	}
 72 | 	```
 73 | 
 74 | * For each state, it creates a vector consisting of features extracted from the current state.
 75 | 
 76 | 	```java
 77 | 	StringVector vector = extractFeatures(state);
 78 | 	```
 79 | 
 80 | * During training and bootstrapping, it adds the training instance to the statistical model.
 81 | 
 82 | 	```java
 83 | 	if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector);
 84 | 	```
 85 | 
 86 | * Given the feature vector, it predicts the label of the current state either from the oracle or the statistical model.
 87 | 
 88 | 	```java
 89 | 	L label = getLabel(state, vector);
 90 | 	```
 91 | 
 92 | * Finally, it assigns the label to the current state.
 93 | 
 94 | 	```java
 95 | 	state.setLabel(label);
 96 | 	```
 97 | 
 98 | * During evaluation, the accuracy counts are updated to the evaluator.
 99 | 
100 | 	```java
101 | 	if (isEvaluate()) state.evaluate(eval);
102 | 	```
103 | 
104 | 


--------------------------------------------------------------------------------
/md/components/dependency_parsing.md:
--------------------------------------------------------------------------------
 1 | # Dependency Parsing
 2 | 
 3 | Our dependency parser uses a transition-based, non-projective parsing algorithm showing a linear-time speed for both projective and non-projective parsing. It processes over 14K tokens per second on an Intel Xeon 2.30GHz machine, and shows the near state-of-the-art accuracy for greedy parsing (92.26% on the WSJ corpus).
 4 | 
 5 | * [It Depends: Dependency Parser Comparison Using A Web-based Evaluation Tool](http://www.aclweb.org/anthology/P15-1038.pdf), Jinho D. Choi, Amanda Stent, Joel Tetreault, Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics (ACL'15), 387–396, Beijing, China, 2015.
 6 | * [Transition-based Dependency Parsing with Selectional Branching](http://aclweb.org/anthology/P/P13/P13-1104.pdf), Jinho D. Choi, Andrew McCallum, Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (ACL'13), 1052-1062, Sofia, Bulgaria, 2013.
 7 | * [Getting the Most out of Transition-based Dependency Parsing](http://aclweb.org/anthology-new/P/P11/P11-2121.pdf), Jinho D. Choi, Martha Palmer, Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL'11), 687-692, Portland, Oregon, 2011.
 8 | 
 9 | ## English Labels
10 | 
11 | | Label       | Description | Version |
12 | | ----------- | ----------- | ------- |
13 | | `acl`       | Clausal modifier of noun | 1.0.0 |
14 | | `acomp`     | Adjectival complement | 1.0.0 |
15 | | `advcl`     | Adverbial clause modifier | 1.0.0 |
16 | | `advmod`    | Adverbial modifier | 1.0.0 |
17 | | `agent`     | Agent (passive) | 1.0.0 |
18 | | `appos`     | Appositional modifier | 1.0.0 |
19 | | `attr`      | Attribute | 1.0.0 |
20 | | `aux`       | Auxiliary verb | 1.0.0 |
21 | | `auxpass`   | Auxiliary verb (passive) | 1.0.0 |
22 | | `case`      | Case marker | 1.0.0 |
23 | | `cc`        | Coordinating conjunction | 1.0.0 |
24 | | `ccomp`     | Clausal complement | 1.0.0 |
25 | | `compound`  | Compound word | 1.0.0 |
26 | | `conj`      | Conjunct | 1.0.0 |
27 | | `csubj`     | Clausal subject | 1.0.0 |
28 | | `csubjpass` | Clausal subject (passive) | 1.0.0 |
29 | | `dative`    | Dative | 1.0.0 |
30 | | `dep`       | Unclassified dependent | 1.0.0 |
31 | | `det`       | Determiner | 1.0.0 |
32 | | `discourse` | Discourse element | 1.0.0 |
33 | | `dobj`      | Direct Object | 1.0.0 |
34 | | `expl`      | Expletive | 1.0.0 |
35 | | `mark`      | Marker | 1.0.0 |
36 | | `meta`      | Meta data | 1.0.0 |
37 | | `neg`       | Negation modifier | 1.0.0 |
38 | | `nmod`      | Modifier of nominal | 1.0.0 |
39 | | `npadvmod`  | Noun phrase as adverbial modifier | 1.0.0 |
40 | | `nsubj`     | Nominal subject | 1.0.0 |
41 | | `nsubjpass` | Nominal subject (passive) | 1.0.0 |
42 | | `oprd`      | Object predicate | 1.0.0 |
43 | | `parataxis` | Parataxis | 1.0.0 |
44 | | `pcomp`     | Preposition complement | 1.0.0 |
45 | | `pobj`      | Preposition object | 1.0.0 |
46 | | `poss`      | Possession modifier | 1.0.0 |
47 | | `preconj`   | Precorrelative conjunction | 1.0.0 |
48 | | `predet`    | Predeterminer | 1.0.0 |
49 | | `prep`      | Prepositional modifier | 1.0.0 |
50 | | `prt`       | Verb particle | 1.0.0 |
51 | | `punct`     | Punctuation | 1.0.0 |
52 | | `qmod`      | Modifier of quantifier | 1.0.0 |
53 | | `relcl`     | Relative clause modifier | 1.0.0 |
54 | | `root`      | Root | 1.0.0 |
55 | | `vocative`  | Vocative modifier | 1.0.0 |
56 | | `xcomp`     | Open clausal complement | 1.0.0 |


--------------------------------------------------------------------------------
/md/supplements/english-lexica-models.md:
--------------------------------------------------------------------------------
 1 | # English
 2 | 
 3 | ## Lexica
 4 | 
 5 | All lexica can be found [here](https://bitbucket.org/emorynlp/nlp4j-english/src):
 6 | 
 7 | * `en-ambiguity-classes-simplified.xz`<br>: ambiguity classes for part-of-speech tagging with simplified word forms.
 8 | * `en-ambiguity-classes-simplified-lowercase.xz`<br>: ambiguity classes for part-of-speech tagging with simplified lowercase word forms.
 9 | * `en-brown-clusters-simplified-lowercase.xz`<br>: brown clusters with simplified lowercase word forms. 
10 | * `en-named-entity-gazetteers-simplified.xz`<br>: gazetteers for named entity recognition with simplified word forms.
11 | * `en-named-entity-gazetteers-simplified-lowercase.xz`<br>: gazetteers for named entity recognition with simplified lowercase word forms.
12 | * `en-stop-words-simplified-lowercase.xz`<br>: stop words with simplified lowercase word forms.
13 | * `en-word-embeddings-undigitalized.xz`<br>: word embeddings with undigitalized word forms.
14 | 
15 | ## Models
16 | 
17 | All models can be found [here](https://bitbucket.org/emorynlp/nlp4j-english/src):
18 | 
19 | * `en-pos.xz`: part-of-speech tagging.
20 | * `en-ner.xz`: named entity recognition.
21 | * `en-dep.xz`: dependency parsing.
22 | 
23 | Models are trained on the following corpora.
24 | 
25 | | [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) | Sentences | Tokens | Names |
26 | | -------------------------- | -----: | ------: | -----: |
27 | | Broadcasting conversations | 10,822 | 171,101 |  9,771 |
28 | | Broadcasting news          | 10,344 | 206,029 | 19,670 | 
29 | | News magazines             |  6,672 | 163,627 | 10,736 |
30 | | Newswires                  | 34,438 | 875,800 | 77,496 |
31 | | Religious texts            | 21,418 | 296,432 |      0 |
32 | | Telephone conversations    |  8,963 |  85,444 |  2,021 |
33 | | Web texts                  | 12,448 | 284,951 |  8,170 |
34 | 
35 | | &nbsp;&nbsp;&nbsp;[English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13)&nbsp;&nbsp;&nbsp; | Sentences | Tokens |
36 | | --------- | ----: | -----: |
37 | | Answers   | 2,699 | 43,916 |
38 | | Email     | 2,983 | 44,168 |
39 | | Newsgroup | 1,996 | 37,816 |
40 | | Reviews   | 2,915 | 44,337 |
41 | | Weblog    | 1,753 | 38,770 |
42 | 
43 | | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[QuestionBank](http://www.computing.dcu.ie/~jjudge/qtreebank/)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Sentences | Tokens |
44 | | --------- | ----: | -----: |
45 | | Questions | 3,198 | 29,704 |
46 | 
47 | | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[MiPACQ](http://clear.colorado.edu/compsem/index.php?page=endendsystems&sub=mipacq)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Sentences | Tokens |
48 | | ------------------- | --------------: | -----------: |
49 | | Clinical questions  | 1,600           |  30,138      |
50 | | Medpedia articles   | 2,796           |  49,922      |
51 | | Clinical notes      | 8,383           | 113,164      |
52 | | Pathological notes  | 1,205           |  21,353      |
53 | 
54 | | [SHARP](http://informatics.mayo.edu/sharp/index.php/Main_Page) | Sentences | Tokens |
55 | | -------------------------------------- | -----: | ------: |
56 | | Seattle group health notes&nbsp;&nbsp; |  7,204 |  94,450 |
57 | | Clinical notes                         |  6,807 |  93,914 |
58 | | Stratified                             |  4,320 |  43,536 |
59 | | Stratified SGH                         | 13,662 | 139,403 |
60 | 
61 | | [THYME](http://clear.colorado.edu/compsem/index.php?page=endendsystems&sub=temporal) | Sentences | Tokens |
62 | | ----------------------------- | -----: | ------: |
63 | | Clinical / pathological notes | 26,661 | 387,943 |
64 | | Brain cancer                  | 18,722 | 225,899 |
65 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/network/NLPSocketServer.java:
--------------------------------------------------------------------------------
  1 | /**
  2 | // * Copyright 2015, Emory University
  3 |  * 
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  * 
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  * 
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package edu.emory.mathcs.nlp.network;
 17 | 
 18 | import java.io.BufferedInputStream;
 19 | import java.io.BufferedOutputStream;
 20 | import java.io.DataInputStream;
 21 | import java.io.DataOutputStream;
 22 | import java.io.IOException;
 23 | import java.io.InputStream;
 24 | import java.io.OutputStream;
 25 | import java.net.ServerSocket;
 26 | import java.net.Socket;
 27 | import java.util.concurrent.ExecutorService;
 28 | import java.util.concurrent.Executors;
 29 | 
 30 | import edu.emory.mathcs.nlp.common.util.IOUtils;
 31 | import edu.emory.mathcs.nlp.decode.NLPDecoder;
 32 | 
 33 | 
 34 | /**
 35 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
 36 |  */
 37 | public class NLPSocketServer
 38 | {
 39 | 	static public final String END = "!E@N#D$"; 
 40 | 	private NLPDecoder decoder;
 41 | 	
 42 | 	@SuppressWarnings("resource")
 43 | 	public NLPSocketServer(InputStream configuration, int port, int threads) throws Exception
 44 | 	{
 45 | 		ExecutorService executor = Executors.newFixedThreadPool(threads);
 46 | 		ServerSocket server = new ServerSocket(port);
 47 | 		Socket client;
 48 | 		
 49 | 		decoder = new NLPDecoder(configuration);
 50 | 		System.out.println("Listening...");
 51 | 		
 52 | 		while (true)
 53 | 		{
 54 | 			client = server.accept();
 55 | 			executor.submit(new NLPTask(client));
 56 | 		}
 57 | 
 58 | //		executor.shutdown();
 59 | //		server.close();
 60 | 	}
 61 | 	
 62 | 	class NLPTask implements Runnable 
 63 | 	{
 64 | 		OutputStream out;
 65 | 		InputStream  in;
 66 | 		Socket client;
 67 | 		
 68 | 		public NLPTask(Socket client)
 69 | 		{
 70 | 			try
 71 | 			{
 72 | 				in  = new DataInputStream (new BufferedInputStream (client.getInputStream()));
 73 | 				out = new DataOutputStream(new BufferedOutputStream(client.getOutputStream()));
 74 | 				this.client = client;
 75 | //				System.out.println(client.getInetAddress().toString());
 76 | 			}
 77 | 			catch (IOException e) {e.printStackTrace();}
 78 | 		}
 79 | 		
 80 | 		@Override
 81 | 		public void run()
 82 | 		{
 83 | 			StringBuilder build = new StringBuilder();
 84 | 			byte[] buffer = new byte[2048];
 85 | 			String s, format;
 86 | 			int i, idx;
 87 | 			
 88 | 			try
 89 | 			{
 90 | 				while ((i = in.read(buffer, 0, buffer.length)) >= 0)
 91 | 				{
 92 | 					build.append(new String(buffer, 0, i));
 93 | 					
 94 | 					if (build.toString().endsWith(END))
 95 | 					{
 96 | 						idx = build.indexOf(":");
 97 | 						format = build.substring(0, idx);
 98 | 						s = build.substring(idx+1, build.length()-END.length());
 99 | 						out.write(decoder.decodeByteArray(s, format));
100 | 						out.close();
101 | 						in.close();
102 | 						break;
103 | 					}
104 | 				}
105 | 			}
106 | 			catch (IOException e) {e.printStackTrace();}
107 | 		}
108 | 	}
109 | 	
110 | 	static public void main(String[] args) throws Exception
111 | 	{
112 | 		final String configFile = args[0];
113 | 		final int port = Integer.parseInt(args[1]);
114 | 		final int threads = Integer.parseInt(args[2]);
115 | 		new NLPSocketServer(IOUtils.createFileInputStream(configFile), port, threads);
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-ner.xml:
--------------------------------------------------------------------------------
 1 | <!-- named entity recognition -->
 2 | <configuration>
 3 |     <tsv>
 4 |         <column index="1" field="form"/>
 5 |         <column index="2" field="lemma"/>
 6 |         <column index="3" field="pos"/>
 7 |         <column index="4" field="feats"/>
 8 |         <column index="5" field="nament"/>
 9 |     </tsv>
10 | 
11 |     <lexica>
12 |         <word_clusters field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
13 |         <word_embeddings field="word_form_undigitalized">edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz</word_embeddings>
14 |         <named_entity_gazetteers field="word_form_simplified">edu/emory/mathcs/nlp/lexica/en-named-entity-gazetteers-simplified.xz</named_entity_gazetteers>
15 |     </lexica>
16 | 
17 |     <optimizer>
18 |         <l1_regularization>0.00001</l1_regularization>
19 |         <algorithm>adagrad-mini-batch</algorithm>
20 |         <learning_rate>0.02</learning_rate>
21 |         <feature_cutoff>0</feature_cutoff>
22 |         <lols fixed="0" decaying="0.95"/>
23 |         <batch_size>5</batch_size>
24 |         <max_epoch>20</max_epoch>
25 |         <bias>0</bias>
26 |     </optimizer>
27 | 
28 |     <reducer>
29 |         <lower_bound>86.98</lower_bound>
30 |         <increment>0.01</increment>
31 |         <iteration>2</iteration>
32 |         <start>0.05</start>
33 |         <range>0.005</range>
34 |     </reducer>
35 | 
36 |     <feature_template>
37 |         <!-- 1-gram features -->
38 |         <feature f0="i-1:word_form_simplified"/>
39 |         <feature f0="i:word_form_simplified"/>
40 |         <feature f0="i+1:word_form_simplified"/>
41 | 
42 |         <feature f0="i-2:word_form_simplified_lowercase"/>
43 |         <feature f0="i-1:word_form_simplified_lowercase"/>
44 |         <feature f0="i:word_form_simplified_lowercase"/>
45 |         <feature f0="i+1:word_form_simplified_lowercase"/>
46 |         <feature f0="i+2:word_form_simplified_lowercase"/>
47 | 
48 |         <feature f0="i-1:word_shape"/>
49 |         <feature f0="i:word_shape"/>
50 |         <feature f0="i+1:word_shape"/>
51 | 
52 |         <feature f0="i-1:part_of_speech_tag"/>
53 |         <feature f0="i:part_of_speech_tag"/>
54 |         <feature f0="i+1:part_of_speech_tag"/>
55 | 
56 |         <feature f0="i-2:named_entity_tag"/>
57 |         <feature f0="i-1:named_entity_tag"/>
58 | 
59 |         <feature set="true" f0="i-1:named_entity_gazetteers"/>
60 |         <feature set="true" f0="i:named_entity_gazetteers"/>
61 |         <feature set="true" f0="i+1:named_entity_gazetteers"/>
62 | 
63 |         <!-- 2-gram features -->
64 |         <feature f0="i-2:part_of_speech_tag" f1="i-1:part_of_speech_tag"/>
65 |         <feature f0="i+1:part_of_speech_tag" f1="i+2:part_of_speech_tag"/>
66 | 
67 |         <feature f0="i:lemma"   f1="i:part_of_speech_tag"/>
68 |         <feature f0="i+1:lemma" f1="i:part_of_speech_tag"/>
69 | 
70 |         <feature f0="i-1:lemma" f1="i:lemma"/>
71 |         <feature f0="i:lemma"   f1="i+1:lemma"/>
72 |         <feature f0="i+1:lemma" f1="i+2:lemma"/>
73 | 
74 |         <!-- 3-gram features -->
75 |         <feature f0="i-1:lemma" f1="i-1:part_of_speech_tag" f2="i-1:named_entity_tag"/>
76 |         <feature f0="i:lemma"   f1="i:part_of_speech_tag"   f2="i-1:named_entity_tag"/>
77 | 
78 |         <!-- affix features -->
79 |         <feature f0="i:suffix:3"/>
80 |         <feature f0="i+1:prefix:3"/>
81 | 
82 |         <feature f0="i:suffix:3"   f1="i:word_form_simplified_lowercase"/>
83 |         <feature f0="i-1:suffix:3" f1="i:word_form_simplified_lowercase"/>
84 | 
85 |         <!-- word cluster features -->
86 |         <feature set="true" f0="i-2:word_clusters"/>
87 |         <feature set="true" f0="i-1:word_clusters"/>
88 |         <feature set="true" f0="i:word_clusters"/>
89 |         <feature set="true" f0="i+1:word_clusters"/>
90 |         <feature set="true" f0="i+2:word_clusters"/>
91 | 
92 |         <feature f0="i:word_embedding"/>
93 |     </feature_template>
94 | </configuration>
95 | 


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-pos.xml:
--------------------------------------------------------------------------------
 1 | <!-- part-of-speech tagging -->
 2 | <configuration>
 3 |     <tsv>
 4 |         <column index="1" field="form"/>
 5 |         <column index="3" field="pos"/>
 6 |     </tsv>
 7 | 
 8 |     <lexica>
 9 |         <ambiguity_classes field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
10 |         <word_clusters field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
11 |     </lexica>
12 | 
13 |     <optimizer>
14 |         <l1_regularization>0.00001</l1_regularization>
15 |         <algorithm>adagrad-mini-batch</algorithm>
16 |         <learning_rate>0.02</learning_rate>
17 |         <feature_cutoff>2</feature_cutoff>
18 |         <lols fixed="0" decaying="0.95"/>
19 |         <batch_size>5</batch_size>
20 |         <max_epoch>40</max_epoch>
21 |         <bias>0</bias>
22 |     </optimizer>
23 | 
24 |     <reducer>
25 |         <lower_bound>97.48</lower_bound>
26 |         <increment>0.01</increment>
27 |         <iteration>2</iteration>
28 |         <start>0.05</start>
29 |         <range>0.005</range>
30 |     </reducer>
31 | 
32 |     <feature_template>
33 |         <!-- 1-gram features -->
34 |         <feature f0="i-2:word_form_simplified"/>
35 |         <feature f0="i-1:word_form_simplified"/>
36 |         <feature f0="i:word_form_simplified"/>
37 |         <feature f0="i+1:word_form_simplified"/>
38 |         <feature f0="i+2:word_form_simplified"/>
39 | 
40 |         <feature f0="i-1:word_shape"/>
41 |         <feature f0="i:word_shape"/>
42 |         <feature f0="i+1:word_shape"/>
43 | 
44 |         <feature f0="i-3:part_of_speech_tag"/>
45 |         <feature f0="i-2:part_of_speech_tag"/>
46 |         <feature f0="i-1:part_of_speech_tag"/>
47 | 
48 |         <feature set="true" f0="i:ambiguity_classes"/>
49 |         <feature set="true" f0="i+1:ambiguity_classes"/>
50 |         <feature set="true" f0="i+2:ambiguity_classes"/>
51 |         <feature set="true" f0="i+3:ambiguity_classes"/>
52 | 
53 |         <!-- 2-gram features -->
54 |         <feature f0="i-2:word_form_simplified_lowercase" f1="i-1:word_form_simplified_lowercase"/>
55 |         <feature f0="i-1:word_form_simplified_lowercase" f1="i:word_form_simplified_lowercase"/>
56 |         <feature f0="i:word_form_simplified_lowercase"   f1="i+1:word_form_simplified_lowercase"/>
57 |         <feature f0="i+1:word_form_simplified_lowercase" f1="i+2:word_form_simplified_lowercase"/>
58 |         <feature f0="i-1:word_form_simplified_lowercase" f1="i+1:word_form_simplified_lowercase"/>
59 | 
60 |         <feature f0="i-2:part_of_speech_tag" f1="i-1:part_of_speech_tag"/>
61 |         <feature f0="i-1:part_of_speech_tag" f1="i+1:ambiguity_classes"/>
62 |         <feature f0="i+1:ambiguity_classes"  f1="i+2:ambiguity_classes"/>
63 | 
64 |         <!-- 3-gram features -->
65 |         <feature f0="i-2:part_of_speech_tag" f1="i-1:part_of_speech_tag" f2="i:ambiguity_classes"/>
66 |         <feature f0="i-1:part_of_speech_tag" f1="i:ambiguity_classes"    f2="i+1:ambiguity_classes"/>
67 |         <feature f0="i-2:part_of_speech_tag" f1="i-1:part_of_speech_tag" f2="i+1:ambiguity_classes"/>
68 |         <feature f0="i-1:part_of_speech_tag" f1="i+1:ambiguity_classes"  f2="i+2:ambiguity_classes"/>
69 | 
70 |         <!-- affix features -->
71 |         <feature f0="i:prefix:2"/>
72 |         <feature f0="i:prefix:3"/>
73 | 
74 |         <feature f0="i:suffix:1"/>
75 |         <feature f0="i:suffix:2"/>
76 |         <feature f0="i:suffix:3"/>
77 |         <feature f0="i:suffix:4"/>
78 | 
79 |         <!-- orthographic features -->
80 |         <feature set="true" f0="i:orthographic"/>
81 | 
82 |         <!-- positional features -->
83 |         <feature set="true" f0="i:positional"/>
84 | 
85 |         <!-- word cluster features -->
86 |         <feature set="true" f0="i-1:word_clusters"/>
87 |         <feature set="true" f0="i:word_clusters"/>
88 |         <feature set="true" f0="i+1:word_clusters"/>
89 |     </feature_template>
90 | </configuration>
91 | 


--------------------------------------------------------------------------------
/md/quickstart/decode.md:
--------------------------------------------------------------------------------
 1 | # Decode
 2 | 
 3 | ## Command-Line
 4 | 
 5 | The following command runs the NLP pipeline for tokenization, part-of-speech tagging, morphological analysis, named entity recognition, dependency parsing, and semantic role labeling:
 6 | 
 7 | ```bash
 8 | java edu.emory.mathcs.nlp.bin.NLPDecode -c <filename> -i <filepath> [-ie <string> -oe <string> -format <string> -threads <integer>]
 9 | 
10 | -c       <filename> : configuration filename (required)
11 | -i       <filepath> : input path (required)
12 | -ie      <string>   : input file extension (default: *)
13 | -oe      <string>   : output file extension (default: nlp)
14 | -format  <string>   : format of the input data (raw|line|tsv; default: raw)
15 | -threads <integer>  : number of threads (default: 2)
16 | ```
17 | 
18 | * `-c` specifies the configuration file (see [configuration](#configuration)).
19 | * `-i` specifies the input path pointing to either a file or a directory. When the path points to a file, only the specific file is processed. When the path points to a directory, all files with the file extension `-ie` under the specific directory are processed.
20 | * `-ie` specifies the input file extension. The default value `*` implies files with any extension. This option is used only when the input path `-i` points to a directory.
21 | * `-oe` specifies the output file extension appended to each input filename. The corresponding output file, consisting of the NLP output, will be generated.
22 | * `-format` specifies the format of the input file: `raw`, `line`, or `tsv` (see [data format](../supplements/data-format.md)).
23 | * `-threads` specifies the number of threads to be used. When multi-threads are used, each file is assigned to an individual thread.
24 | 
25 | ## Example
26 | 
27 | The following command takes [`nlp4j.txt`](../../src/test/resources/dat/nlp4j.txt) and generates [`nlp4j.txt.nlp`](../../src/test/resources/dat/nlp4j.txt.nlp) using [`config-decode-en.xml`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml).
28 | 
29 | ```bash
30 | $ java -Xmx4g -XX:+UseConcMarkSweepGC edu.emory.mathcs.nlp.bin.NLPDecode -c config-decode-general.xml -i emorynlp.txt
31 | 
32 | Loading ambiguity classes
33 | Loading word clusters
34 | Loading word embeddings
35 | Loading named entity gazetteers
36 | Loading tokenizer
37 | Loading part-of-speech tagger
38 | Loading morphological analyzer
39 | Loading named entity recognizer
40 | Loading dependency parser
41 | 
42 | nlp4j.txt
43 | ```
44 | 
45 | * Use the [`-XX:+UseConcMarkSweepGC`](http://www.oracle.com/technetwork/java/tuning-139912.html) option for JVM, which reduces the memory usage into a half.
46 | * Use [`log4j.properties`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/log4j.properties) for the [log4j](http://logging.apache.org/log4j/) configuration.
47 | * The output file is generated in the `tsv` format (see [data format](../supplements/data-format.md#tab-separated-values-format)).
48 | 
49 | ## Configuration
50 | 
51 | Sample configuration files for decoding can be found here: [`config-decode-*`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/).
52 | 
53 | ```xml
54 | <configuration>
55 |     <tsv>
56 |         <column index="0" field="form"/>
57 |     </tsv>
58 | 
59 |     <lexica>
60 |         <ambiguity_classes field="word_form_simplified_lowercase">en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
61 |         <word_clusters field="word_form_simplified_lowercase">en-brown-clusters-simplified-lowercase.xz</word_clusters>
62 |         <named_entity_gazetteers field="word_form_simplified">en-named-entity-gazetteers-simplified.xz</named_entity_gazetteers>
63 |         <word_embeddings field="word_form_undigitalized">en-word-embeddings-undigitalized.xz</word_embeddings>
64 |     </lexica>
65 | 
66 |     <models>
67 |     	<pos>en-pos.xz</pos>
68 |     	<ner>en-ner.xz</ner>
69 |     	<dep>en-dep.xz</dep>
70 |     </models>
71 | </configuration>
72 | ```
73 | 
74 | * `<tsv>`: see [`configuration#tsv`](train.md#configuration). This does not need to be specified when `raw` or `sen` is used.
75 | * `<lexica>`: see [`configuration#lexica`](train.md#configuration).
76 | * `<models>` specifies the statistical model for each component (e.g., [english models](../supplements/english-lexica-models.md#models); see [`NLPMode`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/NLPMode.java)).


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/zzz/TokenizeIt.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015, Emory University
  3 |  * 
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  * 
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  * 
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package edu.emory.mathcs.nlp.zzz;
 17 | 
 18 | import java.io.BufferedReader;
 19 | import java.io.InputStream;
 20 | import java.io.OutputStream;
 21 | import java.io.PrintStream;
 22 | import java.util.ArrayList;
 23 | import java.util.List;
 24 | 
 25 | import edu.emory.mathcs.nlp.common.util.FileUtils;
 26 | import edu.emory.mathcs.nlp.common.util.IOUtils;
 27 | import edu.emory.mathcs.nlp.common.util.Splitter;
 28 | import edu.emory.mathcs.nlp.component.template.node.FeatMap;
 29 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
 30 | import edu.emory.mathcs.nlp.decode.NLPDecoder;
 31 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer;
 32 | import edu.emory.mathcs.nlp.tokenization.Token;
 33 | import edu.emory.mathcs.nlp.tokenization.Tokenizer;
 34 | 
 35 | /**
 36 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
 37 |  */
 38 | public class TokenizeIt
 39 | {
 40 | 	NLPDecoder decoder;
 41 | 	Tokenizer  tokenizer;
 42 | 	
 43 | 	public TokenizeIt(String configFile)
 44 | 	{
 45 | 		decoder   = new NLPDecoder(IOUtils.createFileInputStream(configFile));
 46 | 		tokenizer = new EnglishTokenizer();
 47 | 	}
 48 | 	
 49 | 	public void convert(String inputDir, String outputDir)
 50 | 	{
 51 | 		for (String inputFile : FileUtils.getFileList(inputDir, "tsv"))
 52 | 		{
 53 | 			String outputFile = outputDir+"/"+FileUtils.getBaseName(inputFile);
 54 | 			System.out.println(FileUtils.getBaseName(inputFile));
 55 | 			
 56 | 			try
 57 | 			{
 58 | 				convert(IOUtils.createFileInputStream(inputFile), IOUtils.createFileOutputStream(outputFile));
 59 | 			}
 60 | 			catch (Exception e) {e.printStackTrace();}
 61 | 		}
 62 | 	}
 63 | 	
 64 | 	public void convert(InputStream in, OutputStream out) throws Exception
 65 | 	{
 66 | 		BufferedReader reader = IOUtils.createBufferedReader(in);
 67 | 		PrintStream fout = IOUtils.createBufferedPrintStream(out);
 68 | 		List<ItToken> list = new ArrayList<>();
 69 | 		NLPNode[] nodes;
 70 | 		String line;
 71 | 		String[] t;
 72 | 		
 73 | 		while ((line = reader.readLine()) != null && !(line = line.trim()).isEmpty())
 74 | 		{
 75 | 			t = Splitter.splitTabs(line);
 76 | 			list.add(new ItToken(t[0], t[1]));
 77 | 		}
 78 | 		
 79 | 		for (List<ItToken> tokens : tokenizer.segmentize(list))
 80 | 		{
 81 | 			nodes = decoder.toNodeArray(tokens, token -> create(token));
 82 | 			decoder.decode(nodes);
 83 | 			check(nodes);
 84 | 			fout.println(decoder.toString(nodes)+"\n");
 85 | 		}
 86 | 		
 87 | 		reader.close();
 88 | 		fout.close();
 89 | 	}
 90 | 	
 91 | 	public void check(NLPNode[] nodes)
 92 | 	{
 93 | 		for (int i=1; i<nodes.length; i++)
 94 | 		{
 95 | 			NLPNode node = nodes[i];
 96 | 			
 97 | 			if (((node.isLemma("it") || node.isLemma("its")) && node.getFeat("it") == null) ||
 98 | 				(node.getFeat("it") != null && !node.isLemma("it") && !node.isLemma("its")))
 99 | 			{
100 | 				System.out.println(decoder.toString(nodes)+"\n");
101 | 				break;
102 | 			}
103 | 		}
104 | 	}
105 | 	
106 | 	public NLPNode create(ItToken token)
107 | 	{
108 | 		NLPNode node = decoder.create(token);
109 | 		node.setFeatMap(token.feat);
110 | 		return node;
111 | 	}
112 | 	
113 | 	class ItToken extends Token
114 | 	{
115 | 		FeatMap feat;
116 | 		
117 | 		public ItToken(String form, String feats)
118 | 		{
119 | 			super(form, 0, 0);
120 | 			feat = new FeatMap(feats);
121 | 		}
122 | 	}
123 | 	
124 | 	static public void main(String[] args)
125 | 	{
126 | 		String configFile = "src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml";
127 | 		String inputDir   = "/Users/jdchoi/Documents/Data/it/output";
128 | 		String outputDir  = "/Users/jdchoi/Documents/Data/it/it-dat";
129 | 		new TokenizeIt(configFile).convert(inputDir, outputDir);
130 | 	}
131 | }
132 | 


--------------------------------------------------------------------------------
/src/test/resources/dat/sample-dev.tsv:
--------------------------------------------------------------------------------
  1 | 1	Students	student	NNS	_	5	nsubjpass	5:A1	O
  2 | 2	will	will	MD	_	5	aux	5:AM-MOD	O
  3 | 3	also	also	RB	_	5	advmod	5:AM-ADV	O
  4 | 4	be	be	VB	_	5	auxpass	_	O
  5 | 5	selected	select	VBN	pb=select.01	0	root	_	O
  6 | 6	from	from	IN	syn=CLR	5	prep	5:A2	O
  7 | 7	families	family	NNS	_	6	pobj	10:A1	O
  8 | 8	whose	whose	WP$	_	9	poss	_	O
  9 | 9	income	income	NN	_	10	nsubj	10:R-A1	O
 10 | 10	is	be	VBZ	pb=be.01	7	relcl	_	O
 11 | 11	below	below	IN	syn=PRD	10	prep	10:A2	B-MONEY
 12 | 12	2000	0	CD	_	13	nummod	_	I-MONEY
 13 | 13	Riyals	riyal	NNS	_	11	pobj	_	L-MONEY
 14 | 14	and	and	CC	_	10	cc	_	O
 15 | 15	with	with	IN	_	10	conj	_	O
 16 | 16	four	#crd#	CD	_	20	nummod	_	B-QUANTITY
 17 | 17	or	or	CC	_	16	cc	_	I-QUANTITY
 18 | 18	more	more	JJR	_	16	conj	_	L-QUANTITY
 19 | 19	family	family	NN	_	20	compound	_	O
 20 | 20	members	member	NNS	_	15	pobj	_	O
 21 | 21	.	.	.	_	5	punct	_	O
 22 | 
 23 | 1	Students	student	NNS	_	12	nsubjpass	12:A1;4:A1	O
 24 | 2	who	who	WP	_	4	nsubjpass	4:R-A1	O
 25 | 3	are	be	VBP	_	4	auxpass	_	O
 26 | 4	supported	support	VBN	pb=support.01	1	relcl	_	O
 27 | 5	by	by	IN	_	4	agent	4:A0	O
 28 | 6	the	the	DT	_	9	det	_	B-ORG
 29 | 7	Social	social	NNP	_	8	compound	_	I-ORG
 30 | 8	Welfare	welfare	NNP	_	9	compound	_	I-ORG
 31 | 9	charity	charity	NN	_	5	pobj	_	L-ORG
 32 | 10	will	will	MD	_	12	aux	12:AM-MOD	O
 33 | 11	be	be	VB	_	12	auxpass	_	O
 34 | 12	excluded	exclude	VBN	pb=exclude.01	0	root	_	O
 35 | 13	,	,	,	_	12	punct	_	O
 36 | 14	as	as	RB	_	16	advmod	_	O
 37 | 15	well	well	RB	_	16	advmod	_	O
 38 | 16	as	as	IN	_	17	cc	_	O
 39 | 17	those	those	DT	_	1	nmod	19:A0	O
 40 | 18	who	who	WP	_	19	nsubj	19:R-A0	O
 41 | 19	receive	receive	VBP	pb=receive.01	17	relcl	_	O
 42 | 20	payments	payment	NNS	_	19	dobj	19:A1	O
 43 | 21	from	from	IN	syn=CLR	19	prep	19:A2	O
 44 | 22	the	the	DT	_	25	det	_	O
 45 | 23	Social	social	NNP	_	24	compound	_	B-ORG
 46 | 24	Security	security	NNP	_	25	compound	_	L-ORG
 47 | 25	services	service	NNS	_	21	pobj	_	O
 48 | 26	or	or	CC	_	25	cc	_	O
 49 | 27	the	the	DT	_	28	det	_	O
 50 | 28	school	school	NN	_	25	conj	_	O
 51 | 29	or	or	CC	_	28	cc	_	O
 52 | 30	any	any	DT	_	33	det	_	O
 53 | 31	other	other	JJ	_	33	amod	_	O
 54 | 32	official	official	JJ	_	33	amod	_	O
 55 | 33	body	body	NN	_	28	conj	_	O
 56 | 34	.	.	.	_	12	punct	_	O
 57 | 
 58 | 1	Basfar	basfar	NNP	_	2	nsubj	2:A0	U-PERSON
 59 | 2	called	call	VBD	pb=call.03	0	root	_	O
 60 | 3	on	on	IN	syn=CLR	2	prep	2:A2	O
 61 | 4	all	all	DT	_	6	det	_	O
 62 | 5	school	school	NN	_	6	compound	_	O
 63 | 6	principals	principal	NNS	_	3	pobj	8:A0;13:A0	O
 64 | 7	to	to	TO	_	8	aux	_	O
 65 | 8	implement	implement	VB	pb=implement.01	2	xcomp	2:A1	O
 66 | 9	the	the	DT	_	11	det	_	O
 67 | 10	computer	computer	NN	_	11	compound	_	O
 68 | 11	program	program	NN	_	8	dobj	8:A1	O
 69 | 12	and	and	CC	_	8	cc	_	O
 70 | 13	enter	enter	VB	pb=enter.02	8	conj	_	O
 71 | 14	data	datum	NNS	_	13	dobj	13:A1	O
 72 | 15	about	about	IN	_	14	prep	_	O
 73 | 16	needy	needy	JJ	_	17	amod	_	O
 74 | 17	students	student	NNS	_	15	pobj	_	O
 75 | 18	before	before	IN	sem=TMP	13	prep	8:AM-TMP;13:AM-TMP	O
 76 | 19	next	next	JJ	_	20	amod	_	B-DATE
 77 | 20	December	december	NNP	_	18	pobj	_	I-DATE
 78 | 21	16	0	CD	_	20	nummod	_	L-DATE
 79 | 22	.	.	.	_	2	punct	_	O
 80 | 
 81 | 1	The	the	DT	_	3	det	_	_
 82 | 2	last	last	JJ	_	3	amod	_	_
 83 | 3	thing	thing	NN	_	9	nsubj	9:A1;8:A1	_
 84 | 4	a	a	DT	_	5	det	_	_
 85 | 5	father	father	NN	_	6	nsubj	6:A0;8:A0	_
 86 | 6	wants	want	VBZ	pb=want.01	3	relcl	_	_
 87 | 7	to	to	TO	_	8	aux	_	_
 88 | 8	hear	hear	VB	pb=hear.01	6	xcomp	6:A1	_
 89 | 9	is	be	VBZ	pb=be.01	0	root	_	_
 90 | 10	that	that	IN	_	13	mark	_	_
 91 | 11	his	his	PRP$	_	12	poss	_	_
 92 | 12	son	son	NN	_	13	nsubj	13:A1	_
 93 | 13	is	be	VBZ	pb=be.01|syn=PRD	9	advcl	9:A2	_
 94 | 14	not	not	RB	_	13	neg	13:AM-NEG	_
 95 | 15	like	like	IN	syn=PRD	13	prep	13:A2	_
 96 | 16	him	him	PRP	_	15	pobj	_	_
 97 | 17	.	.	,	_	13	punct	_	_
 98 | 18	Or	or	CC	_	13	cc	_	_
 99 | 19	that	that	IN	_	21	mark	_	_
100 | 20	he	he	PRP	_	21	nsubj	21:A1	_
101 | 21	is	be	VBZ	pb=be.01	13	conj	_	_
102 | 22	less	less	RBR	_	23	advmod	_	_
103 | 23	competent	competent	JJ	syn=PRD	21	acomp	21:A2	_
104 | 24	.	.	,	_	21	punct	_	_
105 | 25	Or	or	CC	_	21	cc	_	_
106 | 26	that	that	IN	_	28	mark	_	_
107 | 27	he	he	PRP	_	28	nsubj	28:A1	_
108 | 28	is	be	VBZ	pb=be.01	21	conj	_	_
109 | 29	his	his	PRP$	_	30	poss	_	_
110 | 30	opposite	opposite	NN	syn=PRD	28	attr	28:A2	_
111 | 31	.	.	.	_	9	punct	_	_
112 | 
113 | 1	But	but	CC	_	3	cc	3:AM-DIS	O
114 | 2	this	this	DT	_	3	nsubj	3:A1	O
115 | 3	is	be	VBZ	pb=be.01	0	root	_	O
116 | 4	what	what	WP	_	8	dobj	8:R-A1	O
117 | 5	George	george	NNP	_	7	compound	_	B-PERSON
118 | 6	Bush	bush	NNP	_	7	compound	_	I-PERSON
119 | 7	Sr.	sr.	NNP	_	8	nsubj	8:A0	L-PERSON
120 | 8	heard	hear	VBD	pb=hear.01|syn=PRD	3	ccomp	3:A2	O
121 | 9	about	about	IN	syn=CLR	8	prep	8:AM-PRD	O
122 | 10	George	george	NNP	_	12	compound	_	B-PERSON
123 | 11	Bush	bush	NNP	_	12	compound	_	I-PERSON
124 | 12	Jr.	jr.	NNP	_	9	pobj	_	L-PERSON
125 | 13	in	in	IN	sem=LOC	8	prep	8:AM-LOC	O
126 | 14	Abu	abu	NNP	_	15	compound	_	B-GPE
127 | 15	Dhabi	dhabi	NNP	_	13	pobj	_	L-GPE
128 | 16	last	last	JJ	_	17	amod	_	O
129 | 17	week	week	NN	sem=TMP	8	npadvmod	8:AM-TMP	O
130 | 18	.	.	.	_	3	punct	_	O
131 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/zzz/CSVRadiology.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015, Emory University
  3 |  * 
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  * 
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  * 
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package edu.emory.mathcs.nlp.zzz;
 17 | 
 18 | import java.io.PrintStream;
 19 | import java.util.ArrayList;
 20 | import java.util.List;
 21 | import java.util.StringJoiner;
 22 | import java.util.regex.Matcher;
 23 | import java.util.regex.Pattern;
 24 | 
 25 | import org.apache.commons.csv.CSVFormat;
 26 | import org.apache.commons.csv.CSVParser;
 27 | import org.apache.commons.csv.CSVRecord;
 28 | 
 29 | import edu.emory.mathcs.nlp.common.collection.tuple.Pair;
 30 | import edu.emory.mathcs.nlp.common.util.FileUtils;
 31 | import edu.emory.mathcs.nlp.common.util.IOUtils;
 32 | import edu.emory.mathcs.nlp.common.util.Joiner;
 33 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer;
 34 | import edu.emory.mathcs.nlp.tokenization.Token;
 35 | import edu.emory.mathcs.nlp.tokenization.Tokenizer;
 36 | 
 37 | /**
 38 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
 39 |  */
 40 | public class CSVRadiology
 41 | {
 42 | 	final String[] BEFORE = {"Patient Name :", "DOB :", "SEX :", "Ordering Physician :", "Exam :", "HEAD CT", "CLINICAL", "TECHNIQUE :", "COMPARISON :", "FINDINGS :", "IMPRESSION :"};
 43 | 	final String[] AFTER  = {"INDICATION :", "TECHNIQUE :", "COMPARISON :", "FINDINGS :", "IMPRESSION :"};
 44 | 	List<Pair<Pattern,String>> P_BEFORE, P_AFTER;
 45 | 	Pattern NEW_LINE = Pattern.compile("\n");
 46 | 
 47 | 	public void categorize(String inputFile) throws Exception
 48 | 	{
 49 | 		CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT);
 50 | 		List<CSVRecord> records = parser.getRecords();
 51 | 		StringJoiner join;
 52 | 		CSVRecord record;
 53 | 		
 54 | 		for (int i=0; i<=500; i++)
 55 | 		{
 56 | 			if (i == 0) continue;
 57 | 			record = records.get(i);
 58 | 			join = new StringJoiner(" ");
 59 | 			
 60 | 			for (int j=2; j<7; j++)
 61 | 				join.add(record.get(j));
 62 | 			
 63 | 			System.out.println(join.toString());
 64 | 		}
 65 | 		
 66 | 		parser.close();
 67 | 	}
 68 | 	
 69 | 	public void tokenize(String inputFile, int outputStart) throws Exception
 70 | 	{
 71 | 		CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT);
 72 | 		String inputPath = FileUtils.getPath(inputFile)+"/";
 73 | 		List<CSVRecord> records = parser.getRecords();
 74 | 		Tokenizer tokenizer = new EnglishTokenizer();
 75 | 		
 76 | 		P_BEFORE = new ArrayList<>();
 77 | 		P_AFTER  = new ArrayList<>();
 78 | 		for (String s : BEFORE) P_BEFORE.add(new Pair<>(Pattern.compile(s), "\n"+s));
 79 | 		for (String s : AFTER ) P_AFTER .add(new Pair<>(Pattern.compile(s), s+"\n"));
 80 | 		
 81 | 		for (int i=0; i<records.size(); i++)
 82 | 		{
 83 | 			PrintStream fout = IOUtils.createBufferedPrintStream(getOuputFilename(inputPath, i+outputStart));
 84 | 			
 85 | 			for (List<Token> tokens : tokenizer.segmentize(records.get(i).get(0)))
 86 | 				print(fout, tokens);
 87 | 			
 88 | 			fout.close();
 89 | 		}
 90 | 		
 91 | 		parser.close();
 92 | 	}
 93 | 	
 94 | 	String getOuputFilename(String inputPath, int index)
 95 | 	{
 96 | 		StringBuilder build = new StringBuilder();
 97 | 		
 98 | 		build.append(inputPath);
 99 | 		if (index < 1000) build.append(0);
100 | 		if (index < 100)  build.append(0);
101 | 		if (index < 10)   build.append(0);
102 | 		build.append(index);
103 | 		build.append(".txt");
104 | 		
105 | 		return build.toString();
106 | 	}
107 | 	
108 | 	void print(PrintStream fout, List<Token> tokens)
109 | 	{
110 | 		String s = Joiner.join(tokens, " ");
111 | 		
112 | 		for (Pair<Pattern,String> p : P_BEFORE)
113 | 		{
114 | 			Matcher m = p.o1.matcher(s);
115 | 			if (m.find()) s = m.replaceAll(p.o2);
116 | 		}
117 | 		
118 | 		for (Pair<Pattern,String> p : P_AFTER)
119 | 		{
120 | 			Matcher m = p.o1.matcher(s);
121 | 			if (m.find()) s = m.replaceAll(p.o2);
122 | 		}
123 | 		
124 | 		for (String t : NEW_LINE.split(s))
125 | 		{
126 | 			t = t.trim();
127 | 			if (!t.isEmpty()) fout.println(t.trim());
128 | 		}
129 | 	}
130 | 	
131 | 	static public void main(String[] args)
132 | 	{
133 | //		String inputFile = "/Users/jdchoi/Emory/radiology/tools/500/500-original.csv";
134 | //		String inputFile = "/Users/jdchoi/Emory/radiology/dat/radiology_report_151112_lemmon.csv";
135 | 		
136 | 		String inputFile = "/Users/jdchoi/Emory/radiology/de-identification/1986/Remaining_1986Reports_FULL.csv";
137 | 		
138 | 		try
139 | 		{
140 | 			CSVRadiology cvs = new CSVRadiology();
141 | 			cvs.tokenize(inputFile, 500);
142 | //			cvs.categorize(inputFile);
143 | 		}
144 | 		catch (Exception e) {e.printStackTrace();}
145 | 	}
146 | }
147 | 


--------------------------------------------------------------------------------
/md/quickstart/release.md:
--------------------------------------------------------------------------------
 1 | # Release Notes
 2 | 
 3 | ## Version 1.1.2 (06/29/2016)
 4 | 
 5 | * Bugfixes: [tokenization-issue-7](https://github.com/emorynlp/nlp4j-tokenization/issues/7)
 6 | * Features: [tokenization-issue-6](https://github.com/emorynlp/nlp4j-tokenization/issues/6)
 7 | * The tokenizer does not tokenize left/right brackets where the content inside is a single character or all numbers (e.g., `(a)`,`[12]`).
 8 | 
 9 | ## Version 1.1.1 (04/29/2016)
10 | 
11 | * Bugfixes: [core-pull-7](https://github.com/emorynlp/nlp4j-core/pull/7).
12 | * Features: [issue-3](https://github.com/emorynlp/nlp4j/issues/3/), [issue-6](https://github.com/emorynlp/nlp4j/issues/6).
13 | * [NLPNode](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java) comes with several useful constructors.
14 | * The `segmentize` method in [Tokenizer](https://github.com/emorynlp/nlp4j-tokenization/blob/master/src/main/java/edu/emory/mathcs/nlp/tokenization/Tokenizer.java) takes the generic type of [Token](https://github.com/emorynlp/nlp4j-tokenization/blob/master/src/main/java/edu/emory/mathcs/nlp/tokenization/Token.java).
15 | 
16 | ## Version 1.1.0 (04/20/2016)
17 | 
18 | * All the statistical models are about twice smaller than the previous ones without compromising accuracy. The whole pipeline can be run in 4GB of RAM now.
19 | * [Training](train.md) automatically saves the best model in a single pass (no need to run training twice any more to save the best model).
20 | * The [nlp4j-common](https://github.com/emorynlp/nlp4j-common) project is separated out from the [nlp4j-core](https://github.com/emorynlp/nlp4j-core) project.
21 | * [GlobalLexica](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/GlobalLexica.java) is no longer static, so it does not get conflicted by another process.
22 | * [NLPNode](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java) extends [AbstractNLPNode](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/AbstractNLPNode.java), which allows to create your own custom node. Generics are added all over for this change (e.g., [NLPState](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/state/NLPState.java), [OnlineComponent](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/OnlineComponent.java)).
23 | * The part-of-speech tagger gives the 2nd-best predictions when the best predictions have low confidence (`pos2` in the extra feats).
24 | * Thanks to [Anatoly Vostryakov](https://github.com/avostryakov): [`adjective.exc`](https://github.com/emorynlp/nlp4j-morphology/blob/master/src/main/resources/edu/emory/mathcs/nlp/component/morph/english/adjective.exc) and [`adverb.base`](https://github.com/emorynlp/nlp4j-morphology/blob/master/src/main/resources/edu/emory/mathcs/nlp/component/morph/english/adverb.base) are cleaned up.
25 | * Thanks to [spraynasal](https://github.com/spraynasal): some bugs in tokenization are fixed [5](https://github.com/emorynlp/nlp4j-tokenization/pull/5).
26 | 
27 | ## Version 1.0.0 (02/24/2016)
28 | 
29 | * NLP4J is the successor of the widely used toolkit, [ClearNLP](https://github.com/clir/clearnlp), developed by the [NLP Research Group](http://nlp.mathcs.emory.edu) at Emory University. Please visit our [Github page](https://github.com/emorynlp/nlp4j) for more details about this project.
30 | * This version supports tokenization, part-of-speech tagging, morphological analysis, named entity recognition, and dependency parsing. The next release (March, 2016) will include supports for semantic role labeling and sentiment analysis, and the following release (April, 2016) will include supports for coreference resolution.
31 | * NLP4J makes it easy to train your own model. Please see [how to train](train.md) for more details about the training process.
32 | * Calling the decoding API is easier than ever. See [NLPDemo](../../src/main/java/edu/emory/mathcs/nlp/bin/NLPDemo.java) for more details.
33 | * The biggest difference between NLP4J and ClearNLP is in machine learning. NLP4J is capable of updating existing models with new training data, which is useful for domain adaptation. We also started implementing a deep learning package although we realized that the GPU support for Java is pretty limited and without a good GPU support, deep learning would make everything much slower. Please let us know if you'd like to contribute for this project.
34 | * One could consider the NLP4J project is a more stabilized version of ClearNLP. I have been using this package for the NLP course I teach, and my students (including undergrads) were able to develop new NLP components without much effort using the built-in APIs in NLP4J. We are preparing a tutorial for developing NLP components using NLP4J.
35 | * We do not expect our tools would work perfectly out of box. We now have a good team working on this project. Please let us know if you'd like to collaborate so we can make this project more robust for you.
36 | * Please visit our [online demo](http://nlp.mathcs.emory.edu:8080/nlp4j). It parses 10K tokens with a couple of seconds and visualizes the dependency trees.
37 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/zzz/CSVSentiment.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015, Emory University
  3 |  * 
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  * 
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  * 
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package edu.emory.mathcs.nlp.zzz;
 17 | 
 18 | import java.io.PrintStream;
 19 | import java.util.List;
 20 | 
 21 | import org.apache.commons.csv.CSVFormat;
 22 | import org.apache.commons.csv.CSVParser;
 23 | import org.apache.commons.csv.CSVRecord;
 24 | 
 25 | import edu.emory.mathcs.nlp.common.util.FileUtils;
 26 | import edu.emory.mathcs.nlp.common.util.IOUtils;
 27 | import edu.emory.mathcs.nlp.common.util.Joiner;
 28 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
 29 | import edu.emory.mathcs.nlp.decode.NLPDecoder;
 30 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer;
 31 | import edu.emory.mathcs.nlp.tokenization.Token;
 32 | import edu.emory.mathcs.nlp.tokenization.Tokenizer;
 33 | 
 34 | /**
 35 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
 36 |  */
 37 | public class CSVSentiment
 38 | {
 39 | 	Tokenizer  tokenizer;
 40 | 	NLPDecoder decode;
 41 | 	
 42 | 	public CSVSentiment()
 43 | 	{
 44 | 		tokenizer = new EnglishTokenizer();
 45 | 	}
 46 | 	
 47 | 	public CSVSentiment(String configurationFile)
 48 | 	{
 49 | 		decode = new NLPDecoder(IOUtils.createFileInputStream(configurationFile));		
 50 | 	}
 51 | 	
 52 | 	public void categorize(String inputFile) throws Exception
 53 | 	{
 54 | 		CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT);
 55 | 		List<CSVRecord> records = parser.getRecords();
 56 | 		List<NLPNode[]> document;
 57 | 		String outputDir;
 58 | 		PrintStream fout;
 59 | 		CSVRecord record;
 60 | 		
 61 | 		System.out.println(inputFile);
 62 | 		
 63 | 		for (int i=0; i<records.size(); i++)
 64 | 		{
 65 | 			if (i == 0) continue;
 66 | 			record = records.get(i);
 67 | 			document = decode.decodeDocument(record.get(6));
 68 | 			document.get(0)[1].putFeat("sent", record.get(0));
 69 | 			
 70 | 			outputDir = inputFile.substring(0, inputFile.length()-4);
 71 | 			fout = IOUtils.createBufferedPrintStream(outputDir+"/"+FileUtils.getBaseName(outputDir)+"_"+i+".nlp");
 72 | 			for (NLPNode[] nodes : document) fout.println(decode.toString(nodes)+"\n");
 73 | 			fout.close();
 74 | 		}
 75 | 		
 76 | 		parser.close();
 77 | 	}
 78 | 	
 79 | 	public void toTSV(String inputFile) throws Exception
 80 | 	{
 81 | 		CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT);
 82 | 		PrintStream fout = IOUtils.createBufferedPrintStream(inputFile+".tsv");
 83 | 		List<CSVRecord> records = parser.getRecords();
 84 | 		List<Token> tokens;
 85 | 		CSVRecord record;
 86 | 		int label;
 87 | 		
 88 | 		System.out.println(inputFile);
 89 | 		
 90 | 		for (int i=0; i<records.size(); i++)
 91 | 		{
 92 | 			if (i == 0) continue;
 93 | 			record = records.get(i);
 94 | 			label  = toIntLabel(record.get(0));
 95 | 			tokens = tokenizer.tokenize(record.get(6));
 96 | 			fout.println(label+"\t"+Joiner.join(tokens, " ", Token::getWordForm));
 97 | 		}
 98 | 		
 99 | 		fout.close();
100 | 		parser.close();
101 | 	}
102 | 	
103 | 	public void toTXT(String inputFile) throws Exception
104 | 	{
105 | 		CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT);
106 | 		PrintStream fout = IOUtils.createBufferedPrintStream(inputFile+".txt");
107 | 		List<CSVRecord> records = parser.getRecords();
108 | 		CSVRecord record;
109 | 		
110 | 		System.out.println(inputFile);
111 | 		
112 | 		for (int i=0; i<records.size(); i++)
113 | 		{
114 | 			if (i == 0) continue;
115 | 			record = records.get(i);
116 | 			fout.println(record.get(6));
117 | 		}
118 | 		
119 | 		fout.close();
120 | 		parser.close();
121 | 	}
122 | 	
123 | 	public int toIntLabel(String label)
124 | 	{
125 | 		switch (label)
126 | 		{
127 | 		case "positive" : return 0;
128 | 		case "negative" : return 1;
129 | 		case "objective": return 2;
130 | 		default: throw new IllegalArgumentException();
131 | 		}
132 | 	}
133 | 		
134 | 	static public void main(String[] args)
135 | 	{
136 | 		List<String> inputFiles = FileUtils.getFileList("/Users/jdchoi/Documents/Data/semeval-sentiment/csv", "csv");
137 | 		
138 | 		try
139 | 		{
140 | //			String configurationFile = "/Users/jdchoi/Documents/EmoryNLP/nlp4j/src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml";
141 | //			CSVSentiment cvs = new CSVSentiment(configurationFile);
142 | //			for (String inputFile : inputFiles) cvs.categorize(inputFile);
143 | 			
144 | //			CSVSentiment cvs = new CSVSentiment();
145 | //			for (String inputFile : inputFiles) cvs.toTSV(inputFile);
146 | 			
147 | 			CSVSentiment cvs = new CSVSentiment();
148 | 			for (String inputFile : inputFiles) cvs.toTXT(inputFile);
149 | 		}
150 | 		catch (Exception e) {e.printStackTrace();}
151 | 	}
152 | }
153 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/bin/DEPEvaluate.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015, Emory University
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package edu.emory.mathcs.nlp.bin;
 17 | 
 18 | import edu.emory.mathcs.nlp.common.util.BinUtils;
 19 | import edu.emory.mathcs.nlp.common.util.FileUtils;
 20 | import edu.emory.mathcs.nlp.component.template.node.NLPNode;
 21 | import edu.emory.mathcs.nlp.component.template.reader.TSVReader;
 22 | import edu.emory.mathcs.nlp.decode.DecodeConfig;
 23 | import edu.emory.mathcs.nlp.decode.NLPDecoder;
 24 | import org.kohsuke.args4j.Option;
 25 | 
 26 | import java.io.InputStream;
 27 | import java.nio.file.Files;
 28 | import java.nio.file.Paths;
 29 | import java.nio.file.StandardOpenOption;
 30 | import java.util.Collections;
 31 | import java.util.List;
 32 | 
 33 | /**
 34 |  * A command-line program that does LAS/UAS evaluation for dependency parsing.
 35 |  * By default, it allows NLP4J to predict the part of speech tags. Optionally,
 36 |  * it will use POS tags from the input TSV.
 37 |  */
 38 | public class DEPEvaluate {
 39 | 
 40 |     @Option(name="-c", usage="configuration filename (required)", required=true, metaVar="<filename>")
 41 |     public String configuration_file;
 42 |     @Option(name="-i", usage="input path (required)", required=true, metaVar="<filepath>")
 43 |     public String input_path;
 44 |     @Option(name="-goldPos", usage = "use gold pos tags")
 45 |     public boolean useGoldPos;
 46 | 
 47 |     private DEPEvaluate(String[] args) throws Exception {
 48 |         BinUtils.initArgs(args, this);
 49 |         List<String> filelist = FileUtils.getFileList(input_path, "*", false);
 50 |         Collections.sort(filelist);
 51 | 
 52 |         DecodeConfig decodeConfig;
 53 |         try (InputStream config = Files.newInputStream(Paths.get(configuration_file), StandardOpenOption.READ)) {
 54 |             decodeConfig = new DecodeConfig(config);
 55 |         }
 56 | 
 57 |         NLPDecoder decoder = new NLPDecoder(decodeConfig);
 58 | 
 59 |         List<NLPNode[]> sentences;
 60 |         try (InputStream is = Files.newInputStream(Paths.get(filelist.get(0)), StandardOpenOption.READ)) {
 61 |             TSVReader<NLPNode> reader = new TSVReader<NLPNode>(decodeConfig.getReaderFieldMap())
 62 |             {
 63 |                 @Override
 64 |                 protected NLPNode create() {return new NLPNode();}
 65 |             };
 66 | 
 67 |             reader.open(is);
 68 |             sentences = reader.readDocument();
 69 |         }
 70 | 
 71 |         int uas = 0;
 72 |         int las = 0;
 73 |         int pos = 0;
 74 |         int total = 0;
 75 | 
 76 |         for (NLPNode[] sentence : sentences) {
 77 |             int[] goldHeads = new int[sentence.length];
 78 |             String[] goldLabels = new String[sentence.length];
 79 |             String[] goldPos = new String[sentence.length];
 80 |             for (int x = 1; x < sentence.length; x++) {
 81 |                 // capture gold and erase it so we recreate it in the decode.
 82 |                 goldHeads[x] = sentence[x].getDependencyHead().getID();
 83 |                 sentence[x].setDependencyHead(null);
 84 |                 goldLabels[x] = sentence[x].getDependencyLabel();
 85 |                 sentence[x].setDependencyLabel(null);
 86 |                 // also forget the POS tag
 87 |                 if (!useGoldPos) {
 88 |                     goldPos[x] = sentence[x].getPartOfSpeechTag();
 89 |                     sentence[x].setPartOfSpeechTag(null);
 90 |                 }
 91 |             }
 92 |             decoder.decode(sentence);
 93 |             for (int x = 1; x < sentence.length; x++) {
 94 |                 total++;
 95 |                 if (!useGoldPos) {
 96 |                     if (goldPos[x].equals(sentence[x].getPartOfSpeechTag())) {
 97 |                         pos++;
 98 |                     }
 99 |                 }
100 | 
101 |                 if (goldHeads[x] == sentence[x].getDependencyHead().getID()) {
102 |                     uas++;
103 |                     if (goldLabels[x].equals(sentence[x].getDependencyLabel())) {
104 |                         las++;
105 |                     }
106 |                 }
107 |             }
108 |         }
109 | 
110 |         double uscore = ((double)uas)/total;
111 |         double lscore = ((double)las)/total;
112 |         if (!useGoldPos) {
113 |             double posscore = ((double)pos)/total;
114 |             System.out.format("UAS %.02f LAS %.02f POS %.02f total tokens %d%n", uscore, lscore, posscore, total);
115 |         } else {
116 |             System.out.format("UAS %.02f LAS %.02f total tokens %d%n", uscore, lscore, total);
117 |         }
118 |     }
119 | 
120 |     public static void main(String args[]) throws Exception {
121 |         new DEPEvaluate(args);
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 | 
  4 |   <groupId>edu.emory.mathcs.nlp</groupId>
  5 |   <artifactId>nlp4j</artifactId>
  6 |   <version>1.1.3-SNAPSHOT</version>
  7 |   <packaging>jar</packaging>
  8 | 
  9 |   <url>http://nlp.mathcs.emory.edu</url>
 10 |   
 11 |   <scm>
 12 |     <url>https://github.com/emorynlp/nlp4j</url>
 13 |     <connection>scm:git:git://github.com/emorynlp/nlp4j.git</connection>
 14 |     <developerConnection>scm:git:git@github.com:emorynlp/nlp4j.git</developerConnection>
 15 |     <tag>HEAD</tag>
 16 |   </scm>
 17 | 
 18 |   <parent>
 19 |     <groupId>org.sonatype.oss</groupId>
 20 |     <artifactId>oss-parent</artifactId>
 21 |     <version>9</version>
 22 |   </parent>
 23 | 
 24 |   <licenses>
 25 |     <license>
 26 |       <name>The Apache Software License, Version 2.0</name>
 27 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 28 |     </license>
 29 |   </licenses>
 30 | 
 31 |   <developers>
 32 |     <developer>
 33 |       <id>jinho.choi</id>
 34 |       <name>Jinho D. Choi</name>
 35 |       <email>{id}@emory.edu</email>
 36 |     </developer>
 37 |   </developers>
 38 | 
 39 |   <properties>
 40 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 41 |     <maven.compiler.source>1.8</maven.compiler.source>
 42 |     <maven.compiler.target>1.8</maven.compiler.target>
 43 |   </properties>
 44 | 
 45 |   <build>
 46 |     <resources>
 47 |       <resource>
 48 |         <directory>src/main/resources</directory>
 49 |         <includes>
 50 |           <include>**/*</include>
 51 |         </includes>
 52 |       </resource>
 53 |     </resources>
 54 |   	
 55 |     <pluginManagement>
 56 | 	  <plugins>
 57 | 	    <plugin>
 58 | 	      <groupId>org.eclipse.m2e</groupId>
 59 | 	      <artifactId>lifecycle-mapping</artifactId>
 60 | 	      <version>1.0.0</version>
 61 | 	      <configuration>
 62 | 	        <lifecycleMappingMetadata>
 63 | 	          <pluginExecutions>
 64 | 	            <pluginExecution>
 65 | 	              <pluginExecutionFilter>
 66 | 	                <groupId>org.apache.maven.plugins</groupId>
 67 | 	                <artifactId>maven-enforcer-plugin</artifactId>
 68 | 	                <versionRange>[1.0.0,)</versionRange>
 69 | 	                <goals>
 70 | 	                  <goal>enforce</goal>
 71 | 	                </goals>
 72 | 	              </pluginExecutionFilter>
 73 | 	              <action>
 74 | 	                <ignore />
 75 | 	              </action>
 76 | 	            </pluginExecution>
 77 | 	          </pluginExecutions>
 78 | 	        </lifecycleMappingMetadata>
 79 | 	      </configuration>
 80 | 	    </plugin>
 81 | 	  </plugins>
 82 | 	</pluginManagement>
 83 | 	
 84 |     <plugins>
 85 |       <plugin>
 86 |         <groupId>org.apache.maven.plugins</groupId>
 87 |         <artifactId>maven-release-plugin</artifactId>
 88 |         <configuration>
 89 |           <autoVersionSubmodules>true</autoVersionSubmodules>
 90 |           <useReleaseProfile>false</useReleaseProfile>
 91 |           <releaseProfiles>release</releaseProfiles>
 92 |           <goals>deploy</goals>
 93 |         </configuration>
 94 | 	  </plugin>
 95 |       <plugin>
 96 |         <groupId>org.apache.maven.plugins</groupId>
 97 |         <artifactId>maven-compiler-plugin</artifactId>
 98 |         <version>3.5.1</version>
 99 |         <configuration>
100 |           <source>${maven.compiler.source}</source>
101 |           <target>${maven.compiler.target}</target>
102 |         </configuration>
103 |       </plugin>
104 | 	  <plugin>
105 |         <groupId>org.apache.maven.plugins</groupId>
106 |         <artifactId>maven-javadoc-plugin</artifactId>
107 |         <version>2.10.3</version>
108 |         <executions>
109 |           <execution>
110 |             <id>attach-javadocs</id>
111 |             <goals>
112 |               <goal>jar</goal>
113 |             </goals>
114 |             <configuration>
115 |               <additionalparam>-Xdoclint:none</additionalparam>
116 |             </configuration>
117 |           </execution>
118 |         </executions>
119 |       </plugin>
120 |     </plugins>
121 |   </build>
122 | 
123 |   <dependencies>
124 |   <dependency> 
125 |     <groupId>org.slf4j</groupId>
126 |     <artifactId>slf4j-log4j12</artifactId>
127 |     <version>1.7.21</version>
128 |   </dependency>
129 |   <dependency>
130 |       <groupId>edu.emory.mathcs.nlp</groupId>
131 |       <artifactId>nlp4j-common</artifactId>
132 |       <version>1.1.2</version>
133 |     </dependency>
134 |     <dependency>
135 |       <groupId>edu.emory.mathcs.nlp</groupId>
136 |       <artifactId>nlp4j-core</artifactId>
137 |       <version>1.1.2</version>
138 |     </dependency>
139 |     <dependency>
140 |       <groupId>edu.emory.mathcs.nlp</groupId>
141 |       <artifactId>nlp4j-tokenization</artifactId>
142 |       <version>1.1.2</version>
143 |     </dependency>
144 |     <dependency>
145 |       <groupId>edu.emory.mathcs.nlp</groupId>
146 |       <artifactId>nlp4j-morphology</artifactId>
147 |       <version>1.1.2</version>
148 |     </dependency>
149 |     <dependency>
150 |       <groupId>junit</groupId>
151 |       <artifactId>junit</artifactId>
152 |       <version>4.12</version>
153 |     </dependency>
154 |     <dependency>
155 |       <groupId>com.google.guava</groupId>
156 |       <artifactId>guava</artifactId>
157 |       <version>18.0</version>
158 |       <scope>test</scope>
159 |     </dependency>
160 |   </dependencies>
161 | </project>
162 | 


--------------------------------------------------------------------------------
/src/test/resources/dat/sample-trn.tsv:
--------------------------------------------------------------------------------
  1 | 1	The	the	DT	_	3	det	_	B-ORG
  2 | 2	Education	education	NNP	_	3	compound	_	I-ORG
  3 | 3	Directorate	directorate	NNP	_	8	nsubj	8:A0;9:A0	I-ORG
  4 | 4	for	for	IN	_	3	prep	_	I-ORG
  5 | 5	Holy	holy	NNP	_	6	compound	_	I-ORG
  6 | 6	Mecca	mecca	NNP	_	4	pobj	_	L-ORG
  7 | 7	has	have	VBZ	_	8	aux	_	O
  8 | 8	finished	finish	VBN	pb=finish.01	0	root	_	O
  9 | 9	preparing	prepare	VBG	pb=prepare.01	8	xcomp	8:A1	O
 10 | 10	a	a	DT	_	13	det	_	O
 11 | 11	new	new	JJ	_	13	amod	_	O
 12 | 12	computer	computer	NN	_	13	compound	_	O
 13 | 13	program	program	NN	_	9	dobj	15:A0;9:A1	O
 14 | 14	to	to	TO	_	15	aux	_	O
 15 | 15	monitor	monitor	VB	pb=monitor.01	13	relcl	_	O
 16 | 16	disadvantaged	disadvantaged	JJ	_	19	amod	_	O
 17 | 17	and	and	CC	_	16	cc	_	O
 18 | 18	deprived	deprived	JJ	_	16	conj	_	O
 19 | 19	students	student	NNS	_	15	dobj	15:A1	O
 20 | 20	in	in	IN	sem=LOC	19	prep	_	O
 21 | 21	schools	school	NNS	_	20	pobj	22:A1	O
 22 | 22	supervised	supervise	VBN	pb=supervise.01	21	acl	_	O
 23 | 23	by	by	IN	_	22	agent	22:A0	O
 24 | 24	the	the	DT	_	25	det	_	O
 25 | 25	directorate	directorate	NN	_	23	pobj	_	O
 26 | 26	,	,	,	_	21	punct	_	O
 27 | 27	of	of	IN	_	32	prep	_	O
 28 | 28	which	which	WDT	_	27	pobj	_	O
 29 | 29	there	there	EX	_	30	expl	_	O
 30 | 30	are	be	VBP	pb=be.02	21	relcl	_	O
 31 | 31	over	over	IN	syn=PRD	30	prep	30:A1	B-QUANTITY
 32 | 32	500	0	CD	_	31	pobj	_	L-QUANTITY
 33 | 33	.	.	.	_	8	punct	_	O
 34 | 
 35 | 1	Bakr	bakr	NNP	_	3	compound	_	B-PERSON
 36 | 2	Ibrahim	ibrahim	NNP	_	3	compound	_	I-PERSON
 37 | 3	Basfar	basfar	NNP	_	11	nsubj	11:A0	L-PERSON
 38 | 4	,	,	,	_	3	punct	_	O
 39 | 5	Director	director	NNP	_	7	compound	_	O
 40 | 6	-	-	HYPH	_	7	punct	_	O
 41 | 7	General	general	NNP	_	3	appos	_	O
 42 | 8	of	of	IN	_	7	prep	_	O
 43 | 9	Education	education	NNP	_	8	pobj	_	O
 44 | 10	,	,	,	_	11	punct	_	O
 45 | 11	said	say	VBD	pb=say.01	0	root	_	O
 46 | 12	the	the	DT	_	13	det	_	O
 47 | 13	program	program	NN	_	15	nsubjpass	15:A1	O
 48 | 14	was	be	VBD	_	15	auxpass	_	O
 49 | 15	aimed	aim	VBN	pb=aim.02	11	ccomp	11:A1	O
 50 | 16	at	at	IN	syn=CLR	15	prep	15:A2	O
 51 | 17	identifying	identify	VBG	pb=identify.01|syn=NOM	16	pcomp	_	O
 52 | 18	students	student	NNS	_	17	dobj	17:A1	O
 53 | 19	in	in	IN	_	18	prep	_	O
 54 | 20	need	need	NN	_	19	pobj	_	O
 55 | 21	in	in	IN	sem=LOC	18	prep	_	O
 56 | 22	schools	school	NNS	_	21	pobj	23:A1	O
 57 | 23	run	run	VBN	pb=run.01	22	acl	_	O
 58 | 24	by	by	IN	_	23	agent	23:A0	O
 59 | 25	the	the	DT	_	26	det	_	O
 60 | 26	directorate	directorate	NN	_	24	pobj	_	O
 61 | 27	.	.	.	_	11	punct	_	O
 62 | 
 63 | 1	The	the	DT	_	2	det	_	_
 64 | 2	program	program	NN	_	4	nsubj	4:A1	_
 65 | 3	will	will	MD	_	4	aux	4:AM-MOD	_
 66 | 4	be	be	VB	pb=be.01	0	root	_	_
 67 | 5	an	an	DT	_	7	det	_	_
 68 | 6	important	important	JJ	_	7	amod	_	_
 69 | 7	resource	resource	NN	syn=PRD	4	attr	4:A2	_
 70 | 8	for	for	IN	_	7	prep	_	_
 71 | 9	all	all	DT	_	10	det	_	_
 72 | 10	associations	association	NNS	_	8	pobj	15:A0;17:A0	_
 73 | 11	and	and	CC	_	10	cc	_	_
 74 | 12	charitable	charitable	JJ	_	13	amod	_	_
 75 | 13	organizations	organization	NNS	_	10	conj	_	_
 76 | 14	who	who	WP	_	15	nsubj	15:R-A0;17:R-A0	_
 77 | 15	wish	wish	VBP	pb=wish.01	10	relcl	_	_
 78 | 16	to	to	TO	_	17	aux	_	_
 79 | 17	identify	identify	VB	pb=identify.01	15	xcomp	15:A1	_
 80 | 18	poor	poor	JJ	_	19	amod	_	_
 81 | 19	students	student	NNS	_	17	dobj	17:A1	_
 82 | 20	in	in	IN	_	19	prep	_	_
 83 | 21	need	need	NN	_	20	pobj	_	_
 84 | 22	of	of	IN	_	21	prep	_	_
 85 | 23	support	support	NN	_	22	pobj	_	_
 86 | 24	.	.	.	_	4	punct	_	_
 87 | 
 88 | 1	He	he	PRP	_	2	nsubj	2:A0	_
 89 | 2	said	say	VBD	pb=say.01	0	root	_	_
 90 | 3	that	that	IN	_	7	mark	_	_
 91 | 4	school	school	NN	_	5	compound	_	_
 92 | 5	principals	principal	NNS	_	7	nsubj	7:A0	_
 93 | 6	would	would	MD	_	7	aux	7:AM-MOD	_
 94 | 7	supervise	supervise	VB	pb=supervise.01	2	ccomp	2:A1	_
 95 | 8	its	its	PRP$	_	9	poss	_	_
 96 | 9	implementation	implementation	NN	_	7	dobj	7:A1	_
 97 | 10	to	to	TO	_	11	aux	_	_
 98 | 11	assure	assure	VB	pb=assure.01|sem=PRP	7	advcl	7:AM-PRP	_
 99 | 12	the	the	DT	_	13	det	_	_
100 | 13	accuracy	accuracy	NN	_	11	dobj	11:A2	_
101 | 14	and	and	CC	_	13	cc	_	_
102 | 15	correctness	correctness	NN	_	13	conj	_	_
103 | 16	of	of	IN	_	13	prep	_	_
104 | 17	data	datum	NNS	_	16	pobj	_	_
105 | 18	and	and	CC	_	11	cc	_	_
106 | 19	to	to	TO	_	20	aux	_	_
107 | 20	register	register	VB	pb=register.02	11	conj	_	_
108 | 21	disadvantaged	disadvantaged	JJ	_	22	amod	_	_
109 | 22	students	student	NNS	_	20	dobj	20:A1	_
110 | 23	and	and	CC	_	20	cc	_	_
111 | 24	attach	attach	VB	pb=attach.01	20	conj	_	_
112 | 25	a	a	DT	_	26	det	_	_
113 | 26	copy	copy	NN	_	24	dobj	24:A1	_
114 | 27	of	of	IN	_	26	prep	_	_
115 | 28	their	their	PRP$	_	31	poss	_	_
116 | 29	family	family	NN	_	31	compound	_	_
117 | 30	identity	identity	NN	_	31	compound	_	_
118 | 31	card	card	NN	_	27	pobj	_	_
119 | 32	.	.	.	_	2	punct	_	_
120 | 
121 | 1	The	the	DT	_	4	det	_	O
122 | 2	Director	director	NNP	_	4	compound	_	O
123 | 3	-	-	HYPH	_	4	punct	_	O
124 | 4	General	general	NNP	_	10	nsubj	10:A0;13:A0	O
125 | 5	of	of	IN	_	4	prep	_	O
126 | 6	Education	education	NNP	_	5	pobj	_	O
127 | 7	for	for	IN	_	6	prep	_	O
128 | 8	Holy	holy	NNP	_	9	compound	_	B-GPE
129 | 9	Mecca	mecca	NNP	_	7	pobj	_	L-GPE
130 | 10	went	go	VBD	pb=go.06	0	root	_	O
131 | 11	on	on	RP	_	10	prt	10:A2	O
132 | 12	to	to	TO	_	13	aux	_	O
133 | 13	say	say	VB	pb=say.01	10	xcomp	10:A1	O
134 | 14	that	that	IN	_	19	mark	_	O
135 | 15	the	the	DT	_	16	det	_	O
136 | 16	program	program	NN	_	19	nsubjpass	19:A1	O
137 | 17	would	would	MD	_	19	aux	19:AM-MOD	O
138 | 18	be	be	VB	_	19	auxpass	_	O
139 | 19	restricted	restrict	VBN	pb=restrict.01	13	ccomp	13:A1	O
140 | 20	to	to	IN	syn=CLR	19	prep	19:A2	O
141 | 21	students	student	NNS	_	20	pobj	27:A0	O
142 | 22	in	in	IN	_	21	prep	_	O
143 | 23	need	need	NN	_	22	pobj	_	O
144 | 24	of	of	IN	_	23	prep	_	O
145 | 25	zakat	zakat	FW	_	24	pobj	_	O
146 | 26	who	who	WP	_	27	nsubj	27:R-A0	O
147 | 27	receive	receive	VBP	pb=receive.01	21	relcl	_	O
148 | 28	no	no	DT	_	29	det	_	O
149 | 29	assistance	assistance	NN	_	27	dobj	27:A1	O
150 | 30	from	from	IN	syn=CLR	27	prep	27:A2	O
151 | 31	the	the	DT	_	32	det	_	O
152 | 32	school	school	NN	_	30	pobj	_	O
153 | 33	and	and	CC	_	21	cc	_	O
154 | 34	those	those	DT	_	21	conj	37:A1	O
155 | 35	whose	whose	WP$	_	36	poss	_	O
156 | 36	fathers	father	NNS	_	37	nsubj	37:R-A1	O
157 | 37	are	be	VBP	pb=be.01	34	relcl	_	O
158 | 38	disabled	disabled	JJ	syn=PRD	37	acomp	37:A2	O
159 | 39	,	,	,	_	38	punct	_	O
160 | 40	in	in	IN	_	38	conj	_	O
161 | 41	prison	prison	NN	_	40	pobj	_	O
162 | 42	or	or	CC	_	40	cc	_	O
163 | 43	absent	absent	JJ	_	40	conj	_	O
164 | 44	and	and	CC	_	38	cc	_	O
165 | 45	with	with	IN	_	38	conj	_	O
166 | 46	no	no	DT	_	49	det	_	O
167 | 47	other	other	JJ	_	49	amod	_	O
168 | 48	financial	financial	JJ	_	49	amod	_	O
169 | 49	support	support	NN	_	45	pobj	_	O
170 | 50	.	.	.	_	10	punct	_	O
171 | 


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/zzz/NEGazetteerCreate.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015, Emory University
  3 |  * 
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  * 
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  * 
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package edu.emory.mathcs.nlp.zzz;
 17 | 
 18 | import java.io.BufferedReader;
 19 | import java.io.InputStream;
 20 | import java.io.OutputStream;
 21 | import java.io.PrintStream;
 22 | import java.util.ArrayList;
 23 | import java.util.Collections;
 24 | import java.util.HashSet;
 25 | import java.util.Iterator;
 26 | import java.util.List;
 27 | import java.util.Set;
 28 | import java.util.stream.Collectors;
 29 | 
 30 | import edu.emory.mathcs.nlp.common.constant.StringConst;
 31 | import edu.emory.mathcs.nlp.common.util.CharUtils;
 32 | import edu.emory.mathcs.nlp.common.util.IOUtils;
 33 | import edu.emory.mathcs.nlp.common.util.Joiner;
 34 | import edu.emory.mathcs.nlp.common.util.StringUtils;
 35 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer;
 36 | import edu.emory.mathcs.nlp.tokenization.Tokenizer;
 37 | 
 38 | /**
 39 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
 40 |  */
 41 | public class NEGazetteerCreate
 42 | {
 43 | 	Tokenizer tokenizer;
 44 | 	
 45 | 	public NEGazetteerCreate()
 46 | 	{
 47 | 		tokenizer = new EnglishTokenizer();
 48 | 	}
 49 | 	
 50 | 	public Set<String> union(InputStream wiki, InputStream redirect, boolean skipColon, boolean single) throws Exception
 51 | 	{
 52 | 		Set<String> set = new HashSet<>();
 53 | 		read(wiki, set, false, skipColon, single);
 54 | 		read(redirect, set, true, skipColon, single);
 55 | 		return set;
 56 | 	}
 57 | 	
 58 | 	public void read(InputStream in, Set<String> set, boolean redirect, boolean skipColon, boolean single) throws Exception
 59 | 	{
 60 | 		BufferedReader reader = IOUtils.createBufferedReader(in);
 61 | 		List<String> tokens;
 62 | 		String line;
 63 | 		
 64 | 		while ((line = reader.readLine()) != null)
 65 | 		{
 66 | 			line = line.trim();
 67 | 			if (skipColon && (line.contains(":") || line.contains(" of "))) continue;
 68 | 			if (redirect) line = splitRedirect(line);
 69 | 			tokens = tokenizer.tokenize(line).stream().map(n -> n.getWordForm()).collect(Collectors.toList());
 70 | 			concatPeriod(tokens);
 71 | 			trimTokens(tokens, single);
 72 | 			if (!tokens.isEmpty()) set.add(Joiner.join(tokens, " "));
 73 | 		}
 74 | 		
 75 | 		System.out.println(set.size());
 76 | 		reader.close();
 77 | 	}
 78 | 	
 79 | 	private void concatPeriod(List<String> tokens)
 80 | 	{
 81 | 		if (tokens.size() == 2 && tokens.get(1).equals(".") && tokens.get(0).contains("."))
 82 | 		{
 83 | 			tokens.set(0, tokens.get(0)+".");
 84 | 			tokens.remove(1);
 85 | 		}
 86 | 	}
 87 | 	
 88 | 	public String splitRedirect(String s)
 89 | 	{
 90 | 		if (s.contains(StringConst.SPACE)) return s;
 91 | 		StringBuilder build = new StringBuilder();
 92 | 		char[] cs = s.toCharArray();
 93 | 		int i, len = cs.length;
 94 | 		
 95 | 		for (i=0; i<len; i++)
 96 | 		{
 97 | 			if (0 < i&&i < len-1 && CharUtils.isLowerCase(cs[i-1]) && CharUtils.isUpperCase(cs[i]))
 98 | 				build.append(StringConst.SPACE);
 99 | 			
100 | 			build.append(cs[i]);
101 | 		}
102 | 		
103 | 		return build.toString();
104 | 	}
105 | 	
106 | 	public void trimTokens(List<String> tokens, boolean single)
107 | 	{
108 | 		Iterator<String> it = tokens.iterator();
109 | 		int i, bIdx = -1;
110 | 		String s;
111 | 		
112 | 		for (i=0; i<tokens.size(); i++)
113 | 		{
114 | 			if (tokens.get(i).equals(StringConst.LRB))
115 | 				bIdx = i;
116 | 			else if (tokens.get(i).equals(StringConst.RRB) && bIdx >= 0)
117 | 			{
118 | 				tokens.subList(bIdx, i+1).clear();
119 | 				break;
120 | 			}
121 | 		}
122 | 		
123 | 		while (it.hasNext())
124 | 		{
125 | 			s = it.next();
126 | 			
127 | 			if (StringUtils.containsPunctuationOnly(s))
128 | 				it.remove();
129 | 			else
130 | 				break;
131 | 		}
132 | 		
133 | 		for (i=tokens.size()-1; i>=0; i--)
134 | 		{
135 | 			if (StringUtils.containsPunctuationOnly(tokens.get(i)))
136 | 				tokens.remove(i);
137 | 			else
138 | 				break;
139 | 		}
140 | 		
141 | 		if (tokens.size() == 1 && ((single && !tokens.get(0).contains(".")) || StringUtils.containsDigitOnly(tokens.get(0))))
142 | 			tokens.clear();
143 | 		
144 | //		if (tokens.size() == 1) System.out.println(tokens.get(0));
145 | 	}
146 | 	
147 | 	public void print(OutputStream out, Set<String> set)
148 | 	{
149 | 		PrintStream fout = IOUtils.createBufferedPrintStream(out);
150 | 		List<String> list = new ArrayList<>(set);
151 | 		Collections.sort(list);
152 | 		
153 | 		for (String s : list)
154 | 			fout.println(s);
155 | 		
156 | 		fout.close();
157 | 	}
158 | 	
159 | 	static public void main(String[] args) throws Exception
160 | 	{
161 | 		final String DIR = args[0];
162 | 		
163 | 		NEGazetteerCreate dict = new NEGazetteerCreate();
164 | 		Set<String> set;
165 | 		String path;
166 | 		
167 | 		path = DIR+"/WikiArtWork";
168 | 		System.out.println(path);
169 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, true);
170 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
171 | 		
172 | 		path = DIR+"/WikiFilms";
173 | 		System.out.println(path);
174 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, true);
175 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
176 | 		
177 | 		path = DIR+"/WikiSongs";
178 | 		System.out.println(path);
179 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, true);
180 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
181 | 		
182 | 		path = DIR+"/WikiManMadeObjectNames";
183 | 		System.out.println(path);
184 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, false);
185 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
186 | 		
187 | 		path = DIR+"/WikiCompetitionsBattlesEvents";
188 | 		System.out.println(path);
189 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, false);
190 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
191 | 		
192 | 		path = DIR+"/WikiLocations";
193 | 		System.out.println(path);
194 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), true, false);
195 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
196 | 		
197 | 		path = DIR+"/WikiOrganizations";
198 | 		System.out.println(path);
199 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), true, false);
200 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
201 | 		
202 | 		path = DIR+"/WikiPeople";
203 | 		System.out.println(path);
204 | 		set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), true, false);
205 | 		dict.print(IOUtils.createFileOutputStream(path+".union"), set);
206 | 		
207 | 	}
208 | }
209 | 


--------------------------------------------------------------------------------
/src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-dep.xml:
--------------------------------------------------------------------------------
  1 | <!-- dependency parsing -->
  2 | <configuration>
  3 | 	<tsv>
  4 |         <column index="1" field="form"/>
  5 |         <column index="3" field="lemma"/>
  6 |         <column index="5" field="pos"/>
  7 |         <column index="7" field="feats"/>
  8 |         <column index="8" field="dhead"/>
  9 |         <column index="9" field="deprel"/>
 10 |     </tsv>
 11 | 
 12 |     <lexica>
 13 |         <word_clusters field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
 14 |     </lexica>
 15 | 
 16 |     <optimizer>
 17 |         <l1_regularization>0.00001</l1_regularization>
 18 |         <algorithm>adagrad-mini-batch</algorithm>
 19 |         <learning_rate>0.02</learning_rate>
 20 |         <feature_cutoff>2</feature_cutoff>
 21 |         <lols fixed="2" decaying="0.95"/>
 22 |         <batch_size>5</batch_size>
 23 |         <max_epoch>20</max_epoch>
 24 |         <bias>0</bias>
 25 |     </optimizer>
 26 | 
 27 |         <reducer>
 28 |         <lower_bound>88.91</lower_bound>
 29 |         <increment>0.01</increment>
 30 |         <iteration>2</iteration>
 31 |         <start>0.04</start>
 32 |         <range>0.005</range>
 33 |     </reducer>
 34 | 
 35 | 
 36 |     <feature_template>
 37 |         <!-- basic features -->
 38 |         <feature f0="i:lemma"/>
 39 |         <feature f0="j:lemma"/>
 40 |         <feature f0="i:part_of_speech_tag"/>
 41 |         <feature f0="j:part_of_speech_tag"/>
 42 | 
 43 |         <feature f0="i:part_of_speech_tag" f1="i:lemma"/>
 44 |         <feature f0="j:part_of_speech_tag" f1="j:lemma"/>
 45 | 
 46 |         <feature f0="i:part_of_speech_tag" f1="j:part_of_speech_tag"/>
 47 |         <feature f0="i:part_of_speech_tag" f1="j:lemma"/>
 48 |         <feature f0="i:lemma"              f1="j:part_of_speech_tag"/>
 49 |         <feature f0="i:lemma"              f1="j:lemma"/>
 50 | 
 51 |         <!-- 1-gram features -->
 52 |         <feature f0="k-1:lemma"/>
 53 |         <feature f0="i-1:lemma"/>
 54 |         <feature f0="i+1:lemma"/>
 55 |         <feature f0="j-2:lemma"/>
 56 |         <feature f0="j-1:lemma"/>
 57 |         <feature f0="j+1:lemma"/>
 58 |         <feature f0="j+2:lemma"/>
 59 | 
 60 |         <feature f0="i-2:part_of_speech_tag"/>
 61 |         <feature f0="i-1:part_of_speech_tag"/>
 62 |         <feature f0="i+1:part_of_speech_tag"/>
 63 |         <feature f0="i+2:part_of_speech_tag"/>
 64 |         <feature f0="j-1:part_of_speech_tag"/>
 65 |         <feature f0="j+1:part_of_speech_tag"/>
 66 | 
 67 |         <!-- 2-gram features -->
 68 |         <feature f0="i:part_of_speech_tag" f1="k-1:part_of_speech_tag"/>
 69 |         <feature f0="i:part_of_speech_tag" f1="j+1:part_of_speech_tag"/>
 70 |         <feature f0="j:part_of_speech_tag" f1="k-1:part_of_speech_tag"/>
 71 | 
 72 |         <feature f0="i:lemma" f1="j-1:part_of_speech_tag"/>
 73 |         <feature f0="i:lemma" f1="j+1:part_of_speech_tag"/>
 74 |         <feature f0="j:lemma" f1="j+1:part_of_speech_tag"/>
 75 | 
 76 |         <feature f0="j+1:lemma" f1="i:part_of_speech_tag"/>
 77 |         <feature f0="j+1:lemma" f1="j:part_of_speech_tag"/>
 78 |         <feature f0="i+1:lemma" f1="i:lemma"/>
 79 |         <feature f0="i+1:lemma" f1="j:lemma"/>
 80 | 
 81 |         <!-- 3-gram features -->
 82 |         <feature f0="i-2:part_of_speech_tag" f1="i-1:part_of_speech_tag" f2="i:part_of_speech_tag"/>
 83 |         <feature f0="i-1:part_of_speech_tag" f1="i:part_of_speech_tag"   f2="i+1:part_of_speech_tag"/>
 84 |         <feature f0="j-1:part_of_speech_tag" f1="j:part_of_speech_tag"   f2="j+1:part_of_speech_tag"/>
 85 |         <feature f0="j:part_of_speech_tag"   f1="j+1:part_of_speech_tag" f2="j+2:part_of_speech_tag"/>
 86 | 
 87 |         <feature f0="k-2:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 88 |         <feature f0="i-1:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 89 |         <feature f0="i+1:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 90 |         <feature f0="j-2:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 91 |         <feature f0="j-1:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 92 |         <feature f0="j+1:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 93 |         <feature f0="j+2:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 94 |         <feature f0="j+3:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
 95 | 
 96 |         <!-- valency features -->
 97 |         <feature f0="i:valency:all" f1="i:lemma"/>
 98 |         <feature f0="j:valency:all" f1="j:lemma"/>
 99 | 
100 |         <!-- 2nd-order features -->
101 |         <feature f0="i:dependency_label"/>
102 |         <feature f0="j:dependency_label"/>
103 |         <feature f0="i_lmd:dependency_label"/>
104 | 
105 |         <feature f0="i_h:lemma"/>
106 |         <feature f0="i_lmd:lemma"/>
107 |         <feature f0="i_rmd:lemma"/>
108 |         <feature f0="j_lmd:lemma"/>
109 | 
110 |         <feature f0="i_h:part_of_speech_tag"/>
111 |         <feature f0="i_rmd:part_of_speech_tag"/>
112 |         <feature f0="j_lmd:part_of_speech_tag"/>
113 | 
114 |         <feature f0="i:dependency_label" f1="i:lemma"/>
115 |         <feature f0="i:dependency_label" f1="j:lemma"/>
116 |         <feature f0="i:dependency_label" f1="i:part_of_speech_tag"/>
117 |         <feature f0="i:dependency_label" f1="j:part_of_speech_tag"/>
118 | 
119 |         <feature f0="i_lmd:dependency_label"   f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
120 |         <feature f0="i_rmd:dependency_label"   f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
121 |         <feature f0="j_lmd:dependency_label"   f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
122 |         <feature f0="i_lns:dependency_label"   f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
123 | 
124 |         <feature f0="i_lmd:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
125 |         <feature f0="i_rmd:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
126 |         <feature f0="j_lmd:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
127 | 
128 |         <!-- 3rd-order features -->
129 |         <feature f0="i_h:dependency_label"/>
130 |         <feature f0="j_h:dependency_label"/>
131 | 
132 |         <feature f0="i_h2:lemma"/>
133 |         <feature f0="j_lmd2:lemma"/>
134 | 
135 |         <feature f0="i_lmd2:part_of_speech_tag"/>
136 |         <feature f0="i_rmd2:part_of_speech_tag"/>
137 |         <feature f0="j_lmd2:part_of_speech_tag"/>
138 | 
139 |         <feature f0="i_h:dependency_label" f1="i:lemma"/>
140 |         <feature f0="i_h:dependency_label" f1="j:lemma"/>
141 |         <feature f0="i_h:dependency_label" f1="j:part_of_speech_tag"/>
142 | 
143 |         <feature f0="i_lns2:dependency_label"   f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
144 |         <feature f0="i_lmd2:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
145 |         <feature f0="i_rmd2:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
146 |         <feature f0="j_lmd2:part_of_speech_tag" f1="i:part_of_speech_tag" f2="j:part_of_speech_tag"/>
147 | 
148 |         <feature f0="i_lmd2:part_of_speech_tag" f1="i_lmd:part_of_speech_tag" f2="i:part_of_speech_tag"/>
149 | 
150 |         <!-- distributional semantics features -->
151 |         <feature set="true" f0="i:word_clusters"/>
152 |         <feature set="true" f0="j:word_clusters"/>
153 |         <feature set="true" f0="i+1:word_clusters"/>
154 |         <feature set="true" f0="j+1:word_clusters"/>
155 | 
156 |         <!-- positional features -->
157 |         <feature set="true" f0="i:positional"/>
158 |         <feature set="true" f0="j:positional"/>
159 |     </feature_template>
160 | </configuration>
161 | 


--------------------------------------------------------------------------------
/md/tutorial/tree_based_nlp_component.md:
--------------------------------------------------------------------------------
  1 | # Pleonastic It
  2 | 
  3 | In this tutorial, we will create an NLP component that traverses every node in a dependency tree, and classifies it into a specific type of [pleonastic-it](https://github.com/emorynlp/pleonastic-it). Let us begin by cloning the [nlp4j-core](https://github.com/emorynlp/nlp4j-core) repository (if you haven't already).
  4 | 
  5 | ```bash
  6 | git clone https://github.com/emorynlp/nlp4j-core.git
  7 | ```
  8 | 
  9 | ## Package
 10 | 
 11 | Create a package [`pleonastic`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/pleonastic) under [`edu.emory.mathcs.nlp.component`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/).
 12 | 
 13 | 
 14 | ## State
 15 | 
 16 | Create a class [`PleonasticState`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/pleonastic/PleonasticState.java) extending [`NLPState`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/template/state/NLPState.java). Add the following member instances to the class (you will see the use of these instances later).
 17 | 
 18 | ```java
 19 | public class PleonasticState extends NLPState
 20 | {
 21 | 	static public final String KEY = "it"; 
 22 | 	private String[] oracle;
 23 | 	private int input;
 24 | }
 25 | ```
 26 | 
 27 | Define a constructor that takes an array of [`NLPNode`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java). `nodes[0]` is the artificial root, `nodes[1]` represents the first token in a sentence, and so on. `input` indicates the index of the node to be processed; it is initialized to `0`, pointing to the root node. `shift` finds the next node whose lemma is `it`.
 28 | 
 29 | ```java
 30 | public PleonasticState(NLPNode[] nodes)
 31 | {
 32 | 	super(nodes);
 33 | 	input = 0;
 34 | 	shift();
 35 | }
 36 | 	
 37 | private void shift()
 38 | {
 39 | 	for (input++; input<nodes.length; input++)
 40 | 	{
 41 | 		NLPNode node = nodes[input];
 42 | 		if (node.isLemma("it")) break;
 43 | 	}
 44 | }
 45 | ```
 46 | 
 47 | Extending `NLPState` requires overriding several abstract methods. First, override `saveOracle` to save all gold labels, which can be retrieved by taking the values of `FEAT_KEY` in [`NLPNode::feat_map`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java). Once the gold labels are saved, they need to be removed from the nodes. This method returns `false` if no gold label is provided; otherwise, it returns `true`.
 48 | Then, override `getOracle`, which returns the gold label of the input node.
 49 | 
 50 | ```java
 51 | @Override
 52 | public boolean saveOracle()
 53 | {
 54 | 	oracle = Arrays.stream(nodes).map(n -> n.removeFeat(FEAT_KEY)).toArray(String[]::new);
 55 | 	return Arrays.stream(oracle).filter(o -> o != null).findFirst().isPresent();
 56 | }
 57 | 
 58 | @Override
 59 | public String getOracle()
 60 | {
 61 | 	return oracle[input];
 62 | }
 63 | ```
 64 | 
 65 | Second, override `next`, which takes system or oracle predictions of the current state, applies the top prediction to the current state, and moves onto the next state. Then, override `isTerminal`, which returns `true` if no more state is available; in other words, no more input node is left to be processed.
 66 | 
 67 | 
 68 | ```java
 69 | /**
 70 |  * @param map retrieves the string label from its index. 
 71 |  * @param yhat index of the top predicated label.
 72 |  * @param scores scores of all labels.
 73 |  */
 74 | @Override
 75 | public void next(LabelMap map, int yhat, float[] scores)
 76 | {
 77 | 	String label = map.getLabel(yhat);
 78 | 	nodes[input].putFeat(FEAT_KEY, label);
 79 | 	shift();
 80 | }
 81 | 
 82 | @Override
 83 | public boolean isTerminate()
 84 | {
 85 | 	return input >= nodes.length;
 86 | }
 87 | ```
 88 | 
 89 | Third, override `getNode`, which takes [`FeatureItem`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/FeatureItem.java) and returns a node indicated by the feature template.  Given the input node `nodes[input]`, the feature template specifies the window size and the dependency relation of the node to extract features from. For instance, if `window` is `-1` and the dependency relation is `lmd`, it returns `nodes[input-1].getLeftMostDependent()` if exist; otherwise, it returns `null`.
 90 | 
 91 | ```java
 92 | @Override
 93 | public NLPNode getNode(FeatureItem item)
 94 | {
 95 | 	NLPNode node = getNode(input, item.window);
 96 | 	return getRelativeNode(item, node);
 97 | }
 98 | ```
 99 | 
100 | Finally, we override `evaluate`, which takes [`Eval`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/eval/Eval.java) and evaluates the predictions made for this tree. Here, we are using the built-in evaluator, [`AccuracyEval`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/eval/AccuracyEval.java), that measures the accuracy by counting the correct predictions.
101 | 
102 | ```java
103 | @Override
104 | public void evaluate(Eval eval)
105 | {
106 | 	int correct = 0, total = 0;
107 | 	
108 | 	for (int i=1; i<oracle.length; i++)
109 | 	{
110 | 		NLPNode n = nodes[i];
111 | 		String o = oracle[i];
112 | 		
113 | 		if (o != null)
114 | 		{
115 | 			if (o.equals(n.getFeat(FEAT_KEY))) correct++;
116 | 			total++;
117 | 		}
118 | 	}
119 | 
120 | 	((AccuracyEval)eval).add(correct, total);
121 | }
122 | ```
123 | 
124 | ## Classifier
125 | 
126 | Create a class [`PleonasticClassifier`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/pleonastic/PleonasticClassifier.java) extending [`OnlineComponent`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/OnlineComponent.java).
127 | 
128 | ```java
129 | public class PleonasticClassifier extends OnlineComponent<PleonasticState>
130 | {
131 | 	private static final long serialVersionUID = 3585863417135590906L;
132 | 	
133 | 	public PleonasticClassifier() {}
134 | 	
135 | 	public PleonasticClassifier(InputStream configuration)
136 | 	{
137 | 		super(configuration);
138 | 	}
139 | }
140 | ```
141 | 
142 | Override `initState` using [`PleonasticState`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/pleonastic/PleonasticState.java).
143 | 
144 | ```java
145 | @Override
146 | protected PleonasticState initState(NLPNode[] nodes)
147 | {
148 | 	return new PleonasticState(nodes);
149 | }
150 | ```
151 | 
152 | Override `createEvaluator` using [`AccuracyEval`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/eval/AccuracyEval.java).
153 | 
154 | ```java
155 | @Override
156 | public Eval createEvaluator()
157 | {
158 | 	return new AccuracyEval();
159 | }
160 | ```
161 | 
162 | Override `postProcess` with an empty definition.
163 | 
164 | ```java
165 | @Override
166 | protected void postProcess(PleonasticState state) {}
167 | ```
168 | 
169 | ## NLPMode
170 | 
171 | Add the mode `pleonastic` to [`NLPMode`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/NLPMode.java).
172 | 
173 | ```java
174 | public enum NLPMode
175 | {
176 | 	pos,			// part-of-speech tagging
177 | 	ner,			// named entity recognition
178 | 	dep,			// dependency parsing
179 | 	srl,			// semantic role labeling
180 | 	sentiment,		// sentiment analysis
181 | 	pleonastic;		// pleonastic-it classification
182 | }
183 | ```
184 | 
185 | ## Trainer
186 | 
187 | Add `pleonastic` to [`OnlineTrainer`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/train/OnlineTrainer.java).
188 | 
189 | ```java
190 | protected OnlineComponent<S> createComponent(NLPMode mode, InputStream config)
191 | {
192 | 	switch (mode)
193 | 	{
194 | 	case pos: return (OnlineComponent<S>)new POSTagger(config);
195 | 	case ner: return (OnlineComponent<S>)new NERTagger(config);
196 | 	case dep: return (OnlineComponent<S>)new DEPParser(config);
197 | 	case srl: return (OnlineComponent<S>)new SRLParser(config);
198 | 	case sentiment : return (OnlineComponent<S>)new SentimentAnalyzer(config);
199 | 	case pleonastic: return (OnlineComponent<S>)new PleonasticClassifier(config);
200 | 	default : throw new IllegalArgumentException("Unsupported mode: "+mode);
201 | 	}
202 | }
203 | ```


--------------------------------------------------------------------------------
/md/quickstart/train.md:
--------------------------------------------------------------------------------
  1 | # Train
  2 | 
  3 | ## Command
  4 | 
  5 | The following command trains an NLP component:
  6 | 
  7 | ```
  8 | java edu.emory.mathcs.nlp.bin.NLPTrain -mode <string> -c <filename> -t <filepath> -d <filepath> [-f <integer> -m <filename> -p <filename> -te <string> -de <string>]
  9 | 
 10 | -c  <filename> : configuration file (required)
 11 | -m  <filename> : output model file (optional)
 12 | -p  <filename> : previously trained model file (optional)
 13 | -t  <filepath> : training path (required)
 14 | -d  <filepath> : development path (optional)
 15 | -te   <string> : training file extension (default: *)
 16 | -de   <string> : development file extension (default: *)
 17 | -cv      <int> : # of cross-validation folds (default: 0)
 18 | -mode <string> : component mode (required: pos|ner|dep|srl|sent)
 19 | ```
 20 | 
 21 | * `-c` specifies the configuration file (see [configuration](#configuration)).
 22 | * `-m` specifies the output model file (saved in the [xz](http://tukaani.org) format). The model is not saved unless this option is set.
 23 | * `-p` specifies the previously trained model file. If this option is set, a new model is trained on top of the previous model.
 24 | * `-t|d` specifies the training or development path pointing to either a file or a directory. When the path points to a file, only the specific file is trained. When the path points to a directory, all files with the file extension `-te|de` under the specific directory are trained. It is possible to train a model without using a development set by not setting the `-d` option (see the example below).
 25 | * `-te|de` specifies the training or development file extension. The default value `*` implies files with any extension. This option is used only when the training or development path `-t|d` points to a directory.
 26 | * `-cv` specifies the number of cross-validation folds. If this number is greater than `1`, it performs cross-validation on the training data.
 27 | * `-mode` specifies the NLP component to be trained (see [`NLPMode`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/NLPMode.java)).
 28 | 
 29 | ## Example
 30 | 
 31 | The following command takes [`sample-trn.tsv`](../../src/test/resources/dat/sample-trn.tsv) and [`sample-dev.tsv`](../../src/test/resources/dat/sample-dev.tsv), trains a dependency parsing model using [`config-train-sample.xml`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-sample.xml), and saves the best model to `sample-dep.xz`.
 32 | 
 33 | ```
 34 | $ java -Xmx1g -XX:+UseConcMarkSweepGC java edu.emory.mathcs.nlp.bin.NLPTrain -mode dep -c config-train-sample.xml -t sample-trn.tsv -d sample-dev.tsv -m sample-dep.xz
 35 | 
 36 | AdaGrad Mini-batch
 37 | - Max epoch: 5
 38 | - Mini-batch: 1
 39 | - Learning rate: 0.02
 40 | - LOLS: fixed = 0, decaying rate = 0.95
 41 | - RDA: 1.0E-5
 42 | Training: 0
 43 |  0:    1: LAS = 22.22, UAS = 26.98, L =  34, SF =    1300, NZW =     1867, N/S =  15750
 44 |  0:    2: LAS = 34.92, UAS = 39.68, L =  34, SF =    1410, NZW =     4578, N/S =  18000
 45 |  0:    3: LAS = 38.89, UAS = 44.44, L =  34, SF =    1454, NZW =     6191, N/S =  21000
 46 |  0:    4: LAS = 37.30, UAS = 41.27, L =  34, SF =    1550, NZW =     7751, N/S =  42000
 47 |  0:    5: LAS = 37.30, UAS = 41.27, L =  34, SF =    1583, NZW =     8997, N/S =  63000
 48 |  0: Best: 38.89, epoch = 3
 49 | Saving the model
 50 | ```
 51 | 
 52 | * Use the [`-XX:+UseConcMarkSweepGC`](http://www.oracle.com/technetwork/java/tuning-139912.html) option for JVM, which reduces the memory usage into a half.
 53 | * Use [`log4j.properties`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/log4j.properties) for the [log4j](http://logging.apache.org/log4j/) configuration.
 54 | * Once the training is done, `sample-dep.xz` should be created, which can be specified in the configuration file for dependency parsing (see [how to decode](decode.md)).
 55 |  * `L`: number of labels.
 56 |  * `SF`: number of sparse features.
 57 |  * `NZW`: number of non-zero weights.
 58 |  * `N/S`: number of nodes processed per second. 
 59 | 
 60 | ## Configuration
 61 | 
 62 | Sample configuration files for training can be found here: [`config-train-*`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/).
 63 | 
 64 | ```xml
 65 | <configuration>
 66 |     <tsv>
 67 |         <column index="1" field="form"/>
 68 |         <column index="2" field="lemma"/>
 69 |         <column index="3" field="pos"/>
 70 |         <column index="4" field="feats"/>
 71 |         <column index="5" field="dhead"/>
 72 |         <column index="6" field="deprel"/>
 73 |         <column index="7" field="sheads"/>
 74 |         <column index="8" field="nament"/>
 75 |     </tsv>
 76 | 
 77 |     <lexica>
 78 |         <ambiguity_classes field="word_form_simplified_lowercase">en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
 79 |         <word_clusters field="word_form_simplified_lowercase">en-brown-clusters-simplified-lowercase.xz</word_clusters>
 80 |         <word_embeddings field="word_form_undigitalized">en-word-embeddings-undigitalized.xz</word_embeddings>
 81 |         <named_entity_gazetteers field="word_form_simplified">en-named-entity-gazetteers-simplified.xz</named_entity_gazetteers>
 82 |     </lexica>
 83 | 
 84 |     <optimizer>
 85 |         <algorithm>adagrad-mini-batch</algorithm>
 86 |         <l1_regularization>0.00001</l1_regularization>
 87 |         <learning_rate>0.02</learning_rate>
 88 |         <feature_cutoff>2</feature_cutoff>
 89 |         <lols fixed="0" decaying="0.95"/>
 90 |         <max_epochs>40</max_epochs>
 91 |         <batch_size>5</batch_size>
 92 |         <bias>0</bias>
 93 |     </optimizer>
 94 | 
 95 |     <feature_template>
 96 |         <feature f0="i:word_form"/>
 97 |         <feature f0="i+1:lemma"/>
 98 |         <feature f0="i-1:part_of_speech_tag"/>
 99 |         <feature f0="i_lmd:part_of_speech_tag"/>
100 |         <feature f0="i-1:lemma" f1="i:lemma" f2="i+1:lemma"/>
101 |     </feature_template>
102 | </configuration>
103 | ```
104 | 
105 | * `<tsv>` specifies the configuration for [`TSVReader`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/TSVReader.java). `index` specifies the index of the field, starting at 0. `field` specifies the name of the field (e.g., [`sample-trn.tsv`](../../src/test/resources/dat/sample-trn.tsv)):
106 |  * `form`&nbsp;&nbsp;&nbsp;&nbsp;: word form.
107 |  * `lemma`&nbsp;&nbsp;: lemma.
108 |  * `pos`&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;: part-of-speech tag.
109 |  * `feats`&nbsp;&nbsp;: extra features.
110 |  * `dhead`&nbsp;&nbsp;: dependency head ID.
111 |  * `deprel`: dependency label.
112 |  * `sheads`: semantic heads.
113 |  * `nament`: named entity tag.
114 | 
115 | * `<lexica>` specifies the lexica used globally across multiple components (e.g., [english lexica](../supplements/english-lexica-models.md#lexica)). `field` specifies the type of word forms used to generate these lexica (see [`NLPNode::getValue`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java#L205)).
116 |  * `ambiguity_classes`: ambiguity classes for part-of-speech tagging.
117 |  * `word_clusters`: word clusters (e.g., brown clusters).
118 |  * `word_embeddings`: word embeddings (e.g., [word2vec](http://word2vec.googlecode.com)).
119 |  * `named_entity_gazetteers`: gazetteers for named entity recognition.
120 | 
121 | * `<optimizer>`specifies the optimizer to train a statistical model.
122 |  * `algorithm`: perceptron, softmax, adagrad, agagrad-mini-batch, agadelta-mini-batch, agagrad-regression.
123 |  * `l1_regularization`: the [RDA](http://www.jmlr.org/papers/volume11/xiao10a/xiao10a.pdf) regularization parameter used for `adagrad-*`.
124 |  * `learning_rate`: the learning rate.
125 |  * `feature_cutoff`: features appearing less than or equal to this cutoff are discarded from training.
126 |  * `lols`: [locally optimal learning to search](http://jmlr.org/proceedings/papers/v37/changb15.pdf). <br>- `fixed`: use only gold labels for the specific number of epochs. <br>- `decaying`: decay the use of gold labels by the specific rate for every epoch.
127 |  * `max_epochs`: the maximum number of epochs to be used for training.
128 |  * `batch_size`: the number of sentences used to train `*-mini-batch`.
129 |  * `bias`: the bias value.
130 | 
131 | * `<feature_template>` specifies the features used during training.
132 | 
133 |     ```xml
134 |     <feature( f#="source(±window)?(_relation)?:field(:value)?")+/>
135 |     ```
136 | 
137 |  * `f#`: `#` must start with 0. When multiple features are joined, they must be in a consecutive order.
138 |  * `source`: see [`Source.java`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/Source.java).
139 |  * `window`: the context window with respect to the source.
140 |  * `relation`: see [`Relation.java`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/Relation.java).
141 |  * `field`: see [`Field.java`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/Field.java).
142 |  * `value`: specifies the extra value of the field.


--------------------------------------------------------------------------------
/src/main/java/edu/emory/mathcs/nlp/decode/AbstractNLPDecoder.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015, Emory University
  3 |  * 
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  * 
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  * 
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package edu.emory.mathcs.nlp.decode;
 17 | 
 18 | import java.io.BufferedReader;
 19 | import java.io.ByteArrayInputStream;
 20 | import java.io.ByteArrayOutputStream;
 21 | import java.io.IOException;
 22 | import java.io.InputStream;
 23 | import java.io.OutputStream;
 24 | import java.io.PrintStream;
 25 | import java.lang.reflect.Array;
 26 | import java.util.ArrayList;
 27 | import java.util.List;
 28 | import java.util.concurrent.ExecutorService;
 29 | import java.util.concurrent.Executors;
 30 | import java.util.function.Function;
 31 | 
 32 | import edu.emory.mathcs.nlp.common.constant.StringConst;
 33 | import edu.emory.mathcs.nlp.common.util.BinUtils;
 34 | import edu.emory.mathcs.nlp.common.util.FileUtils;
 35 | import edu.emory.mathcs.nlp.common.util.IOUtils;
 36 | import edu.emory.mathcs.nlp.common.util.Joiner;
 37 | import edu.emory.mathcs.nlp.common.util.Language;
 38 | import edu.emory.mathcs.nlp.component.morph.MorphologicalAnalyzer;
 39 | import edu.emory.mathcs.nlp.component.template.NLPComponent;
 40 | import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica;
 41 | import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode;
 42 | import edu.emory.mathcs.nlp.component.template.reader.TSVReader;
 43 | import edu.emory.mathcs.nlp.tokenization.Token;
 44 | import edu.emory.mathcs.nlp.tokenization.Tokenizer;
 45 | 
 46 | /**
 47 |  * @author Jinho D. Choi ({@code jinho.choi@emory.edu})
 48 |  */
 49 | public abstract class AbstractNLPDecoder<N extends AbstractNLPNode<N>>
 50 | {
 51 | 	static final public String FORMAT_RAW  = "raw";
 52 | 	static final public String FORMAT_LINE = "line";
 53 | 	static final public String FORMAT_TSV  = "tsv";
 54 | 	
 55 | 	volatile private List<NLPComponent<N>> components;
 56 | 	volatile private Tokenizer tokenizer;
 57 | 	private DecodeConfig decode_config;
 58 | 
 59 | //	======================================== CONSTRUCTORS ========================================
 60 | 	
 61 | 	public AbstractNLPDecoder() {}
 62 | 	
 63 | 	public AbstractNLPDecoder(DecodeConfig config)
 64 | 	{
 65 | 		init(config);
 66 | 	}
 67 | 	
 68 | 	public AbstractNLPDecoder(InputStream configuration)
 69 | 	{
 70 | 		init(new DecodeConfig(configuration));
 71 | 	}
 72 | 	
 73 | 	public void init(DecodeConfig config)
 74 | 	{
 75 | 		List<NLPComponent<N>> components = new ArrayList<>();
 76 | 		Language language = config.getLanguage();
 77 | 		decode_config = config;
 78 | 		
 79 | 		components.add(new GlobalLexica<>(decode_config.getDocumentElement()));
 80 | 		
 81 | 		BinUtils.LOG.info("Loading tokenizer\n");
 82 | 		setTokenizer(NLPUtils.createTokenizer(language));
 83 | 		
 84 | 		if (decode_config.getPartOfSpeechTagging() != null)
 85 | 		{
 86 | 			BinUtils.LOG.info("Loading part-of-speech tagger\n");
 87 | 			components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getPartOfSpeechTagging())));
 88 | 			
 89 | 			BinUtils.LOG.info("Loading morphological analyzer\n");
 90 | 			components.add(new MorphologicalAnalyzer<>(language));
 91 | 		}
 92 | 		
 93 | 		if (decode_config.getNamedEntityRecognition() != null)
 94 | 		{
 95 | 			BinUtils.LOG.info("Loading named entity recognizer\n");
 96 | 			components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getNamedEntityRecognition())));
 97 | 		}
 98 | 		
 99 | 		if (decode_config.getDependencyParsing() != null)
100 | 		{
101 | 			BinUtils.LOG.info("Loading dependency parser\n");
102 | 			components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getDependencyParsing())));
103 | 		}
104 | 		
105 | //		if (decode_config.getSemanticRoleLabeling() != null)
106 | //		{
107 | //			BinUtils.LOG.info("Loading semantic role labeler\n");
108 | //			add(compoinent, , );
109 | //			components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getSemanticRoleLabeling())));		
110 | //		}
111 | 
112 | 		setComponents(components);
113 | 		BinUtils.LOG.info("\n");
114 | 	}
115 | 	
116 | //	======================================== GETTERS/SETTERS ========================================
117 | 	
118 | 	public Tokenizer getTokenizer()
119 | 	{
120 | 		return tokenizer;
121 | 	}
122 | 	
123 | 	public List<NLPComponent<N>> getComponents()
124 | 	{
125 | 		return components;
126 | 	}
127 | 	
128 | 	public void setTokenizer(Tokenizer tokenizer)
129 | 	{
130 | 		this.tokenizer = tokenizer;
131 | 	}
132 | 	
133 | 	public void setComponents(List<NLPComponent<N>> components)
134 | 	{
135 | 		this.components = components;
136 | 	}
137 | 	
138 | //	======================================== DECODE ========================================
139 | 
140 | 	public void decode(List<String> inputFiles, String outputExt, String format, int threads)
141 | 	{
142 | 		ExecutorService executor = Executors.newFixedThreadPool(threads);
143 | 		String outputFile;
144 | 		
145 | 		for (String inputFile : inputFiles)
146 | 		{
147 | 			outputFile = inputFile + StringConst.PERIOD + outputExt;
148 | 			executor.submit(new NLPTask(inputFile, outputFile, format));
149 | 		}
150 | 		
151 | 		executor.shutdown();
152 | 	}
153 | 	
154 | 	public String decode(String s, String format)
155 | 	{
156 | 		return new String(decodeByteArray(s, format));
157 | 	}
158 | 	
159 | 	public byte[] decodeByteArray(String s, String format)
160 | 	{
161 | 		InputStream bin = new ByteArrayInputStream(s.getBytes());
162 | 		ByteArrayOutputStream bout = new ByteArrayOutputStream();
163 | 		
164 | 		decode(bin, bout, format);
165 | 		
166 | 		try
167 | 		{
168 | 			bin .close();
169 | 			bout.close();
170 | 		}
171 | 		catch (IOException e) {e.printStackTrace();}
172 | 		
173 | 		return bout.toByteArray();
174 | 	}
175 | 	
176 | 	public void decode(InputStream in, OutputStream out, String format)
177 | 	{
178 | 		try
179 | 		{
180 | 			switch (format)
181 | 			{
182 | 			case FORMAT_RAW : decodeRaw (in, out); break;
183 | 			case FORMAT_LINE: decodeLine(in, out); break;
184 | 			case FORMAT_TSV : decodeTSV (createTSVReader(), in, out); break;
185 | 			}
186 | 		}
187 | 		catch (Exception e) {e.printStackTrace();}
188 | 	}
189 | 	
190 | 	public List<N[]> decodeDocument(String s) throws IOException
191 | 	{
192 | 		return decodeDocument(new ByteArrayInputStream(s.getBytes()));
193 | 	}
194 | 	
195 | 	public List<N[]> decodeDocument(InputStream in) throws IOException
196 | 	{
197 | 		List<N[]> document = new ArrayList<>();
198 | 		N[] nodes;
199 | 		
200 | 		for (List<Token> tokens : tokenizer.segmentize(in))
201 | 		{
202 | 			nodes = toNodeArray(tokens);
203 | 			decode(nodes);
204 | 			document.add(nodes);
205 | 		}
206 | 		
207 | 		in.close();
208 | 		return document;
209 | 	}
210 | 	
211 | 	public void decodeRaw(String s, OutputStream out) throws IOException
212 | 	{
213 | 		decodeRaw(new ByteArrayInputStream(s.getBytes()), out);
214 | 	}
215 | 	
216 | 	public void decodeRaw(InputStream in, OutputStream out) throws IOException
217 | 	{
218 | 		PrintStream fout = IOUtils.createBufferedPrintStream(out);
219 | 		N[] nodes;
220 | 		
221 | 		for (List<Token> tokens : tokenizer.segmentize(in))
222 | 		{
223 | 			nodes = toNodeArray(tokens);
224 | 			decode(nodes);
225 | 			fout.println(toString(nodes)+"\n");
226 | 		}
227 | 		
228 | 		in.close();
229 | 		fout.close();
230 | 	}
231 | 	
232 | 	public void decodeLine(InputStream in, OutputStream out) throws IOException
233 | 	{
234 | 		BufferedReader reader = IOUtils.createBufferedReader(in);
235 | 		PrintStream fout = IOUtils.createBufferedPrintStream(out);
236 | 		N[] nodes;
237 | 		String line;
238 | 		
239 | 		while ((line = reader.readLine()) != null)
240 | 		{
241 | 			nodes = decode(line);
242 | 			fout.println(toString(nodes)+"\n");
243 | 		}
244 | 		
245 | 		reader.close();
246 | 		fout.close();
247 | 	}
248 | 	
249 | 	public void decodeTSV(TSVReader<N> reader, InputStream in, OutputStream out) throws IOException
250 | 	{
251 | 		PrintStream fout = IOUtils.createBufferedPrintStream(out);
252 | 		N[] nodes;
253 | 		
254 | 		reader.open(in);
255 | 		
256 | 		while ((nodes = reader.next()) != null)
257 | 		{
258 | 			decode(nodes);
259 | 			fout.println(toString(nodes)+"\n");
260 | 		}
261 | 		
262 | 		reader.close();
263 | 		fout.close();
264 | 	}
265 | 	
266 | 	public N[] decode(String sentence)
267 | 	{
268 | 		List<Token> tokens = tokenizer.tokenize(sentence);
269 | 		return decode(toNodeArray(tokens));
270 | 	}
271 | 	
272 | 	public N[] decode(N[] nodes)
273 | 	{
274 | 		for (NLPComponent<N> component : components)
275 | 			component.process(nodes);
276 | 		
277 | 		return nodes;
278 | 	}
279 | 	
280 | 	public N[] toNodeArray(List<Token> tokens)
281 | 	{
282 | 		return toNodeArray(tokens, t -> create(t));
283 | 	}
284 | 	
285 | 	@SuppressWarnings("unchecked")
286 | 	public <T extends Token>N[] toNodeArray(List<T> tokens, Function<T,N> f)
287 | 	{
288 | 		N node = create(); node.toRoot();
289 | 		N[] nodes = (N[])Array.newInstance(node.getClass(), tokens.size() + 1);
290 | 		nodes[0] = node;	// root
291 | 		
292 | 		for (int i=0,j=1; i<tokens.size(); i++,j++)
293 | 		{
294 | 			nodes[j] = f.apply(tokens.get(i));
295 | 			nodes[j].setID(j);
296 | 		}
297 | 			
298 | 		return nodes;
299 | 	}
300 | 	
301 | 	public abstract N create();
302 | 	
303 | 	public N create(Token token)
304 | 	{
305 | 		N node = create();
306 | 		node.setWordForm   (token.getWordForm());
307 | 		node.setStartOffset(token.getStartOffset());
308 | 		node.setEndOffset  (token.getEndOffset());
309 | 		return node;
310 | 	}
311 | 	
312 | 	public TSVReader<N> createTSVReader()
313 | 	{
314 | 		return new TSVReader<N>(decode_config.getReaderFieldMap())
315 | 		{
316 | 			@Override
317 | 			protected N create() {return AbstractNLPDecoder.this.create();}
318 | 		};
319 | 	}
320 | 	
321 | 	public String toString(N[] nodes)
322 | 	{
323 | 		return Joiner.join(nodes, "\n", 1);
324 | 	}
325 | 	
326 | 	class NLPTask implements Runnable
327 | 	{
328 | 		private String input_file;
329 | 		private String output_file;
330 | 		private String format;
331 | 		
332 | 		public NLPTask(String inputFile, String outputFile, String format)
333 | 		{
334 | 			this.input_file  = inputFile;
335 | 			this.output_file = outputFile;
336 | 			this.format      = format;
337 | 		}
338 | 		
339 | 		@Override
340 | 		public void run()
341 | 		{
342 | 			BinUtils.LOG.info(FileUtils.getBaseName(input_file)+"\n");
343 | 			InputStream  in  = IOUtils.createFileInputStream (input_file);
344 | 			OutputStream out = IOUtils.createFileOutputStream(output_file);
345 | 			decode(in, out, format);
346 | 		}
347 | 	}
348 | }
349 | 


--------------------------------------------------------------------------------