├── .gitignore ├── md ├── components │ ├── sentiment_analysis.md │ ├── coreference_resolution.md │ ├── semantic_role_labeling.md │ ├── named_entity_recognition.md │ ├── part_of_speech_tagging.md │ └── dependency_parsing.md ├── quickstart │ ├── milestones.md │ ├── install.md │ ├── decode.md │ ├── release.md │ └── train.md ├── tutorial │ ├── depstate.md │ ├── tensorflow.md │ ├── maven.md │ ├── processing_state.md │ ├── nlp_component.md │ ├── create_nlp_component.md │ └── tree_based_nlp_component.md └── supplements │ ├── data-format.md │ └── english-lexica-models.md ├── .settings ├── org.eclipse.wst.validation.prefs ├── org.eclipse.m2e.core.prefs ├── org.eclipse.core.resources.prefs ├── org.eclipse.wst.common.project.facet.core.xml ├── org.eclipse.wst.common.component └── org.eclipse.jdt.core.prefs ├── src ├── test │ ├── resources │ │ ├── dat │ │ │ ├── nlp4j.txt │ │ │ ├── nlp4j.txt.nlp │ │ │ ├── sample-dev.tsv │ │ │ └── sample-trn.tsv │ │ ├── log4j.properties │ │ └── decoder-test-config.xml │ └── java │ │ └── edu │ │ └── emory │ │ └── mathcs │ │ └── nlp │ │ └── decode │ │ └── AbstractNLPDecoderTest.java └── main │ ├── resources │ └── edu │ │ └── emory │ │ └── mathcs │ │ └── nlp │ │ └── configuration │ │ ├── log4j.properties │ │ ├── config-decode-pos.xml │ │ ├── config-decode-en.xml │ │ ├── config-train-sample.xml │ │ ├── config-train-sample-optimized.xml │ │ ├── config-train-doc.xml │ │ ├── config-train-ner.xml │ │ ├── config-train-pos.xml │ │ └── config-train-dep.xml │ └── java │ └── edu │ └── emory │ └── mathcs │ └── nlp │ ├── bin │ ├── Version.java │ ├── NLPDemo.java │ ├── NLPDecode.java │ └── DEPEvaluate.java │ ├── decode │ ├── NLPDecoder.java │ ├── NLPUtils.java │ ├── DecodeConfig.java │ └── AbstractNLPDecoder.java │ ├── component │ └── morph │ │ └── MorphologicalAnalyzer.java │ ├── zzz │ ├── RadiologyDecode.java │ ├── TokenizeIt.java │ ├── CSVRadiology.java │ ├── CSVSentiment.java │ └── NEGazetteerCreate.java │ └── network │ ├── NLPSocketClient.java │ └── NLPSocketServer.java ├── LICENSE.txt ├── README.md └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | target 4 | -------------------------------------------------------------------------------- /md/components/sentiment_analysis.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis 2 | -------------------------------------------------------------------------------- /md/components/coreference_resolution.md: -------------------------------------------------------------------------------- 1 | # Coreference Resolution 2 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.validation.prefs: -------------------------------------------------------------------------------- 1 | disabled=06target 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /src/test/resources/dat/nlp4j.txt: -------------------------------------------------------------------------------- 1 | The NLP4J project provides a NLP toolkit for JVM languages. This project is under the Apache 2 license and is currently developed by the NLP Research Group at Emory University. -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding//src/test/java=UTF-8 5 | encoding//src/test/resources=UTF-8 6 | encoding/=UTF-8 7 | -------------------------------------------------------------------------------- /md/quickstart/milestones.md: -------------------------------------------------------------------------------- 1 | # Milestones 2 | 3 | ## Version 1.1.0 (03/31/2016) 4 | 5 | * Word2Vec and Struct2Vec. 6 | * Semantic role labeling. 7 | * Sentiment analysis. 8 | 9 | ## Version 1.2.0 (04/30/2016) 10 | 11 | * Coreference resolution. 12 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.common.project.facet.core.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.common.component: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=INFO, A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.conversionPattern=%m 10 | -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=INFO, A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.conversionPattern=%m 10 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 4 | org.eclipse.jdt.core.compiler.compliance=1.8 5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 7 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 8 | org.eclipse.jdt.core.compiler.source=1.8 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2015, Emory University 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-pos.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz 8 | edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz 9 | 10 | 11 | 12 | edu/emory/mathcs/nlp/models/en-pos.xz 13 | 14 | 15 | -------------------------------------------------------------------------------- /md/components/semantic_role_labeling.md: -------------------------------------------------------------------------------- 1 | # Semantic Role Labeling 2 | 3 | Our semantic role labeler uses a higher-order argument pruning algorithm that significantly improves recall from the first-order argument pruning algorithm, yet keeps a similar labeling complexity in practice. Our labeler takes about 0.45 milliseconds for labeling all arguments of each predicate on an Intel Xeon 2.57GHz machine and shows state-of-the-art accuracy compared to other dependency-based labeling approaches. 4 | 5 | * [Transition-based Semantic Role Labeling Using Predicate Argument Clustering](http://aclweb.org/anthology/W11-0906), Jinho D. Choi, Martha Palmer, In Proceedings of the ACL Workshop on Relational Models of Semantics (RELMS'11), 37–45, 2011. 6 | -------------------------------------------------------------------------------- /src/test/resources/decoder-test-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz 8 | edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz 9 | edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz 10 | 11 | 12 | 13 | edu/emory/mathcs/nlp/models/en-pos.xz 14 | 15 | 16 | -------------------------------------------------------------------------------- /md/tutorial/depstate.md: -------------------------------------------------------------------------------- 1 | # DEPState 2 | 3 | [`DEPState`](../../java/edu/emory/mathcs/nlp/component/dep/DEPState.java) implements the parsing algorithm and holds the parsing states processed by the algorithm. 4 | 5 | ## Initialization 6 | 7 | ```java 8 | public DEPStateArcEager(N[] nodes) 9 | { 10 | super(nodes); 11 | 12 | stack = new IntArrayList(); 13 | input = 0; 14 | } 15 | ``` 16 | 17 | ## Save Oracle 18 | 19 | The oracle is saved as a list of [`DEPArc`](../../java/edu/emory/mathcs/nlp/component/dep/DEPArc.java). `oracle[0]` is preserved for the artificial root node and the rest holds the gold-standard head information for each node. 20 | 21 | 22 | ```java 23 | @Override 24 | public void saveOracle() 25 | { 26 | oracle = Arrays.stream(nodes).map(n -> n.clearDependencies()).toArray(DEPArc[]::new); 27 | } 28 | ``` 29 | 30 | # Arc-Eager 31 | 32 | implements the arc-eager algorithm ([Nivre 2008](http://www.mitpressjournals.org/doi/pdf/10.1162/coli.07-056-R1-07-027), Section 4.2), that is the most widely used projective parsing algorithm. -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz 8 | edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz 9 | edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz 10 | edu/emory/mathcs/nlp/lexica/en-named-entity-gazetteers-simplified.xz 11 | 12 | 13 | 14 | edu/emory/mathcs/nlp/models/en-pos.xz 15 | edu/emory/mathcs/nlp/models/en-ner.xz 16 | edu/emory/mathcs/nlp/models/en-dep.xz 17 | 18 | 19 | -------------------------------------------------------------------------------- /md/tutorial/tensorflow.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | * Install [python 3.x](https://www.python.org). 4 | * Install [pip](https://pip.pypa.io): 5 | 6 | ```bash 7 | # https://bootstrap.pypa.io/get-pip.py 8 | python3 get-pip.py 9 | ``` 10 | 11 | * Install [virtualenv](https://pypi.python.org/pypi/virtualenv): 12 | 13 | ```bash 14 | sudo pip3 install --upgrade virtualenv 15 | ``` 16 | 17 | 18 | * Create a virtual environment in the directory, `vnlp`: 19 | 20 | ```bash 21 | virtualenv --system-site-packages vnlp 22 | ``` 23 | 24 | * Activate the `vnlp` environment: 25 | 26 | ```bash 27 | source vnlp/bin/activate 28 | ``` 29 | 30 | * Install [tensorflow](https://www.tensorflow.org) under `vnlp`: 31 | 32 | ```bash 33 | # linux 34 | pip3 install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.8.0-cp27-none-linux_x86_64.whl 35 | # mac 36 | pip3 install --upgrade https://storage.googleapis.com/tensorflow/mac/tensorflow-0.8.0-py3-none-any.whl 37 | ``` 38 | 39 | * Install [scipy](pip3 install scipy) and [scikit-learn](http://scikit-learn.org) under `vnlp`: 40 | 41 | ``` 42 | pip3 install -U scipy 43 | pip3 install -U scikit-learn 44 | ``` -------------------------------------------------------------------------------- /src/test/resources/dat/nlp4j.txt.nlp: -------------------------------------------------------------------------------- 1 | 1 The the DT _ 3 det _ O 2 | 2 NLP4J nlp0j NNP pos2=NN 3 compound _ U-ORG 3 | 3 project project NN _ 4 nsubj _ O 4 | 4 provides provide VBZ _ 0 root _ O 5 | 5 a a DT _ 7 det _ O 6 | 6 NLP nlp NN pos2=NNP 7 compound _ O 7 | 7 toolkit toolkit NN _ 4 dobj _ O 8 | 8 for for IN _ 7 prep _ O 9 | 9 JVM jvm NN pos2=NNP 10 compound _ U-ORG 10 | 10 languages language NNS _ 8 pobj _ O 11 | 11 . . . _ 4 punct _ O 12 | 13 | 1 This this DT _ 2 det _ O 14 | 2 project project NN _ 3 nsubj _ O 15 | 3 is be VBZ _ 0 root _ O 16 | 4 under under IN _ 3 prep _ O 17 | 5 the the DT _ 8 det _ O 18 | 6 Apache apache NNP pos2=NN 8 nmod _ O 19 | 7 2 0 CD pos2=NNP 6 nmod _ O 20 | 8 license license NN pos2=NNS 4 pobj _ O 21 | 9 and and CC _ 3 cc _ O 22 | 10 is be VBZ _ 12 auxpass _ O 23 | 11 currently currently RB _ 12 advmod _ O 24 | 12 developed develop VBN _ 3 conj _ O 25 | 13 by by IN _ 12 agent _ O 26 | 14 the the DT _ 17 det _ B-ORG 27 | 15 NLP nlp NNP _ 17 compound _ I-ORG 28 | 16 Research research NNP _ 17 compound _ I-ORG 29 | 17 Group group NNP _ 13 pobj _ L-ORG 30 | 18 at at IN _ 17 prep _ O 31 | 19 Emory emory NNP _ 20 compound _ B-ORG 32 | 20 University university NNP _ 18 pobj _ L-ORG 33 | 21 . . . _ 3 punct _ O 34 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/bin/Version.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.bin; 17 | 18 | /** 19 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 20 | */ 21 | public class Version 22 | { 23 | public static void main(String[] args) 24 | { 25 | System.out.println("===================================="); 26 | System.out.println("NLP4J Version 1.1.2"); 27 | System.out.println("Webpage: http://nlp.mathcs.emory.edu"); 28 | System.out.println("Contact: choi@mathcs.emory.edu"); 29 | System.out.println("===================================="); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/decode/NLPDecoder.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.decode; 17 | 18 | import java.io.InputStream; 19 | 20 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 21 | 22 | /** 23 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 24 | */ 25 | public class NLPDecoder extends AbstractNLPDecoder 26 | { 27 | public NLPDecoder() {super();} 28 | 29 | public NLPDecoder(DecodeConfig config) 30 | { 31 | super(config); 32 | } 33 | 34 | public NLPDecoder(InputStream configuration) 35 | { 36 | super(new DecodeConfig(configuration)); 37 | } 38 | 39 | @Override 40 | public NLPNode create() 41 | { 42 | return new NLPNode(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /md/tutorial/maven.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Run a Java class using Maven 4 | 5 | * Specify the [JVM options](http://www.oracle.com/technetwork/articles/java/vmoptions-jsp-140102.html) in Maven. If you are using [Bash](https://www.gnu.org/software/bash/), export `MAVEN_OPTS`: 6 | 7 | ``` 8 | export MAVEN_OPTS='-Xmx8g -XX:+UseConcMarkSweepGC -XX:MaxPermSize=128m' 9 | ``` 10 | 11 | * Compile the Java project using Maven by running the following command from the top directory, where the [`pom.xml`](../../pom.xml) is located. The `target/classes` directory should be created after running this command if it does not already exist. 12 | 13 | ``` 14 | mvn compile 15 | ``` 16 | 17 | * Copy [`log4j.properties`](../../src/main/resources/configuration/log4j.properties) to `target/classes` if it is not already specified in your path. 18 | 19 | * Run an executable Java class using `mvn exec:java`. For instance, the following command executes [`POSTrain`](../../src/main/java/edu/emory/mathcs/nlp/bin/POSTrain.java) (see [part-of-speech tagging](../component/part_of_speech_tagging.md#training) for more details about the command). Note that the base filenames are used in this example, but use the filenames with their absolute paths if they are not getting recognized. 20 | 21 | ``` 22 | mvn exec:java -Dexec.mainClass="edu.emory.mathcs.nlp.bin.POSTrain" -Dexec.args="-c config_train_pos.xml -t wsj_0001.dep -d wsj_0001.dep" 23 | ``` 24 | -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-sample.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 0.00001 13 | adagrad-mini-batch 14 | 0.02 15 | 0 16 | 17 | 1 18 | 5 19 | 0 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-sample-optimized.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 0.00001 13 | adagrad-mini-batch 14 | 0.02 15 | 0 16 | 17 | 1 18 | 3 19 | 0 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-doc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | /Users/jdchoi/Documents/EmoryNLP/nlp4j-english/src/main/resources/edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz 14 | 15 | 16 | r3 17 | 18 | 19 | adagrad 20 | 0.01 21 | 0.001 22 | 0 23 | 20 24 | 0 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP4J 2 | 3 | The NLP4J project provides a NLP toolkit for JVM languages. This project is under the [Apache 2](http://www.apache.org/licenses/LICENSE-2.0) license and is currently developed by the [NLP Research Group](http://nlp.mathcs.emory.edu) at [Emory University](http://emory.edu). Please join our [forum](https://groups.google.com/forum/#!forum/emorynlp) to get notifications about new releases and give your feedback about this project. 4 | 5 | * Latest release: [1.1.2](http://search.maven.org/#search%7Cga%7C1%7Cnlp4j) (06/29/2016). 6 | * [Release notes](md/quickstart/release.md). 7 | * [Google groups](https://groups.google.com/forum/#!forum/emorynlp). 8 | 9 | ## Quick Start 10 | 11 | * [How to install](md/quickstart/install.md). 12 | * [How to train](md/quickstart/train.md). 13 | * [How to decode](md/quickstart/decode.md). 14 | * [NLP4J demo](http://nlp.mathcs.emory.edu:8080/nlp4j). 15 | 16 | ## Components 17 | 18 | * [Tokenization](https://github.com/emorynlp/tokenization). 19 | * [Morphological analysis](https://github.com/emorynlp/morphological_analysis). 20 | * [Part-of-speech tagging](md/components/part_of_speech_tagging.md). 21 | * [Named entity recognition](md/components/named_entity_recognition.md). 22 | * [Dependency parsing](md/components/dependency_parsing.md). 23 | * Semantic role labeling (coming soon). 24 | * Sentiment analysis (coming soon). 25 | * Word2Vec & Struct2Vec (coming soon). 26 | 27 | ## Supplements 28 | 29 | * [English lexica and models](md/supplements/english-lexica-models.md) (hosted in [bitbucket](https://bitbucket.org/emorynlp/nlp4j-english)). 30 | * [Data format](md/supplements/data-format.md). 31 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/bin/NLPDemo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.bin; 17 | 18 | import edu.emory.mathcs.nlp.common.util.IOUtils; 19 | import edu.emory.mathcs.nlp.common.util.Joiner; 20 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 21 | import edu.emory.mathcs.nlp.decode.AbstractNLPDecoder; 22 | import edu.emory.mathcs.nlp.decode.NLPDecoder; 23 | 24 | /** 25 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 26 | */ 27 | public class NLPDemo 28 | { 29 | static public void main(String[] args) throws Exception 30 | { 31 | final String configFile = "src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml"; 32 | final String inputFile = "src/test/resources/dat/nlp4j.txt"; 33 | 34 | NLPDecoder nlp4j = new NLPDecoder(IOUtils.createFileInputStream(configFile)); 35 | NLPNode[] nodes; 36 | 37 | String sentence = "John bought a car for Mary."; 38 | nodes = nlp4j.decode(sentence); 39 | System.out.println(Joiner.join(nodes, "\n", 1)+"\n"); 40 | nlp4j.decode(IOUtils.createFileInputStream(inputFile), System.out, AbstractNLPDecoder.FORMAT_RAW); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/component/morph/MorphologicalAnalyzer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.component.morph; 17 | 18 | import java.util.List; 19 | 20 | import edu.emory.mathcs.nlp.common.util.Language; 21 | import edu.emory.mathcs.nlp.component.morph.english.EnglishMorphAnalyzer; 22 | import edu.emory.mathcs.nlp.component.template.NLPComponent; 23 | import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode; 24 | 25 | /** 26 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 27 | */ 28 | public class MorphologicalAnalyzer> implements NLPComponent 29 | { 30 | private MorphAnalyzer analyzer; 31 | 32 | public MorphologicalAnalyzer(Language language) 33 | { 34 | analyzer = new EnglishMorphAnalyzer(); 35 | } 36 | 37 | @Override 38 | public void process(N[] nodes) 39 | { 40 | N node; 41 | 42 | for (int i=1; i document) 51 | { 52 | for (N[] nodes : document) 53 | process(nodes); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/zzz/RadiologyDecode.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.zzz; 17 | 18 | import edu.emory.mathcs.nlp.common.util.FileUtils; 19 | import edu.emory.mathcs.nlp.common.util.IOUtils; 20 | import edu.emory.mathcs.nlp.decode.AbstractNLPDecoder; 21 | import edu.emory.mathcs.nlp.decode.NLPDecoder; 22 | 23 | /** 24 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 25 | */ 26 | public class RadiologyDecode 27 | { 28 | static public void main(String[] args) throws Exception 29 | { 30 | final String configFile = "/Users/jdchoi/Documents/EmoryNLP/nlp4j/src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-deident.xml"; 31 | final String inputDir = "/Users/jdchoi/Desktop/radiology/Q2"; 32 | final String inputExt = "txt"; 33 | final String outputExt = "tsv"; 34 | final String outputFormat = AbstractNLPDecoder.FORMAT_LINE; 35 | 36 | NLPDecoder nlp4j = new NLPDecoder(IOUtils.createFileInputStream(configFile)); 37 | 38 | for (String inputFile : FileUtils.getFileList(inputDir, inputExt)) 39 | { 40 | System.out.println(inputFile); 41 | String outputFile = inputFile+"."+outputExt; 42 | nlp4j.decode(IOUtils.createFileInputStream(inputFile), IOUtils.createFileOutputStream(outputFile), outputFormat); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/java/edu/emory/mathcs/nlp/decode/AbstractNLPDecoderTest.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** This data and information is proprietary to, and a valuable trade secret 3 | ** of, Basis Technology Corp. It is given in confidence by Basis Technology 4 | ** and may only be used as permitted under the license agreement under which 5 | ** it has been distributed, and in no other way. 6 | ** 7 | ** Copyright (c) 2015 Basis Technology Corporation All rights reserved. 8 | ** 9 | ** The technical data and information provided herein are provided with 10 | ** `limited rights', and the computer software provided herein is provided 11 | ** with `restricted rights' as those terms are defined in DAR and ASPR 12 | ** 7-104.9(a). 13 | ******************************************************************************/ 14 | 15 | package edu.emory.mathcs.nlp.decode; 16 | 17 | import java.io.InputStream; 18 | import java.net.URL; 19 | 20 | import org.junit.Test; 21 | 22 | import com.google.common.io.Resources; 23 | 24 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 25 | import edu.emory.mathcs.nlp.component.template.reader.TSVReader; 26 | 27 | /** 28 | * 29 | */ 30 | public class AbstractNLPDecoderTest { 31 | 32 | @Test 33 | public void createTsv() throws Exception { 34 | URL configUrl = Resources.getResource("decoder-test-config.xml"); 35 | DecodeConfig config; 36 | try (InputStream configStream = Resources.asByteSource(configUrl).openStream()) { 37 | config = new DecodeConfig(configStream); 38 | } 39 | 40 | NLPDecoder decoder = new NLPDecoder(config); 41 | TSVReader reader = decoder.createTSVReader(); 42 | URL tsvUrl = Resources.getResource("dat/sample-dev.tsv"); 43 | try (InputStream tsvStream = Resources.asByteSource(tsvUrl).openStream()) { 44 | reader.open(tsvStream); 45 | reader.readDocument(); 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /md/components/named_entity_recognition.md: -------------------------------------------------------------------------------- 1 | # Named Entity Recognition 2 | 3 | Our named entity recognizer uses both sparse and dense features extracted from named entity gazetteers, word clusters, and word embeddings. It processes over 47K tokens per second on an Intel Xeon 2.30GHz machine and shows the state-of-the-art accuracy (91.0% on the CoNLL'03 corpus). 4 | 5 | * [Dynamic Feature Induction: The Last Gist to the State-of-the-Art](http://naacl.org/naacl-hlt-2016/), Jinho D. Choi, Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (NAACL'16), San Diego, CA, 2016. 6 | * [Intrinsic and Extrinsic Evaluations of Word Embeddings](http://www.aaai.org/Conferences/AAAI/2016/aaai16accepted-papers.pdf), Michael Zhai, Johnny Tan, Jinho D. Choi, Proceedings of the AAAI 2015 Student Program, Phoenix, AZ, 2015. 7 | 8 | ## English Tags 9 | 10 | | Tag | Description | Version | 11 | | -------------- | ----------- | ------- | 12 | | `PERSON` | People, including fictional | 1.0.0 | | `NORP` | Nationalities or religious or political groups | 1.0.0 | | `FAC` | Buildings, airports, highways, bridges, etc. | 1.0.0 | | `ORG` | Companies, agencies, institutions, etc. | 1.0.0 | | `GPE` | Countries, cities, states | 1.0.0 | | `LOC` | Non-GPE locations, mountain ranges, bodies of water | 1.0.0 | | `PRODUCT` | Vehicles, weapons, foods, etc. (not services) | 1.0.0 | | `EVENT` | Named hurricanes, battles, wars, sports events, etc. | 1.0.0 | | `WORK OF ART` | Titles of books, songs, etc. | 1.0.0 | | `LAW` | Named documents made into laws | 1.0.0 | | `LANGUAGE` | Any named language | 1.0.0 | 13 | | `DATE` | Absolute or relative dates or periods | 1.0.0 | 14 | | `TIME` | Times smaller than a day | 1.0.0 | 15 | | `PERCENT` | Percentage (including "%") | 1.0.0 | 16 | | `MONEY` | Monetary values, including unit | 1.0.0 | 17 | | `QUANTITY` | Measurements, as of weight or distance | 1.0.0 | 18 | | `ORDINAL` | Ordinals (e.g., "first", "1st") | 1.0.0 | 19 | | `CARDINAL` | Numerals that do not fall under another type | 1.0.0 | 20 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/network/NLPSocketClient.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.network; 17 | 18 | import java.io.BufferedInputStream; 19 | import java.io.BufferedOutputStream; 20 | import java.io.DataInputStream; 21 | import java.io.DataOutputStream; 22 | import java.io.InputStream; 23 | import java.io.OutputStream; 24 | import java.net.Socket; 25 | 26 | /** 27 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 28 | */ 29 | public class NLPSocketClient 30 | { 31 | private final String SERVER_ADDRESS; 32 | private final int SERVER_PORT; 33 | 34 | public NLPSocketClient(String address, int port) 35 | { 36 | SERVER_ADDRESS = address; 37 | SERVER_PORT = port; 38 | } 39 | 40 | public String decode(String text, String format) 41 | { 42 | StringBuilder build = new StringBuilder(); 43 | 44 | try 45 | { 46 | String data = format+":"+text+NLPSocketServer.END; 47 | Socket socket = new Socket(SERVER_ADDRESS, SERVER_PORT); 48 | InputStream in = new DataInputStream(new BufferedInputStream(socket.getInputStream())); 49 | OutputStream out = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream())); 50 | 51 | out.write(data.getBytes()); 52 | out.flush(); 53 | 54 | byte[] buffer = new byte[2048]; 55 | int i; 56 | 57 | while ((i = in.read(buffer, 0, buffer.length)) >= 0) 58 | { 59 | build.append(new String(buffer, 0, i)); 60 | if (build.toString().endsWith(NLPSocketServer.END)) break; 61 | } 62 | 63 | socket.close(); 64 | } 65 | catch (Exception e) {e.printStackTrace();} 66 | 67 | return build.toString(); 68 | } 69 | 70 | static public void main(String[] args) 71 | { 72 | NLPSocketClient client = new NLPSocketClient("127.0.0.1", 8000); 73 | System.out.println(client.decode("UN peacekeepers abuse children", "raw")); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /md/supplements/data-format.md: -------------------------------------------------------------------------------- 1 | # Data Format 2 | 3 | ## Raw Format 4 | 5 | The `raw` format accepts texts in any format. 6 | 7 | ``` 8 | I'd like to meet Dr. Choi. He's a professor at Emory University. 9 | ``` 10 | 11 | ## Sentence Format 12 | 13 | The `sen` format expects a sentence per line. 14 | 15 | ``` 16 | I'd like to meet Dr. Choi. 17 | He's a professor at Emory University. 18 | ``` 19 | 20 | ## Tab Separated Values Format 21 | 22 | The `tsv` format expects columns delimited by `\t` and sentences separated by `\n`. 23 | 24 | ```tsv 25 | 1 I I PRP _ 3 nsubj 3:A0;5:A0 O 26 | 2 'd would MD _ 3 aux 3:AM-MOD O 27 | 3 like like VB _ 0 root _ O 28 | 4 to to TO _ 5 aux _ O 29 | 5 meet meet VB _ 3 xcomp 3:A1 O 30 | 6 Dr. dr. NNP _ 7 compound _ O 31 | 7 Choi choi NNP _ 5 dobj 5:A1 U-PERSON 32 | 8 . . . _ 3 punct _ O 33 | 34 | 1 He he PRP _ 2 nsubj 2:A1 O 35 | 2 's 's VBZ _ 0 root _ O 36 | 3 a a DT _ 4 det _ O 37 | 4 professor professor NN _ 2 attr 2:A2 O 38 | 5 at at IN _ 4 prep _ O 39 | 6 Emory emory NNP _ 7 compound _ B-ORG 40 | 7 University university NNP _ 5 pobj _ L-ORG 41 | 8 . . . _ 2 punct _ O 42 | ``` 43 | 44 | The column fields are specified in the [configuration files](../../src/main/resources/configuration/) as follows: 45 | 46 | ```xml 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | ``` 60 | 61 | * `form`: word form. 62 | * `lemma`: lemma. 63 | * `pos`: part-of-speech tag. 64 | * `feats`: extra features; features are delimited by `|`, and keys and values are delimited by `=` (e.g., `k1=v1|k2=v2`). 65 | * `dhead`: dependency head token ID. 66 | * `deprel`: dependency label. 67 | * `sheads`: semantic heads; head IDs and labels are delimited by `:`. 68 | * `nament`: named entity tags in the BILOU notaiton. -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/decode/NLPUtils.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.decode; 17 | 18 | import java.io.InputStream; 19 | import java.io.ObjectInputStream; 20 | 21 | import edu.emory.mathcs.nlp.common.util.IOUtils; 22 | import edu.emory.mathcs.nlp.common.util.Joiner; 23 | import edu.emory.mathcs.nlp.common.util.Language; 24 | import edu.emory.mathcs.nlp.component.template.NLPComponent; 25 | import edu.emory.mathcs.nlp.component.template.OnlineComponent; 26 | import edu.emory.mathcs.nlp.component.template.feature.Field; 27 | import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode; 28 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 29 | import edu.emory.mathcs.nlp.component.template.state.NLPState; 30 | import edu.emory.mathcs.nlp.component.template.util.NLPFlag; 31 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer; 32 | import edu.emory.mathcs.nlp.tokenization.Tokenizer; 33 | 34 | /** 35 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 36 | */ 37 | public class NLPUtils 38 | { 39 | static public String FEAT_POS_2ND = "pos2"; 40 | static public String FEAT_PREDICATE = "pred"; 41 | 42 | static public String toStringLine(NLPNode[] nodes, String delim, Field field) 43 | { 44 | return Joiner.join(nodes, delim, 1, nodes.length, n -> n.getValue(field)); 45 | } 46 | 47 | static public Tokenizer createTokenizer(Language language) 48 | { 49 | return new EnglishTokenizer(); 50 | } 51 | 52 | @SuppressWarnings("unchecked") 53 | static public ,S extends NLPState>NLPComponent getComponent(InputStream in) 54 | { 55 | ObjectInputStream oin = IOUtils.createObjectXZBufferedInputStream(in); 56 | OnlineComponent component = null; 57 | 58 | try 59 | { 60 | component = (OnlineComponent)oin.readObject(); 61 | component.setFlag(NLPFlag.DECODE); 62 | oin.close(); 63 | } 64 | catch (Exception e) {e.printStackTrace();} 65 | 66 | return component; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /md/tutorial/processing_state.md: -------------------------------------------------------------------------------- 1 | # Processing State 2 | 3 | ## NLPState 4 | 5 | A state object is created for every input (e.g., a sentence), providing information about the current state of the component (e.g., a pointer to the input token). Creating a separate state object for each input enables the component to be thread-safe. All state objects extend [`NLPState`](../../java/edu/emory/mathcs/nlp/component/util/state/NLPState.java), which takes two genetic types `N` and `L` representing the types of the input node (e.g., [`NLPNode`](../../java/edu/emory/mathcs/nlp/component/util/NLPNode.java)) and the label (e.g., `String`), respectively. 6 | 7 | ```java 8 | public abstract class NLPState 9 | ``` 10 | 11 | This class contains several abstract methods: 12 | 13 | ```java 14 | /** Clears and saves the gold-standard labels in the input nodes if available. */ 15 | public abstract void clearGoldLabels(); 16 | 17 | /** Moves onto the next state */ 18 | public abstract void next(); 19 | 20 | /** @return true if no more state can be processed; otherwise, false. */ 21 | public abstract boolean isTerminate(); 22 | 23 | /** @return the gold standard label for the current state. */ 24 | public abstract L getGoldLabel(); 25 | 26 | /** Assigns the specific label to the current state. */ 27 | public abstract void setLabel(L label); 28 | 29 | /** Evaluates all predictions given the current input. */ 30 | public abstract void evaluate(Eval eval); 31 | ``` 32 | 33 | See [NLP component](nlp_component.md) for more details about how these abstract methods are used. 34 | 35 | ## L2RState 36 | 37 | [`N2RState`](../../java/edu/emory/mathcs/nlp/component/util/state/N2RState.java) defines the left-to-right tagging strategy commonly used in NLP (e.g., part-of-speech tagging, named entity recognition). It extends [`NLPState`](#nlpstate) and takes a genetic type `N` representing the type of the input node (e.g., [`POSNode`](../../java/edu/emory/mathcs/nlp/component/pos/POSNode.java)). 38 | 39 | ```java 40 | public abstract class L2RState extends NLPState 41 | ``` 42 | 43 | This state keeps track of the pointer to the processing node, starting at `0`. It then moves onto the next state by incrementing the pointer to the next node. Finally, it terminates if there is no more node to process. 44 | 45 | ```java 46 | protected int index = 0; 47 | 48 | @Override 49 | public void next() 50 | { 51 | index++; 52 | } 53 | 54 | @Override 55 | public boolean isTerminate() 56 | { 57 | return index >= nodes.length; 58 | } 59 | ``` 60 | See [`POSState`](../../java/edu/emory/mathcs/nlp/component/pos/POSState.java) for the example of a subclass inheriting this class. -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/bin/NLPDecode.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.bin; 17 | 18 | import java.util.Collections; 19 | import java.util.List; 20 | 21 | import org.kohsuke.args4j.Option; 22 | 23 | import edu.emory.mathcs.nlp.common.util.BinUtils; 24 | import edu.emory.mathcs.nlp.common.util.FileUtils; 25 | import edu.emory.mathcs.nlp.common.util.IOUtils; 26 | import edu.emory.mathcs.nlp.decode.AbstractNLPDecoder; 27 | import edu.emory.mathcs.nlp.decode.NLPDecoder; 28 | 29 | /** 30 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 31 | */ 32 | public class NLPDecode 33 | { 34 | @Option(name="-c", usage="confinguration filename (required)", required=true, metaVar="") 35 | public String configuration_file; 36 | @Option(name="-i", usage="input path (required)", required=true, metaVar="") 37 | public String input_path; 38 | @Option(name="-ie", usage="input file extension (default: *)", required=false, metaVar="") 39 | public String input_ext = "*"; 40 | @Option(name="-oe", usage="output file extension (default: nlp)", required=false, metaVar="") 41 | public String output_ext = "nlp"; 42 | @Option(name="-format", usage="format of the input data (raw|line|tsv; default: raw)", required=false, metaVar="") 43 | private String format = AbstractNLPDecoder.FORMAT_RAW; 44 | @Option(name="-threads", usage="number of threads (default: 2)", required=false, metaVar="") 45 | protected int threads = 2; 46 | private NLPDecoder decoder; 47 | 48 | // ======================================== CONSTRUCTORS ======================================== 49 | 50 | public NLPDecode(String[] args) 51 | { 52 | BinUtils.initArgs(args, this); 53 | List filelist = FileUtils.getFileList(input_path, input_ext, false); 54 | Collections.sort(filelist); 55 | 56 | decoder = new NLPDecoder(IOUtils.createFileInputStream(configuration_file)); 57 | decoder.decode(filelist, output_ext, format, threads); 58 | } 59 | 60 | static public void main(String[] args) 61 | { 62 | new NLPDecode(args); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/decode/DecodeConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.decode; 17 | 18 | import java.io.InputStream; 19 | 20 | import org.w3c.dom.Element; 21 | 22 | import edu.emory.mathcs.nlp.common.util.XMLUtils; 23 | import edu.emory.mathcs.nlp.component.template.config.NLPConfig; 24 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 25 | 26 | /** 27 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 28 | */ 29 | public class DecodeConfig extends NLPConfig 30 | { 31 | private String part_of_speech_tagging; 32 | private String named_entity_recognition; 33 | private String dependency_parsing; 34 | private String semantic_role_labeling; 35 | 36 | public DecodeConfig() {} 37 | 38 | public DecodeConfig(InputStream in) 39 | { 40 | super(in); 41 | initComponents(); 42 | } 43 | 44 | public void initComponents() 45 | { 46 | Element eModels = XMLUtils.getFirstElementByTagName(xml, "models"); 47 | 48 | setPartOfSpeechTagging (XMLUtils.getTextContentFromFirstElementByTagName(eModels, "pos")); 49 | setNamedEntityRecognition(XMLUtils.getTextContentFromFirstElementByTagName(eModels, "ner")); 50 | setDependencyParsing (XMLUtils.getTextContentFromFirstElementByTagName(eModels, "dep")); 51 | setSemanticRoleLabeling (XMLUtils.getTextContentFromFirstElementByTagName(eModels, "srl")); 52 | } 53 | 54 | public String getPartOfSpeechTagging() 55 | { 56 | return part_of_speech_tagging; 57 | } 58 | 59 | public String getNamedEntityRecognition() 60 | { 61 | return named_entity_recognition; 62 | } 63 | 64 | public String getDependencyParsing() 65 | { 66 | return dependency_parsing; 67 | } 68 | 69 | public String getSemanticRoleLabeling() 70 | { 71 | return semantic_role_labeling; 72 | } 73 | 74 | public void setPartOfSpeechTagging(String filename) 75 | { 76 | part_of_speech_tagging = filename; 77 | } 78 | 79 | public void setNamedEntityRecognition(String filename) 80 | { 81 | named_entity_recognition = filename; 82 | } 83 | 84 | public void setDependencyParsing(String filename) 85 | { 86 | dependency_parsing = filename; 87 | } 88 | 89 | public void setSemanticRoleLabeling(String filename) 90 | { 91 | semantic_role_labeling = filename; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /md/quickstart/install.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | ## With Maven 4 | 5 | * Make sure [Java 8](http://www.oracle.com/technetwork/java/javase/) and [Maven 3](https://maven.apache.org) are installed on your machine. 6 | * Add the following dependency to `pom.xml`: 7 | 8 | ```xml 9 | 10 | edu.emory.mathcs.nlp 11 | nlp4j 12 | RELEASE 13 | 14 | ``` 15 | 16 | * For English models, add the following dependency to `pom.xml`. 17 | 18 | ```xml 19 | 20 | edu.emory.mathcs.nlp 21 | nlp4j-english 22 | RELEASE 23 | 24 | ``` 25 | 26 | * Install the maven project: 27 | 28 | ```bash 29 | mvn clean install 30 | ``` 31 | 32 | * Run the following command: 33 | 34 | ```bash 35 | mvn exec:java -Dexec.mainClass="edu.emory.mathcs.nlp.bin.Version" 36 | ``` 37 | 38 | If you see the following message, it is properly installed. 39 | 40 | ``` 41 | [INFO] Scanning for projects... 42 | [INFO] 43 | [INFO] ------------------------------------------------------------------------ 44 | [INFO] Building nlp4j x.x.x 45 | [INFO] ------------------------------------------------------------------------ 46 | [INFO] 47 | [INFO] --- exec-maven-plugin:1.4.0:java (default-cli) @ nlp4j --- 48 | ==================================== 49 | Emory NLP Version x.x.x 50 | Webpage: http://nlp.mathcs.emory.edu 51 | Contact: jinho.choi@emory.edu 52 | ==================================== 53 | [INFO] ------------------------------------------------------------------------ 54 | [INFO] BUILD SUCCESS 55 | [INFO] ------------------------------------------------------------------------ 56 | [INFO] Total time: 0.739s 57 | [INFO] Finished at: Tue Nov 24 20:08:59 EST 2015 58 | [INFO] Final Memory: 11M/247M 59 | [INFO] ------------------------------------------------------------------------ 60 | ``` 61 | 62 | ## Without Maven 63 | 64 | * Make sure [Java 8](http://www.oracle.com/technetwork/java/javase/) is installed on your machine. 65 | * Download [`nlp4j.jar`](http://nlp.mathcs.emory.edu/nlp4j/nlp4j-1.1.2.jar), and add it to your classpath. If you are using [bash](https://www.gnu.org/software/bash/), export `CLASSPATH` as follows: 66 | 67 | ```bash 68 | export CLASSPATH=nlp4j-x.x.x.jar:. 69 | ``` 70 | 71 | * For English models, download [`nlp4j-english.jar`](http://search.maven.org/remotecontent?filepath=edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar), and add it to your classpath: 72 | 73 | ```bash 74 | export CLASSPATH=nlp4j-x.x.x.jar:nlp4j-english-x.x.x.jar:. 75 | ``` 76 | 77 | * Run the following command: 78 | 79 | ```bash 80 | java edu.emory.mathcs.nlp.bin.Version 81 | ``` 82 | 83 | If you see the following message, it is properly installed. 84 | 85 | ``` 86 | ==================================== 87 | NLP4J Version x.x.x 88 | Webpage: http://nlp.mathcs.emory.edu 89 | Contact: jinho.choi@emory.edu 90 | ==================================== 91 | ``` -------------------------------------------------------------------------------- /md/components/part_of_speech_tagging.md: -------------------------------------------------------------------------------- 1 | # Part-of-Speech Tagging 2 | 3 | Our part-of-speech tagger uses the generalized model from dynamic model selection and utilizes ambiguity classes trained on a large corpus. It processes over 82K tokens per second on an Intel Xeon 2.30GHz machine and shows the state-of-the-art accuracy (97.64% on the WSJ corpus). 4 | 5 | * [Dynamic Feature Induction: The Last Gist to the State-of-the-Art](http://naacl.org/naacl-hlt-2016/), Jinho D. Choi, Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (NAACL'16), San Diego, CA, 2016. 6 | * [Intrinsic and Extrinsic Evaluations of Word Embeddings](http://www.aaai.org/Conferences/AAAI/2016/aaai16accepted-papers.pdf), Michael Zhai, Johnny Tan, Jinho D. Choi, Proceedings of the AAAI 2015 Student Program, Phoenix, AZ, 2015. 7 | * [Fast and Robust Part-of-Speech Tagging Using Dynamic Model Selection](http://aclweb.org/anthology-new/P/P12/P12-2071.pdf), Jinho D. Choi, Martha Palmer, Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL'12), 363-367, Jeju, Korea, 2012. 8 | 9 | ## English Tags 10 | 11 | | Tag | Description | Version | 12 | | ------- | ----------- | ------- | 13 | | `$` | Dollar | 1.0.0 | 14 | | `:` | Colon | 1.0.0 | 15 | | `,` | Comma | 1.0.0 | 16 | | `.` | Period | 1.0.0 | 17 | | ```` | Left quote | 1.0.0 | 18 | | `''` | Right quote | 1.0.0 | 19 | | `-LRB-` | Left bracket | 1.0.0 | 20 | | `-RRB-` | Right bracket | 1.0.0 | 21 | | `ADD` | Email | 1.0.0 | 22 | | `AFX` | Affix | 1.0.0 | 23 | | `CC` | Coordinating conjunction | 1.0.0 | 24 | | `CD` | Cardinal number | 1.0.0 | 25 | | `DT` | Determiner | 1.0.0 | 26 | | `EX` | Existential `there` | 1.0.0 | 27 | | `FW` | Foreign word | 1.0.0 | 28 | | `GW` | Go with | 1.0.0 | 29 | | `HYPH` | Hyphen | 1.0.0 | 30 | | `IN` | Preposition or subordinating conjunction | 1.0.0 | 31 | | `JJ` | Adjective | 1.0.0 | 32 | | `JJR` | Adjective, comparative | 1.0.0 | 33 | | `JJS` | Adjective, superlative | 1.0.0 | 34 | | `LS` | List item marker | 1.0.0 | 35 | | `MD` | Modal | 1.0.0 | 36 | | `NFP` | Superfluous punctuation | 1.0.0 | 37 | | `NN` | Noun, singular or mass | 1.0.0 | 38 | | `NNS` | Noun, plural | 1.0.0 | 39 | | `NNP` | Proper noun, singular | 1.0.0 | 40 | | `NNPS` | Proper noun, plural | 1.0.0 | 41 | | `PDT` | Predeterminer | 1.0.0 | 42 | | `POS` | Possessive ending | 1.0.0 | 43 | | `PRP` | Personal pronoun | 1.0.0 | 44 | | `PRP$` | Possessive pronoun | 1.0.0 | 45 | | `RB` | Adverb | 1.0.0 | 46 | | `RBR` | Adverb, comparative | 1.0.0 | 47 | | `RBS` | Adverb, superlative | 1.0.0 | 48 | | `RP` | Particle | 1.0.0 | 49 | | `SYM` | Symbol | 1.0.0 | 50 | | `TO` | To | 1.0.0 | 51 | | `UH` | Interjection | 1.0.0 | 52 | | `VB` | Verb, base form | 1.0.0 | 53 | | `VBD` | Verb, past tense | 1.0.0 | 54 | | `VBG` | Verb, gerund or present participle | 1.0.0 | 55 | | `VBN` | Verb, past participle | 1.0.0 | 56 | | `VBP` | Verb, non-3rd person singular present | 1.0.0 | 57 | | `VBZ` | Verb, 3rd person singular present | 1.0.0 | 58 | | `WDT` | Wh-determiner | 1.0.0 | 59 | | `WP` | Wh-pronoun | 1.0.0 | 60 | | `WP$` | Wh-pronoun, possessive | 1.0.0 | 61 | | `WRB` | Wh-adverb | 1.0.0 | 62 | | `XX` | Unknown | 1.0.0 | 63 | -------------------------------------------------------------------------------- /md/tutorial/nlp_component.md: -------------------------------------------------------------------------------- 1 | # NLP Component 2 | 3 | All components extend [`NLPComponent`](../../java/edu/emory/mathcs/nlp/component/util/NLPComponent.java), providing general methods for supervised NLP. This class takes three genetic types `N`, `L`, and `S` representing the types of the input nodes, the label, and the [processing state](processing_state.md), respectively. 4 | 5 | ```java 6 | public abstract class NLPComponent> implements Serializable 7 | ``` 8 | 9 | This class contains several abstract methods: 10 | 11 | ```java 12 | /** @return the processing state for the input nodes. */ 13 | protected abstract S createState(N[] nodes); 14 | 15 | /** @return the gold-standard label for training; otherwise, the predicted label. */ 16 | protected abstract L getLabel(S state, StringVector vector); 17 | 18 | /** Adds a training instance (label, x) to the statistical model. */ 19 | protected abstract void addInstance(L label, StringVector vector); 20 | 21 | /** @return the vector consisting of all features extracted from the state. */ 22 | protected abstract StringVector extractFeatures(S state); 23 | ``` 24 | 25 | These abstract methods are used in the `process` method providing a genetic way for processing the NLP component. 26 | 27 | ```java 28 | public void process(N[] nodes) 29 | { 30 | S state = createState(nodes); 31 | if (!isDecode()) state.clearGoldLabels(); 32 | 33 | while (!state.isTerminate()) 34 | { 35 | StringVector vector = extractFeatures(state); 36 | if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector); 37 | L label = getLabel(state, vector); 38 | state.setLabel(label); 39 | state.next(); 40 | } 41 | 42 | if (isEvaluate()) state.evaluate(eval); 43 | } 44 | ``` 45 | 46 | * The `process` method takes an array of nodes with the genetic type `N`. 47 | 48 | ```java 49 | public void process(N[] nodes) 50 | ``` 51 | 52 | * It begins by creating a [processing state](processing_state.md). 53 | 54 | ```java 55 | S state = createState(nodes); 56 | ``` 57 | 58 | * It is important to clear out and save existing gold-standard labels before training; accidental usage of these labels can lead to inflated evaluation scores. 59 | 60 | ```java 61 | if (!isDecode()) state.clearGoldLabels(); 62 | ``` 63 | 64 | * The method iterates through every state as defined in the [processing state](processing_state.md). 65 | 66 | ```java 67 | while (!state.isTerminate()) 68 | { 69 | ... 70 | state.next(); 71 | } 72 | ``` 73 | 74 | * For each state, it creates a vector consisting of features extracted from the current state. 75 | 76 | ```java 77 | StringVector vector = extractFeatures(state); 78 | ``` 79 | 80 | * During training and bootstrapping, it adds the training instance to the statistical model. 81 | 82 | ```java 83 | if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector); 84 | ``` 85 | 86 | * Given the feature vector, it predicts the label of the current state either from the oracle or the statistical model. 87 | 88 | ```java 89 | L label = getLabel(state, vector); 90 | ``` 91 | 92 | * Finally, it assigns the label to the current state. 93 | 94 | ```java 95 | state.setLabel(label); 96 | ``` 97 | 98 | * During evaluation, the accuracy counts are updated to the evaluator. 99 | 100 | ```java 101 | if (isEvaluate()) state.evaluate(eval); 102 | ``` 103 | 104 | -------------------------------------------------------------------------------- /md/tutorial/create_nlp_component.md: -------------------------------------------------------------------------------- 1 | # Online Component 2 | 3 | All components extend [`NLPComponent`](../../java/edu/emory/mathcs/nlp/component/util/NLPComponent.java), providing general methods for supervised NLP. This class takes three genetic types `N`, `L`, and `S` representing the types of the input nodes, the label, and the [processing state](processing_state.md), respectively. 4 | 5 | ```java 6 | public abstract class NLPComponent> implements Serializable 7 | ``` 8 | 9 | This class contains several abstract methods: 10 | 11 | ```java 12 | /** @return the processing state for the input nodes. */ 13 | protected abstract S createState(N[] nodes); 14 | 15 | /** @return the gold-standard label for training; otherwise, the predicted label. */ 16 | protected abstract L getLabel(S state, StringVector vector); 17 | 18 | /** Adds a training instance (label, x) to the statistical model. */ 19 | protected abstract void addInstance(L label, StringVector vector); 20 | 21 | /** @return the vector consisting of all features extracted from the state. */ 22 | protected abstract StringVector extractFeatures(S state); 23 | ``` 24 | 25 | These abstract methods are used in the `process` method providing a genetic way for processing the NLP component. 26 | 27 | ```java 28 | public void process(N[] nodes) 29 | { 30 | S state = createState(nodes); 31 | if (!isDecode()) state.clearGoldLabels(); 32 | 33 | while (!state.isTerminate()) 34 | { 35 | StringVector vector = extractFeatures(state); 36 | if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector); 37 | L label = getLabel(state, vector); 38 | state.setLabel(label); 39 | state.next(); 40 | } 41 | 42 | if (isEvaluate()) state.evaluate(eval); 43 | } 44 | ``` 45 | 46 | * The `process` method takes an array of nodes with the genetic type `N`. 47 | 48 | ```java 49 | public void process(N[] nodes) 50 | ``` 51 | 52 | * It begins by creating a [processing state](processing_state.md). 53 | 54 | ```java 55 | S state = createState(nodes); 56 | ``` 57 | 58 | * It is important to clear out and save existing gold-standard labels before training; accidental usage of these labels can lead to inflated evaluation scores. 59 | 60 | ```java 61 | if (!isDecode()) state.clearGoldLabels(); 62 | ``` 63 | 64 | * The method iterates through every state as defined in the [processing state](processing_state.md). 65 | 66 | ```java 67 | while (!state.isTerminate()) 68 | { 69 | ... 70 | state.next(); 71 | } 72 | ``` 73 | 74 | * For each state, it creates a vector consisting of features extracted from the current state. 75 | 76 | ```java 77 | StringVector vector = extractFeatures(state); 78 | ``` 79 | 80 | * During training and bootstrapping, it adds the training instance to the statistical model. 81 | 82 | ```java 83 | if (isTrainOrBootstrap()) addInstance(state.getGoldLabel(), vector); 84 | ``` 85 | 86 | * Given the feature vector, it predicts the label of the current state either from the oracle or the statistical model. 87 | 88 | ```java 89 | L label = getLabel(state, vector); 90 | ``` 91 | 92 | * Finally, it assigns the label to the current state. 93 | 94 | ```java 95 | state.setLabel(label); 96 | ``` 97 | 98 | * During evaluation, the accuracy counts are updated to the evaluator. 99 | 100 | ```java 101 | if (isEvaluate()) state.evaluate(eval); 102 | ``` 103 | 104 | -------------------------------------------------------------------------------- /md/components/dependency_parsing.md: -------------------------------------------------------------------------------- 1 | # Dependency Parsing 2 | 3 | Our dependency parser uses a transition-based, non-projective parsing algorithm showing a linear-time speed for both projective and non-projective parsing. It processes over 14K tokens per second on an Intel Xeon 2.30GHz machine, and shows the near state-of-the-art accuracy for greedy parsing (92.26% on the WSJ corpus). 4 | 5 | * [It Depends: Dependency Parser Comparison Using A Web-based Evaluation Tool](http://www.aclweb.org/anthology/P15-1038.pdf), Jinho D. Choi, Amanda Stent, Joel Tetreault, Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics (ACL'15), 387–396, Beijing, China, 2015. 6 | * [Transition-based Dependency Parsing with Selectional Branching](http://aclweb.org/anthology/P/P13/P13-1104.pdf), Jinho D. Choi, Andrew McCallum, Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (ACL'13), 1052-1062, Sofia, Bulgaria, 2013. 7 | * [Getting the Most out of Transition-based Dependency Parsing](http://aclweb.org/anthology-new/P/P11/P11-2121.pdf), Jinho D. Choi, Martha Palmer, Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL'11), 687-692, Portland, Oregon, 2011. 8 | 9 | ## English Labels 10 | 11 | | Label | Description | Version | 12 | | ----------- | ----------- | ------- | 13 | | `acl` | Clausal modifier of noun | 1.0.0 | 14 | | `acomp` | Adjectival complement | 1.0.0 | 15 | | `advcl` | Adverbial clause modifier | 1.0.0 | 16 | | `advmod` | Adverbial modifier | 1.0.0 | 17 | | `agent` | Agent (passive) | 1.0.0 | 18 | | `appos` | Appositional modifier | 1.0.0 | 19 | | `attr` | Attribute | 1.0.0 | 20 | | `aux` | Auxiliary verb | 1.0.0 | 21 | | `auxpass` | Auxiliary verb (passive) | 1.0.0 | 22 | | `case` | Case marker | 1.0.0 | 23 | | `cc` | Coordinating conjunction | 1.0.0 | 24 | | `ccomp` | Clausal complement | 1.0.0 | 25 | | `compound` | Compound word | 1.0.0 | 26 | | `conj` | Conjunct | 1.0.0 | 27 | | `csubj` | Clausal subject | 1.0.0 | 28 | | `csubjpass` | Clausal subject (passive) | 1.0.0 | 29 | | `dative` | Dative | 1.0.0 | 30 | | `dep` | Unclassified dependent | 1.0.0 | 31 | | `det` | Determiner | 1.0.0 | 32 | | `discourse` | Discourse element | 1.0.0 | 33 | | `dobj` | Direct Object | 1.0.0 | 34 | | `expl` | Expletive | 1.0.0 | 35 | | `mark` | Marker | 1.0.0 | 36 | | `meta` | Meta data | 1.0.0 | 37 | | `neg` | Negation modifier | 1.0.0 | 38 | | `nmod` | Modifier of nominal | 1.0.0 | 39 | | `npadvmod` | Noun phrase as adverbial modifier | 1.0.0 | 40 | | `nsubj` | Nominal subject | 1.0.0 | 41 | | `nsubjpass` | Nominal subject (passive) | 1.0.0 | 42 | | `oprd` | Object predicate | 1.0.0 | 43 | | `parataxis` | Parataxis | 1.0.0 | 44 | | `pcomp` | Preposition complement | 1.0.0 | 45 | | `pobj` | Preposition object | 1.0.0 | 46 | | `poss` | Possession modifier | 1.0.0 | 47 | | `preconj` | Precorrelative conjunction | 1.0.0 | 48 | | `predet` | Predeterminer | 1.0.0 | 49 | | `prep` | Prepositional modifier | 1.0.0 | 50 | | `prt` | Verb particle | 1.0.0 | 51 | | `punct` | Punctuation | 1.0.0 | 52 | | `qmod` | Modifier of quantifier | 1.0.0 | 53 | | `relcl` | Relative clause modifier | 1.0.0 | 54 | | `root` | Root | 1.0.0 | 55 | | `vocative` | Vocative modifier | 1.0.0 | 56 | | `xcomp` | Open clausal complement | 1.0.0 | -------------------------------------------------------------------------------- /md/supplements/english-lexica-models.md: -------------------------------------------------------------------------------- 1 | # English 2 | 3 | ## Lexica 4 | 5 | All lexica can be found [here](https://bitbucket.org/emorynlp/nlp4j-english/src): 6 | 7 | * `en-ambiguity-classes-simplified.xz`
: ambiguity classes for part-of-speech tagging with simplified word forms. 8 | * `en-ambiguity-classes-simplified-lowercase.xz`
: ambiguity classes for part-of-speech tagging with simplified lowercase word forms. 9 | * `en-brown-clusters-simplified-lowercase.xz`
: brown clusters with simplified lowercase word forms. 10 | * `en-named-entity-gazetteers-simplified.xz`
: gazetteers for named entity recognition with simplified word forms. 11 | * `en-named-entity-gazetteers-simplified-lowercase.xz`
: gazetteers for named entity recognition with simplified lowercase word forms. 12 | * `en-stop-words-simplified-lowercase.xz`
: stop words with simplified lowercase word forms. 13 | * `en-word-embeddings-undigitalized.xz`
: word embeddings with undigitalized word forms. 14 | 15 | ## Models 16 | 17 | All models can be found [here](https://bitbucket.org/emorynlp/nlp4j-english/src): 18 | 19 | * `en-pos.xz`: part-of-speech tagging. 20 | * `en-ner.xz`: named entity recognition. 21 | * `en-dep.xz`: dependency parsing. 22 | 23 | Models are trained on the following corpora. 24 | 25 | | [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) | Sentences | Tokens | Names | 26 | | -------------------------- | -----: | ------: | -----: | 27 | | Broadcasting conversations | 10,822 | 171,101 | 9,771 | 28 | | Broadcasting news | 10,344 | 206,029 | 19,670 | 29 | | News magazines | 6,672 | 163,627 | 10,736 | 30 | | Newswires | 34,438 | 875,800 | 77,496 | 31 | | Religious texts | 21,418 | 296,432 | 0 | 32 | | Telephone conversations | 8,963 | 85,444 | 2,021 | 33 | | Web texts | 12,448 | 284,951 | 8,170 | 34 | 35 | |    [English Web Treebank](https://catalog.ldc.upenn.edu/LDC2012T13)    | Sentences | Tokens | 36 | | --------- | ----: | -----: | 37 | | Answers | 2,699 | 43,916 | 38 | | Email | 2,983 | 44,168 | 39 | | Newsgroup | 1,996 | 37,816 | 40 | | Reviews | 2,915 | 44,337 | 41 | | Weblog | 1,753 | 38,770 | 42 | 43 | |           [QuestionBank](http://www.computing.dcu.ie/~jjudge/qtreebank/)           | Sentences | Tokens | 44 | | --------- | ----: | -----: | 45 | | Questions | 3,198 | 29,704 | 46 | 47 | |                [MiPACQ](http://clear.colorado.edu/compsem/index.php?page=endendsystems&sub=mipacq)                | Sentences | Tokens | 48 | | ------------------- | --------------: | -----------: | 49 | | Clinical questions | 1,600 | 30,138 | 50 | | Medpedia articles | 2,796 | 49,922 | 51 | | Clinical notes | 8,383 | 113,164 | 52 | | Pathological notes | 1,205 | 21,353 | 53 | 54 | | [SHARP](http://informatics.mayo.edu/sharp/index.php/Main_Page) | Sentences | Tokens | 55 | | -------------------------------------- | -----: | ------: | 56 | | Seattle group health notes   | 7,204 | 94,450 | 57 | | Clinical notes | 6,807 | 93,914 | 58 | | Stratified | 4,320 | 43,536 | 59 | | Stratified SGH | 13,662 | 139,403 | 60 | 61 | | [THYME](http://clear.colorado.edu/compsem/index.php?page=endendsystems&sub=temporal) | Sentences | Tokens | 62 | | ----------------------------- | -----: | ------: | 63 | | Clinical / pathological notes | 26,661 | 387,943 | 64 | | Brain cancer | 18,722 | 225,899 | 65 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/network/NLPSocketServer.java: -------------------------------------------------------------------------------- 1 | /** 2 | // * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.network; 17 | 18 | import java.io.BufferedInputStream; 19 | import java.io.BufferedOutputStream; 20 | import java.io.DataInputStream; 21 | import java.io.DataOutputStream; 22 | import java.io.IOException; 23 | import java.io.InputStream; 24 | import java.io.OutputStream; 25 | import java.net.ServerSocket; 26 | import java.net.Socket; 27 | import java.util.concurrent.ExecutorService; 28 | import java.util.concurrent.Executors; 29 | 30 | import edu.emory.mathcs.nlp.common.util.IOUtils; 31 | import edu.emory.mathcs.nlp.decode.NLPDecoder; 32 | 33 | 34 | /** 35 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 36 | */ 37 | public class NLPSocketServer 38 | { 39 | static public final String END = "!E@N#D$"; 40 | private NLPDecoder decoder; 41 | 42 | @SuppressWarnings("resource") 43 | public NLPSocketServer(InputStream configuration, int port, int threads) throws Exception 44 | { 45 | ExecutorService executor = Executors.newFixedThreadPool(threads); 46 | ServerSocket server = new ServerSocket(port); 47 | Socket client; 48 | 49 | decoder = new NLPDecoder(configuration); 50 | System.out.println("Listening..."); 51 | 52 | while (true) 53 | { 54 | client = server.accept(); 55 | executor.submit(new NLPTask(client)); 56 | } 57 | 58 | // executor.shutdown(); 59 | // server.close(); 60 | } 61 | 62 | class NLPTask implements Runnable 63 | { 64 | OutputStream out; 65 | InputStream in; 66 | Socket client; 67 | 68 | public NLPTask(Socket client) 69 | { 70 | try 71 | { 72 | in = new DataInputStream (new BufferedInputStream (client.getInputStream())); 73 | out = new DataOutputStream(new BufferedOutputStream(client.getOutputStream())); 74 | this.client = client; 75 | // System.out.println(client.getInetAddress().toString()); 76 | } 77 | catch (IOException e) {e.printStackTrace();} 78 | } 79 | 80 | @Override 81 | public void run() 82 | { 83 | StringBuilder build = new StringBuilder(); 84 | byte[] buffer = new byte[2048]; 85 | String s, format; 86 | int i, idx; 87 | 88 | try 89 | { 90 | while ((i = in.read(buffer, 0, buffer.length)) >= 0) 91 | { 92 | build.append(new String(buffer, 0, i)); 93 | 94 | if (build.toString().endsWith(END)) 95 | { 96 | idx = build.indexOf(":"); 97 | format = build.substring(0, idx); 98 | s = build.substring(idx+1, build.length()-END.length()); 99 | out.write(decoder.decodeByteArray(s, format)); 100 | out.close(); 101 | in.close(); 102 | break; 103 | } 104 | } 105 | } 106 | catch (IOException e) {e.printStackTrace();} 107 | } 108 | } 109 | 110 | static public void main(String[] args) throws Exception 111 | { 112 | final String configFile = args[0]; 113 | final int port = Integer.parseInt(args[1]); 114 | final int threads = Integer.parseInt(args[2]); 115 | new NLPSocketServer(IOUtils.createFileInputStream(configFile), port, threads); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-ner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz 13 | edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz 14 | edu/emory/mathcs/nlp/lexica/en-named-entity-gazetteers-simplified.xz 15 | 16 | 17 | 18 | 0.00001 19 | adagrad-mini-batch 20 | 0.02 21 | 0 22 | 23 | 5 24 | 20 25 | 0 26 | 27 | 28 | 29 | 86.98 30 | 0.01 31 | 2 32 | 0.05 33 | 0.005 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-pos.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | edu/emory/mathcs/nlp/lexica/en-ambiguity-classes-simplified-lowercase.xz 10 | edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz 11 | 12 | 13 | 14 | 0.00001 15 | adagrad-mini-batch 16 | 0.02 17 | 2 18 | 19 | 5 20 | 40 21 | 0 22 | 23 | 24 | 25 | 97.48 26 | 0.01 27 | 2 28 | 0.05 29 | 0.005 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /md/quickstart/decode.md: -------------------------------------------------------------------------------- 1 | # Decode 2 | 3 | ## Command-Line 4 | 5 | The following command runs the NLP pipeline for tokenization, part-of-speech tagging, morphological analysis, named entity recognition, dependency parsing, and semantic role labeling: 6 | 7 | ```bash 8 | java edu.emory.mathcs.nlp.bin.NLPDecode -c -i [-ie -oe -format -threads ] 9 | 10 | -c : configuration filename (required) 11 | -i : input path (required) 12 | -ie : input file extension (default: *) 13 | -oe : output file extension (default: nlp) 14 | -format : format of the input data (raw|line|tsv; default: raw) 15 | -threads : number of threads (default: 2) 16 | ``` 17 | 18 | * `-c` specifies the configuration file (see [configuration](#configuration)). 19 | * `-i` specifies the input path pointing to either a file or a directory. When the path points to a file, only the specific file is processed. When the path points to a directory, all files with the file extension `-ie` under the specific directory are processed. 20 | * `-ie` specifies the input file extension. The default value `*` implies files with any extension. This option is used only when the input path `-i` points to a directory. 21 | * `-oe` specifies the output file extension appended to each input filename. The corresponding output file, consisting of the NLP output, will be generated. 22 | * `-format` specifies the format of the input file: `raw`, `line`, or `tsv` (see [data format](../supplements/data-format.md)). 23 | * `-threads` specifies the number of threads to be used. When multi-threads are used, each file is assigned to an individual thread. 24 | 25 | ## Example 26 | 27 | The following command takes [`nlp4j.txt`](../../src/test/resources/dat/nlp4j.txt) and generates [`nlp4j.txt.nlp`](../../src/test/resources/dat/nlp4j.txt.nlp) using [`config-decode-en.xml`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml). 28 | 29 | ```bash 30 | $ java -Xmx4g -XX:+UseConcMarkSweepGC edu.emory.mathcs.nlp.bin.NLPDecode -c config-decode-general.xml -i emorynlp.txt 31 | 32 | Loading ambiguity classes 33 | Loading word clusters 34 | Loading word embeddings 35 | Loading named entity gazetteers 36 | Loading tokenizer 37 | Loading part-of-speech tagger 38 | Loading morphological analyzer 39 | Loading named entity recognizer 40 | Loading dependency parser 41 | 42 | nlp4j.txt 43 | ``` 44 | 45 | * Use the [`-XX:+UseConcMarkSweepGC`](http://www.oracle.com/technetwork/java/tuning-139912.html) option for JVM, which reduces the memory usage into a half. 46 | * Use [`log4j.properties`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/log4j.properties) for the [log4j](http://logging.apache.org/log4j/) configuration. 47 | * The output file is generated in the `tsv` format (see [data format](../supplements/data-format.md#tab-separated-values-format)). 48 | 49 | ## Configuration 50 | 51 | Sample configuration files for decoding can be found here: [`config-decode-*`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/). 52 | 53 | ```xml 54 | 55 | 56 | 57 | 58 | 59 | 60 | en-ambiguity-classes-simplified-lowercase.xz 61 | en-brown-clusters-simplified-lowercase.xz 62 | en-named-entity-gazetteers-simplified.xz 63 | en-word-embeddings-undigitalized.xz 64 | 65 | 66 | 67 | en-pos.xz 68 | en-ner.xz 69 | en-dep.xz 70 | 71 | 72 | ``` 73 | 74 | * ``: see [`configuration#tsv`](train.md#configuration). This does not need to be specified when `raw` or `sen` is used. 75 | * ``: see [`configuration#lexica`](train.md#configuration). 76 | * `` specifies the statistical model for each component (e.g., [english models](../supplements/english-lexica-models.md#models); see [`NLPMode`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/NLPMode.java)). -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/zzz/TokenizeIt.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.zzz; 17 | 18 | import java.io.BufferedReader; 19 | import java.io.InputStream; 20 | import java.io.OutputStream; 21 | import java.io.PrintStream; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | 25 | import edu.emory.mathcs.nlp.common.util.FileUtils; 26 | import edu.emory.mathcs.nlp.common.util.IOUtils; 27 | import edu.emory.mathcs.nlp.common.util.Splitter; 28 | import edu.emory.mathcs.nlp.component.template.node.FeatMap; 29 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 30 | import edu.emory.mathcs.nlp.decode.NLPDecoder; 31 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer; 32 | import edu.emory.mathcs.nlp.tokenization.Token; 33 | import edu.emory.mathcs.nlp.tokenization.Tokenizer; 34 | 35 | /** 36 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 37 | */ 38 | public class TokenizeIt 39 | { 40 | NLPDecoder decoder; 41 | Tokenizer tokenizer; 42 | 43 | public TokenizeIt(String configFile) 44 | { 45 | decoder = new NLPDecoder(IOUtils.createFileInputStream(configFile)); 46 | tokenizer = new EnglishTokenizer(); 47 | } 48 | 49 | public void convert(String inputDir, String outputDir) 50 | { 51 | for (String inputFile : FileUtils.getFileList(inputDir, "tsv")) 52 | { 53 | String outputFile = outputDir+"/"+FileUtils.getBaseName(inputFile); 54 | System.out.println(FileUtils.getBaseName(inputFile)); 55 | 56 | try 57 | { 58 | convert(IOUtils.createFileInputStream(inputFile), IOUtils.createFileOutputStream(outputFile)); 59 | } 60 | catch (Exception e) {e.printStackTrace();} 61 | } 62 | } 63 | 64 | public void convert(InputStream in, OutputStream out) throws Exception 65 | { 66 | BufferedReader reader = IOUtils.createBufferedReader(in); 67 | PrintStream fout = IOUtils.createBufferedPrintStream(out); 68 | List list = new ArrayList<>(); 69 | NLPNode[] nodes; 70 | String line; 71 | String[] t; 72 | 73 | while ((line = reader.readLine()) != null && !(line = line.trim()).isEmpty()) 74 | { 75 | t = Splitter.splitTabs(line); 76 | list.add(new ItToken(t[0], t[1])); 77 | } 78 | 79 | for (List tokens : tokenizer.segmentize(list)) 80 | { 81 | nodes = decoder.toNodeArray(tokens, token -> create(token)); 82 | decoder.decode(nodes); 83 | check(nodes); 84 | fout.println(decoder.toString(nodes)+"\n"); 85 | } 86 | 87 | reader.close(); 88 | fout.close(); 89 | } 90 | 91 | public void check(NLPNode[] nodes) 92 | { 93 | for (int i=1; i> P_BEFORE, P_AFTER; 45 | Pattern NEW_LINE = Pattern.compile("\n"); 46 | 47 | public void categorize(String inputFile) throws Exception 48 | { 49 | CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT); 50 | List records = parser.getRecords(); 51 | StringJoiner join; 52 | CSVRecord record; 53 | 54 | for (int i=0; i<=500; i++) 55 | { 56 | if (i == 0) continue; 57 | record = records.get(i); 58 | join = new StringJoiner(" "); 59 | 60 | for (int j=2; j<7; j++) 61 | join.add(record.get(j)); 62 | 63 | System.out.println(join.toString()); 64 | } 65 | 66 | parser.close(); 67 | } 68 | 69 | public void tokenize(String inputFile, int outputStart) throws Exception 70 | { 71 | CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT); 72 | String inputPath = FileUtils.getPath(inputFile)+"/"; 73 | List records = parser.getRecords(); 74 | Tokenizer tokenizer = new EnglishTokenizer(); 75 | 76 | P_BEFORE = new ArrayList<>(); 77 | P_AFTER = new ArrayList<>(); 78 | for (String s : BEFORE) P_BEFORE.add(new Pair<>(Pattern.compile(s), "\n"+s)); 79 | for (String s : AFTER ) P_AFTER .add(new Pair<>(Pattern.compile(s), s+"\n")); 80 | 81 | for (int i=0; i tokens : tokenizer.segmentize(records.get(i).get(0))) 86 | print(fout, tokens); 87 | 88 | fout.close(); 89 | } 90 | 91 | parser.close(); 92 | } 93 | 94 | String getOuputFilename(String inputPath, int index) 95 | { 96 | StringBuilder build = new StringBuilder(); 97 | 98 | build.append(inputPath); 99 | if (index < 1000) build.append(0); 100 | if (index < 100) build.append(0); 101 | if (index < 10) build.append(0); 102 | build.append(index); 103 | build.append(".txt"); 104 | 105 | return build.toString(); 106 | } 107 | 108 | void print(PrintStream fout, List tokens) 109 | { 110 | String s = Joiner.join(tokens, " "); 111 | 112 | for (Pair p : P_BEFORE) 113 | { 114 | Matcher m = p.o1.matcher(s); 115 | if (m.find()) s = m.replaceAll(p.o2); 116 | } 117 | 118 | for (Pair p : P_AFTER) 119 | { 120 | Matcher m = p.o1.matcher(s); 121 | if (m.find()) s = m.replaceAll(p.o2); 122 | } 123 | 124 | for (String t : NEW_LINE.split(s)) 125 | { 126 | t = t.trim(); 127 | if (!t.isEmpty()) fout.println(t.trim()); 128 | } 129 | } 130 | 131 | static public void main(String[] args) 132 | { 133 | // String inputFile = "/Users/jdchoi/Emory/radiology/tools/500/500-original.csv"; 134 | // String inputFile = "/Users/jdchoi/Emory/radiology/dat/radiology_report_151112_lemmon.csv"; 135 | 136 | String inputFile = "/Users/jdchoi/Emory/radiology/de-identification/1986/Remaining_1986Reports_FULL.csv"; 137 | 138 | try 139 | { 140 | CSVRadiology cvs = new CSVRadiology(); 141 | cvs.tokenize(inputFile, 500); 142 | // cvs.categorize(inputFile); 143 | } 144 | catch (Exception e) {e.printStackTrace();} 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /md/quickstart/release.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | ## Version 1.1.2 (06/29/2016) 4 | 5 | * Bugfixes: [tokenization-issue-7](https://github.com/emorynlp/nlp4j-tokenization/issues/7) 6 | * Features: [tokenization-issue-6](https://github.com/emorynlp/nlp4j-tokenization/issues/6) 7 | * The tokenizer does not tokenize left/right brackets where the content inside is a single character or all numbers (e.g., `(a)`,`[12]`). 8 | 9 | ## Version 1.1.1 (04/29/2016) 10 | 11 | * Bugfixes: [core-pull-7](https://github.com/emorynlp/nlp4j-core/pull/7). 12 | * Features: [issue-3](https://github.com/emorynlp/nlp4j/issues/3/), [issue-6](https://github.com/emorynlp/nlp4j/issues/6). 13 | * [NLPNode](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java) comes with several useful constructors. 14 | * The `segmentize` method in [Tokenizer](https://github.com/emorynlp/nlp4j-tokenization/blob/master/src/main/java/edu/emory/mathcs/nlp/tokenization/Tokenizer.java) takes the generic type of [Token](https://github.com/emorynlp/nlp4j-tokenization/blob/master/src/main/java/edu/emory/mathcs/nlp/tokenization/Token.java). 15 | 16 | ## Version 1.1.0 (04/20/2016) 17 | 18 | * All the statistical models are about twice smaller than the previous ones without compromising accuracy. The whole pipeline can be run in 4GB of RAM now. 19 | * [Training](train.md) automatically saves the best model in a single pass (no need to run training twice any more to save the best model). 20 | * The [nlp4j-common](https://github.com/emorynlp/nlp4j-common) project is separated out from the [nlp4j-core](https://github.com/emorynlp/nlp4j-core) project. 21 | * [GlobalLexica](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/GlobalLexica.java) is no longer static, so it does not get conflicted by another process. 22 | * [NLPNode](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java) extends [AbstractNLPNode](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/AbstractNLPNode.java), which allows to create your own custom node. Generics are added all over for this change (e.g., [NLPState](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/state/NLPState.java), [OnlineComponent](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/OnlineComponent.java)). 23 | * The part-of-speech tagger gives the 2nd-best predictions when the best predictions have low confidence (`pos2` in the extra feats). 24 | * Thanks to [Anatoly Vostryakov](https://github.com/avostryakov): [`adjective.exc`](https://github.com/emorynlp/nlp4j-morphology/blob/master/src/main/resources/edu/emory/mathcs/nlp/component/morph/english/adjective.exc) and [`adverb.base`](https://github.com/emorynlp/nlp4j-morphology/blob/master/src/main/resources/edu/emory/mathcs/nlp/component/morph/english/adverb.base) are cleaned up. 25 | * Thanks to [spraynasal](https://github.com/spraynasal): some bugs in tokenization are fixed [5](https://github.com/emorynlp/nlp4j-tokenization/pull/5). 26 | 27 | ## Version 1.0.0 (02/24/2016) 28 | 29 | * NLP4J is the successor of the widely used toolkit, [ClearNLP](https://github.com/clir/clearnlp), developed by the [NLP Research Group](http://nlp.mathcs.emory.edu) at Emory University. Please visit our [Github page](https://github.com/emorynlp/nlp4j) for more details about this project. 30 | * This version supports tokenization, part-of-speech tagging, morphological analysis, named entity recognition, and dependency parsing. The next release (March, 2016) will include supports for semantic role labeling and sentiment analysis, and the following release (April, 2016) will include supports for coreference resolution. 31 | * NLP4J makes it easy to train your own model. Please see [how to train](train.md) for more details about the training process. 32 | * Calling the decoding API is easier than ever. See [NLPDemo](../../src/main/java/edu/emory/mathcs/nlp/bin/NLPDemo.java) for more details. 33 | * The biggest difference between NLP4J and ClearNLP is in machine learning. NLP4J is capable of updating existing models with new training data, which is useful for domain adaptation. We also started implementing a deep learning package although we realized that the GPU support for Java is pretty limited and without a good GPU support, deep learning would make everything much slower. Please let us know if you'd like to contribute for this project. 34 | * One could consider the NLP4J project is a more stabilized version of ClearNLP. I have been using this package for the NLP course I teach, and my students (including undergrads) were able to develop new NLP components without much effort using the built-in APIs in NLP4J. We are preparing a tutorial for developing NLP components using NLP4J. 35 | * We do not expect our tools would work perfectly out of box. We now have a good team working on this project. Please let us know if you'd like to collaborate so we can make this project more robust for you. 36 | * Please visit our [online demo](http://nlp.mathcs.emory.edu:8080/nlp4j). It parses 10K tokens with a couple of seconds and visualizes the dependency trees. 37 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/zzz/CSVSentiment.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.zzz; 17 | 18 | import java.io.PrintStream; 19 | import java.util.List; 20 | 21 | import org.apache.commons.csv.CSVFormat; 22 | import org.apache.commons.csv.CSVParser; 23 | import org.apache.commons.csv.CSVRecord; 24 | 25 | import edu.emory.mathcs.nlp.common.util.FileUtils; 26 | import edu.emory.mathcs.nlp.common.util.IOUtils; 27 | import edu.emory.mathcs.nlp.common.util.Joiner; 28 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 29 | import edu.emory.mathcs.nlp.decode.NLPDecoder; 30 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer; 31 | import edu.emory.mathcs.nlp.tokenization.Token; 32 | import edu.emory.mathcs.nlp.tokenization.Tokenizer; 33 | 34 | /** 35 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 36 | */ 37 | public class CSVSentiment 38 | { 39 | Tokenizer tokenizer; 40 | NLPDecoder decode; 41 | 42 | public CSVSentiment() 43 | { 44 | tokenizer = new EnglishTokenizer(); 45 | } 46 | 47 | public CSVSentiment(String configurationFile) 48 | { 49 | decode = new NLPDecoder(IOUtils.createFileInputStream(configurationFile)); 50 | } 51 | 52 | public void categorize(String inputFile) throws Exception 53 | { 54 | CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT); 55 | List records = parser.getRecords(); 56 | List document; 57 | String outputDir; 58 | PrintStream fout; 59 | CSVRecord record; 60 | 61 | System.out.println(inputFile); 62 | 63 | for (int i=0; i records = parser.getRecords(); 84 | List tokens; 85 | CSVRecord record; 86 | int label; 87 | 88 | System.out.println(inputFile); 89 | 90 | for (int i=0; i records = parser.getRecords(); 108 | CSVRecord record; 109 | 110 | System.out.println(inputFile); 111 | 112 | for (int i=0; i inputFiles = FileUtils.getFileList("/Users/jdchoi/Documents/Data/semeval-sentiment/csv", "csv"); 137 | 138 | try 139 | { 140 | // String configurationFile = "/Users/jdchoi/Documents/EmoryNLP/nlp4j/src/main/resources/edu/emory/mathcs/nlp/configuration/config-decode-en.xml"; 141 | // CSVSentiment cvs = new CSVSentiment(configurationFile); 142 | // for (String inputFile : inputFiles) cvs.categorize(inputFile); 143 | 144 | // CSVSentiment cvs = new CSVSentiment(); 145 | // for (String inputFile : inputFiles) cvs.toTSV(inputFile); 146 | 147 | CSVSentiment cvs = new CSVSentiment(); 148 | for (String inputFile : inputFiles) cvs.toTXT(inputFile); 149 | } 150 | catch (Exception e) {e.printStackTrace();} 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/bin/DEPEvaluate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.bin; 17 | 18 | import edu.emory.mathcs.nlp.common.util.BinUtils; 19 | import edu.emory.mathcs.nlp.common.util.FileUtils; 20 | import edu.emory.mathcs.nlp.component.template.node.NLPNode; 21 | import edu.emory.mathcs.nlp.component.template.reader.TSVReader; 22 | import edu.emory.mathcs.nlp.decode.DecodeConfig; 23 | import edu.emory.mathcs.nlp.decode.NLPDecoder; 24 | import org.kohsuke.args4j.Option; 25 | 26 | import java.io.InputStream; 27 | import java.nio.file.Files; 28 | import java.nio.file.Paths; 29 | import java.nio.file.StandardOpenOption; 30 | import java.util.Collections; 31 | import java.util.List; 32 | 33 | /** 34 | * A command-line program that does LAS/UAS evaluation for dependency parsing. 35 | * By default, it allows NLP4J to predict the part of speech tags. Optionally, 36 | * it will use POS tags from the input TSV. 37 | */ 38 | public class DEPEvaluate { 39 | 40 | @Option(name="-c", usage="configuration filename (required)", required=true, metaVar="") 41 | public String configuration_file; 42 | @Option(name="-i", usage="input path (required)", required=true, metaVar="") 43 | public String input_path; 44 | @Option(name="-goldPos", usage = "use gold pos tags") 45 | public boolean useGoldPos; 46 | 47 | private DEPEvaluate(String[] args) throws Exception { 48 | BinUtils.initArgs(args, this); 49 | List filelist = FileUtils.getFileList(input_path, "*", false); 50 | Collections.sort(filelist); 51 | 52 | DecodeConfig decodeConfig; 53 | try (InputStream config = Files.newInputStream(Paths.get(configuration_file), StandardOpenOption.READ)) { 54 | decodeConfig = new DecodeConfig(config); 55 | } 56 | 57 | NLPDecoder decoder = new NLPDecoder(decodeConfig); 58 | 59 | List sentences; 60 | try (InputStream is = Files.newInputStream(Paths.get(filelist.get(0)), StandardOpenOption.READ)) { 61 | TSVReader reader = new TSVReader(decodeConfig.getReaderFieldMap()) 62 | { 63 | @Override 64 | protected NLPNode create() {return new NLPNode();} 65 | }; 66 | 67 | reader.open(is); 68 | sentences = reader.readDocument(); 69 | } 70 | 71 | int uas = 0; 72 | int las = 0; 73 | int pos = 0; 74 | int total = 0; 75 | 76 | for (NLPNode[] sentence : sentences) { 77 | int[] goldHeads = new int[sentence.length]; 78 | String[] goldLabels = new String[sentence.length]; 79 | String[] goldPos = new String[sentence.length]; 80 | for (int x = 1; x < sentence.length; x++) { 81 | // capture gold and erase it so we recreate it in the decode. 82 | goldHeads[x] = sentence[x].getDependencyHead().getID(); 83 | sentence[x].setDependencyHead(null); 84 | goldLabels[x] = sentence[x].getDependencyLabel(); 85 | sentence[x].setDependencyLabel(null); 86 | // also forget the POS tag 87 | if (!useGoldPos) { 88 | goldPos[x] = sentence[x].getPartOfSpeechTag(); 89 | sentence[x].setPartOfSpeechTag(null); 90 | } 91 | } 92 | decoder.decode(sentence); 93 | for (int x = 1; x < sentence.length; x++) { 94 | total++; 95 | if (!useGoldPos) { 96 | if (goldPos[x].equals(sentence[x].getPartOfSpeechTag())) { 97 | pos++; 98 | } 99 | } 100 | 101 | if (goldHeads[x] == sentence[x].getDependencyHead().getID()) { 102 | uas++; 103 | if (goldLabels[x].equals(sentence[x].getDependencyLabel())) { 104 | las++; 105 | } 106 | } 107 | } 108 | } 109 | 110 | double uscore = ((double)uas)/total; 111 | double lscore = ((double)las)/total; 112 | if (!useGoldPos) { 113 | double posscore = ((double)pos)/total; 114 | System.out.format("UAS %.02f LAS %.02f POS %.02f total tokens %d%n", uscore, lscore, posscore, total); 115 | } else { 116 | System.out.format("UAS %.02f LAS %.02f total tokens %d%n", uscore, lscore, total); 117 | } 118 | } 119 | 120 | public static void main(String args[]) throws Exception { 121 | new DEPEvaluate(args); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | edu.emory.mathcs.nlp 5 | nlp4j 6 | 1.1.3-SNAPSHOT 7 | jar 8 | 9 | http://nlp.mathcs.emory.edu 10 | 11 | 12 | https://github.com/emorynlp/nlp4j 13 | scm:git:git://github.com/emorynlp/nlp4j.git 14 | scm:git:git@github.com:emorynlp/nlp4j.git 15 | HEAD 16 | 17 | 18 | 19 | org.sonatype.oss 20 | oss-parent 21 | 9 22 | 23 | 24 | 25 | 26 | The Apache Software License, Version 2.0 27 | http://www.apache.org/licenses/LICENSE-2.0.txt 28 | 29 | 30 | 31 | 32 | 33 | jinho.choi 34 | Jinho D. Choi 35 | {id}@emory.edu 36 | 37 | 38 | 39 | 40 | UTF-8 41 | 1.8 42 | 1.8 43 | 44 | 45 | 46 | 47 | 48 | src/main/resources 49 | 50 | **/* 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | org.eclipse.m2e 59 | lifecycle-mapping 60 | 1.0.0 61 | 62 | 63 | 64 | 65 | 66 | org.apache.maven.plugins 67 | maven-enforcer-plugin 68 | [1.0.0,) 69 | 70 | enforce 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | org.apache.maven.plugins 87 | maven-release-plugin 88 | 89 | true 90 | false 91 | release 92 | deploy 93 | 94 | 95 | 96 | org.apache.maven.plugins 97 | maven-compiler-plugin 98 | 3.5.1 99 | 100 | ${maven.compiler.source} 101 | ${maven.compiler.target} 102 | 103 | 104 | 105 | org.apache.maven.plugins 106 | maven-javadoc-plugin 107 | 2.10.3 108 | 109 | 110 | attach-javadocs 111 | 112 | jar 113 | 114 | 115 | -Xdoclint:none 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | org.slf4j 126 | slf4j-log4j12 127 | 1.7.21 128 | 129 | 130 | edu.emory.mathcs.nlp 131 | nlp4j-common 132 | 1.1.2 133 | 134 | 135 | edu.emory.mathcs.nlp 136 | nlp4j-core 137 | 1.1.2 138 | 139 | 140 | edu.emory.mathcs.nlp 141 | nlp4j-tokenization 142 | 1.1.2 143 | 144 | 145 | edu.emory.mathcs.nlp 146 | nlp4j-morphology 147 | 1.1.2 148 | 149 | 150 | junit 151 | junit 152 | 4.12 153 | 154 | 155 | com.google.guava 156 | guava 157 | 18.0 158 | test 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /src/test/resources/dat/sample-trn.tsv: -------------------------------------------------------------------------------- 1 | 1 The the DT _ 3 det _ B-ORG 2 | 2 Education education NNP _ 3 compound _ I-ORG 3 | 3 Directorate directorate NNP _ 8 nsubj 8:A0;9:A0 I-ORG 4 | 4 for for IN _ 3 prep _ I-ORG 5 | 5 Holy holy NNP _ 6 compound _ I-ORG 6 | 6 Mecca mecca NNP _ 4 pobj _ L-ORG 7 | 7 has have VBZ _ 8 aux _ O 8 | 8 finished finish VBN pb=finish.01 0 root _ O 9 | 9 preparing prepare VBG pb=prepare.01 8 xcomp 8:A1 O 10 | 10 a a DT _ 13 det _ O 11 | 11 new new JJ _ 13 amod _ O 12 | 12 computer computer NN _ 13 compound _ O 13 | 13 program program NN _ 9 dobj 15:A0;9:A1 O 14 | 14 to to TO _ 15 aux _ O 15 | 15 monitor monitor VB pb=monitor.01 13 relcl _ O 16 | 16 disadvantaged disadvantaged JJ _ 19 amod _ O 17 | 17 and and CC _ 16 cc _ O 18 | 18 deprived deprived JJ _ 16 conj _ O 19 | 19 students student NNS _ 15 dobj 15:A1 O 20 | 20 in in IN sem=LOC 19 prep _ O 21 | 21 schools school NNS _ 20 pobj 22:A1 O 22 | 22 supervised supervise VBN pb=supervise.01 21 acl _ O 23 | 23 by by IN _ 22 agent 22:A0 O 24 | 24 the the DT _ 25 det _ O 25 | 25 directorate directorate NN _ 23 pobj _ O 26 | 26 , , , _ 21 punct _ O 27 | 27 of of IN _ 32 prep _ O 28 | 28 which which WDT _ 27 pobj _ O 29 | 29 there there EX _ 30 expl _ O 30 | 30 are be VBP pb=be.02 21 relcl _ O 31 | 31 over over IN syn=PRD 30 prep 30:A1 B-QUANTITY 32 | 32 500 0 CD _ 31 pobj _ L-QUANTITY 33 | 33 . . . _ 8 punct _ O 34 | 35 | 1 Bakr bakr NNP _ 3 compound _ B-PERSON 36 | 2 Ibrahim ibrahim NNP _ 3 compound _ I-PERSON 37 | 3 Basfar basfar NNP _ 11 nsubj 11:A0 L-PERSON 38 | 4 , , , _ 3 punct _ O 39 | 5 Director director NNP _ 7 compound _ O 40 | 6 - - HYPH _ 7 punct _ O 41 | 7 General general NNP _ 3 appos _ O 42 | 8 of of IN _ 7 prep _ O 43 | 9 Education education NNP _ 8 pobj _ O 44 | 10 , , , _ 11 punct _ O 45 | 11 said say VBD pb=say.01 0 root _ O 46 | 12 the the DT _ 13 det _ O 47 | 13 program program NN _ 15 nsubjpass 15:A1 O 48 | 14 was be VBD _ 15 auxpass _ O 49 | 15 aimed aim VBN pb=aim.02 11 ccomp 11:A1 O 50 | 16 at at IN syn=CLR 15 prep 15:A2 O 51 | 17 identifying identify VBG pb=identify.01|syn=NOM 16 pcomp _ O 52 | 18 students student NNS _ 17 dobj 17:A1 O 53 | 19 in in IN _ 18 prep _ O 54 | 20 need need NN _ 19 pobj _ O 55 | 21 in in IN sem=LOC 18 prep _ O 56 | 22 schools school NNS _ 21 pobj 23:A1 O 57 | 23 run run VBN pb=run.01 22 acl _ O 58 | 24 by by IN _ 23 agent 23:A0 O 59 | 25 the the DT _ 26 det _ O 60 | 26 directorate directorate NN _ 24 pobj _ O 61 | 27 . . . _ 11 punct _ O 62 | 63 | 1 The the DT _ 2 det _ _ 64 | 2 program program NN _ 4 nsubj 4:A1 _ 65 | 3 will will MD _ 4 aux 4:AM-MOD _ 66 | 4 be be VB pb=be.01 0 root _ _ 67 | 5 an an DT _ 7 det _ _ 68 | 6 important important JJ _ 7 amod _ _ 69 | 7 resource resource NN syn=PRD 4 attr 4:A2 _ 70 | 8 for for IN _ 7 prep _ _ 71 | 9 all all DT _ 10 det _ _ 72 | 10 associations association NNS _ 8 pobj 15:A0;17:A0 _ 73 | 11 and and CC _ 10 cc _ _ 74 | 12 charitable charitable JJ _ 13 amod _ _ 75 | 13 organizations organization NNS _ 10 conj _ _ 76 | 14 who who WP _ 15 nsubj 15:R-A0;17:R-A0 _ 77 | 15 wish wish VBP pb=wish.01 10 relcl _ _ 78 | 16 to to TO _ 17 aux _ _ 79 | 17 identify identify VB pb=identify.01 15 xcomp 15:A1 _ 80 | 18 poor poor JJ _ 19 amod _ _ 81 | 19 students student NNS _ 17 dobj 17:A1 _ 82 | 20 in in IN _ 19 prep _ _ 83 | 21 need need NN _ 20 pobj _ _ 84 | 22 of of IN _ 21 prep _ _ 85 | 23 support support NN _ 22 pobj _ _ 86 | 24 . . . _ 4 punct _ _ 87 | 88 | 1 He he PRP _ 2 nsubj 2:A0 _ 89 | 2 said say VBD pb=say.01 0 root _ _ 90 | 3 that that IN _ 7 mark _ _ 91 | 4 school school NN _ 5 compound _ _ 92 | 5 principals principal NNS _ 7 nsubj 7:A0 _ 93 | 6 would would MD _ 7 aux 7:AM-MOD _ 94 | 7 supervise supervise VB pb=supervise.01 2 ccomp 2:A1 _ 95 | 8 its its PRP$ _ 9 poss _ _ 96 | 9 implementation implementation NN _ 7 dobj 7:A1 _ 97 | 10 to to TO _ 11 aux _ _ 98 | 11 assure assure VB pb=assure.01|sem=PRP 7 advcl 7:AM-PRP _ 99 | 12 the the DT _ 13 det _ _ 100 | 13 accuracy accuracy NN _ 11 dobj 11:A2 _ 101 | 14 and and CC _ 13 cc _ _ 102 | 15 correctness correctness NN _ 13 conj _ _ 103 | 16 of of IN _ 13 prep _ _ 104 | 17 data datum NNS _ 16 pobj _ _ 105 | 18 and and CC _ 11 cc _ _ 106 | 19 to to TO _ 20 aux _ _ 107 | 20 register register VB pb=register.02 11 conj _ _ 108 | 21 disadvantaged disadvantaged JJ _ 22 amod _ _ 109 | 22 students student NNS _ 20 dobj 20:A1 _ 110 | 23 and and CC _ 20 cc _ _ 111 | 24 attach attach VB pb=attach.01 20 conj _ _ 112 | 25 a a DT _ 26 det _ _ 113 | 26 copy copy NN _ 24 dobj 24:A1 _ 114 | 27 of of IN _ 26 prep _ _ 115 | 28 their their PRP$ _ 31 poss _ _ 116 | 29 family family NN _ 31 compound _ _ 117 | 30 identity identity NN _ 31 compound _ _ 118 | 31 card card NN _ 27 pobj _ _ 119 | 32 . . . _ 2 punct _ _ 120 | 121 | 1 The the DT _ 4 det _ O 122 | 2 Director director NNP _ 4 compound _ O 123 | 3 - - HYPH _ 4 punct _ O 124 | 4 General general NNP _ 10 nsubj 10:A0;13:A0 O 125 | 5 of of IN _ 4 prep _ O 126 | 6 Education education NNP _ 5 pobj _ O 127 | 7 for for IN _ 6 prep _ O 128 | 8 Holy holy NNP _ 9 compound _ B-GPE 129 | 9 Mecca mecca NNP _ 7 pobj _ L-GPE 130 | 10 went go VBD pb=go.06 0 root _ O 131 | 11 on on RP _ 10 prt 10:A2 O 132 | 12 to to TO _ 13 aux _ O 133 | 13 say say VB pb=say.01 10 xcomp 10:A1 O 134 | 14 that that IN _ 19 mark _ O 135 | 15 the the DT _ 16 det _ O 136 | 16 program program NN _ 19 nsubjpass 19:A1 O 137 | 17 would would MD _ 19 aux 19:AM-MOD O 138 | 18 be be VB _ 19 auxpass _ O 139 | 19 restricted restrict VBN pb=restrict.01 13 ccomp 13:A1 O 140 | 20 to to IN syn=CLR 19 prep 19:A2 O 141 | 21 students student NNS _ 20 pobj 27:A0 O 142 | 22 in in IN _ 21 prep _ O 143 | 23 need need NN _ 22 pobj _ O 144 | 24 of of IN _ 23 prep _ O 145 | 25 zakat zakat FW _ 24 pobj _ O 146 | 26 who who WP _ 27 nsubj 27:R-A0 O 147 | 27 receive receive VBP pb=receive.01 21 relcl _ O 148 | 28 no no DT _ 29 det _ O 149 | 29 assistance assistance NN _ 27 dobj 27:A1 O 150 | 30 from from IN syn=CLR 27 prep 27:A2 O 151 | 31 the the DT _ 32 det _ O 152 | 32 school school NN _ 30 pobj _ O 153 | 33 and and CC _ 21 cc _ O 154 | 34 those those DT _ 21 conj 37:A1 O 155 | 35 whose whose WP$ _ 36 poss _ O 156 | 36 fathers father NNS _ 37 nsubj 37:R-A1 O 157 | 37 are be VBP pb=be.01 34 relcl _ O 158 | 38 disabled disabled JJ syn=PRD 37 acomp 37:A2 O 159 | 39 , , , _ 38 punct _ O 160 | 40 in in IN _ 38 conj _ O 161 | 41 prison prison NN _ 40 pobj _ O 162 | 42 or or CC _ 40 cc _ O 163 | 43 absent absent JJ _ 40 conj _ O 164 | 44 and and CC _ 38 cc _ O 165 | 45 with with IN _ 38 conj _ O 166 | 46 no no DT _ 49 det _ O 167 | 47 other other JJ _ 49 amod _ O 168 | 48 financial financial JJ _ 49 amod _ O 169 | 49 support support NN _ 45 pobj _ O 170 | 50 . . . _ 10 punct _ O 171 | -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/zzz/NEGazetteerCreate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.zzz; 17 | 18 | import java.io.BufferedReader; 19 | import java.io.InputStream; 20 | import java.io.OutputStream; 21 | import java.io.PrintStream; 22 | import java.util.ArrayList; 23 | import java.util.Collections; 24 | import java.util.HashSet; 25 | import java.util.Iterator; 26 | import java.util.List; 27 | import java.util.Set; 28 | import java.util.stream.Collectors; 29 | 30 | import edu.emory.mathcs.nlp.common.constant.StringConst; 31 | import edu.emory.mathcs.nlp.common.util.CharUtils; 32 | import edu.emory.mathcs.nlp.common.util.IOUtils; 33 | import edu.emory.mathcs.nlp.common.util.Joiner; 34 | import edu.emory.mathcs.nlp.common.util.StringUtils; 35 | import edu.emory.mathcs.nlp.tokenization.EnglishTokenizer; 36 | import edu.emory.mathcs.nlp.tokenization.Tokenizer; 37 | 38 | /** 39 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 40 | */ 41 | public class NEGazetteerCreate 42 | { 43 | Tokenizer tokenizer; 44 | 45 | public NEGazetteerCreate() 46 | { 47 | tokenizer = new EnglishTokenizer(); 48 | } 49 | 50 | public Set union(InputStream wiki, InputStream redirect, boolean skipColon, boolean single) throws Exception 51 | { 52 | Set set = new HashSet<>(); 53 | read(wiki, set, false, skipColon, single); 54 | read(redirect, set, true, skipColon, single); 55 | return set; 56 | } 57 | 58 | public void read(InputStream in, Set set, boolean redirect, boolean skipColon, boolean single) throws Exception 59 | { 60 | BufferedReader reader = IOUtils.createBufferedReader(in); 61 | List tokens; 62 | String line; 63 | 64 | while ((line = reader.readLine()) != null) 65 | { 66 | line = line.trim(); 67 | if (skipColon && (line.contains(":") || line.contains(" of "))) continue; 68 | if (redirect) line = splitRedirect(line); 69 | tokens = tokenizer.tokenize(line).stream().map(n -> n.getWordForm()).collect(Collectors.toList()); 70 | concatPeriod(tokens); 71 | trimTokens(tokens, single); 72 | if (!tokens.isEmpty()) set.add(Joiner.join(tokens, " ")); 73 | } 74 | 75 | System.out.println(set.size()); 76 | reader.close(); 77 | } 78 | 79 | private void concatPeriod(List tokens) 80 | { 81 | if (tokens.size() == 2 && tokens.get(1).equals(".") && tokens.get(0).contains(".")) 82 | { 83 | tokens.set(0, tokens.get(0)+"."); 84 | tokens.remove(1); 85 | } 86 | } 87 | 88 | public String splitRedirect(String s) 89 | { 90 | if (s.contains(StringConst.SPACE)) return s; 91 | StringBuilder build = new StringBuilder(); 92 | char[] cs = s.toCharArray(); 93 | int i, len = cs.length; 94 | 95 | for (i=0; i tokens, boolean single) 107 | { 108 | Iterator it = tokens.iterator(); 109 | int i, bIdx = -1; 110 | String s; 111 | 112 | for (i=0; i= 0) 117 | { 118 | tokens.subList(bIdx, i+1).clear(); 119 | break; 120 | } 121 | } 122 | 123 | while (it.hasNext()) 124 | { 125 | s = it.next(); 126 | 127 | if (StringUtils.containsPunctuationOnly(s)) 128 | it.remove(); 129 | else 130 | break; 131 | } 132 | 133 | for (i=tokens.size()-1; i>=0; i--) 134 | { 135 | if (StringUtils.containsPunctuationOnly(tokens.get(i))) 136 | tokens.remove(i); 137 | else 138 | break; 139 | } 140 | 141 | if (tokens.size() == 1 && ((single && !tokens.get(0).contains(".")) || StringUtils.containsDigitOnly(tokens.get(0)))) 142 | tokens.clear(); 143 | 144 | // if (tokens.size() == 1) System.out.println(tokens.get(0)); 145 | } 146 | 147 | public void print(OutputStream out, Set set) 148 | { 149 | PrintStream fout = IOUtils.createBufferedPrintStream(out); 150 | List list = new ArrayList<>(set); 151 | Collections.sort(list); 152 | 153 | for (String s : list) 154 | fout.println(s); 155 | 156 | fout.close(); 157 | } 158 | 159 | static public void main(String[] args) throws Exception 160 | { 161 | final String DIR = args[0]; 162 | 163 | NEGazetteerCreate dict = new NEGazetteerCreate(); 164 | Set set; 165 | String path; 166 | 167 | path = DIR+"/WikiArtWork"; 168 | System.out.println(path); 169 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, true); 170 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 171 | 172 | path = DIR+"/WikiFilms"; 173 | System.out.println(path); 174 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, true); 175 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 176 | 177 | path = DIR+"/WikiSongs"; 178 | System.out.println(path); 179 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, true); 180 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 181 | 182 | path = DIR+"/WikiManMadeObjectNames"; 183 | System.out.println(path); 184 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, false); 185 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 186 | 187 | path = DIR+"/WikiCompetitionsBattlesEvents"; 188 | System.out.println(path); 189 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), false, false); 190 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 191 | 192 | path = DIR+"/WikiLocations"; 193 | System.out.println(path); 194 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), true, false); 195 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 196 | 197 | path = DIR+"/WikiOrganizations"; 198 | System.out.println(path); 199 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), true, false); 200 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 201 | 202 | path = DIR+"/WikiPeople"; 203 | System.out.println(path); 204 | set= dict.union(IOUtils.createFileInputStream(path+".txt"), IOUtils.createFileInputStream(path+"Redirects.txt"), true, false); 205 | dict.print(IOUtils.createFileOutputStream(path+".union"), set); 206 | 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-dep.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz 14 | 15 | 16 | 17 | 0.00001 18 | adagrad-mini-batch 19 | 0.02 20 | 2 21 | 22 | 5 23 | 20 24 | 0 25 | 26 | 27 | 28 | 88.91 29 | 0.01 30 | 2 31 | 0.04 32 | 0.005 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /md/tutorial/tree_based_nlp_component.md: -------------------------------------------------------------------------------- 1 | # Pleonastic It 2 | 3 | In this tutorial, we will create an NLP component that traverses every node in a dependency tree, and classifies it into a specific type of [pleonastic-it](https://github.com/emorynlp/pleonastic-it). Let us begin by cloning the [nlp4j-core](https://github.com/emorynlp/nlp4j-core) repository (if you haven't already). 4 | 5 | ```bash 6 | git clone https://github.com/emorynlp/nlp4j-core.git 7 | ``` 8 | 9 | ## Package 10 | 11 | Create a package [`pleonastic`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/pleonastic) under [`edu.emory.mathcs.nlp.component`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/). 12 | 13 | 14 | ## State 15 | 16 | Create a class [`PleonasticState`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/pleonastic/PleonasticState.java) extending [`NLPState`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/template/state/NLPState.java). Add the following member instances to the class (you will see the use of these instances later). 17 | 18 | ```java 19 | public class PleonasticState extends NLPState 20 | { 21 | static public final String KEY = "it"; 22 | private String[] oracle; 23 | private int input; 24 | } 25 | ``` 26 | 27 | Define a constructor that takes an array of [`NLPNode`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java). `nodes[0]` is the artificial root, `nodes[1]` represents the first token in a sentence, and so on. `input` indicates the index of the node to be processed; it is initialized to `0`, pointing to the root node. `shift` finds the next node whose lemma is `it`. 28 | 29 | ```java 30 | public PleonasticState(NLPNode[] nodes) 31 | { 32 | super(nodes); 33 | input = 0; 34 | shift(); 35 | } 36 | 37 | private void shift() 38 | { 39 | for (input++; input n.removeFeat(FEAT_KEY)).toArray(String[]::new); 55 | return Arrays.stream(oracle).filter(o -> o != null).findFirst().isPresent(); 56 | } 57 | 58 | @Override 59 | public String getOracle() 60 | { 61 | return oracle[input]; 62 | } 63 | ``` 64 | 65 | Second, override `next`, which takes system or oracle predictions of the current state, applies the top prediction to the current state, and moves onto the next state. Then, override `isTerminal`, which returns `true` if no more state is available; in other words, no more input node is left to be processed. 66 | 67 | 68 | ```java 69 | /** 70 | * @param map retrieves the string label from its index. 71 | * @param yhat index of the top predicated label. 72 | * @param scores scores of all labels. 73 | */ 74 | @Override 75 | public void next(LabelMap map, int yhat, float[] scores) 76 | { 77 | String label = map.getLabel(yhat); 78 | nodes[input].putFeat(FEAT_KEY, label); 79 | shift(); 80 | } 81 | 82 | @Override 83 | public boolean isTerminate() 84 | { 85 | return input >= nodes.length; 86 | } 87 | ``` 88 | 89 | Third, override `getNode`, which takes [`FeatureItem`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/FeatureItem.java) and returns a node indicated by the feature template. Given the input node `nodes[input]`, the feature template specifies the window size and the dependency relation of the node to extract features from. For instance, if `window` is `-1` and the dependency relation is `lmd`, it returns `nodes[input-1].getLeftMostDependent()` if exist; otherwise, it returns `null`. 90 | 91 | ```java 92 | @Override 93 | public NLPNode getNode(FeatureItem item) 94 | { 95 | NLPNode node = getNode(input, item.window); 96 | return getRelativeNode(item, node); 97 | } 98 | ``` 99 | 100 | Finally, we override `evaluate`, which takes [`Eval`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/eval/Eval.java) and evaluates the predictions made for this tree. Here, we are using the built-in evaluator, [`AccuracyEval`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/eval/AccuracyEval.java), that measures the accuracy by counting the correct predictions. 101 | 102 | ```java 103 | @Override 104 | public void evaluate(Eval eval) 105 | { 106 | int correct = 0, total = 0; 107 | 108 | for (int i=1; i 130 | { 131 | private static final long serialVersionUID = 3585863417135590906L; 132 | 133 | public PleonasticClassifier() {} 134 | 135 | public PleonasticClassifier(InputStream configuration) 136 | { 137 | super(configuration); 138 | } 139 | } 140 | ``` 141 | 142 | Override `initState` using [`PleonasticState`](https://github.com/emorynlp/nlp4j-core/tree/master/src/main/java/edu/emory/mathcs/nlp/component/pleonastic/PleonasticState.java). 143 | 144 | ```java 145 | @Override 146 | protected PleonasticState initState(NLPNode[] nodes) 147 | { 148 | return new PleonasticState(nodes); 149 | } 150 | ``` 151 | 152 | Override `createEvaluator` using [`AccuracyEval`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/eval/AccuracyEval.java). 153 | 154 | ```java 155 | @Override 156 | public Eval createEvaluator() 157 | { 158 | return new AccuracyEval(); 159 | } 160 | ``` 161 | 162 | Override `postProcess` with an empty definition. 163 | 164 | ```java 165 | @Override 166 | protected void postProcess(PleonasticState state) {} 167 | ``` 168 | 169 | ## NLPMode 170 | 171 | Add the mode `pleonastic` to [`NLPMode`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/NLPMode.java). 172 | 173 | ```java 174 | public enum NLPMode 175 | { 176 | pos, // part-of-speech tagging 177 | ner, // named entity recognition 178 | dep, // dependency parsing 179 | srl, // semantic role labeling 180 | sentiment, // sentiment analysis 181 | pleonastic; // pleonastic-it classification 182 | } 183 | ``` 184 | 185 | ## Trainer 186 | 187 | Add `pleonastic` to [`OnlineTrainer`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/train/OnlineTrainer.java). 188 | 189 | ```java 190 | protected OnlineComponent createComponent(NLPMode mode, InputStream config) 191 | { 192 | switch (mode) 193 | { 194 | case pos: return (OnlineComponent)new POSTagger(config); 195 | case ner: return (OnlineComponent)new NERTagger(config); 196 | case dep: return (OnlineComponent)new DEPParser(config); 197 | case srl: return (OnlineComponent)new SRLParser(config); 198 | case sentiment : return (OnlineComponent)new SentimentAnalyzer(config); 199 | case pleonastic: return (OnlineComponent)new PleonasticClassifier(config); 200 | default : throw new IllegalArgumentException("Unsupported mode: "+mode); 201 | } 202 | } 203 | ``` -------------------------------------------------------------------------------- /md/quickstart/train.md: -------------------------------------------------------------------------------- 1 | # Train 2 | 3 | ## Command 4 | 5 | The following command trains an NLP component: 6 | 7 | ``` 8 | java edu.emory.mathcs.nlp.bin.NLPTrain -mode -c -t -d [-f -m -p -te -de ] 9 | 10 | -c : configuration file (required) 11 | -m : output model file (optional) 12 | -p : previously trained model file (optional) 13 | -t : training path (required) 14 | -d : development path (optional) 15 | -te : training file extension (default: *) 16 | -de : development file extension (default: *) 17 | -cv : # of cross-validation folds (default: 0) 18 | -mode : component mode (required: pos|ner|dep|srl|sent) 19 | ``` 20 | 21 | * `-c` specifies the configuration file (see [configuration](#configuration)). 22 | * `-m` specifies the output model file (saved in the [xz](http://tukaani.org) format). The model is not saved unless this option is set. 23 | * `-p` specifies the previously trained model file. If this option is set, a new model is trained on top of the previous model. 24 | * `-t|d` specifies the training or development path pointing to either a file or a directory. When the path points to a file, only the specific file is trained. When the path points to a directory, all files with the file extension `-te|de` under the specific directory are trained. It is possible to train a model without using a development set by not setting the `-d` option (see the example below). 25 | * `-te|de` specifies the training or development file extension. The default value `*` implies files with any extension. This option is used only when the training or development path `-t|d` points to a directory. 26 | * `-cv` specifies the number of cross-validation folds. If this number is greater than `1`, it performs cross-validation on the training data. 27 | * `-mode` specifies the NLP component to be trained (see [`NLPMode`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/NLPMode.java)). 28 | 29 | ## Example 30 | 31 | The following command takes [`sample-trn.tsv`](../../src/test/resources/dat/sample-trn.tsv) and [`sample-dev.tsv`](../../src/test/resources/dat/sample-dev.tsv), trains a dependency parsing model using [`config-train-sample.xml`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/config-train-sample.xml), and saves the best model to `sample-dep.xz`. 32 | 33 | ``` 34 | $ java -Xmx1g -XX:+UseConcMarkSweepGC java edu.emory.mathcs.nlp.bin.NLPTrain -mode dep -c config-train-sample.xml -t sample-trn.tsv -d sample-dev.tsv -m sample-dep.xz 35 | 36 | AdaGrad Mini-batch 37 | - Max epoch: 5 38 | - Mini-batch: 1 39 | - Learning rate: 0.02 40 | - LOLS: fixed = 0, decaying rate = 0.95 41 | - RDA: 1.0E-5 42 | Training: 0 43 | 0: 1: LAS = 22.22, UAS = 26.98, L = 34, SF = 1300, NZW = 1867, N/S = 15750 44 | 0: 2: LAS = 34.92, UAS = 39.68, L = 34, SF = 1410, NZW = 4578, N/S = 18000 45 | 0: 3: LAS = 38.89, UAS = 44.44, L = 34, SF = 1454, NZW = 6191, N/S = 21000 46 | 0: 4: LAS = 37.30, UAS = 41.27, L = 34, SF = 1550, NZW = 7751, N/S = 42000 47 | 0: 5: LAS = 37.30, UAS = 41.27, L = 34, SF = 1583, NZW = 8997, N/S = 63000 48 | 0: Best: 38.89, epoch = 3 49 | Saving the model 50 | ``` 51 | 52 | * Use the [`-XX:+UseConcMarkSweepGC`](http://www.oracle.com/technetwork/java/tuning-139912.html) option for JVM, which reduces the memory usage into a half. 53 | * Use [`log4j.properties`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/log4j.properties) for the [log4j](http://logging.apache.org/log4j/) configuration. 54 | * Once the training is done, `sample-dep.xz` should be created, which can be specified in the configuration file for dependency parsing (see [how to decode](decode.md)). 55 | * `L`: number of labels. 56 | * `SF`: number of sparse features. 57 | * `NZW`: number of non-zero weights. 58 | * `N/S`: number of nodes processed per second. 59 | 60 | ## Configuration 61 | 62 | Sample configuration files for training can be found here: [`config-train-*`](../../src/main/resources/edu/emory/mathcs/nlp/configuration/). 63 | 64 | ```xml 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | en-ambiguity-classes-simplified-lowercase.xz 79 | en-brown-clusters-simplified-lowercase.xz 80 | en-word-embeddings-undigitalized.xz 81 | en-named-entity-gazetteers-simplified.xz 82 | 83 | 84 | 85 | adagrad-mini-batch 86 | 0.00001 87 | 0.02 88 | 2 89 | 90 | 40 91 | 5 92 | 0 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | ``` 104 | 105 | * `` specifies the configuration for [`TSVReader`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/util/TSVReader.java). `index` specifies the index of the field, starting at 0. `field` specifies the name of the field (e.g., [`sample-trn.tsv`](../../src/test/resources/dat/sample-trn.tsv)): 106 | * `form`    : word form. 107 | * `lemma`  : lemma. 108 | * `pos`      : part-of-speech tag. 109 | * `feats`  : extra features. 110 | * `dhead`  : dependency head ID. 111 | * `deprel`: dependency label. 112 | * `sheads`: semantic heads. 113 | * `nament`: named entity tag. 114 | 115 | * `` specifies the lexica used globally across multiple components (e.g., [english lexica](../supplements/english-lexica-models.md#lexica)). `field` specifies the type of word forms used to generate these lexica (see [`NLPNode::getValue`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/node/NLPNode.java#L205)). 116 | * `ambiguity_classes`: ambiguity classes for part-of-speech tagging. 117 | * `word_clusters`: word clusters (e.g., brown clusters). 118 | * `word_embeddings`: word embeddings (e.g., [word2vec](http://word2vec.googlecode.com)). 119 | * `named_entity_gazetteers`: gazetteers for named entity recognition. 120 | 121 | * ``specifies the optimizer to train a statistical model. 122 | * `algorithm`: perceptron, softmax, adagrad, agagrad-mini-batch, agadelta-mini-batch, agagrad-regression. 123 | * `l1_regularization`: the [RDA](http://www.jmlr.org/papers/volume11/xiao10a/xiao10a.pdf) regularization parameter used for `adagrad-*`. 124 | * `learning_rate`: the learning rate. 125 | * `feature_cutoff`: features appearing less than or equal to this cutoff are discarded from training. 126 | * `lols`: [locally optimal learning to search](http://jmlr.org/proceedings/papers/v37/changb15.pdf).
- `fixed`: use only gold labels for the specific number of epochs.
- `decaying`: decay the use of gold labels by the specific rate for every epoch. 127 | * `max_epochs`: the maximum number of epochs to be used for training. 128 | * `batch_size`: the number of sentences used to train `*-mini-batch`. 129 | * `bias`: the bias value. 130 | 131 | * `` specifies the features used during training. 132 | 133 | ```xml 134 | 135 | ``` 136 | 137 | * `f#`: `#` must start with 0. When multiple features are joined, they must be in a consecutive order. 138 | * `source`: see [`Source.java`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/Source.java). 139 | * `window`: the context window with respect to the source. 140 | * `relation`: see [`Relation.java`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/Relation.java). 141 | * `field`: see [`Field.java`](https://github.com/emorynlp/nlp4j-core/blob/master/src/main/java/edu/emory/mathcs/nlp/component/template/feature/Field.java). 142 | * `value`: specifies the extra value of the field. -------------------------------------------------------------------------------- /src/main/java/edu/emory/mathcs/nlp/decode/AbstractNLPDecoder.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Emory University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package edu.emory.mathcs.nlp.decode; 17 | 18 | import java.io.BufferedReader; 19 | import java.io.ByteArrayInputStream; 20 | import java.io.ByteArrayOutputStream; 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | import java.io.OutputStream; 24 | import java.io.PrintStream; 25 | import java.lang.reflect.Array; 26 | import java.util.ArrayList; 27 | import java.util.List; 28 | import java.util.concurrent.ExecutorService; 29 | import java.util.concurrent.Executors; 30 | import java.util.function.Function; 31 | 32 | import edu.emory.mathcs.nlp.common.constant.StringConst; 33 | import edu.emory.mathcs.nlp.common.util.BinUtils; 34 | import edu.emory.mathcs.nlp.common.util.FileUtils; 35 | import edu.emory.mathcs.nlp.common.util.IOUtils; 36 | import edu.emory.mathcs.nlp.common.util.Joiner; 37 | import edu.emory.mathcs.nlp.common.util.Language; 38 | import edu.emory.mathcs.nlp.component.morph.MorphologicalAnalyzer; 39 | import edu.emory.mathcs.nlp.component.template.NLPComponent; 40 | import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica; 41 | import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode; 42 | import edu.emory.mathcs.nlp.component.template.reader.TSVReader; 43 | import edu.emory.mathcs.nlp.tokenization.Token; 44 | import edu.emory.mathcs.nlp.tokenization.Tokenizer; 45 | 46 | /** 47 | * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) 48 | */ 49 | public abstract class AbstractNLPDecoder> 50 | { 51 | static final public String FORMAT_RAW = "raw"; 52 | static final public String FORMAT_LINE = "line"; 53 | static final public String FORMAT_TSV = "tsv"; 54 | 55 | volatile private List> components; 56 | volatile private Tokenizer tokenizer; 57 | private DecodeConfig decode_config; 58 | 59 | // ======================================== CONSTRUCTORS ======================================== 60 | 61 | public AbstractNLPDecoder() {} 62 | 63 | public AbstractNLPDecoder(DecodeConfig config) 64 | { 65 | init(config); 66 | } 67 | 68 | public AbstractNLPDecoder(InputStream configuration) 69 | { 70 | init(new DecodeConfig(configuration)); 71 | } 72 | 73 | public void init(DecodeConfig config) 74 | { 75 | List> components = new ArrayList<>(); 76 | Language language = config.getLanguage(); 77 | decode_config = config; 78 | 79 | components.add(new GlobalLexica<>(decode_config.getDocumentElement())); 80 | 81 | BinUtils.LOG.info("Loading tokenizer\n"); 82 | setTokenizer(NLPUtils.createTokenizer(language)); 83 | 84 | if (decode_config.getPartOfSpeechTagging() != null) 85 | { 86 | BinUtils.LOG.info("Loading part-of-speech tagger\n"); 87 | components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getPartOfSpeechTagging()))); 88 | 89 | BinUtils.LOG.info("Loading morphological analyzer\n"); 90 | components.add(new MorphologicalAnalyzer<>(language)); 91 | } 92 | 93 | if (decode_config.getNamedEntityRecognition() != null) 94 | { 95 | BinUtils.LOG.info("Loading named entity recognizer\n"); 96 | components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getNamedEntityRecognition()))); 97 | } 98 | 99 | if (decode_config.getDependencyParsing() != null) 100 | { 101 | BinUtils.LOG.info("Loading dependency parser\n"); 102 | components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getDependencyParsing()))); 103 | } 104 | 105 | // if (decode_config.getSemanticRoleLabeling() != null) 106 | // { 107 | // BinUtils.LOG.info("Loading semantic role labeler\n"); 108 | // add(compoinent, , ); 109 | // components.add(NLPUtils.getComponent(IOUtils.getInputStream(decode_config.getSemanticRoleLabeling()))); 110 | // } 111 | 112 | setComponents(components); 113 | BinUtils.LOG.info("\n"); 114 | } 115 | 116 | // ======================================== GETTERS/SETTERS ======================================== 117 | 118 | public Tokenizer getTokenizer() 119 | { 120 | return tokenizer; 121 | } 122 | 123 | public List> getComponents() 124 | { 125 | return components; 126 | } 127 | 128 | public void setTokenizer(Tokenizer tokenizer) 129 | { 130 | this.tokenizer = tokenizer; 131 | } 132 | 133 | public void setComponents(List> components) 134 | { 135 | this.components = components; 136 | } 137 | 138 | // ======================================== DECODE ======================================== 139 | 140 | public void decode(List inputFiles, String outputExt, String format, int threads) 141 | { 142 | ExecutorService executor = Executors.newFixedThreadPool(threads); 143 | String outputFile; 144 | 145 | for (String inputFile : inputFiles) 146 | { 147 | outputFile = inputFile + StringConst.PERIOD + outputExt; 148 | executor.submit(new NLPTask(inputFile, outputFile, format)); 149 | } 150 | 151 | executor.shutdown(); 152 | } 153 | 154 | public String decode(String s, String format) 155 | { 156 | return new String(decodeByteArray(s, format)); 157 | } 158 | 159 | public byte[] decodeByteArray(String s, String format) 160 | { 161 | InputStream bin = new ByteArrayInputStream(s.getBytes()); 162 | ByteArrayOutputStream bout = new ByteArrayOutputStream(); 163 | 164 | decode(bin, bout, format); 165 | 166 | try 167 | { 168 | bin .close(); 169 | bout.close(); 170 | } 171 | catch (IOException e) {e.printStackTrace();} 172 | 173 | return bout.toByteArray(); 174 | } 175 | 176 | public void decode(InputStream in, OutputStream out, String format) 177 | { 178 | try 179 | { 180 | switch (format) 181 | { 182 | case FORMAT_RAW : decodeRaw (in, out); break; 183 | case FORMAT_LINE: decodeLine(in, out); break; 184 | case FORMAT_TSV : decodeTSV (createTSVReader(), in, out); break; 185 | } 186 | } 187 | catch (Exception e) {e.printStackTrace();} 188 | } 189 | 190 | public List decodeDocument(String s) throws IOException 191 | { 192 | return decodeDocument(new ByteArrayInputStream(s.getBytes())); 193 | } 194 | 195 | public List decodeDocument(InputStream in) throws IOException 196 | { 197 | List document = new ArrayList<>(); 198 | N[] nodes; 199 | 200 | for (List tokens : tokenizer.segmentize(in)) 201 | { 202 | nodes = toNodeArray(tokens); 203 | decode(nodes); 204 | document.add(nodes); 205 | } 206 | 207 | in.close(); 208 | return document; 209 | } 210 | 211 | public void decodeRaw(String s, OutputStream out) throws IOException 212 | { 213 | decodeRaw(new ByteArrayInputStream(s.getBytes()), out); 214 | } 215 | 216 | public void decodeRaw(InputStream in, OutputStream out) throws IOException 217 | { 218 | PrintStream fout = IOUtils.createBufferedPrintStream(out); 219 | N[] nodes; 220 | 221 | for (List tokens : tokenizer.segmentize(in)) 222 | { 223 | nodes = toNodeArray(tokens); 224 | decode(nodes); 225 | fout.println(toString(nodes)+"\n"); 226 | } 227 | 228 | in.close(); 229 | fout.close(); 230 | } 231 | 232 | public void decodeLine(InputStream in, OutputStream out) throws IOException 233 | { 234 | BufferedReader reader = IOUtils.createBufferedReader(in); 235 | PrintStream fout = IOUtils.createBufferedPrintStream(out); 236 | N[] nodes; 237 | String line; 238 | 239 | while ((line = reader.readLine()) != null) 240 | { 241 | nodes = decode(line); 242 | fout.println(toString(nodes)+"\n"); 243 | } 244 | 245 | reader.close(); 246 | fout.close(); 247 | } 248 | 249 | public void decodeTSV(TSVReader reader, InputStream in, OutputStream out) throws IOException 250 | { 251 | PrintStream fout = IOUtils.createBufferedPrintStream(out); 252 | N[] nodes; 253 | 254 | reader.open(in); 255 | 256 | while ((nodes = reader.next()) != null) 257 | { 258 | decode(nodes); 259 | fout.println(toString(nodes)+"\n"); 260 | } 261 | 262 | reader.close(); 263 | fout.close(); 264 | } 265 | 266 | public N[] decode(String sentence) 267 | { 268 | List tokens = tokenizer.tokenize(sentence); 269 | return decode(toNodeArray(tokens)); 270 | } 271 | 272 | public N[] decode(N[] nodes) 273 | { 274 | for (NLPComponent component : components) 275 | component.process(nodes); 276 | 277 | return nodes; 278 | } 279 | 280 | public N[] toNodeArray(List tokens) 281 | { 282 | return toNodeArray(tokens, t -> create(t)); 283 | } 284 | 285 | @SuppressWarnings("unchecked") 286 | public N[] toNodeArray(List tokens, Function f) 287 | { 288 | N node = create(); node.toRoot(); 289 | N[] nodes = (N[])Array.newInstance(node.getClass(), tokens.size() + 1); 290 | nodes[0] = node; // root 291 | 292 | for (int i=0,j=1; i createTSVReader() 313 | { 314 | return new TSVReader(decode_config.getReaderFieldMap()) 315 | { 316 | @Override 317 | protected N create() {return AbstractNLPDecoder.this.create();} 318 | }; 319 | } 320 | 321 | public String toString(N[] nodes) 322 | { 323 | return Joiner.join(nodes, "\n", 1); 324 | } 325 | 326 | class NLPTask implements Runnable 327 | { 328 | private String input_file; 329 | private String output_file; 330 | private String format; 331 | 332 | public NLPTask(String inputFile, String outputFile, String format) 333 | { 334 | this.input_file = inputFile; 335 | this.output_file = outputFile; 336 | this.format = format; 337 | } 338 | 339 | @Override 340 | public void run() 341 | { 342 | BinUtils.LOG.info(FileUtils.getBaseName(input_file)+"\n"); 343 | InputStream in = IOUtils.createFileInputStream (input_file); 344 | OutputStream out = IOUtils.createFileOutputStream(output_file); 345 | decode(in, out, format); 346 | } 347 | } 348 | } 349 | --------------------------------------------------------------------------------