├── .gitignore
├── .travis.yml
├── README.md
├── pom.xml
└── src
    ├── main
        ├── assemblies
        │   └── assembly.xml
        ├── checkstyle
        │   └── checkstyle.xml
        ├── java
        │   └── com
        │   │   └── vader
        │   │       └── sentiment
        │   │           ├── analyzer
        │   │               ├── RawSentimentScores.java
        │   │               ├── SentimentAnalyzer.java
        │   │               ├── SentimentPolarities.java
        │   │               └── package-info.java
        │   │           ├── processor
        │   │               ├── InputAnalyzer.java
        │   │               ├── InputAnalyzerInterface.java
        │   │               ├── TextProperties.java
        │   │               └── package-info.java
        │   │           └── util
        │   │               ├── Constants.java
        │   │               ├── SentimentModifyingTokens.java
        │   │               ├── Utils.java
        │   │               ├── Valence.java
        │   │               └── package-info.java
        └── resources
        │   ├── log4j.properties
        │   └── vader_sentiment_lexicon.txt
    └── test
        ├── java
            └── com
            │   └── vader
            │       └── sentiment
            │           └── analyzer
            │               └── SentimentAnalyzerTest.java
        └── resources
            ├── GroundTruth
                ├── amazonReviewSnippets_GroundTruth.txt
                ├── movieReviewSnippets_GroundTruth.txt
                ├── nytEditorialSnippets_GroundTruth.txt
                └── tweets_GroundTruth.txt
            ├── amazonReviewSnippets_GroundTruth_vader.tsv
            ├── getNltkVader.py
            ├── log4j.properties
            ├── movieReviewSnippets_GroundTruth_vader.tsv
            ├── nytEditorialSnippets_GroundTruth_vader.tsv
            └── tweets_GroundTruth_vader.tsv


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target
4 | build
5 | out
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: java
 3 | install: mvn install -Dgpg.skip=true
 4 | jdk:
 5 |   - oraclejdk8
 6 | 
 7 | cache:
 8 |   directories:
 9 |     - $HOME/.m2
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## VADER-Sentiment-Analysis in Java
  2 | 
  3 | [![Build Status](https://travis-ci.org/apanimesh061/VaderSentimentJava.svg?branch=master)](https://travis-ci.org/apanimesh061/VaderSentimentJava)
  4 | 
  5 | VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is _specifically attuned to sentiments expressed in social media_. It is fully open-sourced under the [MIT License](http://choosealicense.com/) (we sincerely appreciate all attributions and readily accept most contributions, but please don't hold us liable).
  6 | 
  7 | This is a JAVA port of the NLTK VADER sentiment analysis originally written in Python.
  8 | 
  9 |  - The [Original](https://github.com/cjhutto/vaderSentiment) python module by the paper's author C.J. Hutto
 10 |  - The [NLTK](http://www.nltk.org/_modules/nltk/sentiment/vader.html) source
 11 | 
 12 | For the testing I have compared the results of the NLTK module with this Java port.
 13 | 
 14 | ### Update (Oct 2021)
 15 | - - -
 16 | Releasing `v1.1.1`.
 17 | 
 18 | Thanks to @ArjohnKampman for helping is optimizing some parts of the code. Since I was touching this repo after a long time, I noticed that a lot of the Maven dependencies and plugins were outdated, so I have updated them. `mvn package` still works so it should be fine.
 19 | 
 20 | I also noticed a lot of comments on not being able to use the library from Maven. I did upload a Jar to Nexus a long time back and I was having trouble doing that again since I think I've lost the pass-phrases needed to sign and upload the Jar to the Nexus. Luckily, I found a new solution [here](https://stackoverflow.com/a/28483461) which suggests to use https://jitpack.io/ for public GitHub repositories. Turns out it is super simple to use it and get the pacakge from GitHub. I wanted to make sure I unblock anyone who wants to use this package.
 21 | 
 22 | I created a test Maven project `test-mvn-pkg1` locally and added the following to its `pom.xml`:
 23 | 
 24 | ```
 25 | <?xml version="1.0" encoding="UTF-8"?>
 26 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 27 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 28 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 29 |     <modelVersion>4.0.0</modelVersion>
 30 | 
 31 |     <groupId>org.example</groupId>
 32 |     <artifactId>test-mvn-pkg1</artifactId>
 33 |     <version>1.0-SNAPSHOT</version>
 34 | 
 35 |     <repositories>
 36 |         <repository>
 37 |             <id>jitpack.io</id>
 38 |             <url>https://jitpack.io</url>
 39 |         </repository>
 40 |     </repositories>
 41 | 
 42 |     <dependencies>
 43 |         <dependency>
 44 |             <groupId>com.github.apanimesh061</groupId>
 45 |             <artifactId>VaderSentimentJava</artifactId>
 46 |             <version>v1.1.1</version>
 47 |         </dependency>
 48 |     </dependencies>
 49 | 
 50 | </project>
 51 | ```
 52 | Once Maven downloads the dependencies, you can easily use it in your code like:
 53 | 
 54 | ```
 55 | package org.example;
 56 | 
 57 | import com.vader.sentiment.analyzer.SentimentAnalyzer;
 58 | import com.vader.sentiment.analyzer.SentimentPolarities;
 59 | 
 60 | public class Test {
 61 |     public static void main(String[] args) {
 62 |         final SentimentPolarities sentimentPolarities =
 63 |             SentimentAnalyzer.getScoresFor("that's a rare and valuable feature.");
 64 |         System.out.println(sentimentPolarities);
 65 | 	// SentimentPolarities{positivePolarity=0.437, negativePolarity=0.0, neutralPolarity=0.563, compoundPolarity=0.4767}
 66 |     }
 67 | }
 68 | ```
 69 | 
 70 | I'll try the Nexus upload and figure out if I can create a new Maven repo all together. Meanwhile, `jitpack` should work for anyone wanting to use the package.
 71 | 
 72 | 
 73 | ### Update (Jan 2018)
 74 | 
 75 | - - -
 76 | Based on a recommendation from @alexpetlenko, I uploaded the jar to Nexus as `vader-sentiment-analyzer-1.0`.
 77 | 
 78 | You can download the jar by adding the following to you `pom.xml`:
 79 | ```xml
 80 | <dependency>
 81 |   <groupId>com.github.apanimesh061</groupId>
 82 |   <artifactId>vader-sentiment-analyzer</artifactId>
 83 |   <version>1.0</version>
 84 | </dependency>
 85 | ```
 86 | 
 87 | Path to Jar: [vader-sentiment-analyzer-1.0.jar](https://oss.sonatype.org/service/local/repositories/releases/content/com/github/apanimesh061/vader-sentiment-analyzer/1.0/vader-sentiment-analyzer-1.0.jar)
 88 | 
 89 | ### Update (May 2017)
 90 | 
 91 | - - -
 92 | Major design refactorings resulting from addition of `checkstyle` to the project.
 93 | 
 94 | Also added JavaDocs to the project.
 95 | 
 96 | ### Update (Jan 2017)
 97 | 
 98 | - - -
 99 | 
100 | I have corrected a few bugs that I encountered when I was adding more tests.
101 | 
102 | The details are [here](https://github.com/apanimesh061/VaderSentimentJava/commit/d1d30c4ceeb356ec838f8abac70514bd21a92b4b).
103 | 
104 | This project now includes tests on text from:
105 | 
106 | 1. Amazon Reviews
107 | 2. Movie Reviews
108 | 3. NyTimes Editorial snippets
109 | 
110 | ### Introduction
111 | - - -
112 | 
113 | This README file describes the dataset of the paper:
114 | 
115 |   **VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text** <br />
116 |   (by C.J. Hutto and Eric Gilbert) <br />
117 |   Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. <br />
118 | 
119 | For questions, please contact: <br />
120 | 
121 | C.J. Hutto <br />
122 | Georgia Institute of Technology, Atlanta, GA 30032  <br />
123 | cjhutto [at] gatech [dot] edu <br />
124 | 
125 | ### Citation Information
126 | - - -
127 | 
128 | If you use either the dataset or any of the VADER sentiment analysis tools (VADER sentiment lexicon or Python code for rule-based sentiment analysis engine) in your research, please cite the above paper. For example:  <br />
129 | 
130 |   > <small> **Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.** </small><br />
131 | 
132 | ### Resources and Dataset Descriptions
133 | - - -
134 | 
135 | The compressed .tar.gz package includes **PRIMARY RESOURCES** (items 1-3) as well as additional **DATASETS AND TESTING RESOURCES** (items 4-12):
136 | 
137 | 1. [VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text](http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf) <br />
138 |     The original paper for the data set, see citation information (above).
139 | 
140 | 2. vader_sentiment_lexicon.txt <br />
141 |        Empirically validated by multiple independent human judges, VADER incorporates a "gold-standard" sentiment lexicon that is especially attuned to microblog-like contexts.  <br />
142 |     The VADER sentiment lexicon is sensitive both the **polarity** and the **intensity** of sentiments
143 | 	expressed in social media contexts, and is also generally applicable to sentiment analysis
144 | 	in other domains. <br />
145 | 	   Manually creating (much less, validating) a comprehensive sentiment lexicon is
146 | 	a labor intensive and sometimes error prone process, so it is no wonder that many
147 | 	opinion mining researchers and practitioners rely so heavily on existing lexicons
148 | 	as primary resources. We are pleased to offer ours as a new resource. <br />
149 | 	   We begin by constructing a list inspired by examining existing well-established
150 | 	sentiment word-banks (LIWC, ANEW, and GI). To this, we next incorporate numerous
151 | 	lexical features common to sentiment expression in microblogs, including
152 | 	 - a full list of Western-style emoticons, for example, :-) denotes a smiley face
153 | 	   and generally indicates positive sentiment)
154 | 	 - sentiment-related acronyms and initialisms (e.g., LOL and WTF are both examples of
155 | 	   sentiment-laden initialisms)
156 | 	 - commonly used slang with sentiment value (e.g., nah, meh and giggly).
157 | 
158 | 	This process provided us with over 9,000 lexical feature candidates. Next, we assessed
159 | 	the general applicability of each feature candidate to sentiment expressions. We
160 | 	used a wisdom-of-the-crowd13 (WotC) approach (Surowiecki, 2004) to acquire a valid
161 | 	point estimate for the sentiment valence (intensity) of each context-free candidate
162 | 	feature. We collected intensity ratings on each of our candidate lexical features
163 | 	from ten independent human raters (for a total of 90,000+ ratings). Features were
164 | 	rated on a scale from "[–4] Extremely Negative" to "[4] Extremely Positive", with
165 | 	allowance for "[0] Neutral (or Neither, N/A)".  <br />
166 | 	   We kept every lexical feature that had a non-zero mean rating, and whose standard
167 | 	deviation was less than 2.5 as determined by the aggregate of ten independent raters.
168 | 	This left us with just over 7,500 lexical features with validated valence scores that
169 | 	indicated both the sentiment polarity (positive/negative), and the sentiment intensity
170 | 	on a scale from –4 to +4. For example, the word "okay" has a positive valence of 0.9,
171 | 	"good" is 1.9, and "great" is 3.1, whereas "horrible" is –2.5, the frowning emoticon :(
172 | 	is –2.2, and "sucks" and it's slang derivative "sux" are both –1.5.
173 | 
174 | 3. vaderSentiment.py <br />
175 |     The Python code for the rule-based sentiment analysis engine. Implements the
176 | 	grammatical and syntactical rules described in the paper, incorporating empirically
177 | 	derived quantifications for the impact of each rule on the perceived intensity of
178 | 	sentiment in sentence-level text. Importantly, these heuristics go beyond what would
179 | 	normally be captured in a typical bag-of-words model. They incorporate **word-order
180 | 	sensitive relationships** between terms. For example, degree modifiers (also called
181 | 	intensifiers, booster words, or degree adverbs) impact sentiment intensity by either
182 | 	increasing or decreasing the intensity. Consider these examples: <br />
183 | 	   (a) "The service here is extremely good"  <br />
184 | 	   (b) "The service here is good" <br />
185 | 	   (c) "The service here is marginally good" <br />
186 | 	From Table 3 in the paper, we see that for 95% of the data, using a degree modifier
187 |     increases the positive sentiment intensity of example (a) by 0.227 to 0.36, with a
188 | 	mean difference of 0.293 on a rating scale from 1 to 4. Likewise, example (c) reduces
189 | 	the perceived sentiment intensity by 0.293, on average.
190 | 
191 | 4. tweets_GroundTruth.txt <br />
192 |     **NOTE**: This java module uses this file for testing. <br />
193 | 	FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TWEET-TEXT <br />
194 |     DESCRIPTION: includes "tweet-like" text as inspired by 4,000 tweets pulled from Twitter’s public timeline, plus 200 completely contrived tweet-like texts intended to specifically test syntactical and grammatical conventions of conveying differences in sentiment intensity. The "tweet-like" texts incorporate a fictitious username (@anonymous) in places where a username might typically appear, along with a fake URL ( http://url_removed ) in places where a URL might typically appear, as inspired by the original tweets. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'tweets_anonDataRatings.txt' (described below).
195 | 
196 | 5. tweets_anonDataRatings.txt <br />
197 |     FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS <br />
198 | 	DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
199 | 
200 | 6. nytEditorialSnippets_GroundTruth.txt <br />
201 | 	FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TEXT-SNIPPET <br />
202 |     DESCRIPTION: includes 5,190 sentence-level snippets from 500 New York Times opinion news editorials/articles; we used the NLTK tokenizer to segment the articles into sentence phrases, and added sentiment intensity ratings. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'nytEditorialSnippets_anonDataRatings.txt' (described below).
203 | 
204 | 7. nytEditorialSnippets_anonDataRatings.txt <br />
205 | 	FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS <br />
206 |     DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
207 | 
208 | 8. movieReviewSnippets_GroundTruth.txt <br />
209 | 	FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TEXT-SNIPPET <br />
210 |     DESCRIPTION: includes 10,605 sentence-level snippets from rotten.tomatoes.com. The snippets were derived from an original set of 2000 movie reviews (1000 positive and 1000 negative) in Pang & Lee (2004); we used the NLTK tokenizer to segment the reviews into sentence phrases, and added sentiment intensity ratings. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'movieReviewSnippets_anonDataRatings.txt' (described below).
211 | 
212 | 9. movieReviewSnippets_anonDataRatings.txt <br />
213 | 	FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS <br />
214 |     DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
215 | 
216 | 10. amazonReviewSnippets_GroundTruth.txt <br />
217 | 	 FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TEXT-SNIPPET <br />
218 |      DESCRIPTION: includes 3,708 sentence-level snippets from 309 customer reviews on 5 different products. The reviews were originally used in Hu & Liu (2004); we added sentiment intensity ratings. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'amazonReviewSnippets_anonDataRatings.txt' (described below).
219 | 
220 | 11. amazonReviewSnippets_anonDataRatings.txt <br />
221 | 	 FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS <br />
222 |      DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
223 | 
224 | 12. Comp.Social website with more papers/research: [Comp.Social](http://comp.social.gatech.edu/papers/)
225 | 	 
226 | 13. vader_sentiment_comparison_online_weblink <br />
227 |      A short-cut hyperlinked to the online (web-based) sentiment comparison using a "light" version of VADER. http://www.socialai.gatech.edu/apps/sentiment.html .
228 | 
229 | 
230 | ## Java Code EXAMPLE:
231 | 
232 | ```
233 | public static void main(String[] args) throws IOException {
234 |     ArrayList<String> sentences = new ArrayList<String>() {{
235 |         add("VADER is smart, handsome, and funny.");
236 |         add("VADER is smart, handsome, and funny!");
237 |         add("VADER is very smart, handsome, and funny.");
238 |         add("VADER is VERY SMART, handsome, and FUNNY.");
239 |         add("VADER is VERY SMART, handsome, and FUNNY!!!");
240 |         add("VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!");
241 |         add("The book was good.");
242 |         add("The book was kind of good.");
243 |         add("The plot was good, but the characters are uncompelling and the dialog is not great.");
244 |         add("A really bad, horrible book.");
245 |         add("At least it isn't a horrible book.");
246 |         add(":) and :D");
247 |         add("");
248 |         add("Today sux");
249 |         add("Today sux!");
250 |         add("Today SUX!");
251 |         add("Today kinda sux! But I'll get by, lol");
252 |     }};
253 | 
254 |     for (String sentence : sentences) {
255 |         System.out.println(sentence);
256 |         final SentimentPolarities sentimentPolarities =
257 | 			SentimentAnalyzer.getScoresFor(sentence);
258 |         System.out.println(sentimentPolarities);
259 |     }
260 | }
261 | ```
262 | 
263 | ### Online (web-based) Sentiment Comparison using VADER
264 | 
265 | http://www.socialai.gatech.edu/apps/sentiment.html .
266 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.github.apanimesh061</groupId>
  8 |     <artifactId>vader-sentiment-analyzer</artifactId>
  9 |     <version>1.1.1</version>
 10 | 
 11 |     <packaging>jar</packaging>
 12 |     <description>
 13 |         Java port of Python NLTK Vader Sentiment Analyzer. VADER (Valence Aware Dictionary and sEntiment Reasoner)
 14 |         is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in
 15 |         social media, and works well on texts from other domains.
 16 |     </description>
 17 |     <url>https://github.com/apanimesh061/VaderSentimentJava/blob/master/README.md</url>
 18 |     <inceptionYear>2016</inceptionYear>
 19 | 
 20 |     <licenses>
 21 |         <license>
 22 |             <name>MIT License</name>
 23 |             <url>http://www.opensource.org/licenses/mit-license.php</url>
 24 |             <distribution>repo</distribution>
 25 |         </license>
 26 |     </licenses>
 27 | 
 28 |     <developers>
 29 |         <developer>
 30 |             <id>apanimesh061</id>
 31 |             <name>Animesh Pandey</name>
 32 |             <email>apanimesh061@gmail.com</email>
 33 |             <roles>
 34 |                 <role>Developer</role>
 35 |             </roles>
 36 |         </developer>
 37 |     </developers>
 38 | 
 39 |     <scm>
 40 |         <url>https://github.com/apanimesh061/VaderSentimentJava</url>
 41 |         <connection>scm:git:git://github.com/apanimesh061/VaderSentimentJava.git</connection>
 42 |         <developerConnection>scm:git:git@github.com:apanimesh061/VaderSentimentJava.git</developerConnection>
 43 |     </scm>
 44 | 
 45 |     <parent>
 46 |         <groupId>org.sonatype.oss</groupId>
 47 |         <artifactId>oss-parent</artifactId>
 48 |         <version>7</version>
 49 |     </parent>
 50 | 
 51 |     <properties>
 52 |         <maven.compiler.target>1.8</maven.compiler.target>
 53 |         <maven.compiler.source>1.8</maven.compiler.source>
 54 |         <elasticsearch.assembly.descriptor>
 55 |             ${project.basedir}/src/main/assemblies/assembly.xml
 56 |         </elasticsearch.assembly.descriptor>
 57 |         <checkstyle.config.location>
 58 |             ${project.basedir}/src/main/checkstyle/checkstyle.xml
 59 |         </checkstyle.config.location>
 60 |         <lucene.analyzers.common.version>
 61 |             6.6.0
 62 |         </lucene.analyzers.common.version>
 63 |     </properties>
 64 | 
 65 |     <distributionManagement>
 66 |         <snapshotRepository>
 67 |             <id>ossrh</id>
 68 |             <url>https://oss.sonatype.org/content/repositories/snapshots</url>
 69 |         </snapshotRepository>
 70 |         <repository>
 71 |             <id>ossrh</id>
 72 |             <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 73 |         </repository>
 74 |     </distributionManagement>
 75 | 
 76 |     <build>
 77 |         <plugins>
 78 |             <plugin>
 79 |                 <groupId>org.sonatype.plugins</groupId>
 80 |                 <artifactId>nexus-staging-maven-plugin</artifactId>
 81 |                 <version>1.6.7</version>
 82 |                 <extensions>true</extensions>
 83 |                 <configuration>
 84 |                     <serverId>ossrh</serverId>
 85 |                     <nexusUrl>https://oss.sonatype.org/</nexusUrl>
 86 |                     <autoReleaseAfterClose>true</autoReleaseAfterClose>
 87 |                 </configuration>
 88 |             </plugin>
 89 | 
 90 |             <plugin>
 91 |                 <groupId>org.apache.maven.plugins</groupId>
 92 |                 <artifactId>maven-compiler-plugin</artifactId>
 93 |                 <version>3.8.1</version>
 94 |                 <configuration>
 95 |                     <source>${maven.compiler.target}</source>
 96 |                     <target>${maven.compiler.target}</target>
 97 |                 </configuration>
 98 |             </plugin>
 99 | 
100 |             <plugin>
101 |                 <groupId>org.apache.maven.plugins</groupId>
102 |                 <artifactId>maven-surefire-plugin</artifactId>
103 |                 <version>3.0.0-M5</version>
104 |                 <configuration>
105 |                     <includes>
106 |                         <include>**/*Test*.java</include>
107 |                     </includes>
108 |                 </configuration>
109 |             </plugin>
110 | 
111 |             <plugin>
112 |                 <groupId>org.apache.maven.plugins</groupId>
113 |                 <artifactId>maven-source-plugin</artifactId>
114 |                 <version>3.2.1</version>
115 |                 <executions>
116 |                     <execution>
117 |                         <id>attach-sources</id>
118 |                         <goals>
119 |                             <goal>jar-no-fork</goal>
120 |                         </goals>
121 |                     </execution>
122 |                 </executions>
123 |             </plugin>
124 | 
125 |             <plugin>
126 |                 <groupId>org.apache.maven.plugins</groupId>
127 |                 <artifactId>maven-javadoc-plugin</artifactId>
128 |                 <version>3.3.1</version>
129 |                 <executions>
130 |                     <execution>
131 |                         <id>attach-javadocs</id>
132 |                         <goals>
133 |                             <goal>jar</goal>
134 |                         </goals>
135 |                     </execution>
136 |                 </executions>
137 |             </plugin>
138 | 
139 |             <plugin>
140 |                 <groupId>org.apache.maven.plugins</groupId>
141 |                 <artifactId>maven-dependency-plugin</artifactId>
142 |                 <version>3.2.0</version>
143 |                 <executions>
144 |                     <execution>
145 |                         <id>copy-dependencies</id>
146 |                         <phase>process-resources</phase>
147 |                         <goals>
148 |                             <goal>copy-dependencies</goal>
149 |                         </goals>
150 |                         <configuration>
151 |                             <excludeScope>provided</excludeScope>
152 |                             <outputDirectory>${project.build.directory}/lib/</outputDirectory>
153 |                         </configuration>
154 |                     </execution>
155 |                 </executions>
156 |             </plugin>
157 | 
158 |             <plugin>
159 |                 <groupId>org.apache.maven.plugins</groupId>
160 |                 <artifactId>maven-assembly-plugin</artifactId>
161 |                 <version>3.3.0</version>
162 |                 <configuration>
163 |                     <appendAssemblyId>false</appendAssemblyId>
164 |                     <outputDirectory>${project.build.directory}/releases/</outputDirectory>
165 |                     <descriptors>
166 |                         <descriptor>${elasticsearch.assembly.descriptor}</descriptor>
167 |                     </descriptors>
168 |                     <archive>
169 |                         <manifest>
170 |                             <mainClass>fully.qualified.MainClass</mainClass>
171 |                         </manifest>
172 |                     </archive>
173 |                 </configuration>
174 |                 <executions>
175 |                     <execution>
176 |                         <phase>package</phase>
177 |                         <goals>
178 |                             <goal>single</goal>
179 |                         </goals>
180 |                     </execution>
181 |                 </executions>
182 |             </plugin>
183 | 
184 |             <plugin>
185 |                 <groupId>org.apache.maven.plugins</groupId>
186 |                 <artifactId>maven-checkstyle-plugin</artifactId>
187 |                 <version>3.1.2</version>
188 |                 <dependencies>
189 |                     <!-- https://mvnrepository.com/artifact/com.puppycrawl.tools/checkstyle -->
190 |                     <dependency>
191 |                         <groupId>com.puppycrawl.tools</groupId>
192 |                         <artifactId>checkstyle</artifactId>
193 |                         <version>8.40</version>
194 |                     </dependency>
195 |                 </dependencies>
196 |                 <executions>
197 |                     <execution>
198 |                         <id>validate</id>
199 |                         <phase>validate</phase>
200 |                         <configuration>
201 |                             <configLocation>${checkstyle.config.location}</configLocation>
202 |                             <encoding>UTF-8</encoding>
203 |                             <consoleOutput>true</consoleOutput>
204 |                             <failsOnError>true</failsOnError>
205 |                         </configuration>
206 |                         <goals>
207 |                             <goal>check</goal>
208 |                         </goals>
209 |                     </execution>
210 |                 </executions>
211 |             </plugin>
212 | 
213 |             <plugin>
214 |                 <groupId>org.apache.maven.plugins</groupId>
215 |                 <artifactId>maven-gpg-plugin</artifactId>
216 |                 <version>3.0.1</version>
217 |                 <executions>
218 |                     <execution>
219 |                         <id>sign-artifacts</id>
220 |                         <phase>verify</phase>
221 |                         <goals>
222 |                             <goal>sign</goal>
223 |                         </goals>
224 |                     </execution>
225 |                 </executions>
226 |             </plugin>
227 | 
228 |         </plugins>
229 |     </build>
230 | 
231 |     <dependencies>
232 |         <dependency>
233 |             <groupId>org.apache.lucene</groupId>
234 |             <artifactId>lucene-analyzers-common</artifactId>
235 |             <version>${lucene.analyzers.common.version}</version>
236 |         </dependency>
237 |         <dependency>
238 |             <groupId>com.google.guava</groupId>
239 |             <artifactId>guava</artifactId>
240 |             <version>21.0</version>
241 |         </dependency>
242 |         <dependency>
243 |             <groupId>org.apache.commons</groupId>
244 |             <artifactId>commons-lang3</artifactId>
245 |             <version>3.6</version>
246 |         </dependency>
247 |         <dependency>
248 |             <groupId>org.slf4j</groupId>
249 |             <artifactId>slf4j-api</artifactId>
250 |             <version>1.7.25</version>
251 |         </dependency>
252 |         <dependency>
253 |             <groupId>org.slf4j</groupId>
254 |             <artifactId>slf4j-log4j12</artifactId>
255 |             <version>1.6.4</version>
256 |             <scope>provided</scope>
257 |         </dependency>
258 |         <dependency>
259 |             <groupId>junit</groupId>
260 |             <artifactId>junit</artifactId>
261 |             <version>4.12</version>
262 |             <scope>test</scope>
263 |         </dependency>
264 |     </dependencies>
265 | 
266 | </project>
267 | 


--------------------------------------------------------------------------------
/src/main/assemblies/assembly.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
 3 |           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |           xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
 5 |     <id>jar-with-dependencies</id>
 6 | 
 7 |     <formats>
 8 |         <format>jar</format>
 9 |     </formats>
10 | 
11 |     <includeBaseDirectory>false</includeBaseDirectory>
12 | 
13 |     <dependencySets>
14 | 
15 |         <dependencySet>
16 |             <outputDirectory>/</outputDirectory>
17 |             <useProjectArtifact>true</useProjectArtifact>
18 |             <useStrictFiltering>true</useStrictFiltering>
19 |             <unpack>true</unpack>
20 |             <scope>runtime</scope>
21 |             <unpackOptions>
22 |                 <excludes>
23 |                     <exclude>org.apache.lucene:lucene-analyzers-common</exclude>
24 |                     <exclude>commons-lang:commons-lang</exclude>
25 |                     <exclude>log4j:log4j</exclude>
26 |                     <exclude>junit:junit</exclude>
27 |                     <exclude>**/log4j.properties</exclude>
28 |                 </excludes>
29 |             </unpackOptions>
30 |         </dependencySet>
31 | 
32 |     </dependencySets>
33 | 
34 | </assembly>
35 | 


--------------------------------------------------------------------------------
/src/main/checkstyle/checkstyle.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <!DOCTYPE module PUBLIC
  3 |         "-//Checkstyle//DTD Checkstyle Configuration 1.3//EN"
  4 |         "https://checkstyle.org/dtds/configuration_1_3.dtd">
  5 | 
  6 | <!--
  7 |     Checkstyle configuration that checks the Google coding conventions from Google Java Style
  8 |     that can be found at https://google.github.io/styleguide/javaguide.html
  9 | 
 10 |     Checkstyle is very configurable. Be sure to read the documentation at
 11 |     http://checkstyle.org (or in your downloaded distribution).
 12 | 
 13 |     To completely disable a check, just comment it out or delete it from the file.
 14 |     To suppress certain violations please review suppression filters.
 15 | 
 16 |     Authors: Max Vetrenko, Ruslan Diachenko, Roman Ivanov.
 17 |  -->
 18 | 
 19 | <module name="Checker">
 20 |     <property name="charset" value="UTF-8"/>
 21 | 
 22 |     <property name="severity" value="warning"/>
 23 | 
 24 |     <property name="fileExtensions" value="java, properties, xml"/>
 25 |     <!-- Excludes all 'module-info.java' files              -->
 26 |     <!-- See https://checkstyle.org/config_filefilters.html -->
 27 |     <module name="BeforeExecutionExclusionFileFilter">
 28 |         <property name="fileNamePattern" value="module\-info\.java$"/>
 29 |     </module>
 30 |     <!-- https://checkstyle.org/config_filters.html#SuppressionFilter -->
 31 |     <module name="SuppressionFilter">
 32 |         <property name="file" value="${org.checkstyle.google.suppressionfilter.config}"
 33 |                   default="checkstyle-suppressions.xml"/>
 34 |         <property name="optional" value="true"/>
 35 |     </module>
 36 | 
 37 |     <!-- Checks for whitespace                               -->
 38 |     <!-- See http://checkstyle.org/config_whitespace.html -->
 39 |     <module name="FileTabCharacter">
 40 |         <property name="eachLine" value="true"/>
 41 |     </module>
 42 | 
 43 |     <module name="LineLength">
 44 |         <property name="max" value="120"/>
 45 |         <property name="ignorePattern" value="^ *\* *[^ ]+$"/>
 46 |     </module>
 47 | 
 48 |     <module name="TreeWalker">
 49 |         <module name="OuterTypeFilename"/>
 50 |         <module name="IllegalTokenText">
 51 |             <property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
 52 |             <property name="format"
 53 |                       value="\\u00(09|0(a|A)|0(c|C)|0(d|D)|22|27|5(C|c))|\\(0(10|11|12|14|15|42|47)|134)"/>
 54 |             <property name="message"
 55 |                       value="Consider using special escape sequence instead of octal value or Unicode escaped value."/>
 56 |         </module>
 57 |         <module name="AvoidEscapedUnicodeCharacters">
 58 |             <property name="allowEscapesForControlCharacters" value="true"/>
 59 |             <property name="allowByTailComment" value="true"/>
 60 |             <property name="allowNonPrintableEscapes" value="true"/>
 61 |         </module>
 62 |         <module name="AvoidStarImport"/>
 63 |         <module name="OneTopLevelClass"/>
 64 |         <module name="NoLineWrap">
 65 |             <property name="tokens" value="PACKAGE_DEF, IMPORT, STATIC_IMPORT"/>
 66 |         </module>
 67 |         <module name="EmptyBlock">
 68 |             <property name="option" value="TEXT"/>
 69 |             <property name="tokens"
 70 |                       value="LITERAL_TRY, LITERAL_FINALLY, LITERAL_IF, LITERAL_ELSE, LITERAL_SWITCH"/>
 71 |         </module>
 72 |         <module name="NeedBraces">
 73 |             <property name="tokens"
 74 |                       value="LITERAL_DO, LITERAL_ELSE, LITERAL_FOR, LITERAL_IF, LITERAL_WHILE"/>
 75 |         </module>
 76 |         <module name="LeftCurly">
 77 |             <property name="tokens"
 78 |                       value="ANNOTATION_DEF, CLASS_DEF, CTOR_DEF, ENUM_CONSTANT_DEF, ENUM_DEF,
 79 |                     INTERFACE_DEF, LAMBDA, LITERAL_CASE, LITERAL_CATCH, LITERAL_DEFAULT,
 80 |                     LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF,
 81 |                     LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE, METHOD_DEF,
 82 |                     OBJBLOCK, STATIC_INIT, RECORD_DEF, COMPACT_CTOR_DEF"/>
 83 |         </module>
 84 |         <module name="RightCurly">
 85 |             <property name="id" value="RightCurlySame"/>
 86 |             <property name="tokens"
 87 |                       value="LITERAL_TRY, LITERAL_CATCH, LITERAL_FINALLY, LITERAL_IF, LITERAL_ELSE,
 88 |                     LITERAL_DO"/>
 89 |         </module>
 90 |         <module name="RightCurly">
 91 |             <property name="id" value="RightCurlyAlone"/>
 92 |             <property name="option" value="alone"/>
 93 |             <property name="tokens"
 94 |                       value="CLASS_DEF, METHOD_DEF, CTOR_DEF, LITERAL_FOR, LITERAL_WHILE, STATIC_INIT,
 95 |                     INSTANCE_INIT, ANNOTATION_DEF, ENUM_DEF, INTERFACE_DEF, RECORD_DEF,
 96 |                     COMPACT_CTOR_DEF"/>
 97 |         </module>
 98 |         <module name="SuppressionXpathSingleFilter">
 99 |             <!-- suppresion is required till https://github.com/checkstyle/checkstyle/issues/7541 -->
100 |             <property name="id" value="RightCurlyAlone"/>
101 |             <property name="query" value="//RCURLY[parent::SLIST[count(./*)=1]
102 |                                      or preceding-sibling::*[last()][self::LCURLY]]"/>
103 |         </module>
104 |         <module name="WhitespaceAfter">
105 |             <property name="tokens"
106 |                       value="COMMA, SEMI, TYPECAST, LITERAL_IF, LITERAL_ELSE,
107 |                     LITERAL_WHILE, LITERAL_DO, LITERAL_FOR, DO_WHILE"/>
108 |         </module>
109 |         <module name="WhitespaceAround">
110 |             <property name="allowEmptyConstructors" value="true"/>
111 |             <property name="allowEmptyLambdas" value="true"/>
112 |             <property name="allowEmptyMethods" value="true"/>
113 |             <property name="allowEmptyTypes" value="true"/>
114 |             <property name="allowEmptyLoops" value="true"/>
115 |             <property name="ignoreEnhancedForColon" value="false"/>
116 |             <property name="tokens"
117 |                       value="ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN, BXOR,
118 |                     BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, DO_WHILE, EQUAL, GE, GT, LAMBDA, LAND,
119 |                     LCURLY, LE, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY,
120 |                     LITERAL_FOR, LITERAL_IF, LITERAL_RETURN, LITERAL_SWITCH, LITERAL_SYNCHRONIZED,
121 |                     LITERAL_TRY, LITERAL_WHILE, LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN,
122 |                     NOT_EQUAL, PLUS, PLUS_ASSIGN, QUESTION, RCURLY, SL, SLIST, SL_ASSIGN, SR,
123 |                     SR_ASSIGN, STAR, STAR_ASSIGN, LITERAL_ASSERT, TYPE_EXTENSION_AND"/>
124 |             <message key="ws.notFollowed"
125 |                      value="WhitespaceAround: ''{0}'' is not followed by whitespace. Empty blocks may only be represented as '{}' when not part of a multi-block statement (4.1.3)"/>
126 |             <message key="ws.notPreceded"
127 |                      value="WhitespaceAround: ''{0}'' is not preceded with whitespace."/>
128 |         </module>
129 |         <module name="OneStatementPerLine"/>
130 |         <module name="MultipleVariableDeclarations"/>
131 |         <module name="ArrayTypeStyle"/>
132 |         <module name="MissingSwitchDefault"/>
133 |         <module name="FallThrough"/>
134 |         <module name="UpperEll"/>
135 |         <module name="ModifierOrder"/>
136 |         <module name="EmptyLineSeparator">
137 |             <property name="tokens"
138 |                       value="PACKAGE_DEF, IMPORT, STATIC_IMPORT, CLASS_DEF, INTERFACE_DEF, ENUM_DEF,
139 |                     STATIC_INIT, INSTANCE_INIT, METHOD_DEF, CTOR_DEF, VARIABLE_DEF, RECORD_DEF,
140 |                     COMPACT_CTOR_DEF"/>
141 |             <property name="allowNoEmptyLineBetweenFields" value="true"/>
142 |         </module>
143 |         <module name="SeparatorWrap">
144 |             <property name="id" value="SeparatorWrapDot"/>
145 |             <property name="tokens" value="DOT"/>
146 |             <property name="option" value="nl"/>
147 |         </module>
148 |         <module name="SeparatorWrap">
149 |             <property name="id" value="SeparatorWrapComma"/>
150 |             <property name="tokens" value="COMMA"/>
151 |             <property name="option" value="EOL"/>
152 |         </module>
153 |         <module name="SeparatorWrap">
154 |             <!-- ELLIPSIS is EOL until https://github.com/google/styleguide/issues/258 -->
155 |             <property name="id" value="SeparatorWrapEllipsis"/>
156 |             <property name="tokens" value="ELLIPSIS"/>
157 |             <property name="option" value="EOL"/>
158 |         </module>
159 |         <module name="SeparatorWrap">
160 |             <!-- ARRAY_DECLARATOR is EOL until https://github.com/google/styleguide/issues/259 -->
161 |             <property name="id" value="SeparatorWrapArrayDeclarator"/>
162 |             <property name="tokens" value="ARRAY_DECLARATOR"/>
163 |             <property name="option" value="EOL"/>
164 |         </module>
165 |         <module name="SeparatorWrap">
166 |             <property name="id" value="SeparatorWrapMethodRef"/>
167 |             <property name="tokens" value="METHOD_REF"/>
168 |             <property name="option" value="nl"/>
169 |         </module>
170 |         <module name="PackageName">
171 |             <property name="format" value="^[a-z]+(\.[a-z][a-z0-9]*)*$"/>
172 |             <message key="name.invalidPattern"
173 |                      value="Package name ''{0}'' must match pattern ''{1}''."/>
174 |         </module>
175 |         <module name="TypeName">
176 |             <property name="tokens" value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF,
177 |                     ANNOTATION_DEF, RECORD_DEF"/>
178 |             <message key="name.invalidPattern"
179 |                      value="Type name ''{0}'' must match pattern ''{1}''."/>
180 |         </module>
181 |         <module name="MemberName">
182 |             <property name="format" value="^[a-z][a-z0-9][a-zA-Z0-9]*$"/>
183 |             <message key="name.invalidPattern"
184 |                      value="Member name ''{0}'' must match pattern ''{1}''."/>
185 |         </module>
186 |         <module name="ParameterName">
187 |             <property name="format" value="^[a-z]([a-z0-9][a-zA-Z0-9]*)?$"/>
188 |             <message key="name.invalidPattern"
189 |                      value="Parameter name ''{0}'' must match pattern ''{1}''."/>
190 |         </module>
191 |         <module name="LambdaParameterName">
192 |             <property name="format" value="^[a-z]([a-z0-9][a-zA-Z0-9]*)?$"/>
193 |             <message key="name.invalidPattern"
194 |                      value="Lambda parameter name ''{0}'' must match pattern ''{1}''."/>
195 |         </module>
196 |         <module name="CatchParameterName">
197 |             <property name="format" value="^[a-z]([a-z0-9][a-zA-Z0-9]*)?$"/>
198 |             <message key="name.invalidPattern"
199 |                      value="Catch parameter name ''{0}'' must match pattern ''{1}''."/>
200 |         </module>
201 |         <module name="LocalVariableName">
202 |             <property name="format" value="^[a-z]([a-z0-9][a-zA-Z0-9]*)?$"/>
203 |             <message key="name.invalidPattern"
204 |                      value="Local variable name ''{0}'' must match pattern ''{1}''."/>
205 |         </module>
206 |         <module name="PatternVariableName">
207 |             <property name="format" value="^[a-z]([a-z0-9][a-zA-Z0-9]*)?$"/>
208 |             <message key="name.invalidPattern"
209 |                      value="Pattern variable name ''{0}'' must match pattern ''{1}''."/>
210 |         </module>
211 |         <module name="ClassTypeParameterName">
212 |             <property name="format" value="(^[A-Z][0-9]?)$|([A-Z][a-zA-Z0-9]*[T]$)"/>
213 |             <message key="name.invalidPattern"
214 |                      value="Class type name ''{0}'' must match pattern ''{1}''."/>
215 |         </module>
216 |         <module name="RecordComponentName">
217 |             <property name="format" value="^[a-z]([a-z0-9][a-zA-Z0-9]*)?$"/>
218 |             <message key="name.invalidPattern"
219 |                      value="Record component name ''{0}'' must match pattern ''{1}''."/>
220 |         </module>
221 |         <module name="RecordTypeParameterName">
222 |             <property name="format" value="(^[A-Z][0-9]?)$|([A-Z][a-zA-Z0-9]*[T]$)"/>
223 |             <message key="name.invalidPattern"
224 |                      value="Record type name ''{0}'' must match pattern ''{1}''."/>
225 |         </module>
226 |         <module name="MethodTypeParameterName">
227 |             <property name="format" value="(^[A-Z][0-9]?)$|([A-Z][a-zA-Z0-9]*[T]$)"/>
228 |             <message key="name.invalidPattern"
229 |                      value="Method type name ''{0}'' must match pattern ''{1}''."/>
230 |         </module>
231 |         <module name="InterfaceTypeParameterName">
232 |             <property name="format" value="(^[A-Z][0-9]?)$|([A-Z][a-zA-Z0-9]*[T]$)"/>
233 |             <message key="name.invalidPattern"
234 |                      value="Interface type name ''{0}'' must match pattern ''{1}''."/>
235 |         </module>
236 |         <module name="NoFinalizer"/>
237 |         <module name="GenericWhitespace">
238 |             <message key="ws.followed"
239 |                      value="GenericWhitespace ''{0}'' is followed by whitespace."/>
240 |             <message key="ws.preceded"
241 |                      value="GenericWhitespace ''{0}'' is preceded with whitespace."/>
242 |             <message key="ws.illegalFollow"
243 |                      value="GenericWhitespace ''{0}'' should followed by whitespace."/>
244 |             <message key="ws.notPreceded"
245 |                      value="GenericWhitespace ''{0}'' is not preceded with whitespace."/>
246 |         </module>
247 |         <module name="Indentation">
248 |             <property name="basicOffset" value="4"/>
249 |             <property name="braceAdjustment" value="0"/>
250 |             <property name="caseIndent" value="4"/>
251 |             <property name="throwsIndent" value="8"/>
252 |         </module>
253 |         <module name="AbbreviationAsWordInName">
254 |             <property name="ignoreFinal" value="false"/>
255 |             <property name="allowedAbbreviationLength" value="0"/>
256 |             <property name="tokens"
257 |                       value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF, ANNOTATION_DEF, ANNOTATION_FIELD_DEF,
258 |                     PARAMETER_DEF, VARIABLE_DEF, METHOD_DEF, PATTERN_VARIABLE_DEF, RECORD_DEF,
259 |                     RECORD_COMPONENT_DEF"/>
260 |         </module>
261 |         <module name="OverloadMethodsDeclarationOrder"/>
262 |         <module name="VariableDeclarationUsageDistance"/>
263 |         <module name="CustomImportOrder">
264 |             <property name="specialImportsRegExp" value="(^org\.|^com\.)"/>
265 |             <property name="sortImportsInGroupAlphabetically" value="false"/>
266 |             <property name="separateLineBetweenGroups" value="true"/>
267 |         </module>
268 |         <module name="MethodParamPad">
269 |             <property name="tokens"
270 |                       value="CTOR_DEF, LITERAL_NEW, METHOD_CALL, METHOD_DEF,
271 |                     SUPER_CTOR_CALL, ENUM_CONSTANT_DEF, RECORD_DEF"/>
272 |         </module>
273 |         <module name="NoWhitespaceBefore">
274 |             <property name="tokens"
275 |                       value="COMMA, SEMI, POST_INC, POST_DEC, DOT,
276 |                     LABELED_STAT, METHOD_REF"/>
277 |             <property name="allowLineBreaks" value="true"/>
278 |         </module>
279 |         <module name="ParenPad">
280 |             <property name="tokens"
281 |                       value="ANNOTATION, ANNOTATION_FIELD_DEF, CTOR_CALL, CTOR_DEF, DOT, ENUM_CONSTANT_DEF,
282 |                     EXPR, LITERAL_CATCH, LITERAL_DO, LITERAL_FOR, LITERAL_IF, LITERAL_NEW,
283 |                     LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_WHILE, METHOD_CALL,
284 |                     METHOD_DEF, QUESTION, RESOURCE_SPECIFICATION, SUPER_CTOR_CALL, LAMBDA,
285 |                     RECORD_DEF"/>
286 |         </module>
287 |         <module name="OperatorWrap">
288 |             <property name="option" value="NL"/>
289 |             <property name="tokens"
290 |                       value="BAND, BOR, BSR, BXOR, DIV, EQUAL, GE, GT, LAND, LE, LITERAL_INSTANCEOF, LOR,
291 |                     LT, MINUS, MOD, NOT_EQUAL, PLUS, QUESTION, SL, SR, STAR, METHOD_REF "/>
292 |         </module>
293 |         <module name="AnnotationLocation">
294 |             <property name="id" value="AnnotationLocationMostCases"/>
295 |             <property name="tokens"
296 |                       value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF, METHOD_DEF, CTOR_DEF,
297 |                       RECORD_DEF, COMPACT_CTOR_DEF"/>
298 |         </module>
299 |         <module name="AnnotationLocation">
300 |             <property name="id" value="AnnotationLocationVariables"/>
301 |             <property name="tokens" value="VARIABLE_DEF"/>
302 |             <property name="allowSamelineMultipleAnnotations" value="true"/>
303 |         </module>
304 |         <module name="NonEmptyAtclauseDescription"/>
305 |         <module name="InvalidJavadocPosition"/>
306 |         <module name="JavadocTagContinuationIndentation"/>
307 |         <module name="SummaryJavadoc"/>
308 |         <module name="JavadocParagraph"/>
309 |         <module name="RequireEmptyLineBeforeBlockTagGroup"/>
310 |         <module name="AtclauseOrder">
311 |             <property name="tagOrder" value="@param, @return, @throws, @deprecated"/>
312 |             <property name="target"
313 |                       value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF, METHOD_DEF, CTOR_DEF, VARIABLE_DEF"/>
314 |         </module>
315 |         <module name="JavadocMethod">
316 |             <property name="scope" value="public"/>
317 |             <property name="allowMissingParamTags" value="true"/>
318 |             <property name="allowMissingReturnTag" value="true"/>
319 |             <property name="allowedAnnotations" value="Override, Test"/>
320 |             <property name="tokens" value="METHOD_DEF, CTOR_DEF, ANNOTATION_FIELD_DEF, COMPACT_CTOR_DEF"/>
321 |         </module>
322 |         <module name="MissingJavadocMethod">
323 |             <property name="scope" value="public"/>
324 |             <property name="minLineCount" value="2"/>
325 |             <property name="allowedAnnotations" value="Override, Test"/>
326 |             <property name="tokens" value="METHOD_DEF, CTOR_DEF, ANNOTATION_FIELD_DEF,
327 |                                    COMPACT_CTOR_DEF"/>
328 |         </module>
329 |         <module name="MissingJavadocType">
330 |             <property name="scope" value="protected"/>
331 |             <property name="tokens"
332 |                       value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF,
333 |                       RECORD_DEF, ANNOTATION_DEF"/>
334 |             <property name="excludeScope" value="nothing"/>
335 |         </module>
336 |         <module name="MethodName">
337 |             <property name="format" value="^[a-z][a-z0-9][a-zA-Z0-9_]*$"/>
338 |             <message key="name.invalidPattern"
339 |                      value="Method name ''{0}'' must match pattern ''{1}''."/>
340 |         </module>
341 |         <module name="SingleLineJavadoc">
342 |             <property name="ignoreInlineTags" value="false"/>
343 |         </module>
344 |         <module name="EmptyCatchBlock">
345 |             <property name="exceptionVariableName" value="expected"/>
346 |         </module>
347 |         <module name="CommentsIndentation">
348 |             <property name="tokens" value="SINGLE_LINE_COMMENT, BLOCK_COMMENT_BEGIN"/>
349 |         </module>
350 |         <!-- https://checkstyle.org/config_filters.html#SuppressionXpathFilter -->
351 |         <module name="SuppressionXpathFilter">
352 |             <property name="file" value="${org.checkstyle.google.suppressionxpathfilter.config}"
353 |                       default="checkstyle-xpath-suppressions.xml"/>
354 |             <property name="optional" value="true"/>
355 |         </module>
356 |     </module>
357 | </module>
358 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/RawSentimentScores.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2021 Animesh Pandey
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | package com.vader.sentiment.analyzer;
26 | 
27 | /**
28 |  * This class defines the three types of raw sentiment scores which are non-normalized.
29 |  *
30 |  * @author Animesh Pandey
31 |  */
32 | public final class RawSentimentScores {
33 |     /**
34 |      * This is the raw positive sentiment score.
35 |      */
36 |     private final float positiveScore;
37 | 
38 |     /**
39 |      * This is the raw negative sentiment score.
40 |      */
41 |     private final float negativeScore;
42 | 
43 |     /**
44 |      * This is the raw neutral sentiment score.
45 |      */
46 |     private final float neutralScore;
47 | 
48 |     /**
49 |      * Creates an object of this class and sets all the fields.
50 |      *
51 |      * @param positiveScore positive score
52 |      * @param negativeScore negative score
53 |      * @param neutralScore  neutral score
54 |      */
55 |     public RawSentimentScores(float positiveScore, float negativeScore, float neutralScore) {
56 |         this.positiveScore = positiveScore;
57 |         this.negativeScore = negativeScore;
58 |         this.neutralScore = neutralScore;
59 |     }
60 | 
61 |     public float getPositiveScore() {
62 |         return positiveScore;
63 |     }
64 | 
65 |     public float getNegativeScore() {
66 |         return negativeScore;
67 |     }
68 | 
69 |     public float getNeutralScore() {
70 |         return neutralScore;
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/SentimentAnalyzer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2021 Animesh Pandey
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | package com.vader.sentiment.analyzer;
 26 | 
 27 | import java.io.IOException;
 28 | import java.util.ArrayList;
 29 | import java.util.Collections;
 30 | import java.util.List;
 31 | import org.apache.commons.lang3.StringUtils;
 32 | import org.slf4j.Logger;
 33 | import org.slf4j.LoggerFactory;
 34 | import com.google.common.base.Preconditions;
 35 | import com.vader.sentiment.processor.TextProperties;
 36 | import com.vader.sentiment.util.Constants;
 37 | import com.vader.sentiment.util.SentimentModifyingTokens;
 38 | import com.vader.sentiment.util.Utils;
 39 | import com.vader.sentiment.util.Valence;
 40 | 
 41 | /**
 42 |  * The SentimentAnalyzer class is the main class for VADER Sentiment analysis.
 43 |  *
 44 |  * @author Animesh Pandey
 45 |  * @see <a href="http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf">VADER: A Parsimonious Rule-based Model
 46 |  * for Sentiment Analysis of Social Media Text</a>
 47 |  */
 48 | //CHECKSTYLE.OFF: ExecutableStatementCount
 49 | //CHECKSTYLE.OFF: JavaNCSS
 50 | //CHECKSTYLE.OFF: CyclomaticComplexity
 51 | //CHECKSTYLE.OFF: NPathComplexity
 52 | public final class SentimentAnalyzer {
 53 |     /**
 54 |      * Logger for current class.
 55 |      */
 56 |     private static final Logger LOGGER = LoggerFactory.getLogger(SentimentAnalyzer.class);
 57 | 
 58 |     /**
 59 |      * All functions is this class are static. So, this class should have a private constructor.
 60 |      */
 61 |     private SentimentAnalyzer() {
 62 |     }
 63 | 
 64 |     /**
 65 |      * This method returns the polarity scores for a given input string.
 66 |      *
 67 |      * @param inputString the string to be analyzed.
 68 |      * @return an object of {@link SentimentPolarities} which will hold all the sentiment scores.
 69 |      */
 70 |     public static SentimentPolarities getScoresFor(String inputString) {
 71 |         return computeSentimentPolaritiesFor(inputString);
 72 |     }
 73 | 
 74 |     /**
 75 |      * Adjust valence if a token is in {@link Utils#BOOSTER_DICTIONARY} or is a yelling word (all caps).
 76 |      *
 77 |      * @param precedingToken  token
 78 |      * @param currentValence  valence to be adjusted
 79 |      * @param inputHasYelling true if the input string has any yelling words.
 80 |      * @return adjusted valence
 81 |      */
 82 |     private static float adjustValenceIfCapital(final String precedingToken, final float currentValence,
 83 |                                                 final boolean inputHasYelling) {
 84 |         float scalar = 0.0F;
 85 |         final String precedingTokenLower = precedingToken.toLowerCase();
 86 |         if (Utils.BOOSTER_DICTIONARY.containsKey(precedingTokenLower)) {
 87 |             scalar = Utils.BOOSTER_DICTIONARY.get(precedingTokenLower);
 88 |             if (currentValence < 0.0F) {
 89 |                 scalar = -scalar;
 90 |             }
 91 |             if (Utils.isUpper(precedingToken) && inputHasYelling) {
 92 |                 if (currentValence > 0.0F) {
 93 |                     scalar += Valence.ALL_CAPS_FACTOR.getValue();
 94 |                 } else {
 95 |                     scalar -= Valence.ALL_CAPS_FACTOR.getValue();
 96 |                 }
 97 |             }
 98 |         }
 99 |         return scalar;
100 |     }
101 | 
102 |     /**
103 |      * This method checks for phrases having
104 |      * - "never so current_word"
105 |      * - "never this current_word"
106 |      * - "never so this" etc.
107 |      *
108 |      * @param distance            gram window size
109 |      * @param currentItemPosition position of the current token
110 |      * @param wordsAndEmoticons   tokenized version of the input text
111 |      * @return true if any of the above phrases are found.
112 |      */
113 |     private static boolean areNeverPhrasesPresent(final int distance, final int currentItemPosition,
114 |                                                   final List<String> wordsAndEmoticons) {
115 |         if (distance == 1) {
116 |             final String wordAtDistanceTwoLeft =
117 |                 wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_BIGRAM_WINDOW);
118 |             final String wordAtDistanceOneLeft =
119 |                 wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_UNIGRAM_WINDOW);
120 |             return (wordAtDistanceTwoLeft.equals(SentimentModifyingTokens.NEVER.getValue()))
121 |                 && (wordAtDistanceOneLeft.equals(SentimentModifyingTokens.SO.getValue())
122 |                 || (wordAtDistanceOneLeft.equals(SentimentModifyingTokens.NEVER.getValue())));
123 |         } else if (distance == 2) {
124 |             final String wordAtDistanceThreeLeft = wordsAndEmoticons.get(currentItemPosition
125 |                 - Constants.PRECEDING_TRIGRAM_WINDOW);
126 |             final String wordAtDistanceTwoLeft =
127 |                 wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_BIGRAM_WINDOW);
128 |             final String wordAtDistanceOneLeft =
129 |                 wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_UNIGRAM_WINDOW);
130 |             return (wordAtDistanceThreeLeft.equals(SentimentModifyingTokens.NEVER.getValue()))
131 |                 && (wordAtDistanceTwoLeft.equals(SentimentModifyingTokens.SO.getValue())
132 |                 || wordAtDistanceTwoLeft.equals(SentimentModifyingTokens.THIS.getValue()))
133 |                 || (wordAtDistanceOneLeft.equals(SentimentModifyingTokens.SO.getValue())
134 |                 || wordAtDistanceOneLeft.equals(SentimentModifyingTokens.THIS.getValue()));
135 |         }
136 |         return false;
137 |     }
138 | 
139 |     /**
140 |      * Adjust the valence is there tokens contain any token which is a negative token or the bigrams and trigrams
141 |      * around a token are phrases that have "never" in them.
142 |      *
143 |      * @param currentValence      valence before
144 |      * @param distance            gram window size
145 |      * @param currentItemPosition position of the current token
146 |      * @param closeTokenIndex     token at the distance position from current item
147 |      * @param wordsAndEmoticons   tokenized version of the input text
148 |      * @return adjusted valence.
149 |      */
150 |     private static float dampValenceIfNegativeTokensFound(final float currentValence, final int distance,
151 |                                                           final int currentItemPosition, final int closeTokenIndex,
152 |                                                           final List<String> wordsAndEmoticons) {
153 |         float newValence = currentValence;
154 |         final boolean anyNeverPhrase = areNeverPhrasesPresent(distance, currentItemPosition, wordsAndEmoticons);
155 | 
156 |         if (!anyNeverPhrase) {
157 |             if (isNegative(wordsAndEmoticons.get(closeTokenIndex))) {
158 |                 newValence *= Valence.NEGATIVE_WORD_DAMPING_FACTOR.getValue();
159 |             }
160 |         } else {
161 |             final float neverPhraseAdjustment = (distance == 1)
162 |                 ? Valence.PRECEDING_BIGRAM_HAVING_NEVER_DAMPING_FACTOR.getValue()
163 |                 : Valence.PRECEDING_TRIGRAM_HAVING_NEVER_DAMPING_FACTOR.getValue();
164 |             newValence *= neverPhraseAdjustment;
165 |         }
166 | 
167 |         return newValence;
168 |     }
169 | 
170 |     /**
171 |      * This method builds the possible to n-grams starting from last token in the token list.
172 |      * VADER uses bi-grams and tri-grams only, so here minGramLength will be 2 and maxGramLength
173 |      * will be 3.
174 |      *
175 |      * @param tokenList                    The list of tokens for which we want to compute the n-grams.
176 |      * @param minGramLength                The minimum size of the possible n-grams.
177 |      * @param maxGramLength                The maximum size of the possible n-grams.
178 |      * @param startPosition                The position of the token from which we'll extract the tokens.
179 |      * @param maxDistanceFromStartPosition The max distance from the end of the current gram and the startPosition.
180 |      * @return list of all possible to minGramLength-grams and maxGramLength-grams starting from startPosition.
181 |      */
182 |     private static List<String> getLeftGrams(final List<String> tokenList, final int minGramLength,
183 |                                              final int maxGramLength, final int startPosition,
184 |                                              final int maxDistanceFromStartPosition) {
185 |         Preconditions.checkArgument(minGramLength > 0 && maxGramLength > 0,
186 |             "Left Gram lengths should not be negative or zero.");
187 |         Preconditions.checkArgument(maxGramLength >= minGramLength,
188 |             "Maximum left gram length should be at least equal to the minimum value.");
189 |         Preconditions.checkArgument(tokenList != null);
190 | 
191 |         final int noOfTokens = tokenList.size();
192 |         if (noOfTokens < minGramLength) {
193 |             return Collections.emptyList();
194 |         }
195 | 
196 |         final List<String> result = new ArrayList<>();
197 |         for (int end = startPosition; end > 0; end--) {
198 |             final int windowStart = end - minGramLength + 1;
199 |             final int windowEnd = end - maxGramLength;
200 |             String currentSuffix = tokenList.get(end);
201 |             for (int start = windowStart; start >= ((windowEnd < 0) ? 0 : Math.max(0, windowEnd) + 1); start--) {
202 |                 currentSuffix = tokenList.get(start) + Constants.SPACE_SEPARATOR + currentSuffix;
203 |                 result.add(currentSuffix);
204 |                 if ((startPosition - end) == maxDistanceFromStartPosition) {
205 |                     return result;
206 |                 }
207 |             }
208 |         }
209 |         return result;
210 |     }
211 | 
212 |     /**
213 |      * This method builds the first possible n-grams starting from startPosition in the token list.
214 |      * VADER uses bi-grams and tri-grams only, so here minGramLength will be 2 and maxGramLength
215 |      * will be 3.
216 |      *
217 |      * @param tokenList     The list of tokens for which we want to compute the n-grams.
218 |      * @param minGramLength The minimum size of the possible n-grams.
219 |      * @param maxGramLength The maximum size of the possible n-grams.
220 |      * @param startPosition The position of the token from which we'll extract the tokens.
221 |      * @return list of the first to minGramLength-grams and maxGramLength-grams starting from
222 |      */
223 |     private static List<String> getFirstRightGrams(final List<String> tokenList, final int minGramLength,
224 |                                                    final int maxGramLength, final int startPosition) {
225 |         Preconditions.checkArgument(minGramLength > 0 && maxGramLength > 0,
226 |             "Right Gram lengths should not be negative or zero.");
227 |         Preconditions.checkArgument(maxGramLength >= minGramLength,
228 |             "Maximum right gram length should be at least equal to the minimum value.");
229 |         Preconditions.checkArgument(tokenList != null);
230 | 
231 |         final int noOfTokens = tokenList.size();
232 |         if (noOfTokens < minGramLength) {
233 |             return Collections.emptyList();
234 |         }
235 | 
236 |         final List<String> result = new ArrayList<>();
237 |         final StringBuilder currentGram = new StringBuilder(tokenList.get(startPosition));
238 |         for (int i = minGramLength; i <= maxGramLength; i++) {
239 |             final int endPosition = startPosition + i - 1;
240 |             if (endPosition > tokenList.size() - 1) {
241 |                 break;
242 |             }
243 |             currentGram.append(Constants.SPACE_SEPARATOR).append(tokenList.get(endPosition));
244 |             result.add(currentGram.toString());
245 |         }
246 |         return result;
247 |     }
248 | 
249 |     /**
250 |      * We check if the idioms present in {@link Utils#SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY} are present in
251 |      * left bi/tri-grams sequences.
252 |      *
253 |      * @param currentValence    current valence before checking for idioms.
254 |      * @param leftGramSequences list of all the left bi/tri-grams.
255 |      * @return adjusted valence.
256 |      */
257 |     private static float adjustValenceIfLeftGramsHaveIdioms(final float currentValence,
258 |                                                             final List<String> leftGramSequences) {
259 |         float newValence = currentValence;
260 |         for (String leftGramSequence : leftGramSequences) {
261 |             if (Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.containsKey(leftGramSequence)) {
262 |                 newValence = Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.get(leftGramSequence);
263 |                 break;
264 |             }
265 |         }
266 | 
267 |         // Based on how getLeftGrams calculates grams, the bi-grams are at the all the even indices.
268 |         // VADER only deals with the 2 left most bi-grams in leftGramSequences.
269 |         for (int i = leftGramSequences.size() - 1; i <= 2; i--) {
270 |             if (Utils.BOOSTER_DICTIONARY.containsKey(leftGramSequences.get(i))) {
271 |                 newValence += Valence.DEFAULT_DAMPING.getValue();
272 |                 break;
273 |             }
274 |         }
275 | 
276 |         return newValence;
277 |     }
278 | 
279 |     /**
280 |      * Search if the any bi-gram/tri-grams around the currentItemPosition contains any idioms defined
281 |      * in {@link Utils#SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY} Adjust the current valence if there are
282 |      * any idioms found.
283 |      *
284 |      * @param currentValence      valence to be adjusted
285 |      * @param currentItemPosition current tokens position
286 |      * @param wordsAndEmoticons   tokenized version of the input text
287 |      * @param distance            max distance from the end of the current gram and the startPosition.
288 |      * @return adjusted valence
289 |      */
290 |     private static float adjustValenceIfIdiomsFound(final float currentValence, final int currentItemPosition,
291 |                                                     final List<String> wordsAndEmoticons, final int distance) {
292 |         float newValence;
293 | 
294 |         final List<String> leftGramSequences = getLeftGrams(wordsAndEmoticons, 2,
295 |             Constants.MAX_GRAM_WINDOW_SIZE, currentItemPosition, distance);
296 |         newValence = adjustValenceIfLeftGramsHaveIdioms(currentValence, leftGramSequences);
297 | 
298 |         final List<String> rightGramSequences = getFirstRightGrams(wordsAndEmoticons, 2,
299 |             Constants.MAX_GRAM_WINDOW_SIZE, currentItemPosition);
300 |         for (String rightGramSequence : rightGramSequences) {
301 |             if (Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.containsKey(rightGramSequence)) {
302 |                 newValence = Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.get(rightGramSequence);
303 |             }
304 |         }
305 | 
306 |         return newValence;
307 |     }
308 | 
309 |     /**
310 |      * Analyze each token/emoticon in the input string and calculate its valence.
311 |      *
312 |      * @param textProperties This objects holds the tokenized version of a string.
313 |      * @return the valence of each token as a list
314 |      */
315 |     private static List<Float> getTokenWiseSentiment(final TextProperties textProperties) {
316 |         List<Float> sentiments = new ArrayList<>();
317 |         final List<String> wordsAndEmoticons = textProperties.getWordsAndEmoticons();
318 | 
319 |         for (int currentItemPosition = 0; currentItemPosition < wordsAndEmoticons.size(); currentItemPosition++) {
320 |             final String currentItem = wordsAndEmoticons.get(currentItemPosition);
321 |             final String currentItemLower = currentItem.toLowerCase();
322 |             float currentValence = 0.0F;
323 | 
324 |             LOGGER.debug("Current token, \"{}\" with index, i = {}", currentItem, currentItemPosition);
325 |             LOGGER.debug("Sentiment State before \"kind of\" processing: {}", sentiments);
326 | 
327 |             /*
328 |              * This section performs the following evaluation:
329 |              * If the term at currentItemPosition is followed by "kind of" or the it is present in
330 |              * {@link Utils#BoosterDictionary}, add the currentValence to sentiment array and break
331 |              * to the next loop.
332 |              *
333 |              * If currentValence was 0.0, then current word's valence will also be 0.0.
334 |              */
335 |             if ((currentItemPosition < wordsAndEmoticons.size() - 1
336 |                 && currentItemLower.equals(SentimentModifyingTokens.KIND.getValue())
337 |                 && wordsAndEmoticons.get(currentItemPosition + 1).toLowerCase()
338 |                                     .equals(SentimentModifyingTokens.OF.getValue()))
339 |                 || Utils.BOOSTER_DICTIONARY.containsKey(currentItemLower)) {
340 |                 sentiments.add(currentValence);
341 |                 continue;
342 |             }
343 | 
344 |             LOGGER.debug("Sentiment State after \"kind of\" processing: {}", sentiments);
345 |             LOGGER.debug("Current Valence is {} for \"{}\"", currentValence, currentItem);
346 | 
347 |             /*
348 |              * If current item in lowercase is in {@link Utils#WordValenceDictionary}...
349 |              */
350 |             if (Utils.WORD_VALENCE_DICTIONARY.containsKey(currentItemLower)) {
351 |                 currentValence = Utils.WORD_VALENCE_DICTIONARY.get(currentItemLower);
352 | 
353 |                 if (LOGGER.isDebugEnabled()) {
354 |                     LOGGER.debug("Current currentItem isUpper(): {}", Utils.isUpper(currentItem));
355 |                     LOGGER.debug("Current currentItem isYelling(): {}", textProperties.isYelling());
356 |                 }
357 | 
358 |                 /*
359 |                  * If current item is all in uppercase and the input string has yelling words,
360 |                  * accordingly adjust currentValence.
361 |                  */
362 |                 if (Utils.isUpper(currentItem) && textProperties.isYelling()) {
363 |                     if (currentValence > 0.0) {
364 |                         currentValence += Valence.ALL_CAPS_FACTOR.getValue();
365 |                     } else {
366 |                         currentValence -= Valence.ALL_CAPS_FACTOR.getValue();
367 |                     }
368 |                 }
369 | 
370 |                 LOGGER.debug("Current Valence post all CAPS checks: {}", currentValence);
371 | 
372 |                 /*
373 |                  * "distance" is the window size.
374 |                  * e.g. "The plot was good, but the characters are uncompelling.",
375 |                  * if the current item is "characters", then at:
376 |                  *  - distance = 0, closeTokenIndex = 5
377 |                  *  - distance = 1, closeTokenIndex = 4
378 |                  *  - distance = 2, closeTokenIndex = 3
379 |                  */
380 |                 int distance = 0;
381 |                 while (distance < Constants.MAX_GRAM_WINDOW_SIZE) {
382 |                     int closeTokenIndex = currentItemPosition - (distance + 1);
383 |                     if (closeTokenIndex < 0) {
384 |                         closeTokenIndex = wordsAndEmoticons.size() - Math.abs(closeTokenIndex);
385 |                     }
386 | 
387 |                     if ((currentItemPosition > distance)
388 |                         && !Utils.WORD_VALENCE_DICTIONARY.containsKey(wordsAndEmoticons.get(closeTokenIndex)
389 |                                                                                        .toLowerCase())) {
390 |                         LOGGER.debug("Current Valence pre gramBasedValence: {}", currentValence);
391 |                         float gramBasedValence = adjustValenceIfCapital(wordsAndEmoticons.get(closeTokenIndex),
392 |                             currentValence, textProperties.isYelling());
393 |                         LOGGER.debug("Current Valence post gramBasedValence: {}", currentValence);
394 |                         /*
395 |                          * At distance of 1, reduce current gram's valence by 5%.
396 |                          * At distance of 2, reduce current gram's valence by 10%.
397 |                          */
398 |                         if (gramBasedValence != 0.0F) {
399 |                             if (distance == 1) {
400 |                                 gramBasedValence *= Valence.ONE_WORD_DISTANCE_DAMPING_FACTOR.getValue();
401 |                             } else if (distance == 2) {
402 |                                 gramBasedValence *= Valence.TWO_WORD_DISTANCE_DAMPING_FACTOR.getValue();
403 |                             }
404 |                         }
405 |                         currentValence += gramBasedValence;
406 | 
407 |                         LOGGER.debug("Current Valence post gramBasedValence and distance "
408 |                             + "based damping: {}", currentValence);
409 | 
410 |                         currentValence = dampValenceIfNegativeTokensFound(currentValence, distance,
411 |                             currentItemPosition, closeTokenIndex, wordsAndEmoticons);
412 | 
413 |                         LOGGER.debug("Current Valence post \"never\" check: {}", currentValence);
414 | 
415 |                         /*
416 |                          * At a distance of 2, we check for idioms in bi-grams and tri-grams around currentItemPosition.
417 |                          */
418 |                         if (distance == 2) {
419 |                             currentValence = adjustValenceIfIdiomsFound(currentValence, currentItemPosition,
420 |                                 wordsAndEmoticons, distance);
421 |                             LOGGER.debug("Current Valence post Idiom check: {}", currentValence);
422 |                         }
423 |                     }
424 | 
425 |                     distance++;
426 |                 }
427 |                 currentValence = adjustValenceIfHasAtLeast(currentItemPosition, wordsAndEmoticons, currentValence);
428 |             }
429 | 
430 |             sentiments.add(currentValence);
431 |         }
432 |         LOGGER.debug("Sentiment state after first pass through tokens: {}", sentiments);
433 | 
434 |         sentiments = adjustValenceIfHasConjunction(wordsAndEmoticons, sentiments);
435 |         LOGGER.debug("Sentiment state after checking conjunctions: {}", sentiments);
436 | 
437 |         return sentiments;
438 |     }
439 | 
440 |     /**
441 |      * This methods calculates the positive, negative and neutral sentiment from the sentiment values of the input
442 |      * string.
443 |      *
444 |      * @param tokenWiseSentimentState valence of the each token in input string
445 |      * @param punctuationAmplifier    valence adjustment factor for punctuations
446 |      * @return an object of the non-normalized scores as {@link RawSentimentScores}.
447 |      */
448 |     private static RawSentimentScores computeRawSentimentScores(final List<Float> tokenWiseSentimentState,
449 |                                                                 final float punctuationAmplifier) {
450 |         float positiveSentimentScore = 0.0F;
451 |         float negativeSentimentScore = 0.0F;
452 |         int neutralSentimentCount = 0;
453 |         for (Float valence : tokenWiseSentimentState) {
454 |             if (valence > 0.0F) {
455 |                 positiveSentimentScore += valence + 1.0F;
456 |             } else if (valence < 0.0F) {
457 |                 negativeSentimentScore += valence - 1.0F;
458 |             } else {
459 |                 neutralSentimentCount += 1;
460 |             }
461 |         }
462 | 
463 |         if (positiveSentimentScore > Math.abs(negativeSentimentScore)) {
464 |             positiveSentimentScore += punctuationAmplifier;
465 |         } else if (positiveSentimentScore < Math.abs(negativeSentimentScore)) {
466 |             negativeSentimentScore -= punctuationAmplifier;
467 |         }
468 | 
469 |         return new RawSentimentScores(positiveSentimentScore, negativeSentimentScore, (float) neutralSentimentCount);
470 |     }
471 | 
472 |     /**
473 |      * The compound score is computed by summing the valence scores of each word in the lexicon, adjusted
474 |      * according to the rules, and then normalized to be between -1 (most extreme negative) and +1
475 |      * (most extreme positive). This is the most useful metric if you want a single uni-dimensional measure
476 |      * of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.
477 |      *
478 |      * @param tokenWiseSentimentState valence for each token
479 |      * @param punctuationAmplifier    valence adjustment factor for punctuations
480 |      * @return raw compound polarity
481 |      */
482 |     private static float computeCompoundPolarityScore(final List<Float> tokenWiseSentimentState,
483 |                                                       final float punctuationAmplifier) {
484 |         /*
485 |          * Compute the total valence.
486 |          */
487 |         float totalValence = tokenWiseSentimentState.stream().reduce(0.0F, Float::sum);
488 |         LOGGER.debug("Total valence: {}", totalValence);
489 | 
490 |         if (totalValence > 0.0F) {
491 |             totalValence += punctuationAmplifier;
492 |         } else if (totalValence < 0.0F) {
493 |             totalValence -= punctuationAmplifier;
494 |         }
495 | 
496 |         return totalValence;
497 |     }
498 | 
499 |     /**
500 |      * Normalize the compound score and the other three raw sentiment scores.
501 |      *
502 |      * @param rawSentimentScores    multi-dimensional sentiment scores.
503 |      * @param compoundPolarityScore uni-dimensional sentiment score.
504 |      * @return normalized values of all the type of the sentiment scores in a object of {@link SentimentPolarities}.
505 |      */
506 |     private static SentimentPolarities normalizeAllScores(final RawSentimentScores rawSentimentScores,
507 |                                                           final float compoundPolarityScore) {
508 |         final float positiveSentimentScore = rawSentimentScores.getPositiveScore();
509 |         final float negativeSentimentScore = rawSentimentScores.getNegativeScore();
510 |         final int neutralSentimentCount = Math.round(rawSentimentScores.getNeutralScore());
511 | 
512 |         final float normalizationFactor = positiveSentimentScore + Math.abs(negativeSentimentScore)
513 |             + neutralSentimentCount;
514 | 
515 |         if (LOGGER.isDebugEnabled()) {
516 |             LOGGER.debug("Normalization Factor: {}", normalizationFactor);
517 |             LOGGER.debug("Pre-Normalized Scores: {} {} {} {}}",
518 |                 Math.abs(positiveSentimentScore),
519 |                 Math.abs(negativeSentimentScore),
520 |                 Math.abs(neutralSentimentCount),
521 |                 compoundPolarityScore
522 |             );
523 |         }
524 | 
525 |         final float absolutePositivePolarity = Math.abs(positiveSentimentScore / normalizationFactor);
526 |         final float absoluteNegativePolarity = Math.abs(negativeSentimentScore / normalizationFactor);
527 |         final float absoluteNeutralPolarity = Math.abs(neutralSentimentCount / normalizationFactor);
528 | 
529 |         LOGGER.debug("Pre-Round Scores: {} {} {} {}}",
530 |             absolutePositivePolarity,
531 |             absoluteNegativePolarity,
532 |             absoluteNeutralPolarity,
533 |             compoundPolarityScore
534 |         );
535 | 
536 |         final float normalizedPositivePolarity = roundDecimal(absolutePositivePolarity, 3);
537 |         final float normalizedNegativePolarity = roundDecimal(absoluteNegativePolarity, 3);
538 |         final float normalizedNeutralPolarity = roundDecimal(absoluteNeutralPolarity, 3);
539 | 
540 |         // Normalizing the compound score.
541 |         final float normalizedCompoundPolarity = roundDecimal(normalizeCompoundScore(compoundPolarityScore), 4);
542 | 
543 |         return new SentimentPolarities(normalizedPositivePolarity, normalizedNegativePolarity,
544 |             normalizedNeutralPolarity, normalizedCompoundPolarity);
545 |     }
546 | 
547 |     /**
548 |      * Convert the lower level token wise valence to a higher level polarity scores.
549 |      *
550 |      * @param tokenWiseSentimentStateParam the token wise scores of the input string
551 |      * @param punctuationAmplifier         valence adjustment factor for punctuations
552 |      * @return the positive, negative, neutral and compound polarity scores as a map
553 |      */
554 |     private static SentimentPolarities getPolarityScores(final List<Float> tokenWiseSentimentStateParam,
555 |                                                          final float punctuationAmplifier) {
556 |         final List<Float> tokenWiseSentimentState = Collections.unmodifiableList(tokenWiseSentimentStateParam);
557 |         LOGGER.debug("Final token-wise sentiment state: {}", tokenWiseSentimentState);
558 | 
559 |         final float compoundPolarity = computeCompoundPolarityScore(tokenWiseSentimentState, punctuationAmplifier);
560 |         final RawSentimentScores rawSentimentScores = computeRawSentimentScores(tokenWiseSentimentState,
561 |             punctuationAmplifier);
562 | 
563 |         return normalizeAllScores(rawSentimentScores, compoundPolarity);
564 |     }
565 | 
566 |     /**
567 |      * This function jointly performs the boosting if input string contains
568 |      * '!'s and/or '?'s and then returns the sum of the boosted scores from
569 |      * {@link SentimentAnalyzer#boostByExclamation(String)} and {@link SentimentAnalyzer#boostByQuestionMark(String)}.
570 |      *
571 |      * @param input the input string that needs to be processed.
572 |      * @return joint boosted score
573 |      */
574 |     private static float boostByPunctuation(String input) {
575 |         return boostByExclamation(input) + boostByQuestionMark(input);
576 |     }
577 | 
578 |     /**
579 |      * Valence boosting when '!' is found in the input string.
580 |      *
581 |      * @param input the input string that needs to be processed.
582 |      * @return boosting score
583 |      */
584 |     private static float boostByExclamation(String input) {
585 |         final int exclamationCount =
586 |             StringUtils.countMatches(input, SentimentModifyingTokens.EXCLAMATION_MARK.getValue());
587 |         return Math.min(exclamationCount, Constants.MAX_EXCLAMATION_MARKS)
588 |             * Valence.EXCLAMATION_BOOSTING.getValue();
589 |     }
590 | 
591 |     /**
592 |      * Valence boosting when '?' is found in the input string.
593 |      *
594 |      * @param input the input string that needs to be processed.
595 |      * @return boosting score
596 |      */
597 |     private static float boostByQuestionMark(String input) {
598 |         final int questionMarkCount =
599 |             StringUtils.countMatches(input, SentimentModifyingTokens.QUESTION_MARK.getValue());
600 |         float questionMarkAmplifier = 0.0F;
601 |         if (questionMarkCount > 1) {
602 |             if (questionMarkCount <= Constants.MAX_QUESTION_MARKS) {
603 |                 questionMarkAmplifier = questionMarkCount * Valence.QUESTION_MARK_MAX_COUNT_BOOSTING.getValue();
604 |             } else {
605 |                 questionMarkAmplifier = Valence.QUESTION_MARK_BOOSTING.getValue();
606 |             }
607 |         }
608 |         return questionMarkAmplifier;
609 |     }
610 | 
611 |     /**
612 |      * This methods manages the effect of contrastive conjunctions like "but" on the valence of a token.
613 |      * "VADER" only support "but/BUT" as a conjunction that modifies the valence.
614 |      *
615 |      * @param inputTokensParam             list of token and/or emoticons in the input string
616 |      * @param tokenWiseSentimentStateParam current token wise sentiment scores
617 |      * @return adjusted token wise sentiment scores
618 |      */
619 |     private static List<Float> adjustValenceIfHasConjunction(final List<String> inputTokensParam,
620 |                                                              final List<Float> tokenWiseSentimentStateParam) {
621 |         final List<String> inputTokens = Collections.unmodifiableList(inputTokensParam);
622 |         final List<Float> tokenWiseSentimentState = new ArrayList<>(tokenWiseSentimentStateParam);
623 | 
624 |         int indexOfConjunction = inputTokens.indexOf(SentimentModifyingTokens.BUT.getValue());
625 |         if (indexOfConjunction < 0) {
626 |             indexOfConjunction = inputTokens.indexOf(SentimentModifyingTokens.BUT.getValue().toUpperCase());
627 |         }
628 |         if (indexOfConjunction >= 0) {
629 |             for (int valenceIndex = 0; valenceIndex < tokenWiseSentimentState.size(); valenceIndex++) {
630 |                 float currentValence = tokenWiseSentimentState.get(valenceIndex);
631 |                 if (valenceIndex < indexOfConjunction) {
632 |                     currentValence *= Valence.PRE_CONJUNCTION_ADJUSTMENT_FACTOR.getValue();
633 |                 } else if (valenceIndex > indexOfConjunction) {
634 |                     currentValence *= Valence.POST_CONJUNCTION_ADJUSTMENT_FACTOR.getValue();
635 |                 }
636 |                 tokenWiseSentimentState.set(valenceIndex, currentValence);
637 |             }
638 |         }
639 |         return tokenWiseSentimentState;
640 |     }
641 | 
642 |     /**
643 |      * Check for the cases where you have phrases having "least" in the words preceding the token at
644 |      * currentItemPosition and accordingly adjust the valence.
645 |      *
646 |      * @param currentItemPosition    position of the token in wordsAndEmoticons around which we will search for "least"
647 |      *                               type phrases
648 |      * @param wordsAndEmoticonsParam list of token and/or emoticons in the input string
649 |      * @param currentValence         valence of the token at currentItemPosition
650 |      * @return adjusted currentValence
651 |      */
652 |     private static float adjustValenceIfHasAtLeast(final int currentItemPosition,
653 |                                                    final List<String> wordsAndEmoticonsParam,
654 |                                                    final float currentValence) {
655 |         final List<String> wordsAndEmoticons = Collections.unmodifiableList(wordsAndEmoticonsParam);
656 |         float valence = currentValence;
657 |         if (currentItemPosition > 1
658 |             && !Utils.WORD_VALENCE_DICTIONARY.containsKey(wordsAndEmoticons.get(currentItemPosition - 1)
659 |                                                                            .toLowerCase())
660 |             && wordsAndEmoticons.get(currentItemPosition - 1)
661 |                                 .toLowerCase().equals(SentimentModifyingTokens.LEAST.getValue())) {
662 |             if (!(wordsAndEmoticons.get(currentItemPosition - 2).toLowerCase()
663 |                                    .equals(SentimentModifyingTokens.AT.getValue())
664 |                 || wordsAndEmoticons.get(currentItemPosition - 2).toLowerCase()
665 |                                     .equals(SentimentModifyingTokens.VERY.getValue()))) {
666 |                 valence *= Valence.NEGATIVE_WORD_DAMPING_FACTOR.getValue();
667 |             }
668 |         } else if (currentItemPosition > 0
669 |             && !Utils.WORD_VALENCE_DICTIONARY.containsKey(wordsAndEmoticons.get(currentItemPosition - 1).toLowerCase())
670 |             && wordsAndEmoticons.get(currentItemPosition - 1).equals(SentimentModifyingTokens.LEAST.getValue())) {
671 |             valence *= Valence.NEGATIVE_WORD_DAMPING_FACTOR.getValue();
672 |         }
673 |         return valence;
674 |     }
675 | 
676 |     /**
677 |      * Check if token has "n't" in the end.
678 |      *
679 |      * @param token current token
680 |      * @return true iff token has "n't" in the end
681 |      */
682 |     private static boolean hasContraction(final String token) {
683 |         return token.endsWith(SentimentModifyingTokens.CONTRACTION.getValue());
684 |     }
685 | 
686 |     /**
687 |      * Check if token belongs to a pre-defined list of negative words. e.g. {@link Utils#NEGATIVE_WORDS}
688 |      * and also checks if the token has "n't" in the end.
689 |      *
690 |      * @param token             current token
691 |      * @param checkContractions flag to check "n't" in end of token
692 |      * @return true iff token is in newNegWords or if checkContractions is true, token should have "n't" in its end
693 |      */
694 |     private static boolean isNegative(final String token, final boolean checkContractions) {
695 |         final boolean result = Utils.NEGATIVE_WORDS.contains(token);
696 |         if (!checkContractions) {
697 |             return result;
698 |         }
699 |         return result || hasContraction(token);
700 |     }
701 | 
702 |     /**
703 |      * This is the default version of {@link SentimentAnalyzer#isNegative(String, boolean)}.
704 |      *
705 |      * @param token current token
706 |      * @return true iff token is in {@link Utils#NEGATIVE_WORDS} or token has "n't" in its end
707 |      */
708 |     private static boolean isNegative(final String token) {
709 |         return isNegative(token, true);
710 |     }
711 | 
712 |     /**
713 |      * Normalize the total valence of the input string, where alpha is the estimated maximum value of valence.
714 |      *
715 |      * @param score score
716 |      * @param alpha estimated max value
717 |      * @return normalized value of score
718 |      */
719 |     private static float normalizeCompoundScore(final float score, final float alpha) {
720 |         final double normalizedScore = score / Math.sqrt((score * score) + alpha);
721 |         return (float) normalizedScore;
722 |     }
723 | 
724 |     /**
725 |      * Default version of {@link SentimentAnalyzer#normalizeCompoundScore(float, float)} where alpha is 15.0.
726 |      *
727 |      * @param score score
728 |      * @return normalized value of score
729 |      */
730 |     private static float normalizeCompoundScore(final float score) {
731 |         return normalizeCompoundScore(score, Constants.DEFAULT_ALPHA);
732 |     }
733 | 
734 |     /**
735 |      * This method rounds of a float value to defined no. of places.
736 |      *
737 |      * @param currentValue current float values
738 |      * @param noOfPlaces   no. of decimal places
739 |      * @return rounded float value
740 |      */
741 |     private static float roundDecimal(final float currentValue, final int noOfPlaces) {
742 |         final float factor = (float) Math.pow(10.0, (double) noOfPlaces);
743 |         final float number = Math.round(currentValue * factor);
744 |         return number / factor;
745 |     }
746 | 
747 |     /**
748 |      * This is a composite function that computes token-wise sentiment scores and then converts that to
749 |      * higher level scores.
750 |      *
751 |      * @param inputString string that is to be processed.
752 |      * @return the positive, negative, neutral and compound polarity scores as {@link SentimentPolarities}
753 |      */
754 |     private static SentimentPolarities computeSentimentPolaritiesFor(String inputString) {
755 |         // Parse the string using Lucene and get the text tokens.
756 |         final TextProperties inputStringProperties;
757 |         try {
758 |             inputStringProperties = new TextProperties(inputString);
759 |         } catch (IOException excp) {
760 |             LOGGER.error("There was an issue while pre-processing the inputString.", excp);
761 |             return SentimentPolarities.emptySentimentState();
762 |         }
763 | 
764 |         // Calculate the per-token valence.
765 |         final List<Float> tokenWiseSentiments = getTokenWiseSentiment(inputStringProperties);
766 |         if (tokenWiseSentiments.isEmpty()) {
767 |             return SentimentPolarities.emptySentimentState();
768 |         }
769 |         // Adjust the total valence score on the basis of the punctuations in the input string.
770 |         final float punctuationAmplifier = boostByPunctuation(inputString);
771 |         return getPolarityScores(tokenWiseSentiments, punctuationAmplifier);
772 |     }
773 | }
774 | //CHECKSTYLE.ON: ExecutableStatementCount
775 | //CHECKSTYLE.ON: JavaNCSS
776 | //CHECKSTYLE.ON: CyclomaticComplexity
777 | //CHECKSTYLE.ON: NPathComplexity
778 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/SentimentPolarities.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2021 Animesh Pandey
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | package com.vader.sentiment.analyzer;
 26 | 
 27 | /**
 28 |  * This class object will store the normalized scores that we get from {@link RawSentimentScores}.
 29 |  * The positivePolarity, neutralPolarity, and negativePolarity scores are ratios for proportions
 30 |  * of text that fall in each category (so these should all add up to be 1... or close to it with
 31 |  * float operation). These are the most useful metrics if you want multidimensional measures of
 32 |  * sentiment for a given sentence.
 33 |  * The compoundPolarity score is computed by summing the valence scores of each word in the lexicon,
 34 |  * adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1
 35 |  * (most extreme positive). This is the most useful metric if you want a single uni-dimensional measure of
 36 |  * sentiment for a given sentence. Calling it a "normalized, weighted composite score" is accurate.
 37 |  *
 38 |  * @author Animesh Pandey
 39 |  */
 40 | public final class SentimentPolarities {
 41 |     /**
 42 |      * This represents proportion of text that is positive.
 43 |      */
 44 |     private final float positivePolarity;
 45 | 
 46 |     /**
 47 |      * This represents proportion of text that is negative.
 48 |      */
 49 |     private final float negativePolarity;
 50 | 
 51 |     /**
 52 |      * This represents proportion of text that is neutral.
 53 |      */
 54 |     private final float neutralPolarity;
 55 | 
 56 |     /**
 57 |      * This represents compound score.
 58 |      */
 59 |     private final float compoundPolarity;
 60 | 
 61 |     /**
 62 |      * Creates an object of this class and sets all the fields.
 63 |      *
 64 |      * @param positivePolarity proportion of text that is positive.
 65 |      * @param negativePolarity proportion of text that is negative.
 66 |      * @param neutralPolarity  proportion of text that is neutral.
 67 |      * @param compoundPolarity compound score.
 68 |      */
 69 |     public SentimentPolarities(float positivePolarity, float negativePolarity, float neutralPolarity,
 70 |                                float compoundPolarity) {
 71 |         this.positivePolarity = positivePolarity;
 72 |         this.negativePolarity = negativePolarity;
 73 |         this.neutralPolarity = neutralPolarity;
 74 |         this.compoundPolarity = compoundPolarity;
 75 |     }
 76 | 
 77 |     /**
 78 |      * Sometimes, if the string that is to be processed, is either empty, null or has un-identified tokens.
 79 |      * In this case all the polarities are set to zero.
 80 |      *
 81 |      * @return an object of {@link SentimentPolarities} class with all polarities set to 0.0.
 82 |      */
 83 |     public static SentimentPolarities emptySentimentState() {
 84 |         return new SentimentPolarities(0.0F, 0.0F, 0.0F, 0.0F);
 85 |     }
 86 | 
 87 |     public float getPositivePolarity() {
 88 |         return positivePolarity;
 89 |     }
 90 | 
 91 |     public float getNegativePolarity() {
 92 |         return negativePolarity;
 93 |     }
 94 | 
 95 |     public float getNeutralPolarity() {
 96 |         return neutralPolarity;
 97 |     }
 98 | 
 99 |     public float getCompoundPolarity() {
100 |         return compoundPolarity;
101 |     }
102 | 
103 |     @Override
104 |     public String toString() {
105 |         return "SentimentPolarities{"
106 |             + "positivePolarity=" + positivePolarity
107 |             + ", negativePolarity=" + negativePolarity
108 |             + ", neutralPolarity=" + neutralPolarity
109 |             + ", compoundPolarity=" + compoundPolarity
110 |             + '}';
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2021 Animesh Pandey
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | /**
26 |  * Package containing analyzer classes.
27 |  *
28 |  * @author Animesh Pandey
29 |  */
30 | package com.vader.sentiment.analyzer;
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/InputAnalyzer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2021 Animesh Pandey
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | package com.vader.sentiment.processor;
26 | 
27 | import java.io.IOException;
28 | import java.io.StringReader;
29 | import java.util.function.Consumer;
30 | import org.apache.lucene.analysis.TokenStream;
31 | import org.apache.lucene.analysis.Tokenizer;
32 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
33 | import org.apache.lucene.analysis.miscellaneous.LengthFilter;
34 | import org.apache.lucene.analysis.standard.StandardTokenizer;
35 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
36 | 
37 | /**
38 |  * This class defines a Lucene analyzer that is applied on the input string in
39 |  * {@link com.vader.sentiment.analyzer.SentimentAnalyzer}.
40 |  *
41 |  * @author Animesh Pandey
42 |  */
43 | class InputAnalyzer implements InputAnalyzerInterface {
44 |     /**
45 |      * This function applies a Lucene tokenizer that splits a string into a tokens.
46 |      *
47 |      * @param inputString   The input string to be pre-processed with Lucene tokenizer
48 |      * @param tokenizer     The tokenizer to use for processing the input string
49 |      * @param tokenConsumer The consumer of the tokens
50 |      * @throws IOException if Lucene's tokenizer encounters any error
51 |      */
52 |     protected void tokenize(final String inputString, final Tokenizer tokenizer,
53 |                             final Consumer<String> tokenConsumer) throws IOException {
54 |         tokenizer.setReader(new StringReader(inputString));
55 | 
56 |         try (TokenStream tokenStream = new LengthFilter(tokenizer, 2, Integer.MAX_VALUE)) {
57 |             final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
58 |             tokenStream.reset();
59 | 
60 |             while (tokenStream.incrementToken()) {
61 |                 tokenConsumer.accept(charTermAttribute.toString());
62 |             }
63 | 
64 |             tokenStream.end();
65 |         }
66 |     }
67 | 
68 |     /**
69 |      * Performs tokenization using Lucene's {@link WhitespaceTokenizer}, which tokenizes from the white spaces.
70 |      * {@inheritDoc}
71 |      */
72 |     @Override
73 |     public void keepPunctuation(final String inputString, final Consumer<String> tokenConsumer) throws IOException {
74 |         tokenize(inputString, new WhitespaceTokenizer(), tokenConsumer);
75 |     }
76 | 
77 |     /**
78 |      * Performs tokenization using Lucene's {@link StandardTokenizer}, which tokenizes from white space
79 |      * as well as removed any punctuations.
80 |      * {@inheritDoc}
81 |      */
82 |     @Override
83 |     public void removePunctuation(final String inputString, final Consumer<String> tokenConsumer) throws IOException {
84 |         tokenize(inputString, new StandardTokenizer(), tokenConsumer);
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/InputAnalyzerInterface.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2021 Animesh Pandey
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | package com.vader.sentiment.processor;
26 | 
27 | import java.io.IOException;
28 | import java.util.function.Consumer;
29 | 
30 | /**
31 |  * This interface defines methods that use two methods for splitting up a raw string.
32 |  *
33 |  * @author Animesh Pandey
34 |  */
35 | interface InputAnalyzerInterface {
36 |     /**
37 |      * This method performs tokenization without punctuation removal.
38 |      *
39 |      * @param inputString   The input string to be pre-processed with Lucene tokenizer
40 |      * @param tokenConsumer The consumer of the tokens
41 |      * @throws IOException if Lucene's analyzer encounters any error
42 |      */
43 |     void keepPunctuation(String inputString, Consumer<String> tokenConsumer) throws IOException;
44 | 
45 |     /**
46 |      * This method performs tokenization with punctuation removal.
47 |      *
48 |      * @param inputString   The input string to be pre-processed with Lucene tokenizer
49 |      * @param tokenConsumer The consumer of the tokens
50 |      * @throws IOException if Lucene's analyzer encounters any error
51 |      */
52 |     void removePunctuation(String inputString, Consumer<String> tokenConsumer) throws IOException;
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/TextProperties.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2021 Animesh Pandey
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | package com.vader.sentiment.processor;
 26 | 
 27 | import java.io.IOException;
 28 | import java.util.ArrayList;
 29 | import java.util.HashSet;
 30 | import java.util.List;
 31 | import java.util.Set;
 32 | import com.vader.sentiment.util.Utils;
 33 | 
 34 | /**
 35 |  * The TextProperties class implements the pre-processing steps of the input string for sentiment analysis.
 36 |  * It utilizes the Lucene analyzer to perform processing on the input string.
 37 |  *
 38 |  * @author Animesh Pandey
 39 |  */
 40 | public final class TextProperties {
 41 |     /**
 42 |      * String whose properties will be extracted.
 43 |      */
 44 |     private final String inputText;
 45 | 
 46 |     /**
 47 |      * List of tokens and emoticons extracted from the {@link TextProperties#inputText}.
 48 |      */
 49 |     private List<String> wordsAndEmoticons;
 50 | 
 51 |     /**
 52 |      * Set of tokens extracted from the {@link TextProperties#inputText}.
 53 |      * Emoticons are removed here.
 54 |      */
 55 |     private Set<String> wordsOnly;
 56 | 
 57 |     /**
 58 |      * Flags that specifies if the current string has yelling words.
 59 |      */
 60 |     private boolean hasYellWords;
 61 | 
 62 |     /**
 63 |      * Parameterized constructor accepting the input string that will be processed.
 64 |      *
 65 |      * @param inputText the input string
 66 |      * @throws IOException if there is an issue with the lucene analyzers
 67 |      */
 68 |     public TextProperties(final String inputText) throws IOException {
 69 |         this.inputText = inputText;
 70 |         setWordsAndEmoticons();
 71 |         setHasYellWords(hasCapDifferential(getWordsAndEmoticons()));
 72 |     }
 73 | 
 74 |     /**
 75 |      * Tokenize the input text in two steps:
 76 |      * 1. Use Lucene analyzer to tokenize while preserving the punctuations, so that the emoticons are preserved.
 77 |      * 2. Remove punctuations from a token, if adjacent to it without a space and replace it with the original token.
 78 |      * e.g. going!!!! -> going OR !?!?there -> there
 79 |      *
 80 |      * @param unTokenizedText           original text to be analyzed.
 81 |      * @param tokensWithoutPunctuations tokenized version of the input which has no punctuations.
 82 |      * @return tokenized version which preserves all the punctuations so that emoticons are preserved.
 83 |      * @throws IOException if there was an issue while Lucene was processing unTokenizedText
 84 |      */
 85 |     private List<String> tokensAftersKeepingEmoticons(final String unTokenizedText,
 86 |                                                       final Set<String> tokensWithoutPunctuations) throws IOException {
 87 |         final List<String> wordsAndEmoticonsList = new ArrayList<>();
 88 |         new InputAnalyzer().keepPunctuation(unTokenizedText, wordsAndEmoticonsList::add);
 89 |         wordsAndEmoticonsList.replaceAll(t -> stripPunctuations(t, tokensWithoutPunctuations));
 90 |         return wordsAndEmoticonsList;
 91 |     }
 92 | 
 93 |     /**
 94 |      * Remove punctuations from a token, if adjacent to it without a space and replace it with the original token.
 95 |      * e.g. going!!!! -> going OR !?!?there -> there
 96 |      *
 97 |      * @param token                     token that potentially includes punctuations.
 98 |      * @param tokensWithoutPunctuations tokenized version of the input which has no punctuations.
 99 |      * @return the token with any such punctuation removed from it, or the original token otherwise
100 |      */
101 |     private String stripPunctuations(String token, Set<String> tokensWithoutPunctuations) {
102 |         for (final String punct : Utils.PUNCTUATIONS) {
103 |             if (token.startsWith(punct)) {
104 |                 final String strippedToken = token.substring(punct.length());
105 |                 if (tokensWithoutPunctuations.contains(strippedToken)) {
106 |                     return strippedToken;
107 |                 }
108 |             } else if (token.endsWith(punct)) {
109 |                 final String strippedToken = token.substring(0, token.length() - punct.length());
110 |                 if (tokensWithoutPunctuations.contains(strippedToken)) {
111 |                     return strippedToken;
112 |                 }
113 |             }
114 |         }
115 |         return token;
116 |     }
117 | 
118 |     /**
119 |      * This method tokenizes the input string, preserving the punctuation marks using a custom Lucene analyzer.
120 |      *
121 |      * @throws IOException if something goes wrong in the Lucene analyzer.
122 |      * @see InputAnalyzer#tokenize(String, org.apache.lucene.analysis.Tokenizer, java.util.function.Consumer)
123 |      */
124 |     private void setWordsAndEmoticons() throws IOException {
125 |         setWordsOnly();
126 |         this.wordsAndEmoticons = tokensAftersKeepingEmoticons(inputText, wordsOnly);
127 |     }
128 | 
129 |     /**
130 |      * This method tokenizes the input string, removing the special characters as well.
131 |      *
132 |      * @throws IOException iff there is an error which using Lucene analyzers.
133 |      * @see InputAnalyzer#removePunctuation(String, java.util.function.Consumer)
134 |      */
135 |     private void setWordsOnly() throws IOException {
136 |         this.wordsOnly = new HashSet<>();
137 |         new InputAnalyzer().removePunctuation(inputText, wordsOnly::add);
138 |     }
139 | 
140 |     public List<String> getWordsAndEmoticons() {
141 |         return wordsAndEmoticons;
142 |     }
143 | 
144 |     @SuppressWarnings("unused")
145 |     public Set<String> getWordsOnly() {
146 |         return wordsOnly;
147 |     }
148 | 
149 |     public boolean isYelling() {
150 |         return hasYellWords;
151 |     }
152 | 
153 |     private void setHasYellWords(boolean hasYellWords) {
154 |         this.hasYellWords = hasYellWords;
155 |     }
156 | 
157 |     /**
158 |      * Return true iff the input has yelling words i.e. all caps in the tokens,
159 |      * but all the token should not be in upper case.
160 |      * e.g. [GET, THE, HELL, OUT] returns false
161 |      * [GET, the, HELL, OUT] returns true
162 |      * [get, the, hell, out] returns false
163 |      *
164 |      * @param tokenList a list of strings
165 |      * @return boolean value
166 |      */
167 |     private boolean hasCapDifferential(List<String> tokenList) {
168 |         int countAllCaps = 0;
169 |         for (String token : tokenList) {
170 |             if (Utils.isUpper(token)) {
171 |                 countAllCaps++;
172 |             }
173 |         }
174 |         final int capDifferential = tokenList.size() - countAllCaps;
175 |         return (capDifferential > 0) && (capDifferential < tokenList.size());
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2021 Animesh Pandey
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | /**
26 |  * Package containing analyzer classes.
27 |  *
28 |  * @author Animesh Pandey
29 |  */
30 | package com.vader.sentiment.processor;
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/Constants.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2021 Animesh Pandey
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | package com.vader.sentiment.util;
 26 | 
 27 | import java.util.regex.Pattern;
 28 | 
 29 | /**
 30 |  * This class defines constants that are used during the computation of the sentiment scores.
 31 |  *
 32 |  * @author Animesh Pandey
 33 |  */
 34 | public final class Constants {
 35 |     /**
 36 |      * Max allowed question marks in a string.
 37 |      * Beyond this value the affect of the Question marks will be considered the same.
 38 |      *
 39 |      * @see SentimentModifyingTokens#QUESTION_MARK
 40 |      */
 41 |     public static final int MAX_QUESTION_MARKS = 3;
 42 | 
 43 |     /**
 44 |      * Window size for preceding trigram.
 45 |      */
 46 |     public static final int PRECEDING_TRIGRAM_WINDOW = 3;
 47 | 
 48 |     /**
 49 |      * Window size for preceding bigram.
 50 |      */
 51 |     public static final int PRECEDING_BIGRAM_WINDOW = 2;
 52 | 
 53 |     /**
 54 |      * Window size for preceding unigram.
 55 |      */
 56 |     public static final int PRECEDING_UNIGRAM_WINDOW = 1;
 57 | 
 58 |     /**
 59 |      * Maximum number for exclamation marks that could be processed.
 60 |      */
 61 |     public static final int MAX_EXCLAMATION_MARKS = 4;
 62 | 
 63 |     /**
 64 |      * This is the window size within which processing will be done.
 65 |      * This means that we would be dealing only with unigrams, bigrams and
 66 |      * trigrams.
 67 |      */
 68 |     public static final int MAX_GRAM_WINDOW_SIZE = 3;
 69 | 
 70 |     /**
 71 |      * This alpha approximates the max expected value for a sentiment score.
 72 |      */
 73 |     public static final float DEFAULT_ALPHA = 15.0F;
 74 | 
 75 |     /**
 76 |      * This regex checks if a string has only alphabets and no special characters or numbers.
 77 |      */
 78 |     public static final Pattern NON_NUMERIC_STRING_REGEX = Pattern.compile(".*[a-zA-Z]+.*");
 79 | 
 80 |     /**
 81 |      * This string defines the prefix for a string that has a URL.
 82 |      */
 83 |     public static final String HTTP_URL_PREFIX = "http://";
 84 | 
 85 |     /**
 86 |      * This string defines the prefix for a string that has a URL.
 87 |      */
 88 |     public static final String HTTPS_URL_PREFIX = "https://";
 89 | 
 90 |     /**
 91 |      * The separator for a word N-gram.
 92 |      */
 93 |     public static final String SPACE_SEPARATOR = " ";
 94 | 
 95 |     /**
 96 |      * Private constructor for utility class.
 97 |      */
 98 |     private Constants() {
 99 | 
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/SentimentModifyingTokens.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2021 Animesh Pandey
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | package com.vader.sentiment.util;
26 | 
27 | /**
28 |  * This is list of tokens that modifying the valence of tokens of a string if found in the same string.
29 |  *
30 |  * @author Animesh Pandey
31 |  */
32 | //CHECKSTYLE.OFF: Javadoc*
33 | public enum SentimentModifyingTokens {
34 |     NEVER("never"),
35 |     SO("so"),
36 |     THIS("this"),
37 |     AT("at"),
38 |     LEAST("least"),
39 |     KIND("kind"),
40 |     OF("of"),
41 |     VERY("very"),
42 |     BUT("but"),
43 |     EXCLAMATION_MARK("!"),
44 |     QUESTION_MARK("?"),
45 |     CONTRACTION("n't");
46 | 
47 |     private final String value;
48 | 
49 |     SentimentModifyingTokens(String value) {
50 |         this.value = value;
51 |     }
52 | 
53 |     public String getValue() {
54 |         return value;
55 |     }
56 | }
57 | //CHECKSTYLE.ON: Javadoc*
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/Utils.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2021 Animesh Pandey
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | package com.vader.sentiment.util;
 26 | 
 27 | import java.io.BufferedReader;
 28 | import java.io.IOException;
 29 | import java.io.InputStream;
 30 | import java.io.InputStreamReader;
 31 | import java.nio.charset.StandardCharsets;
 32 | import java.util.Collections;
 33 | import java.util.HashMap;
 34 | import java.util.Map;
 35 | import java.util.Set;
 36 | 
 37 | import org.apache.commons.lang3.StringUtils;
 38 | import org.slf4j.LoggerFactory;
 39 | import com.google.common.collect.ImmutableMap;
 40 | import com.google.common.collect.ImmutableSet;
 41 | 
 42 | /**
 43 |  * This class contains the constants that are the used by the sentiment analyzer.
 44 |  * The constants are same as the ones used in the official python implementation
 45 |  *
 46 |  * @author Animesh Pandey
 47 |  * @see <a href="http://www.nltk.org/_modules/nltk/sentiment/vader.html">NLTK Source</a>
 48 |  * @see <a href="https://github.com/cjhutto/vaderSentiment/blob/master/vaderSentiment/vaderSentiment.py">
 49 |  * vaderSentiment Python module</a>
 50 |  */
 51 | public final class Utils {
 52 | 
 53 |     /**
 54 |      * Set of possible punctuation marks.
 55 |      */
 56 |     public static final Set<String> PUNCTUATIONS = ImmutableSet.of(".", "!", "?", ",", ";", ":", "-", "'",
 57 |         "\"", "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?");
 58 | 
 59 |     /**
 60 |      * Set of negative words.
 61 |      */
 62 |     public static final Set<String> NEGATIVE_WORDS =
 63 |         ImmutableSet.of("aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
 64 |             "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt",
 65 |             "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", "don't", "hadn't", "hasn't",
 66 |             "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't", "never", "none", "nope",
 67 |             "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "uhuh", "wasnt",
 68 |             "werent", "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", "without",
 69 |             "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite");
 70 | 
 71 |     /**
 72 |      * This dictionary holds a token and its corresponding boosting/dampening factor for sentiment scoring.
 73 |      */
 74 |     public static final Map<String, Float> BOOSTER_DICTIONARY = ImmutableMap.<String, Float>builder()
 75 |         .put("decidedly", Valence.DEFAULT_BOOSTING.getValue())
 76 |         .put("uber", Valence.DEFAULT_BOOSTING.getValue())
 77 |         .put("barely", Valence.DEFAULT_DAMPING.getValue())
 78 |         .put("particularly", Valence.DEFAULT_BOOSTING.getValue())
 79 |         .put("enormously", Valence.DEFAULT_BOOSTING.getValue())
 80 |         .put("less", Valence.DEFAULT_DAMPING.getValue())
 81 |         .put("absolutely", Valence.DEFAULT_BOOSTING.getValue())
 82 |         .put("kinda", Valence.DEFAULT_DAMPING.getValue())
 83 |         .put("flipping", Valence.DEFAULT_BOOSTING.getValue())
 84 |         .put("awfully", Valence.DEFAULT_BOOSTING.getValue())
 85 |         .put("purely", Valence.DEFAULT_BOOSTING.getValue())
 86 |         .put("majorly", Valence.DEFAULT_BOOSTING.getValue())
 87 |         .put("substantially", Valence.DEFAULT_BOOSTING.getValue())
 88 |         .put("partly", Valence.DEFAULT_DAMPING.getValue())
 89 |         .put("remarkably", Valence.DEFAULT_BOOSTING.getValue())
 90 |         .put("really", Valence.DEFAULT_BOOSTING.getValue())
 91 |         .put("sort of", Valence.DEFAULT_DAMPING.getValue())
 92 |         .put("little", Valence.DEFAULT_DAMPING.getValue())
 93 |         .put("fricking", Valence.DEFAULT_BOOSTING.getValue())
 94 |         .put("sorta", Valence.DEFAULT_DAMPING.getValue())
 95 |         .put("amazingly", Valence.DEFAULT_BOOSTING.getValue())
 96 |         .put("kind of", Valence.DEFAULT_DAMPING.getValue())
 97 |         .put("just enough", Valence.DEFAULT_DAMPING.getValue())
 98 |         .put("fucking", Valence.DEFAULT_BOOSTING.getValue())
 99 |         .put("occasionally", Valence.DEFAULT_DAMPING.getValue())
100 |         .put("somewhat", Valence.DEFAULT_DAMPING.getValue())
101 |         .put("kindof", Valence.DEFAULT_DAMPING.getValue())
102 |         .put("friggin", Valence.DEFAULT_BOOSTING.getValue())
103 |         .put("incredibly", Valence.DEFAULT_BOOSTING.getValue())
104 |         .put("totally", Valence.DEFAULT_BOOSTING.getValue())
105 |         .put("marginally", Valence.DEFAULT_DAMPING.getValue())
106 |         .put("more", Valence.DEFAULT_BOOSTING.getValue())
107 |         .put("considerably", Valence.DEFAULT_BOOSTING.getValue())
108 |         .put("fabulously", Valence.DEFAULT_BOOSTING.getValue())
109 |         .put("hardly", Valence.DEFAULT_DAMPING.getValue())
110 |         .put("very", Valence.DEFAULT_BOOSTING.getValue())
111 |         .put("sortof", Valence.DEFAULT_DAMPING.getValue())
112 |         .put("kind-of", Valence.DEFAULT_DAMPING.getValue())
113 |         .put("scarcely", Valence.DEFAULT_DAMPING.getValue())
114 |         .put("thoroughly", Valence.DEFAULT_BOOSTING.getValue())
115 |         .put("quite", Valence.DEFAULT_BOOSTING.getValue())
116 |         .put("most", Valence.DEFAULT_BOOSTING.getValue())
117 |         .put("completely", Valence.DEFAULT_BOOSTING.getValue())
118 |         .put("frigging", Valence.DEFAULT_BOOSTING.getValue())
119 |         .put("intensely", Valence.DEFAULT_BOOSTING.getValue())
120 |         .put("utterly", Valence.DEFAULT_BOOSTING.getValue())
121 |         .put("highly", Valence.DEFAULT_BOOSTING.getValue())
122 |         .put("extremely", Valence.DEFAULT_BOOSTING.getValue())
123 |         .put("unbelievably", Valence.DEFAULT_BOOSTING.getValue())
124 |         .put("almost", Valence.DEFAULT_DAMPING.getValue())
125 |         .put("especially", Valence.DEFAULT_BOOSTING.getValue())
126 |         .put("fully", Valence.DEFAULT_BOOSTING.getValue())
127 |         .put("frickin", Valence.DEFAULT_BOOSTING.getValue())
128 |         .put("tremendously", Valence.DEFAULT_BOOSTING.getValue())
129 |         .put("exceptionally", Valence.DEFAULT_BOOSTING.getValue())
130 |         .put("flippin", Valence.DEFAULT_BOOSTING.getValue())
131 |         .put("hella", Valence.DEFAULT_BOOSTING.getValue())
132 |         .put("so", Valence.DEFAULT_BOOSTING.getValue())
133 |         .put("greatly", Valence.DEFAULT_BOOSTING.getValue())
134 |         .put("hugely", Valence.DEFAULT_BOOSTING.getValue())
135 |         .put("deeply", Valence.DEFAULT_BOOSTING.getValue())
136 |         .put("unusually", Valence.DEFAULT_BOOSTING.getValue())
137 |         .put("entirely", Valence.DEFAULT_BOOSTING.getValue())
138 |         .put("slightly", Valence.DEFAULT_DAMPING.getValue())
139 |         .put("effing", Valence.DEFAULT_BOOSTING.getValue())
140 |         .build();
141 | 
142 |     /**
143 |      * Idioms with their respective valencies.
144 |      */
145 |     //CHECKSTYLE.OFF: MagicNumber
146 |     public static final Map<String, Float> SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY =
147 |         ImmutableMap.<String, Float>builder()
148 |             .put("cut the mustard", 2f)
149 |             .put("bad ass", 1.5f)
150 |             .put("kiss of death", -1.5f)
151 |             .put("yeah right", -2f)
152 |             .put("the bomb", 3f)
153 |             .put("hand to mouth", -2f)
154 |             .put("the shit", 3f)
155 |             .build();
156 |     //CHECKSTYLE.ON: MagicNumber
157 | 
158 |     /**
159 |      * Tokens with their respective valencies.
160 |      */
161 |     public static final Map<String, Float> WORD_VALENCE_DICTIONARY = readLexiconFile();
162 | 
163 |     /**
164 |      * Private constructor for utility class.
165 |      */
166 |     private Utils() {
167 | 
168 |     }
169 | 
170 |     /**
171 |      * This function returns false if the input token:
172 |      * 1. is a URL starting with "http://" or "HTTP://"
173 |      * 2. is a number as string
174 |      * 3. has one character in lower case
175 |      *
176 |      * @param token input token
177 |      * @return true iff none of the above conditions occur
178 |      */
179 |     public static boolean isUpper(String token) {
180 |         if (StringUtils.startsWithIgnoreCase(token, Constants.HTTP_URL_PREFIX)) {
181 |             return false;
182 |         }
183 |         if (StringUtils.startsWithIgnoreCase(token, Constants.HTTPS_URL_PREFIX)) {
184 |             return false;
185 |         }
186 |         if (!Constants.NON_NUMERIC_STRING_REGEX.matcher(token).matches()) {
187 |             return false;
188 |         }
189 |         for (int i = 0; i < token.length(); i++) {
190 |             if (Character.isLowerCase(token.charAt(i))) {
191 |                 return false;
192 |             }
193 |         }
194 |         return true;
195 |     }
196 | 
197 |     /**
198 |      * This function reads in a file that stores lexicon and their corresponding valence intensity.
199 |      * Each pair of lexicon and its valence is then stored as key-value pairs in a HashMap.
200 |      *
201 |      * @return map of lexicons with their corresponding valence
202 |      */
203 |     private static Map<String, Float> readLexiconFile() {
204 |         final InputStream lexFile = Utils.class.getClassLoader()
205 |                                                .getResourceAsStream("vader_sentiment_lexicon.txt");
206 |         final Map<String, Float> lexDictionary = new HashMap<>();
207 |         if (lexFile != null) {
208 |             try (BufferedReader br = new BufferedReader(new InputStreamReader(lexFile, StandardCharsets.UTF_8))) {
209 |                 String line;
210 |                 while ((line = br.readLine()) != null) {
211 |                     final String[] lexFileData = line.split("\\t");
212 |                     final String currentText = lexFileData[0];
213 |                     final Float currentTextValence = Float.parseFloat(lexFileData[1]);
214 |                     lexDictionary.put(currentText, currentTextValence);
215 |                 }
216 |             } catch (IOException ex) {
217 |                 LoggerFactory.getLogger(Utils.class).error("vader_sentiment_lexicon.txt file not found", ex);
218 |             }
219 |         }
220 |         return Collections.unmodifiableMap(lexDictionary);
221 |     }
222 | }
223 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/Valence.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2021 Animesh Pandey
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | package com.vader.sentiment.util;
 26 | 
 27 | /**
 28 |  * List of default values of valence modifiers.
 29 |  * This list has values as well as factors that modify the valence.
 30 |  *
 31 |  * @author Animesh Pandey
 32 |  */
 33 | public enum Valence {
 34 |     /**
 35 |      * This denotes the default valence for token that boost.
 36 |      */
 37 |     DEFAULT_BOOSTING(0.293F),
 38 | 
 39 |     /**
 40 |      * This denotes the default valence for token that damp.
 41 |      */
 42 |     DEFAULT_DAMPING(-0.293F),
 43 | 
 44 |     /**
 45 |      * Boosting factor for strings having a '?'.
 46 |      */
 47 |     ALL_CAPS_FACTOR(0.733F),
 48 | 
 49 |     /**
 50 |      * If a negative word is encountered, its valence is reduced by this factor.
 51 |      */
 52 |     NEGATIVE_WORD_DAMPING_FACTOR(-0.74F),
 53 | 
 54 |     /**
 55 |      * Boosting factor for strings having a '!'.
 56 |      */
 57 |     EXCLAMATION_BOOSTING(0.292F),
 58 | 
 59 |     /**
 60 |      * Boosting factor for strings having a '?'.
 61 |      */
 62 |     QUESTION_MARK_BOOSTING(0.96F),
 63 | 
 64 |     /**
 65 |      * Boosting factor for strings having 3 or more '?'s.
 66 |      */
 67 |     QUESTION_MARK_MAX_COUNT_BOOSTING(0.18F),
 68 | 
 69 |     /**
 70 |      * If the preceding trigram has a "never" type phrase, increase the negative valence by 25%.
 71 |      */
 72 |     PRECEDING_TRIGRAM_HAVING_NEVER_DAMPING_FACTOR(1.25F),
 73 | 
 74 |     /**
 75 |      * If the preceding bigram has a "never" type phrase, increase the negative valence by 50%.
 76 |      */
 77 |     PRECEDING_BIGRAM_HAVING_NEVER_DAMPING_FACTOR(1.5F),
 78 | 
 79 |     /**
 80 |      * At distance of 1 from current token, reduce current gram's valence by 5%.
 81 |      */
 82 |     ONE_WORD_DISTANCE_DAMPING_FACTOR(0.95F),
 83 | 
 84 |     /**
 85 |      * At distance of 2 from current token, reduce current gram's valence by 10%.
 86 |      */
 87 |     TWO_WORD_DISTANCE_DAMPING_FACTOR(0.9F),
 88 | 
 89 |     /**
 90 |      * If the conjunction is after the current token then reduce valence by 50%.
 91 |      */
 92 |     PRE_CONJUNCTION_ADJUSTMENT_FACTOR(0.5F),
 93 | 
 94 |     /**
 95 |      * If the conjunction is before the current token then increase valence by 50%.
 96 |      */
 97 |     POST_CONJUNCTION_ADJUSTMENT_FACTOR(1.5F);
 98 | 
 99 |     /**
100 |      * Valence value.
101 |      */
102 |     private final float value;
103 | 
104 |     /**
105 |      * Enum constructor.
106 |      *
107 |      * @param value valence value
108 |      */
109 |     Valence(float value) {
110 |         this.value = value;
111 |     }
112 | 
113 |     public float getValue() {
114 |         return value;
115 |     }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2021 Animesh Pandey
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | /**
26 |  * Package containing utility classes.
27 |  *
28 |  * @author Animesh Pandey
29 |  */
30 | package com.vader.sentiment.util;
31 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Root logger option
 2 | log4j.rootLogger=ERROR, stdout, file
 3 | 
 4 | # Redirect log messages to console
 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stdout.Target=System.out
 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
 9 | 
10 | # Redirect log messages to a log file, support file rolling.
11 | log4j.appender.file=org.apache.log4j.RollingFileAppender
12 | log4j.appender.file.File=/tmp/vader-sentiment-app.log
13 | log4j.appender.file.MaxFileSize=5MB
14 | log4j.appender.file.MaxBackupIndex=10
15 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 | 


--------------------------------------------------------------------------------
/src/test/java/com/vader/sentiment/analyzer/SentimentAnalyzerTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2021 Animesh Pandey
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | package com.vader.sentiment.analyzer;
 26 | 
 27 | import java.io.BufferedReader;
 28 | import java.io.InputStream;
 29 | import java.io.InputStreamReader;
 30 | import java.io.IOException;
 31 | import java.nio.charset.StandardCharsets;
 32 | import java.nio.file.Files;
 33 | import java.nio.file.Paths;
 34 | import java.util.ArrayList;
 35 | import java.util.List;
 36 | import java.util.concurrent.TimeUnit;
 37 | 
 38 | import org.junit.Assert;
 39 | import org.junit.BeforeClass;
 40 | import org.junit.Test;
 41 | import org.slf4j.Logger;
 42 | import org.slf4j.LoggerFactory;
 43 | 
 44 | /**
 45 |  * This tests confirms if the port from Python NLTK was correct.
 46 |  * The sentiment scores are pre-computed for Python and them compared
 47 |  * with same text input using the Java implementation.
 48 |  * The sentiment scores are supposed to be equal.
 49 |  * <p>
 50 |  * NOTE: There are some issues with floating point precision differences
 51 |  * between Python and Java.
 52 |  *
 53 |  * @author Animesh Pandey
 54 |  * @see <a href=http://github.com/apanimesh061/VaderSentimentJava/commit/d1d30c4ceeb356ec838f8abac70514bd21a92b4b>
 55 |  * http://github.com/apanimesh061/VaderSentimentJava/commit/d1d30c4ceeb356ec838f8abac70514bd21a92b4b
 56 |  * </a>
 57 |  */
 58 | public class SentimentAnalyzerTest {
 59 |     private static final ClassLoader loader = SentimentAnalyzerTest.class.getClassLoader();
 60 |     private static List<String> testFiles = new ArrayList<>();
 61 |     private static Logger logger = LoggerFactory.getLogger(SentimentAnalyzerTest.class);
 62 | 
 63 |     @BeforeClass
 64 |     public static void setUpTestFiles() {
 65 |         testFiles.add("amazonReviewSnippets_GroundTruth_vader.tsv");
 66 |         testFiles.add("movieReviewSnippets_GroundTruth_vader.tsv");
 67 |         testFiles.add("nytEditorialSnippets_GroundTruth_vader.tsv");
 68 |         testFiles.add("tweets_GroundTruth_vader.tsv");
 69 |     }
 70 | 
 71 |     @Test
 72 |     public void readGroundTruth() {
 73 |         for (String fileName : testFiles) {
 74 |             InputStream inputStream = loader.getResourceAsStream(fileName);
 75 |             try (BufferedReader br = new BufferedReader(new InputStreamReader(inputStream))) {
 76 |                 String line;
 77 |                 while ((line = br.readLine()) != null) {
 78 |                     String[] gtFileData = line.split("\\t");
 79 | 
 80 |                     float expectedNegativeScore = Float.parseFloat(gtFileData[1]);
 81 |                     float expectedNeutralScore = Float.parseFloat(gtFileData[2]);
 82 |                     float expectedPositiveScore = Float.parseFloat(gtFileData[3]);
 83 |                     float expectedCompoundScore = Float.parseFloat(gtFileData[4]);
 84 |                     String inputString = gtFileData[5];
 85 | 
 86 |                     SentimentPolarities inputStringPolarity = SentimentAnalyzer.getScoresFor(inputString);
 87 |                     float actualNegativeScore = inputStringPolarity.getNegativePolarity();
 88 |                     float actualPositiveScore = inputStringPolarity.getPositivePolarity();
 89 |                     float actualNeutralScore = inputStringPolarity.getNeutralPolarity();
 90 |                     float actualCompoundScore = inputStringPolarity.getCompoundPolarity();
 91 | 
 92 |                     Assert.assertFalse(
 93 |                         getErrorMessage(inputString, actualNegativeScore, expectedNegativeScore, "Negative Score"),
 94 |                         error(actualNegativeScore, expectedNegativeScore)
 95 |                     );
 96 |                     Assert.assertFalse(
 97 |                         getErrorMessage(inputString, actualPositiveScore, expectedPositiveScore, "Positive Score"),
 98 |                         error(actualPositiveScore, expectedPositiveScore)
 99 |                     );
100 |                     Assert.assertFalse(
101 |                         getErrorMessage(inputString, actualNeutralScore, expectedNeutralScore, "Neutral Score"),
102 |                         error(actualNeutralScore, expectedNeutralScore)
103 |                     );
104 |                     Assert.assertFalse(
105 |                         getErrorMessage(inputString, actualCompoundScore, expectedCompoundScore, "Compound Score"),
106 |                         error(actualCompoundScore, expectedCompoundScore)
107 |                     );
108 |                 }
109 |             } catch (IOException e) {
110 |                 e.printStackTrace();
111 |             }
112 |             logger.info("Test passed for {}", fileName);
113 |         }
114 |     }
115 | 
116 |     private String getErrorMessage(String message, float actual, float expected, String type) {
117 |         return String.format("Test String: %s ==> %s (actual = %s, expected = %s)", message, type, actual, expected);
118 |     }
119 | 
120 |     /**
121 |      * Count the number of digits in the fractional section.
122 |      *
123 |      * @param value float value
124 |      * @return length of fractional part of decimal number.
125 |      */
126 |     private static int fractionalPartLength(float value) {
127 |         String text = Float.toString(Math.abs(value));
128 |         return text.length() - text.indexOf('.') - 1;
129 |     }
130 | 
131 |     /**
132 |      * Due to Floating Point Precision errors results used to differ by 1
133 |      * e.g. 0.0345 from NLTK might be 0.0344 or 0.0346 when calculated
134 |      * in Java. This was mainly due to rounding off errors.
135 |      * To handle this the difference between two values should not be
136 |      * greater than 1.
137 |      * <p>
138 |      * error(0.0345, 0.0344) => false
139 |      * error(0.0345, 0.0346) => false
140 |      * error(0.0345, 0.0348) => true
141 |      *
142 |      * @param actual     actual value
143 |      * @param experiment experiment value
144 |      * @return true if the difference between actual and experiment is
145 |      * greater than 1.0
146 |      */
147 |     private boolean error(float actual, float experiment) {
148 |         int maxPlaces = Math.max(fractionalPartLength(actual), fractionalPartLength(experiment));
149 |         return ((Math.abs(Math.abs(actual * maxPlaces) - Math.abs(experiment * maxPlaces))) > 1.0);
150 |     }
151 | 
152 |     public static void main(String[] files)
153 |         throws Exception {
154 |         for (String file : files) {
155 |             System.out.printf("Analyzing file %s...%n", file);
156 |             byte[] fileBytes = Files.readAllBytes(Paths.get(file));
157 |             String text = new String(fileBytes, StandardCharsets.UTF_8);
158 |             long startTime = System.nanoTime();
159 |             SentimentPolarities sp = SentimentAnalyzer.getScoresFor(text);
160 |             long endTime = System.nanoTime();
161 |             System.out.printf("%s (%,d ms)%n", sp, TimeUnit.NANOSECONDS.toMillis(endTime - startTime));
162 |         }
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------
/src/test/resources/getNltkVader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
 4 | from unidecode import unidecode
 5 | 
 6 | """
 7 | This script uses the NLTK to get the sentiment polarities of 4000 Tweets from "tweets_GroundTruth.txt"
 8 | DATASET: http://comp.social.gatech.edu/papers/hutto_ICWSM_2014.tar.gz
 9 | PAPER: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf
10 | 
11 | The file tweets_GroundTruth_Vader.tsv created using this script serves as the ground truth for comparing
12 | results of the JAVA post of NLTK vader sentiment analyzer.
13 | """
14 | 
15 | sid = SentimentIntensityAnalyzer()
16 | 
17 | ground_truth_file_list = [
18 |     "GroundTruth/tweets_GroundTruth.txt",
19 |     "GroundTruth/amazonReviewSnippets_GroundTruth.txt",
20 |     "GroundTruth/movieReviewSnippets_GroundTruth.txt",
21 |     "GroundTruth/nytEditorialSnippets_GroundTruth.txt"
22 | ]
23 | 
24 | 
25 | def remove_non_ascii(text):
26 |     return unidecode(unicode(text, encoding="utf-8"))
27 | 
28 | 
29 | for test_file in ground_truth_file_list:
30 |     current_file = test_file.split("/")[1].split(".")[0]
31 |     output_filename = current_file + "_vader.tsv"
32 |     with open(output_filename, "wb") as csv_file:
33 |         with open(test_file, "rb") as tweets:
34 |             for line in tweets.readlines():
35 |                 tweet_id, _, tweet = line.split("\t")
36 |                 tweet = remove_non_ascii(tweet.strip())
37 |                 ss = sid.polarity_scores(tweet)
38 |                 csv_file.write("\t".join([tweet_id, str(ss["neg"]), str(ss["neu"]), str(ss["pos"]), str(ss["compound"]),
39 |                                           tweet.strip()]) + "\n")
40 |         print "Created output for ", test_file, "as", output_filename
41 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Root logger option
 2 | log4j.rootLogger=ERROR, stdout, file
 3 | 
 4 | # Redirect log messages to console
 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stdout.Target=System.out
 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
 9 | 
10 | # Redirect log messages to a log file, support file rolling.
11 | log4j.appender.file=org.apache.log4j.RollingFileAppender
12 | log4j.appender.file.File=E:\\es_source\\vader-sentiment-app.log
13 | log4j.appender.file.MaxFileSize=5MB
14 | log4j.appender.file.MaxBackupIndex=10
15 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 | 


--------------------------------------------------------------------------------