lexDictionary = new HashMap<>();
207 | if (lexFile != null) {
208 | try (BufferedReader br = new BufferedReader(new InputStreamReader(lexFile, StandardCharsets.UTF_8))) {
209 | String line;
210 | while ((line = br.readLine()) != null) {
211 | final String[] lexFileData = line.split("\\t");
212 | final String currentText = lexFileData[0];
213 | final Float currentTextValence = Float.parseFloat(lexFileData[1]);
214 | lexDictionary.put(currentText, currentTextValence);
215 | }
216 | } catch (IOException ex) {
217 | LoggerFactory.getLogger(Utils.class).error("vader_sentiment_lexicon.txt file not found", ex);
218 | }
219 | }
220 | return Collections.unmodifiableMap(lexDictionary);
221 | }
222 | }
223 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/Valence.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.util;
26 |
27 | /**
28 | * List of default values of valence modifiers.
29 | * This list has values as well as factors that modify the valence.
30 | *
31 | * @author Animesh Pandey
32 | */
33 | public enum Valence {
34 | /**
35 | * This denotes the default valence for token that boost.
36 | */
37 | DEFAULT_BOOSTING(0.293F),
38 |
39 | /**
40 | * This denotes the default valence for token that damp.
41 | */
42 | DEFAULT_DAMPING(-0.293F),
43 |
44 | /**
45 | * Boosting factor for strings having a '?'.
46 | */
47 | ALL_CAPS_FACTOR(0.733F),
48 |
49 | /**
50 | * If a negative word is encountered, its valence is reduced by this factor.
51 | */
52 | NEGATIVE_WORD_DAMPING_FACTOR(-0.74F),
53 |
54 | /**
55 | * Boosting factor for strings having a '!'.
56 | */
57 | EXCLAMATION_BOOSTING(0.292F),
58 |
59 | /**
60 | * Boosting factor for strings having a '?'.
61 | */
62 | QUESTION_MARK_BOOSTING(0.96F),
63 |
64 | /**
65 | * Boosting factor for strings having 3 or more '?'s.
66 | */
67 | QUESTION_MARK_MAX_COUNT_BOOSTING(0.18F),
68 |
69 | /**
70 | * If the preceding trigram has a "never" type phrase, increase the negative valence by 25%.
71 | */
72 | PRECEDING_TRIGRAM_HAVING_NEVER_DAMPING_FACTOR(1.25F),
73 |
74 | /**
75 | * If the preceding bigram has a "never" type phrase, increase the negative valence by 50%.
76 | */
77 | PRECEDING_BIGRAM_HAVING_NEVER_DAMPING_FACTOR(1.5F),
78 |
79 | /**
80 | * At distance of 1 from current token, reduce current gram's valence by 5%.
81 | */
82 | ONE_WORD_DISTANCE_DAMPING_FACTOR(0.95F),
83 |
84 | /**
85 | * At distance of 2 from current token, reduce current gram's valence by 10%.
86 | */
87 | TWO_WORD_DISTANCE_DAMPING_FACTOR(0.9F),
88 |
89 | /**
90 | * If the conjunction is after the current token then reduce valence by 50%.
91 | */
92 | PRE_CONJUNCTION_ADJUSTMENT_FACTOR(0.5F),
93 |
94 | /**
95 | * If the conjunction is before the current token then increase valence by 50%.
96 | */
97 | POST_CONJUNCTION_ADJUSTMENT_FACTOR(1.5F);
98 |
99 | /**
100 | * Valence value.
101 | */
102 | private final float value;
103 |
104 | /**
105 | * Enum constructor.
106 | *
107 | * @param value valence value
108 | */
109 | Valence(float value) {
110 | this.value = value;
111 | }
112 |
113 | public float getValue() {
114 | return value;
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | /**
26 | * Package containing utility classes.
27 | *
28 | * @author Animesh Pandey
29 | */
30 | package com.vader.sentiment.util;
31 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=ERROR, stdout, file
3 |
4 | # Redirect log messages to console
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
10 | # Redirect log messages to a log file, support file rolling.
11 | log4j.appender.file=org.apache.log4j.RollingFileAppender
12 | log4j.appender.file.File=/tmp/vader-sentiment-app.log
13 | log4j.appender.file.MaxFileSize=5MB
14 | log4j.appender.file.MaxBackupIndex=10
15 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 |
--------------------------------------------------------------------------------
/src/test/java/com/vader/sentiment/analyzer/SentimentAnalyzerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.analyzer;
26 |
27 | import java.io.BufferedReader;
28 | import java.io.InputStream;
29 | import java.io.InputStreamReader;
30 | import java.io.IOException;
31 | import java.nio.charset.StandardCharsets;
32 | import java.nio.file.Files;
33 | import java.nio.file.Paths;
34 | import java.util.ArrayList;
35 | import java.util.List;
36 | import java.util.concurrent.TimeUnit;
37 |
38 | import org.junit.Assert;
39 | import org.junit.BeforeClass;
40 | import org.junit.Test;
41 | import org.slf4j.Logger;
42 | import org.slf4j.LoggerFactory;
43 |
44 | /**
45 | * This tests confirms if the port from Python NLTK was correct.
46 | * The sentiment scores are pre-computed for Python and them compared
47 | * with same text input using the Java implementation.
48 | * The sentiment scores are supposed to be equal.
49 | *
50 | * NOTE: There are some issues with floating point precision differences
51 | * between Python and Java.
52 | *
53 | * @author Animesh Pandey
54 | * @see
55 | * http://github.com/apanimesh061/VaderSentimentJava/commit/d1d30c4ceeb356ec838f8abac70514bd21a92b4b
56 | *
57 | */
58 | public class SentimentAnalyzerTest {
59 | private static final ClassLoader loader = SentimentAnalyzerTest.class.getClassLoader();
60 | private static List testFiles = new ArrayList<>();
61 | private static Logger logger = LoggerFactory.getLogger(SentimentAnalyzerTest.class);
62 |
63 | @BeforeClass
64 | public static void setUpTestFiles() {
65 | testFiles.add("amazonReviewSnippets_GroundTruth_vader.tsv");
66 | testFiles.add("movieReviewSnippets_GroundTruth_vader.tsv");
67 | testFiles.add("nytEditorialSnippets_GroundTruth_vader.tsv");
68 | testFiles.add("tweets_GroundTruth_vader.tsv");
69 | }
70 |
71 | @Test
72 | public void readGroundTruth() {
73 | for (String fileName : testFiles) {
74 | InputStream inputStream = loader.getResourceAsStream(fileName);
75 | try (BufferedReader br = new BufferedReader(new InputStreamReader(inputStream))) {
76 | String line;
77 | while ((line = br.readLine()) != null) {
78 | String[] gtFileData = line.split("\\t");
79 |
80 | float expectedNegativeScore = Float.parseFloat(gtFileData[1]);
81 | float expectedNeutralScore = Float.parseFloat(gtFileData[2]);
82 | float expectedPositiveScore = Float.parseFloat(gtFileData[3]);
83 | float expectedCompoundScore = Float.parseFloat(gtFileData[4]);
84 | String inputString = gtFileData[5];
85 |
86 | SentimentPolarities inputStringPolarity = SentimentAnalyzer.getScoresFor(inputString);
87 | float actualNegativeScore = inputStringPolarity.getNegativePolarity();
88 | float actualPositiveScore = inputStringPolarity.getPositivePolarity();
89 | float actualNeutralScore = inputStringPolarity.getNeutralPolarity();
90 | float actualCompoundScore = inputStringPolarity.getCompoundPolarity();
91 |
92 | Assert.assertFalse(
93 | getErrorMessage(inputString, actualNegativeScore, expectedNegativeScore, "Negative Score"),
94 | error(actualNegativeScore, expectedNegativeScore)
95 | );
96 | Assert.assertFalse(
97 | getErrorMessage(inputString, actualPositiveScore, expectedPositiveScore, "Positive Score"),
98 | error(actualPositiveScore, expectedPositiveScore)
99 | );
100 | Assert.assertFalse(
101 | getErrorMessage(inputString, actualNeutralScore, expectedNeutralScore, "Neutral Score"),
102 | error(actualNeutralScore, expectedNeutralScore)
103 | );
104 | Assert.assertFalse(
105 | getErrorMessage(inputString, actualCompoundScore, expectedCompoundScore, "Compound Score"),
106 | error(actualCompoundScore, expectedCompoundScore)
107 | );
108 | }
109 | } catch (IOException e) {
110 | e.printStackTrace();
111 | }
112 | logger.info("Test passed for {}", fileName);
113 | }
114 | }
115 |
116 | private String getErrorMessage(String message, float actual, float expected, String type) {
117 | return String.format("Test String: %s ==> %s (actual = %s, expected = %s)", message, type, actual, expected);
118 | }
119 |
120 | /**
121 | * Count the number of digits in the fractional section.
122 | *
123 | * @param value float value
124 | * @return length of fractional part of decimal number.
125 | */
126 | private static int fractionalPartLength(float value) {
127 | String text = Float.toString(Math.abs(value));
128 | return text.length() - text.indexOf('.') - 1;
129 | }
130 |
131 | /**
132 | * Due to Floating Point Precision errors results used to differ by 1
133 | * e.g. 0.0345 from NLTK might be 0.0344 or 0.0346 when calculated
134 | * in Java. This was mainly due to rounding off errors.
135 | * To handle this the difference between two values should not be
136 | * greater than 1.
137 | *
138 | * error(0.0345, 0.0344) => false
139 | * error(0.0345, 0.0346) => false
140 | * error(0.0345, 0.0348) => true
141 | *
142 | * @param actual actual value
143 | * @param experiment experiment value
144 | * @return true if the difference between actual and experiment is
145 | * greater than 1.0
146 | */
147 | private boolean error(float actual, float experiment) {
148 | int maxPlaces = Math.max(fractionalPartLength(actual), fractionalPartLength(experiment));
149 | return ((Math.abs(Math.abs(actual * maxPlaces) - Math.abs(experiment * maxPlaces))) > 1.0);
150 | }
151 |
152 | public static void main(String[] files)
153 | throws Exception {
154 | for (String file : files) {
155 | System.out.printf("Analyzing file %s...%n", file);
156 | byte[] fileBytes = Files.readAllBytes(Paths.get(file));
157 | String text = new String(fileBytes, StandardCharsets.UTF_8);
158 | long startTime = System.nanoTime();
159 | SentimentPolarities sp = SentimentAnalyzer.getScoresFor(text);
160 | long endTime = System.nanoTime();
161 | System.out.printf("%s (%,d ms)%n", sp, TimeUnit.NANOSECONDS.toMillis(endTime - startTime));
162 | }
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/src/test/resources/getNltkVader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
4 | from unidecode import unidecode
5 |
6 | """
7 | This script uses the NLTK to get the sentiment polarities of 4000 Tweets from "tweets_GroundTruth.txt"
8 | DATASET: http://comp.social.gatech.edu/papers/hutto_ICWSM_2014.tar.gz
9 | PAPER: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf
10 |
11 | The file tweets_GroundTruth_Vader.tsv created using this script serves as the ground truth for comparing
12 | results of the JAVA post of NLTK vader sentiment analyzer.
13 | """
14 |
15 | sid = SentimentIntensityAnalyzer()
16 |
17 | ground_truth_file_list = [
18 | "GroundTruth/tweets_GroundTruth.txt",
19 | "GroundTruth/amazonReviewSnippets_GroundTruth.txt",
20 | "GroundTruth/movieReviewSnippets_GroundTruth.txt",
21 | "GroundTruth/nytEditorialSnippets_GroundTruth.txt"
22 | ]
23 |
24 |
25 | def remove_non_ascii(text):
26 | return unidecode(unicode(text, encoding="utf-8"))
27 |
28 |
29 | for test_file in ground_truth_file_list:
30 | current_file = test_file.split("/")[1].split(".")[0]
31 | output_filename = current_file + "_vader.tsv"
32 | with open(output_filename, "wb") as csv_file:
33 | with open(test_file, "rb") as tweets:
34 | for line in tweets.readlines():
35 | tweet_id, _, tweet = line.split("\t")
36 | tweet = remove_non_ascii(tweet.strip())
37 | ss = sid.polarity_scores(tweet)
38 | csv_file.write("\t".join([tweet_id, str(ss["neg"]), str(ss["neu"]), str(ss["pos"]), str(ss["compound"]),
39 | tweet.strip()]) + "\n")
40 | print "Created output for ", test_file, "as", output_filename
41 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=ERROR, stdout, file
3 |
4 | # Redirect log messages to console
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
10 | # Redirect log messages to a log file, support file rolling.
11 | log4j.appender.file=org.apache.log4j.RollingFileAppender
12 | log4j.appender.file.File=E:\\es_source\\vader-sentiment-app.log
13 | log4j.appender.file.MaxFileSize=5MB
14 | log4j.appender.file.MaxBackupIndex=10
15 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 |
--------------------------------------------------------------------------------