├── .gitignore
├── .travis.yml
├── src
├── test
│ ├── resources
│ │ ├── log4j.properties
│ │ └── getNltkVader.py
│ └── java
│ │ └── com
│ │ └── vader
│ │ └── sentiment
│ │ └── analyzer
│ │ └── SentimentAnalyzerTest.java
└── main
│ ├── resources
│ └── log4j.properties
│ ├── assemblies
│ └── assembly.xml
│ ├── java
│ └── com
│ │ └── vader
│ │ └── sentiment
│ │ ├── util
│ │ ├── package-info.java
│ │ ├── SentimentModifyingTokens.java
│ │ ├── Constants.java
│ │ ├── Valence.java
│ │ └── Utils.java
│ │ ├── analyzer
│ │ ├── package-info.java
│ │ ├── RawSentimentScores.java
│ │ ├── SentimentPolarities.java
│ │ └── SentimentAnalyzer.java
│ │ └── processor
│ │ ├── package-info.java
│ │ ├── InputAnalyzerInterface.java
│ │ ├── InputAnalyzer.java
│ │ └── TextProperties.java
│ └── checkstyle
│ └── checkstyle.xml
├── pom.xml
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target
4 | build
5 | out
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: java
3 | install: mvn install -Dgpg.skip=true
4 | jdk:
5 | - oraclejdk8
6 |
7 | cache:
8 | directories:
9 | - $HOME/.m2
10 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=ERROR, stdout, file
3 |
4 | # Redirect log messages to console
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
10 | # Redirect log messages to a log file, support file rolling.
11 | log4j.appender.file=org.apache.log4j.RollingFileAppender
12 | log4j.appender.file.File=E:\\es_source\\vader-sentiment-app.log
13 | log4j.appender.file.MaxFileSize=5MB
14 | log4j.appender.file.MaxBackupIndex=10
15 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=ERROR, stdout, file
3 |
4 | # Redirect log messages to console
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
10 | # Redirect log messages to a log file, support file rolling.
11 | log4j.appender.file=org.apache.log4j.RollingFileAppender
12 | log4j.appender.file.File=/tmp/vader-sentiment-app.log
13 | log4j.appender.file.MaxFileSize=5MB
14 | log4j.appender.file.MaxBackupIndex=10
15 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 |
--------------------------------------------------------------------------------
/src/main/assemblies/assembly.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | jar-with-dependencies
6 |
7 |
8 | jar
9 |
10 |
11 | false
12 |
13 |
14 |
15 |
16 | /
17 | true
18 | true
19 | true
20 | runtime
21 |
22 |
23 | org.apache.lucene:lucene-analyzers-common
24 | commons-lang:commons-lang
25 | log4j:log4j
26 | junit:junit
27 | **/log4j.properties
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | /**
26 | * Package containing utility classes.
27 | *
28 | * @author Animesh Pandey
29 | */
30 | package com.vader.sentiment.util;
31 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | /**
26 | * Package containing analyzer classes.
27 | *
28 | * @author Animesh Pandey
29 | */
30 | package com.vader.sentiment.analyzer;
31 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | /**
26 | * Package containing analyzer classes.
27 | *
28 | * @author Animesh Pandey
29 | */
30 | package com.vader.sentiment.processor;
31 |
--------------------------------------------------------------------------------
/src/test/resources/getNltkVader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
4 | from unidecode import unidecode
5 |
6 | """
7 | This script uses the NLTK to get the sentiment polarities of 4000 Tweets from "tweets_GroundTruth.txt"
8 | DATASET: http://comp.social.gatech.edu/papers/hutto_ICWSM_2014.tar.gz
9 | PAPER: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf
10 |
11 | The file tweets_GroundTruth_Vader.tsv created using this script serves as the ground truth for comparing
12 | results of the JAVA post of NLTK vader sentiment analyzer.
13 | """
14 |
15 | sid = SentimentIntensityAnalyzer()
16 |
17 | ground_truth_file_list = [
18 | "GroundTruth/tweets_GroundTruth.txt",
19 | "GroundTruth/amazonReviewSnippets_GroundTruth.txt",
20 | "GroundTruth/movieReviewSnippets_GroundTruth.txt",
21 | "GroundTruth/nytEditorialSnippets_GroundTruth.txt"
22 | ]
23 |
24 |
25 | def remove_non_ascii(text):
26 | return unidecode(unicode(text, encoding="utf-8"))
27 |
28 |
29 | for test_file in ground_truth_file_list:
30 | current_file = test_file.split("/")[1].split(".")[0]
31 | output_filename = current_file + "_vader.tsv"
32 | with open(output_filename, "wb") as csv_file:
33 | with open(test_file, "rb") as tweets:
34 | for line in tweets.readlines():
35 | tweet_id, _, tweet = line.split("\t")
36 | tweet = remove_non_ascii(tweet.strip())
37 | ss = sid.polarity_scores(tweet)
38 | csv_file.write("\t".join([tweet_id, str(ss["neg"]), str(ss["neu"]), str(ss["pos"]), str(ss["compound"]),
39 | tweet.strip()]) + "\n")
40 | print "Created output for ", test_file, "as", output_filename
41 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/SentimentModifyingTokens.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.util;
26 |
27 | /**
28 | * This is list of tokens that modifying the valence of tokens of a string if found in the same string.
29 | *
30 | * @author Animesh Pandey
31 | */
32 | //CHECKSTYLE.OFF: Javadoc*
33 | public enum SentimentModifyingTokens {
34 | NEVER("never"),
35 | SO("so"),
36 | THIS("this"),
37 | AT("at"),
38 | LEAST("least"),
39 | KIND("kind"),
40 | OF("of"),
41 | VERY("very"),
42 | BUT("but"),
43 | EXCLAMATION_MARK("!"),
44 | QUESTION_MARK("?"),
45 | CONTRACTION("n't");
46 |
47 | private final String value;
48 |
49 | SentimentModifyingTokens(String value) {
50 | this.value = value;
51 | }
52 |
53 | public String getValue() {
54 | return value;
55 | }
56 | }
57 | //CHECKSTYLE.ON: Javadoc*
58 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/InputAnalyzerInterface.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.processor;
26 |
27 | import java.io.IOException;
28 | import java.util.function.Consumer;
29 |
30 | /**
31 | * This interface defines methods that use two methods for splitting up a raw string.
32 | *
33 | * @author Animesh Pandey
34 | */
35 | interface InputAnalyzerInterface {
36 | /**
37 | * This method performs tokenization without punctuation removal.
38 | *
39 | * @param inputString The input string to be pre-processed with Lucene tokenizer
40 | * @param tokenConsumer The consumer of the tokens
41 | * @throws IOException if Lucene's analyzer encounters any error
42 | */
43 | void keepPunctuation(String inputString, Consumer tokenConsumer) throws IOException;
44 |
45 | /**
46 | * This method performs tokenization with punctuation removal.
47 | *
48 | * @param inputString The input string to be pre-processed with Lucene tokenizer
49 | * @param tokenConsumer The consumer of the tokens
50 | * @throws IOException if Lucene's analyzer encounters any error
51 | */
52 | void removePunctuation(String inputString, Consumer tokenConsumer) throws IOException;
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/RawSentimentScores.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.analyzer;
26 |
27 | /**
28 | * This class defines the three types of raw sentiment scores which are non-normalized.
29 | *
30 | * @author Animesh Pandey
31 | */
32 | public final class RawSentimentScores {
33 | /**
34 | * This is the raw positive sentiment score.
35 | */
36 | private final float positiveScore;
37 |
38 | /**
39 | * This is the raw negative sentiment score.
40 | */
41 | private final float negativeScore;
42 |
43 | /**
44 | * This is the raw neutral sentiment score.
45 | */
46 | private final float neutralScore;
47 |
48 | /**
49 | * Creates an object of this class and sets all the fields.
50 | *
51 | * @param positiveScore positive score
52 | * @param negativeScore negative score
53 | * @param neutralScore neutral score
54 | */
55 | public RawSentimentScores(float positiveScore, float negativeScore, float neutralScore) {
56 | this.positiveScore = positiveScore;
57 | this.negativeScore = negativeScore;
58 | this.neutralScore = neutralScore;
59 | }
60 |
61 | public float getPositiveScore() {
62 | return positiveScore;
63 | }
64 |
65 | public float getNegativeScore() {
66 | return negativeScore;
67 | }
68 |
69 | public float getNeutralScore() {
70 | return neutralScore;
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/Constants.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.util;
26 |
27 | import java.util.regex.Pattern;
28 |
29 | /**
30 | * This class defines constants that are used during the computation of the sentiment scores.
31 | *
32 | * @author Animesh Pandey
33 | */
34 | public final class Constants {
35 | /**
36 | * Max allowed question marks in a string.
37 | * Beyond this value the affect of the Question marks will be considered the same.
38 | *
39 | * @see SentimentModifyingTokens#QUESTION_MARK
40 | */
41 | public static final int MAX_QUESTION_MARKS = 3;
42 |
43 | /**
44 | * Window size for preceding trigram.
45 | */
46 | public static final int PRECEDING_TRIGRAM_WINDOW = 3;
47 |
48 | /**
49 | * Window size for preceding bigram.
50 | */
51 | public static final int PRECEDING_BIGRAM_WINDOW = 2;
52 |
53 | /**
54 | * Window size for preceding unigram.
55 | */
56 | public static final int PRECEDING_UNIGRAM_WINDOW = 1;
57 |
58 | /**
59 | * Maximum number for exclamation marks that could be processed.
60 | */
61 | public static final int MAX_EXCLAMATION_MARKS = 4;
62 |
63 | /**
64 | * This is the window size within which processing will be done.
65 | * This means that we would be dealing only with unigrams, bigrams and
66 | * trigrams.
67 | */
68 | public static final int MAX_GRAM_WINDOW_SIZE = 3;
69 |
70 | /**
71 | * This alpha approximates the max expected value for a sentiment score.
72 | */
73 | public static final float DEFAULT_ALPHA = 15.0F;
74 |
75 | /**
76 | * This regex checks if a string has only alphabets and no special characters or numbers.
77 | */
78 | public static final Pattern NON_NUMERIC_STRING_REGEX = Pattern.compile(".*[a-zA-Z]+.*");
79 |
80 | /**
81 | * This string defines the prefix for a string that has a URL.
82 | */
83 | public static final String HTTP_URL_PREFIX = "http://";
84 |
85 | /**
86 | * This string defines the prefix for a string that has a URL.
87 | */
88 | public static final String HTTPS_URL_PREFIX = "https://";
89 |
90 | /**
91 | * The separator for a word N-gram.
92 | */
93 | public static final String SPACE_SEPARATOR = " ";
94 |
95 | /**
96 | * Private constructor for utility class.
97 | */
98 | private Constants() {
99 |
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/Valence.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.util;
26 |
27 | /**
28 | * List of default values of valence modifiers.
29 | * This list has values as well as factors that modify the valence.
30 | *
31 | * @author Animesh Pandey
32 | */
33 | public enum Valence {
34 | /**
35 | * This denotes the default valence for token that boost.
36 | */
37 | DEFAULT_BOOSTING(0.293F),
38 |
39 | /**
40 | * This denotes the default valence for token that damp.
41 | */
42 | DEFAULT_DAMPING(-0.293F),
43 |
44 | /**
45 | * Boosting factor for strings having a '?'.
46 | */
47 | ALL_CAPS_FACTOR(0.733F),
48 |
49 | /**
50 | * If a negative word is encountered, its valence is reduced by this factor.
51 | */
52 | NEGATIVE_WORD_DAMPING_FACTOR(-0.74F),
53 |
54 | /**
55 | * Boosting factor for strings having a '!'.
56 | */
57 | EXCLAMATION_BOOSTING(0.292F),
58 |
59 | /**
60 | * Boosting factor for strings having a '?'.
61 | */
62 | QUESTION_MARK_BOOSTING(0.96F),
63 |
64 | /**
65 | * Boosting factor for strings having 3 or more '?'s.
66 | */
67 | QUESTION_MARK_MAX_COUNT_BOOSTING(0.18F),
68 |
69 | /**
70 | * If the preceding trigram has a "never" type phrase, increase the negative valence by 25%.
71 | */
72 | PRECEDING_TRIGRAM_HAVING_NEVER_DAMPING_FACTOR(1.25F),
73 |
74 | /**
75 | * If the preceding bigram has a "never" type phrase, increase the negative valence by 50%.
76 | */
77 | PRECEDING_BIGRAM_HAVING_NEVER_DAMPING_FACTOR(1.5F),
78 |
79 | /**
80 | * At distance of 1 from current token, reduce current gram's valence by 5%.
81 | */
82 | ONE_WORD_DISTANCE_DAMPING_FACTOR(0.95F),
83 |
84 | /**
85 | * At distance of 2 from current token, reduce current gram's valence by 10%.
86 | */
87 | TWO_WORD_DISTANCE_DAMPING_FACTOR(0.9F),
88 |
89 | /**
90 | * If the conjunction is after the current token then reduce valence by 50%.
91 | */
92 | PRE_CONJUNCTION_ADJUSTMENT_FACTOR(0.5F),
93 |
94 | /**
95 | * If the conjunction is before the current token then increase valence by 50%.
96 | */
97 | POST_CONJUNCTION_ADJUSTMENT_FACTOR(1.5F);
98 |
99 | /**
100 | * Valence value.
101 | */
102 | private final float value;
103 |
104 | /**
105 | * Enum constructor.
106 | *
107 | * @param value valence value
108 | */
109 | Valence(float value) {
110 | this.value = value;
111 | }
112 |
113 | public float getValue() {
114 | return value;
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/InputAnalyzer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.processor;
26 |
27 | import java.io.IOException;
28 | import java.io.StringReader;
29 | import java.util.function.Consumer;
30 | import org.apache.lucene.analysis.TokenStream;
31 | import org.apache.lucene.analysis.Tokenizer;
32 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
33 | import org.apache.lucene.analysis.miscellaneous.LengthFilter;
34 | import org.apache.lucene.analysis.standard.StandardTokenizer;
35 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
36 |
37 | /**
38 | * This class defines a Lucene analyzer that is applied on the input string in
39 | * {@link com.vader.sentiment.analyzer.SentimentAnalyzer}.
40 | *
41 | * @author Animesh Pandey
42 | */
43 | class InputAnalyzer implements InputAnalyzerInterface {
44 | /**
45 | * This function applies a Lucene tokenizer that splits a string into a tokens.
46 | *
47 | * @param inputString The input string to be pre-processed with Lucene tokenizer
48 | * @param tokenizer The tokenizer to use for processing the input string
49 | * @param tokenConsumer The consumer of the tokens
50 | * @throws IOException if Lucene's tokenizer encounters any error
51 | */
52 | protected void tokenize(final String inputString, final Tokenizer tokenizer,
53 | final Consumer tokenConsumer) throws IOException {
54 | tokenizer.setReader(new StringReader(inputString));
55 |
56 | try (TokenStream tokenStream = new LengthFilter(tokenizer, 2, Integer.MAX_VALUE)) {
57 | final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
58 | tokenStream.reset();
59 |
60 | while (tokenStream.incrementToken()) {
61 | tokenConsumer.accept(charTermAttribute.toString());
62 | }
63 |
64 | tokenStream.end();
65 | }
66 | }
67 |
68 | /**
69 | * Performs tokenization using Lucene's {@link WhitespaceTokenizer}, which tokenizes from the white spaces.
70 | * {@inheritDoc}
71 | */
72 | @Override
73 | public void keepPunctuation(final String inputString, final Consumer tokenConsumer) throws IOException {
74 | tokenize(inputString, new WhitespaceTokenizer(), tokenConsumer);
75 | }
76 |
77 | /**
78 | * Performs tokenization using Lucene's {@link StandardTokenizer}, which tokenizes from white space
79 | * as well as removed any punctuations.
80 | * {@inheritDoc}
81 | */
82 | @Override
83 | public void removePunctuation(final String inputString, final Consumer tokenConsumer) throws IOException {
84 | tokenize(inputString, new StandardTokenizer(), tokenConsumer);
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/SentimentPolarities.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.analyzer;
26 |
27 | /**
28 | * This class object will store the normalized scores that we get from {@link RawSentimentScores}.
29 | * The positivePolarity, neutralPolarity, and negativePolarity scores are ratios for proportions
30 | * of text that fall in each category (so these should all add up to be 1... or close to it with
31 | * float operation). These are the most useful metrics if you want multidimensional measures of
32 | * sentiment for a given sentence.
33 | * The compoundPolarity score is computed by summing the valence scores of each word in the lexicon,
34 | * adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1
35 | * (most extreme positive). This is the most useful metric if you want a single uni-dimensional measure of
36 | * sentiment for a given sentence. Calling it a "normalized, weighted composite score" is accurate.
37 | *
38 | * @author Animesh Pandey
39 | */
40 | public final class SentimentPolarities {
41 | /**
42 | * This represents proportion of text that is positive.
43 | */
44 | private final float positivePolarity;
45 |
46 | /**
47 | * This represents proportion of text that is negative.
48 | */
49 | private final float negativePolarity;
50 |
51 | /**
52 | * This represents proportion of text that is neutral.
53 | */
54 | private final float neutralPolarity;
55 |
56 | /**
57 | * This represents compound score.
58 | */
59 | private final float compoundPolarity;
60 |
61 | /**
62 | * Creates an object of this class and sets all the fields.
63 | *
64 | * @param positivePolarity proportion of text that is positive.
65 | * @param negativePolarity proportion of text that is negative.
66 | * @param neutralPolarity proportion of text that is neutral.
67 | * @param compoundPolarity compound score.
68 | */
69 | public SentimentPolarities(float positivePolarity, float negativePolarity, float neutralPolarity,
70 | float compoundPolarity) {
71 | this.positivePolarity = positivePolarity;
72 | this.negativePolarity = negativePolarity;
73 | this.neutralPolarity = neutralPolarity;
74 | this.compoundPolarity = compoundPolarity;
75 | }
76 |
77 | /**
78 | * Sometimes, if the string that is to be processed, is either empty, null or has un-identified tokens.
79 | * In this case all the polarities are set to zero.
80 | *
81 | * @return an object of {@link SentimentPolarities} class with all polarities set to 0.0.
82 | */
83 | public static SentimentPolarities emptySentimentState() {
84 | return new SentimentPolarities(0.0F, 0.0F, 0.0F, 0.0F);
85 | }
86 |
87 | public float getPositivePolarity() {
88 | return positivePolarity;
89 | }
90 |
91 | public float getNegativePolarity() {
92 | return negativePolarity;
93 | }
94 |
95 | public float getNeutralPolarity() {
96 | return neutralPolarity;
97 | }
98 |
99 | public float getCompoundPolarity() {
100 | return compoundPolarity;
101 | }
102 |
103 | @Override
104 | public String toString() {
105 | return "SentimentPolarities{"
106 | + "positivePolarity=" + positivePolarity
107 | + ", negativePolarity=" + negativePolarity
108 | + ", neutralPolarity=" + neutralPolarity
109 | + ", compoundPolarity=" + compoundPolarity
110 | + '}';
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/processor/TextProperties.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.processor;
26 |
27 | import java.io.IOException;
28 | import java.util.ArrayList;
29 | import java.util.HashSet;
30 | import java.util.List;
31 | import java.util.Set;
32 | import com.vader.sentiment.util.Utils;
33 |
34 | /**
35 | * The TextProperties class implements the pre-processing steps of the input string for sentiment analysis.
36 | * It utilizes the Lucene analyzer to perform processing on the input string.
37 | *
38 | * @author Animesh Pandey
39 | */
40 | public final class TextProperties {
41 | /**
42 | * String whose properties will be extracted.
43 | */
44 | private final String inputText;
45 |
46 | /**
47 | * List of tokens and emoticons extracted from the {@link TextProperties#inputText}.
48 | */
49 | private List wordsAndEmoticons;
50 |
51 | /**
52 | * Set of tokens extracted from the {@link TextProperties#inputText}.
53 | * Emoticons are removed here.
54 | */
55 | private Set wordsOnly;
56 |
57 | /**
58 | * Flags that specifies if the current string has yelling words.
59 | */
60 | private boolean hasYellWords;
61 |
62 | /**
63 | * Parameterized constructor accepting the input string that will be processed.
64 | *
65 | * @param inputText the input string
66 | * @throws IOException if there is an issue with the lucene analyzers
67 | */
68 | public TextProperties(final String inputText) throws IOException {
69 | this.inputText = inputText;
70 | setWordsAndEmoticons();
71 | setHasYellWords(hasCapDifferential(getWordsAndEmoticons()));
72 | }
73 |
74 | /**
75 | * Tokenize the input text in two steps:
76 | * 1. Use Lucene analyzer to tokenize while preserving the punctuations, so that the emoticons are preserved.
77 | * 2. Remove punctuations from a token, if adjacent to it without a space and replace it with the original token.
78 | * e.g. going!!!! -> going OR !?!?there -> there
79 | *
80 | * @param unTokenizedText original text to be analyzed.
81 | * @param tokensWithoutPunctuations tokenized version of the input which has no punctuations.
82 | * @return tokenized version which preserves all the punctuations so that emoticons are preserved.
83 | * @throws IOException if there was an issue while Lucene was processing unTokenizedText
84 | */
85 | private List tokensAftersKeepingEmoticons(final String unTokenizedText,
86 | final Set tokensWithoutPunctuations) throws IOException {
87 | final List wordsAndEmoticonsList = new ArrayList<>();
88 | new InputAnalyzer().keepPunctuation(unTokenizedText, wordsAndEmoticonsList::add);
89 | wordsAndEmoticonsList.replaceAll(t -> stripPunctuations(t, tokensWithoutPunctuations));
90 | return wordsAndEmoticonsList;
91 | }
92 |
93 | /**
94 | * Remove punctuations from a token, if adjacent to it without a space and replace it with the original token.
95 | * e.g. going!!!! -> going OR !?!?there -> there
96 | *
97 | * @param token token that potentially includes punctuations.
98 | * @param tokensWithoutPunctuations tokenized version of the input which has no punctuations.
99 | * @return the token with any such punctuation removed from it, or the original token otherwise
100 | */
101 | private String stripPunctuations(String token, Set tokensWithoutPunctuations) {
102 | for (final String punct : Utils.PUNCTUATIONS) {
103 | if (token.startsWith(punct)) {
104 | final String strippedToken = token.substring(punct.length());
105 | if (tokensWithoutPunctuations.contains(strippedToken)) {
106 | return strippedToken;
107 | }
108 | } else if (token.endsWith(punct)) {
109 | final String strippedToken = token.substring(0, token.length() - punct.length());
110 | if (tokensWithoutPunctuations.contains(strippedToken)) {
111 | return strippedToken;
112 | }
113 | }
114 | }
115 | return token;
116 | }
117 |
118 | /**
119 | * This method tokenizes the input string, preserving the punctuation marks using a custom Lucene analyzer.
120 | *
121 | * @throws IOException if something goes wrong in the Lucene analyzer.
122 | * @see InputAnalyzer#tokenize(String, org.apache.lucene.analysis.Tokenizer, java.util.function.Consumer)
123 | */
124 | private void setWordsAndEmoticons() throws IOException {
125 | setWordsOnly();
126 | this.wordsAndEmoticons = tokensAftersKeepingEmoticons(inputText, wordsOnly);
127 | }
128 |
129 | /**
130 | * This method tokenizes the input string, removing the special characters as well.
131 | *
132 | * @throws IOException iff there is an error which using Lucene analyzers.
133 | * @see InputAnalyzer#removePunctuation(String, java.util.function.Consumer)
134 | */
135 | private void setWordsOnly() throws IOException {
136 | this.wordsOnly = new HashSet<>();
137 | new InputAnalyzer().removePunctuation(inputText, wordsOnly::add);
138 | }
139 |
140 | public List getWordsAndEmoticons() {
141 | return wordsAndEmoticons;
142 | }
143 |
144 | @SuppressWarnings("unused")
145 | public Set getWordsOnly() {
146 | return wordsOnly;
147 | }
148 |
149 | public boolean isYelling() {
150 | return hasYellWords;
151 | }
152 |
153 | private void setHasYellWords(boolean hasYellWords) {
154 | this.hasYellWords = hasYellWords;
155 | }
156 |
157 | /**
158 | * Return true iff the input has yelling words i.e. all caps in the tokens,
159 | * but all the token should not be in upper case.
160 | * e.g. [GET, THE, HELL, OUT] returns false
161 | * [GET, the, HELL, OUT] returns true
162 | * [get, the, hell, out] returns false
163 | *
164 | * @param tokenList a list of strings
165 | * @return boolean value
166 | */
167 | private boolean hasCapDifferential(List tokenList) {
168 | int countAllCaps = 0;
169 | for (String token : tokenList) {
170 | if (Utils.isUpper(token)) {
171 | countAllCaps++;
172 | }
173 | }
174 | final int capDifferential = tokenList.size() - countAllCaps;
175 | return (capDifferential > 0) && (capDifferential < tokenList.size());
176 | }
177 | }
178 |
--------------------------------------------------------------------------------
/src/test/java/com/vader/sentiment/analyzer/SentimentAnalyzerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.analyzer;
26 |
27 | import java.io.BufferedReader;
28 | import java.io.InputStream;
29 | import java.io.InputStreamReader;
30 | import java.io.IOException;
31 | import java.nio.charset.StandardCharsets;
32 | import java.nio.file.Files;
33 | import java.nio.file.Paths;
34 | import java.util.ArrayList;
35 | import java.util.List;
36 | import java.util.concurrent.TimeUnit;
37 |
38 | import org.junit.Assert;
39 | import org.junit.BeforeClass;
40 | import org.junit.Test;
41 | import org.slf4j.Logger;
42 | import org.slf4j.LoggerFactory;
43 |
44 | /**
45 | * This tests confirms if the port from Python NLTK was correct.
46 | * The sentiment scores are pre-computed for Python and them compared
47 | * with same text input using the Java implementation.
48 | * The sentiment scores are supposed to be equal.
49 | *
50 | * NOTE: There are some issues with floating point precision differences
51 | * between Python and Java.
52 | *
53 | * @author Animesh Pandey
54 | * @see
55 | * http://github.com/apanimesh061/VaderSentimentJava/commit/d1d30c4ceeb356ec838f8abac70514bd21a92b4b
56 | *
57 | */
58 | public class SentimentAnalyzerTest {
59 | private static final ClassLoader loader = SentimentAnalyzerTest.class.getClassLoader();
60 | private static List testFiles = new ArrayList<>();
61 | private static Logger logger = LoggerFactory.getLogger(SentimentAnalyzerTest.class);
62 |
63 | @BeforeClass
64 | public static void setUpTestFiles() {
65 | testFiles.add("amazonReviewSnippets_GroundTruth_vader.tsv");
66 | testFiles.add("movieReviewSnippets_GroundTruth_vader.tsv");
67 | testFiles.add("nytEditorialSnippets_GroundTruth_vader.tsv");
68 | testFiles.add("tweets_GroundTruth_vader.tsv");
69 | }
70 |
71 | @Test
72 | public void readGroundTruth() {
73 | for (String fileName : testFiles) {
74 | InputStream inputStream = loader.getResourceAsStream(fileName);
75 | try (BufferedReader br = new BufferedReader(new InputStreamReader(inputStream))) {
76 | String line;
77 | while ((line = br.readLine()) != null) {
78 | String[] gtFileData = line.split("\\t");
79 |
80 | float expectedNegativeScore = Float.parseFloat(gtFileData[1]);
81 | float expectedNeutralScore = Float.parseFloat(gtFileData[2]);
82 | float expectedPositiveScore = Float.parseFloat(gtFileData[3]);
83 | float expectedCompoundScore = Float.parseFloat(gtFileData[4]);
84 | String inputString = gtFileData[5];
85 |
86 | SentimentPolarities inputStringPolarity = SentimentAnalyzer.getScoresFor(inputString);
87 | float actualNegativeScore = inputStringPolarity.getNegativePolarity();
88 | float actualPositiveScore = inputStringPolarity.getPositivePolarity();
89 | float actualNeutralScore = inputStringPolarity.getNeutralPolarity();
90 | float actualCompoundScore = inputStringPolarity.getCompoundPolarity();
91 |
92 | Assert.assertFalse(
93 | getErrorMessage(inputString, actualNegativeScore, expectedNegativeScore, "Negative Score"),
94 | error(actualNegativeScore, expectedNegativeScore)
95 | );
96 | Assert.assertFalse(
97 | getErrorMessage(inputString, actualPositiveScore, expectedPositiveScore, "Positive Score"),
98 | error(actualPositiveScore, expectedPositiveScore)
99 | );
100 | Assert.assertFalse(
101 | getErrorMessage(inputString, actualNeutralScore, expectedNeutralScore, "Neutral Score"),
102 | error(actualNeutralScore, expectedNeutralScore)
103 | );
104 | Assert.assertFalse(
105 | getErrorMessage(inputString, actualCompoundScore, expectedCompoundScore, "Compound Score"),
106 | error(actualCompoundScore, expectedCompoundScore)
107 | );
108 | }
109 | } catch (IOException e) {
110 | e.printStackTrace();
111 | }
112 | logger.info("Test passed for {}", fileName);
113 | }
114 | }
115 |
116 | private String getErrorMessage(String message, float actual, float expected, String type) {
117 | return String.format("Test String: %s ==> %s (actual = %s, expected = %s)", message, type, actual, expected);
118 | }
119 |
120 | /**
121 | * Count the number of digits in the fractional section.
122 | *
123 | * @param value float value
124 | * @return length of fractional part of decimal number.
125 | */
126 | private static int fractionalPartLength(float value) {
127 | String text = Float.toString(Math.abs(value));
128 | return text.length() - text.indexOf('.') - 1;
129 | }
130 |
131 | /**
132 | * Due to Floating Point Precision errors results used to differ by 1
133 | * e.g. 0.0345 from NLTK might be 0.0344 or 0.0346 when calculated
134 | * in Java. This was mainly due to rounding off errors.
135 | * To handle this the difference between two values should not be
136 | * greater than 1.
137 | *
138 | * error(0.0345, 0.0344) => false
139 | * error(0.0345, 0.0346) => false
140 | * error(0.0345, 0.0348) => true
141 | *
142 | * @param actual actual value
143 | * @param experiment experiment value
144 | * @return true if the difference between actual and experiment is
145 | * greater than 1.0
146 | */
147 | private boolean error(float actual, float experiment) {
148 | int maxPlaces = Math.max(fractionalPartLength(actual), fractionalPartLength(experiment));
149 | return ((Math.abs(Math.abs(actual * maxPlaces) - Math.abs(experiment * maxPlaces))) > 1.0);
150 | }
151 |
152 | public static void main(String[] files)
153 | throws Exception {
154 | for (String file : files) {
155 | System.out.printf("Analyzing file %s...%n", file);
156 | byte[] fileBytes = Files.readAllBytes(Paths.get(file));
157 | String text = new String(fileBytes, StandardCharsets.UTF_8);
158 | long startTime = System.nanoTime();
159 | SentimentPolarities sp = SentimentAnalyzer.getScoresFor(text);
160 | long endTime = System.nanoTime();
161 | System.out.printf("%s (%,d ms)%n", sp, TimeUnit.NANOSECONDS.toMillis(endTime - startTime));
162 | }
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.github.apanimesh061
8 | vader-sentiment-analyzer
9 | 1.1.1
10 |
11 | jar
12 |
13 | Java port of Python NLTK Vader Sentiment Analyzer. VADER (Valence Aware Dictionary and sEntiment Reasoner)
14 | is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in
15 | social media, and works well on texts from other domains.
16 |
17 | https://github.com/apanimesh061/VaderSentimentJava/blob/master/README.md
18 | 2016
19 |
20 |
21 |
22 | MIT License
23 | http://www.opensource.org/licenses/mit-license.php
24 | repo
25 |
26 |
27 |
28 |
29 |
30 | apanimesh061
31 | Animesh Pandey
32 | apanimesh061@gmail.com
33 |
34 | Developer
35 |
36 |
37 |
38 |
39 |
40 | https://github.com/apanimesh061/VaderSentimentJava
41 | scm:git:git://github.com/apanimesh061/VaderSentimentJava.git
42 | scm:git:git@github.com:apanimesh061/VaderSentimentJava.git
43 |
44 |
45 |
46 | org.sonatype.oss
47 | oss-parent
48 | 7
49 |
50 |
51 |
52 | 1.8
53 | 1.8
54 |
55 | ${project.basedir}/src/main/assemblies/assembly.xml
56 |
57 |
58 | ${project.basedir}/src/main/checkstyle/checkstyle.xml
59 |
60 |
61 | 6.6.0
62 |
63 |
64 |
65 |
66 |
67 | ossrh
68 | https://oss.sonatype.org/content/repositories/snapshots
69 |
70 |
71 | ossrh
72 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
73 |
74 |
75 |
76 |
77 |
78 |
79 | org.sonatype.plugins
80 | nexus-staging-maven-plugin
81 | 1.6.7
82 | true
83 |
84 | ossrh
85 | https://oss.sonatype.org/
86 | true
87 |
88 |
89 |
90 |
91 | org.apache.maven.plugins
92 | maven-compiler-plugin
93 | 3.8.1
94 |
95 | ${maven.compiler.target}
96 | ${maven.compiler.target}
97 |
98 |
99 |
100 |
101 | org.apache.maven.plugins
102 | maven-surefire-plugin
103 | 3.0.0-M5
104 |
105 |
106 | **/*Test*.java
107 |
108 |
109 |
110 |
111 |
112 | org.apache.maven.plugins
113 | maven-source-plugin
114 | 3.2.1
115 |
116 |
117 | attach-sources
118 |
119 | jar-no-fork
120 |
121 |
122 |
123 |
124 |
125 |
126 | org.apache.maven.plugins
127 | maven-javadoc-plugin
128 | 3.3.1
129 |
130 |
131 | attach-javadocs
132 |
133 | jar
134 |
135 |
136 |
137 |
138 |
139 |
140 | org.apache.maven.plugins
141 | maven-dependency-plugin
142 | 3.2.0
143 |
144 |
145 | copy-dependencies
146 | process-resources
147 |
148 | copy-dependencies
149 |
150 |
151 | provided
152 | ${project.build.directory}/lib/
153 |
154 |
155 |
156 |
157 |
158 |
159 | org.apache.maven.plugins
160 | maven-assembly-plugin
161 | 3.3.0
162 |
163 | false
164 | ${project.build.directory}/releases/
165 |
166 | ${elasticsearch.assembly.descriptor}
167 |
168 |
169 |
170 | fully.qualified.MainClass
171 |
172 |
173 |
174 |
175 |
176 | package
177 |
178 | single
179 |
180 |
181 |
182 |
183 |
184 |
185 | org.apache.maven.plugins
186 | maven-checkstyle-plugin
187 | 3.1.2
188 |
189 |
190 |
191 | com.puppycrawl.tools
192 | checkstyle
193 | 8.40
194 |
195 |
196 |
197 |
198 | validate
199 | validate
200 |
201 | ${checkstyle.config.location}
202 | UTF-8
203 | true
204 | true
205 |
206 |
207 | check
208 |
209 |
210 |
211 |
212 |
213 |
214 | org.apache.maven.plugins
215 | maven-gpg-plugin
216 | 3.0.1
217 |
218 |
219 | sign-artifacts
220 | verify
221 |
222 | sign
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 | org.apache.lucene
234 | lucene-analyzers-common
235 | ${lucene.analyzers.common.version}
236 |
237 |
238 | com.google.guava
239 | guava
240 | 21.0
241 |
242 |
243 | org.apache.commons
244 | commons-lang3
245 | 3.6
246 |
247 |
248 | org.slf4j
249 | slf4j-api
250 | 1.7.25
251 |
252 |
253 | org.slf4j
254 | slf4j-log4j12
255 | 1.6.4
256 | provided
257 |
258 |
259 | junit
260 | junit
261 | 4.12
262 | test
263 |
264 |
265 |
266 |
267 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/util/Utils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.util;
26 |
27 | import java.io.BufferedReader;
28 | import java.io.IOException;
29 | import java.io.InputStream;
30 | import java.io.InputStreamReader;
31 | import java.nio.charset.StandardCharsets;
32 | import java.util.Collections;
33 | import java.util.HashMap;
34 | import java.util.Map;
35 | import java.util.Set;
36 |
37 | import org.apache.commons.lang3.StringUtils;
38 | import org.slf4j.LoggerFactory;
39 | import com.google.common.collect.ImmutableMap;
40 | import com.google.common.collect.ImmutableSet;
41 |
42 | /**
43 | * This class contains the constants that are the used by the sentiment analyzer.
44 | * The constants are same as the ones used in the official python implementation
45 | *
46 | * @author Animesh Pandey
47 | * @see NLTK Source
48 | * @see
49 | * vaderSentiment Python module
50 | */
51 | public final class Utils {
52 |
53 | /**
54 | * Set of possible punctuation marks.
55 | */
56 | public static final Set PUNCTUATIONS = ImmutableSet.of(".", "!", "?", ",", ";", ":", "-", "'",
57 | "\"", "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?");
58 |
59 | /**
60 | * Set of negative words.
61 | */
62 | public static final Set NEGATIVE_WORDS =
63 | ImmutableSet.of("aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
64 | "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt",
65 | "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", "don't", "hadn't", "hasn't",
66 | "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't", "never", "none", "nope",
67 | "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "uhuh", "wasnt",
68 | "werent", "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", "without",
69 | "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite");
70 |
71 | /**
72 | * This dictionary holds a token and its corresponding boosting/dampening factor for sentiment scoring.
73 | */
74 | public static final Map BOOSTER_DICTIONARY = ImmutableMap.builder()
75 | .put("decidedly", Valence.DEFAULT_BOOSTING.getValue())
76 | .put("uber", Valence.DEFAULT_BOOSTING.getValue())
77 | .put("barely", Valence.DEFAULT_DAMPING.getValue())
78 | .put("particularly", Valence.DEFAULT_BOOSTING.getValue())
79 | .put("enormously", Valence.DEFAULT_BOOSTING.getValue())
80 | .put("less", Valence.DEFAULT_DAMPING.getValue())
81 | .put("absolutely", Valence.DEFAULT_BOOSTING.getValue())
82 | .put("kinda", Valence.DEFAULT_DAMPING.getValue())
83 | .put("flipping", Valence.DEFAULT_BOOSTING.getValue())
84 | .put("awfully", Valence.DEFAULT_BOOSTING.getValue())
85 | .put("purely", Valence.DEFAULT_BOOSTING.getValue())
86 | .put("majorly", Valence.DEFAULT_BOOSTING.getValue())
87 | .put("substantially", Valence.DEFAULT_BOOSTING.getValue())
88 | .put("partly", Valence.DEFAULT_DAMPING.getValue())
89 | .put("remarkably", Valence.DEFAULT_BOOSTING.getValue())
90 | .put("really", Valence.DEFAULT_BOOSTING.getValue())
91 | .put("sort of", Valence.DEFAULT_DAMPING.getValue())
92 | .put("little", Valence.DEFAULT_DAMPING.getValue())
93 | .put("fricking", Valence.DEFAULT_BOOSTING.getValue())
94 | .put("sorta", Valence.DEFAULT_DAMPING.getValue())
95 | .put("amazingly", Valence.DEFAULT_BOOSTING.getValue())
96 | .put("kind of", Valence.DEFAULT_DAMPING.getValue())
97 | .put("just enough", Valence.DEFAULT_DAMPING.getValue())
98 | .put("fucking", Valence.DEFAULT_BOOSTING.getValue())
99 | .put("occasionally", Valence.DEFAULT_DAMPING.getValue())
100 | .put("somewhat", Valence.DEFAULT_DAMPING.getValue())
101 | .put("kindof", Valence.DEFAULT_DAMPING.getValue())
102 | .put("friggin", Valence.DEFAULT_BOOSTING.getValue())
103 | .put("incredibly", Valence.DEFAULT_BOOSTING.getValue())
104 | .put("totally", Valence.DEFAULT_BOOSTING.getValue())
105 | .put("marginally", Valence.DEFAULT_DAMPING.getValue())
106 | .put("more", Valence.DEFAULT_BOOSTING.getValue())
107 | .put("considerably", Valence.DEFAULT_BOOSTING.getValue())
108 | .put("fabulously", Valence.DEFAULT_BOOSTING.getValue())
109 | .put("hardly", Valence.DEFAULT_DAMPING.getValue())
110 | .put("very", Valence.DEFAULT_BOOSTING.getValue())
111 | .put("sortof", Valence.DEFAULT_DAMPING.getValue())
112 | .put("kind-of", Valence.DEFAULT_DAMPING.getValue())
113 | .put("scarcely", Valence.DEFAULT_DAMPING.getValue())
114 | .put("thoroughly", Valence.DEFAULT_BOOSTING.getValue())
115 | .put("quite", Valence.DEFAULT_BOOSTING.getValue())
116 | .put("most", Valence.DEFAULT_BOOSTING.getValue())
117 | .put("completely", Valence.DEFAULT_BOOSTING.getValue())
118 | .put("frigging", Valence.DEFAULT_BOOSTING.getValue())
119 | .put("intensely", Valence.DEFAULT_BOOSTING.getValue())
120 | .put("utterly", Valence.DEFAULT_BOOSTING.getValue())
121 | .put("highly", Valence.DEFAULT_BOOSTING.getValue())
122 | .put("extremely", Valence.DEFAULT_BOOSTING.getValue())
123 | .put("unbelievably", Valence.DEFAULT_BOOSTING.getValue())
124 | .put("almost", Valence.DEFAULT_DAMPING.getValue())
125 | .put("especially", Valence.DEFAULT_BOOSTING.getValue())
126 | .put("fully", Valence.DEFAULT_BOOSTING.getValue())
127 | .put("frickin", Valence.DEFAULT_BOOSTING.getValue())
128 | .put("tremendously", Valence.DEFAULT_BOOSTING.getValue())
129 | .put("exceptionally", Valence.DEFAULT_BOOSTING.getValue())
130 | .put("flippin", Valence.DEFAULT_BOOSTING.getValue())
131 | .put("hella", Valence.DEFAULT_BOOSTING.getValue())
132 | .put("so", Valence.DEFAULT_BOOSTING.getValue())
133 | .put("greatly", Valence.DEFAULT_BOOSTING.getValue())
134 | .put("hugely", Valence.DEFAULT_BOOSTING.getValue())
135 | .put("deeply", Valence.DEFAULT_BOOSTING.getValue())
136 | .put("unusually", Valence.DEFAULT_BOOSTING.getValue())
137 | .put("entirely", Valence.DEFAULT_BOOSTING.getValue())
138 | .put("slightly", Valence.DEFAULT_DAMPING.getValue())
139 | .put("effing", Valence.DEFAULT_BOOSTING.getValue())
140 | .build();
141 |
142 | /**
143 | * Idioms with their respective valencies.
144 | */
145 | //CHECKSTYLE.OFF: MagicNumber
146 | public static final Map SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY =
147 | ImmutableMap.builder()
148 | .put("cut the mustard", 2f)
149 | .put("bad ass", 1.5f)
150 | .put("kiss of death", -1.5f)
151 | .put("yeah right", -2f)
152 | .put("the bomb", 3f)
153 | .put("hand to mouth", -2f)
154 | .put("the shit", 3f)
155 | .build();
156 | //CHECKSTYLE.ON: MagicNumber
157 |
158 | /**
159 | * Tokens with their respective valencies.
160 | */
161 | public static final Map WORD_VALENCE_DICTIONARY = readLexiconFile();
162 |
163 | /**
164 | * Private constructor for utility class.
165 | */
166 | private Utils() {
167 |
168 | }
169 |
170 | /**
171 | * This function returns false if the input token:
172 | * 1. is a URL starting with "http://" or "HTTP://"
173 | * 2. is a number as string
174 | * 3. has one character in lower case
175 | *
176 | * @param token input token
177 | * @return true iff none of the above conditions occur
178 | */
179 | public static boolean isUpper(String token) {
180 | if (StringUtils.startsWithIgnoreCase(token, Constants.HTTP_URL_PREFIX)) {
181 | return false;
182 | }
183 | if (StringUtils.startsWithIgnoreCase(token, Constants.HTTPS_URL_PREFIX)) {
184 | return false;
185 | }
186 | if (!Constants.NON_NUMERIC_STRING_REGEX.matcher(token).matches()) {
187 | return false;
188 | }
189 | for (int i = 0; i < token.length(); i++) {
190 | if (Character.isLowerCase(token.charAt(i))) {
191 | return false;
192 | }
193 | }
194 | return true;
195 | }
196 |
197 | /**
198 | * This function reads in a file that stores lexicon and their corresponding valence intensity.
199 | * Each pair of lexicon and its valence is then stored as key-value pairs in a HashMap.
200 | *
201 | * @return map of lexicons with their corresponding valence
202 | */
203 | private static Map readLexiconFile() {
204 | final InputStream lexFile = Utils.class.getClassLoader()
205 | .getResourceAsStream("vader_sentiment_lexicon.txt");
206 | final Map lexDictionary = new HashMap<>();
207 | if (lexFile != null) {
208 | try (BufferedReader br = new BufferedReader(new InputStreamReader(lexFile, StandardCharsets.UTF_8))) {
209 | String line;
210 | while ((line = br.readLine()) != null) {
211 | final String[] lexFileData = line.split("\\t");
212 | final String currentText = lexFileData[0];
213 | final Float currentTextValence = Float.parseFloat(lexFileData[1]);
214 | lexDictionary.put(currentText, currentTextValence);
215 | }
216 | } catch (IOException ex) {
217 | LoggerFactory.getLogger(Utils.class).error("vader_sentiment_lexicon.txt file not found", ex);
218 | }
219 | }
220 | return Collections.unmodifiableMap(lexDictionary);
221 | }
222 | }
223 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## VADER-Sentiment-Analysis in Java
2 |
3 | [](https://travis-ci.org/apanimesh061/VaderSentimentJava)
4 |
5 | VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is _specifically attuned to sentiments expressed in social media_. It is fully open-sourced under the [MIT License](http://choosealicense.com/) (we sincerely appreciate all attributions and readily accept most contributions, but please don't hold us liable).
6 |
7 | This is a JAVA port of the NLTK VADER sentiment analysis originally written in Python.
8 |
9 | - The [Original](https://github.com/cjhutto/vaderSentiment) python module by the paper's author C.J. Hutto
10 | - The [NLTK](http://www.nltk.org/_modules/nltk/sentiment/vader.html) source
11 |
12 | For the testing I have compared the results of the NLTK module with this Java port.
13 |
14 | ### Update (Oct 2021)
15 | - - -
16 | Releasing `v1.1.1`.
17 |
18 | Thanks to @ArjohnKampman for helping is optimizing some parts of the code. Since I was touching this repo after a long time, I noticed that a lot of the Maven dependencies and plugins were outdated, so I have updated them. `mvn package` still works so it should be fine.
19 |
20 | I also noticed a lot of comments on not being able to use the library from Maven. I did upload a Jar to Nexus a long time back and I was having trouble doing that again since I think I've lost the pass-phrases needed to sign and upload the Jar to the Nexus. Luckily, I found a new solution [here](https://stackoverflow.com/a/28483461) which suggests to use https://jitpack.io/ for public GitHub repositories. Turns out it is super simple to use it and get the pacakge from GitHub. I wanted to make sure I unblock anyone who wants to use this package.
21 |
22 | I created a test Maven project `test-mvn-pkg1` locally and added the following to its `pom.xml`:
23 |
24 | ```
25 |
26 |
29 | 4.0.0
30 |
31 | org.example
32 | test-mvn-pkg1
33 | 1.0-SNAPSHOT
34 |
35 |
36 |
37 | jitpack.io
38 | https://jitpack.io
39 |
40 |
41 |
42 |
43 |
44 | com.github.apanimesh061
45 | VaderSentimentJava
46 | v1.1.1
47 |
48 |
49 |
50 |
51 | ```
52 | Once Maven downloads the dependencies, you can easily use it in your code like:
53 |
54 | ```
55 | package org.example;
56 |
57 | import com.vader.sentiment.analyzer.SentimentAnalyzer;
58 | import com.vader.sentiment.analyzer.SentimentPolarities;
59 |
60 | public class Test {
61 | public static void main(String[] args) {
62 | final SentimentPolarities sentimentPolarities =
63 | SentimentAnalyzer.getScoresFor("that's a rare and valuable feature.");
64 | System.out.println(sentimentPolarities);
65 | // SentimentPolarities{positivePolarity=0.437, negativePolarity=0.0, neutralPolarity=0.563, compoundPolarity=0.4767}
66 | }
67 | }
68 | ```
69 |
70 | I'll try the Nexus upload and figure out if I can create a new Maven repo all together. Meanwhile, `jitpack` should work for anyone wanting to use the package.
71 |
72 |
73 | ### Update (Jan 2018)
74 |
75 | - - -
76 | Based on a recommendation from @alexpetlenko, I uploaded the jar to Nexus as `vader-sentiment-analyzer-1.0`.
77 |
78 | You can download the jar by adding the following to you `pom.xml`:
79 | ```xml
80 |
81 | com.github.apanimesh061
82 | vader-sentiment-analyzer
83 | 1.0
84 |
85 | ```
86 |
87 | Path to Jar: [vader-sentiment-analyzer-1.0.jar](https://oss.sonatype.org/service/local/repositories/releases/content/com/github/apanimesh061/vader-sentiment-analyzer/1.0/vader-sentiment-analyzer-1.0.jar)
88 |
89 | ### Update (May 2017)
90 |
91 | - - -
92 | Major design refactorings resulting from addition of `checkstyle` to the project.
93 |
94 | Also added JavaDocs to the project.
95 |
96 | ### Update (Jan 2017)
97 |
98 | - - -
99 |
100 | I have corrected a few bugs that I encountered when I was adding more tests.
101 |
102 | The details are [here](https://github.com/apanimesh061/VaderSentimentJava/commit/d1d30c4ceeb356ec838f8abac70514bd21a92b4b).
103 |
104 | This project now includes tests on text from:
105 |
106 | 1. Amazon Reviews
107 | 2. Movie Reviews
108 | 3. NyTimes Editorial snippets
109 |
110 | ### Introduction
111 | - - -
112 |
113 | This README file describes the dataset of the paper:
114 |
115 | **VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text**
116 | (by C.J. Hutto and Eric Gilbert)
117 | Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
118 |
119 | For questions, please contact:
120 |
121 | C.J. Hutto
122 | Georgia Institute of Technology, Atlanta, GA 30032
123 | cjhutto [at] gatech [dot] edu
124 |
125 | ### Citation Information
126 | - - -
127 |
128 | If you use either the dataset or any of the VADER sentiment analysis tools (VADER sentiment lexicon or Python code for rule-based sentiment analysis engine) in your research, please cite the above paper. For example:
129 |
130 | > **Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.**
131 |
132 | ### Resources and Dataset Descriptions
133 | - - -
134 |
135 | The compressed .tar.gz package includes **PRIMARY RESOURCES** (items 1-3) as well as additional **DATASETS AND TESTING RESOURCES** (items 4-12):
136 |
137 | 1. [VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text](http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf)
138 | The original paper for the data set, see citation information (above).
139 |
140 | 2. vader_sentiment_lexicon.txt
141 | Empirically validated by multiple independent human judges, VADER incorporates a "gold-standard" sentiment lexicon that is especially attuned to microblog-like contexts.
142 | The VADER sentiment lexicon is sensitive both the **polarity** and the **intensity** of sentiments
143 | expressed in social media contexts, and is also generally applicable to sentiment analysis
144 | in other domains.
145 | Manually creating (much less, validating) a comprehensive sentiment lexicon is
146 | a labor intensive and sometimes error prone process, so it is no wonder that many
147 | opinion mining researchers and practitioners rely so heavily on existing lexicons
148 | as primary resources. We are pleased to offer ours as a new resource.
149 | We begin by constructing a list inspired by examining existing well-established
150 | sentiment word-banks (LIWC, ANEW, and GI). To this, we next incorporate numerous
151 | lexical features common to sentiment expression in microblogs, including
152 | - a full list of Western-style emoticons, for example, :-) denotes a smiley face
153 | and generally indicates positive sentiment)
154 | - sentiment-related acronyms and initialisms (e.g., LOL and WTF are both examples of
155 | sentiment-laden initialisms)
156 | - commonly used slang with sentiment value (e.g., nah, meh and giggly).
157 |
158 | This process provided us with over 9,000 lexical feature candidates. Next, we assessed
159 | the general applicability of each feature candidate to sentiment expressions. We
160 | used a wisdom-of-the-crowd13 (WotC) approach (Surowiecki, 2004) to acquire a valid
161 | point estimate for the sentiment valence (intensity) of each context-free candidate
162 | feature. We collected intensity ratings on each of our candidate lexical features
163 | from ten independent human raters (for a total of 90,000+ ratings). Features were
164 | rated on a scale from "[–4] Extremely Negative" to "[4] Extremely Positive", with
165 | allowance for "[0] Neutral (or Neither, N/A)".
166 | We kept every lexical feature that had a non-zero mean rating, and whose standard
167 | deviation was less than 2.5 as determined by the aggregate of ten independent raters.
168 | This left us with just over 7,500 lexical features with validated valence scores that
169 | indicated both the sentiment polarity (positive/negative), and the sentiment intensity
170 | on a scale from –4 to +4. For example, the word "okay" has a positive valence of 0.9,
171 | "good" is 1.9, and "great" is 3.1, whereas "horrible" is –2.5, the frowning emoticon :(
172 | is –2.2, and "sucks" and it's slang derivative "sux" are both –1.5.
173 |
174 | 3. vaderSentiment.py
175 | The Python code for the rule-based sentiment analysis engine. Implements the
176 | grammatical and syntactical rules described in the paper, incorporating empirically
177 | derived quantifications for the impact of each rule on the perceived intensity of
178 | sentiment in sentence-level text. Importantly, these heuristics go beyond what would
179 | normally be captured in a typical bag-of-words model. They incorporate **word-order
180 | sensitive relationships** between terms. For example, degree modifiers (also called
181 | intensifiers, booster words, or degree adverbs) impact sentiment intensity by either
182 | increasing or decreasing the intensity. Consider these examples:
183 | (a) "The service here is extremely good"
184 | (b) "The service here is good"
185 | (c) "The service here is marginally good"
186 | From Table 3 in the paper, we see that for 95% of the data, using a degree modifier
187 | increases the positive sentiment intensity of example (a) by 0.227 to 0.36, with a
188 | mean difference of 0.293 on a rating scale from 1 to 4. Likewise, example (c) reduces
189 | the perceived sentiment intensity by 0.293, on average.
190 |
191 | 4. tweets_GroundTruth.txt
192 | **NOTE**: This java module uses this file for testing.
193 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TWEET-TEXT
194 | DESCRIPTION: includes "tweet-like" text as inspired by 4,000 tweets pulled from Twitter’s public timeline, plus 200 completely contrived tweet-like texts intended to specifically test syntactical and grammatical conventions of conveying differences in sentiment intensity. The "tweet-like" texts incorporate a fictitious username (@anonymous) in places where a username might typically appear, along with a fake URL ( http://url_removed ) in places where a URL might typically appear, as inspired by the original tweets. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'tweets_anonDataRatings.txt' (described below).
195 |
196 | 5. tweets_anonDataRatings.txt
197 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS
198 | DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
199 |
200 | 6. nytEditorialSnippets_GroundTruth.txt
201 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TEXT-SNIPPET
202 | DESCRIPTION: includes 5,190 sentence-level snippets from 500 New York Times opinion news editorials/articles; we used the NLTK tokenizer to segment the articles into sentence phrases, and added sentiment intensity ratings. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'nytEditorialSnippets_anonDataRatings.txt' (described below).
203 |
204 | 7. nytEditorialSnippets_anonDataRatings.txt
205 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS
206 | DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
207 |
208 | 8. movieReviewSnippets_GroundTruth.txt
209 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TEXT-SNIPPET
210 | DESCRIPTION: includes 10,605 sentence-level snippets from rotten.tomatoes.com. The snippets were derived from an original set of 2000 movie reviews (1000 positive and 1000 negative) in Pang & Lee (2004); we used the NLTK tokenizer to segment the reviews into sentence phrases, and added sentiment intensity ratings. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'movieReviewSnippets_anonDataRatings.txt' (described below).
211 |
212 | 9. movieReviewSnippets_anonDataRatings.txt
213 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS
214 | DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
215 |
216 | 10. amazonReviewSnippets_GroundTruth.txt
217 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, and TEXT-SNIPPET
218 | DESCRIPTION: includes 3,708 sentence-level snippets from 309 customer reviews on 5 different products. The reviews were originally used in Hu & Liu (2004); we added sentiment intensity ratings. The ID and MEAN-SENTIMENT-RATING correspond to the raw sentiment rating data provided in 'amazonReviewSnippets_anonDataRatings.txt' (described below).
219 |
220 | 11. amazonReviewSnippets_anonDataRatings.txt
221 | FORMAT: the file is tab delimited with ID, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-SENTIMENT-RATINGS
222 | DESCRIPTION: Sentiment ratings from a minimum of 20 independent human raters (all pre-screened, trained, and quality checked for optimal inter-rater reliability).
223 |
224 | 12. Comp.Social website with more papers/research: [Comp.Social](http://comp.social.gatech.edu/papers/)
225 |
226 | 13. vader_sentiment_comparison_online_weblink
227 | A short-cut hyperlinked to the online (web-based) sentiment comparison using a "light" version of VADER. http://www.socialai.gatech.edu/apps/sentiment.html .
228 |
229 |
230 | ## Java Code EXAMPLE:
231 |
232 | ```
233 | public static void main(String[] args) throws IOException {
234 | ArrayList sentences = new ArrayList() {{
235 | add("VADER is smart, handsome, and funny.");
236 | add("VADER is smart, handsome, and funny!");
237 | add("VADER is very smart, handsome, and funny.");
238 | add("VADER is VERY SMART, handsome, and FUNNY.");
239 | add("VADER is VERY SMART, handsome, and FUNNY!!!");
240 | add("VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!");
241 | add("The book was good.");
242 | add("The book was kind of good.");
243 | add("The plot was good, but the characters are uncompelling and the dialog is not great.");
244 | add("A really bad, horrible book.");
245 | add("At least it isn't a horrible book.");
246 | add(":) and :D");
247 | add("");
248 | add("Today sux");
249 | add("Today sux!");
250 | add("Today SUX!");
251 | add("Today kinda sux! But I'll get by, lol");
252 | }};
253 |
254 | for (String sentence : sentences) {
255 | System.out.println(sentence);
256 | final SentimentPolarities sentimentPolarities =
257 | SentimentAnalyzer.getScoresFor(sentence);
258 | System.out.println(sentimentPolarities);
259 | }
260 | }
261 | ```
262 |
263 | ### Online (web-based) Sentiment Comparison using VADER
264 |
265 | http://www.socialai.gatech.edu/apps/sentiment.html .
266 |
--------------------------------------------------------------------------------
/src/main/checkstyle/checkstyle.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
54 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
71 |
72 |
73 |
75 |
76 |
77 |
83 |
84 |
85 |
86 |
89 |
90 |
91 |
92 |
93 |
97 |
98 |
99 |
100 |
101 |
103 |
104 |
105 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
124 |
126 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
174 |
175 |
176 |
178 |
180 |
181 |
182 |
183 |
185 |
186 |
187 |
188 |
190 |
191 |
192 |
193 |
195 |
196 |
197 |
198 |
200 |
201 |
202 |
203 |
205 |
206 |
207 |
208 |
210 |
211 |
212 |
213 |
215 |
216 |
217 |
218 |
220 |
221 |
222 |
223 |
225 |
226 |
227 |
228 |
230 |
231 |
232 |
233 |
235 |
236 |
237 |
238 |
240 |
242 |
244 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
272 |
273 |
274 |
277 |
278 |
279 |
280 |
286 |
287 |
288 |
289 |
292 |
293 |
294 |
295 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
328 |
329 |
330 |
331 |
334 |
335 |
336 |
337 |
338 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
354 |
355 |
356 |
357 |
358 |
--------------------------------------------------------------------------------
/src/main/java/com/vader/sentiment/analyzer/SentimentAnalyzer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * MIT License
3 | *
4 | * Copyright (c) 2021 Animesh Pandey
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in all
14 | * copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | * SOFTWARE.
23 | */
24 |
25 | package com.vader.sentiment.analyzer;
26 |
27 | import java.io.IOException;
28 | import java.util.ArrayList;
29 | import java.util.Collections;
30 | import java.util.List;
31 | import org.apache.commons.lang3.StringUtils;
32 | import org.slf4j.Logger;
33 | import org.slf4j.LoggerFactory;
34 | import com.google.common.base.Preconditions;
35 | import com.vader.sentiment.processor.TextProperties;
36 | import com.vader.sentiment.util.Constants;
37 | import com.vader.sentiment.util.SentimentModifyingTokens;
38 | import com.vader.sentiment.util.Utils;
39 | import com.vader.sentiment.util.Valence;
40 |
41 | /**
42 | * The SentimentAnalyzer class is the main class for VADER Sentiment analysis.
43 | *
44 | * @author Animesh Pandey
45 | * @see VADER: A Parsimonious Rule-based Model
46 | * for Sentiment Analysis of Social Media Text
47 | */
48 | //CHECKSTYLE.OFF: ExecutableStatementCount
49 | //CHECKSTYLE.OFF: JavaNCSS
50 | //CHECKSTYLE.OFF: CyclomaticComplexity
51 | //CHECKSTYLE.OFF: NPathComplexity
52 | public final class SentimentAnalyzer {
53 | /**
54 | * Logger for current class.
55 | */
56 | private static final Logger LOGGER = LoggerFactory.getLogger(SentimentAnalyzer.class);
57 |
58 | /**
59 | * All functions is this class are static. So, this class should have a private constructor.
60 | */
61 | private SentimentAnalyzer() {
62 | }
63 |
64 | /**
65 | * This method returns the polarity scores for a given input string.
66 | *
67 | * @param inputString the string to be analyzed.
68 | * @return an object of {@link SentimentPolarities} which will hold all the sentiment scores.
69 | */
70 | public static SentimentPolarities getScoresFor(String inputString) {
71 | return computeSentimentPolaritiesFor(inputString);
72 | }
73 |
74 | /**
75 | * Adjust valence if a token is in {@link Utils#BOOSTER_DICTIONARY} or is a yelling word (all caps).
76 | *
77 | * @param precedingToken token
78 | * @param currentValence valence to be adjusted
79 | * @param inputHasYelling true if the input string has any yelling words.
80 | * @return adjusted valence
81 | */
82 | private static float adjustValenceIfCapital(final String precedingToken, final float currentValence,
83 | final boolean inputHasYelling) {
84 | float scalar = 0.0F;
85 | final String precedingTokenLower = precedingToken.toLowerCase();
86 | if (Utils.BOOSTER_DICTIONARY.containsKey(precedingTokenLower)) {
87 | scalar = Utils.BOOSTER_DICTIONARY.get(precedingTokenLower);
88 | if (currentValence < 0.0F) {
89 | scalar = -scalar;
90 | }
91 | if (Utils.isUpper(precedingToken) && inputHasYelling) {
92 | if (currentValence > 0.0F) {
93 | scalar += Valence.ALL_CAPS_FACTOR.getValue();
94 | } else {
95 | scalar -= Valence.ALL_CAPS_FACTOR.getValue();
96 | }
97 | }
98 | }
99 | return scalar;
100 | }
101 |
102 | /**
103 | * This method checks for phrases having
104 | * - "never so current_word"
105 | * - "never this current_word"
106 | * - "never so this" etc.
107 | *
108 | * @param distance gram window size
109 | * @param currentItemPosition position of the current token
110 | * @param wordsAndEmoticons tokenized version of the input text
111 | * @return true if any of the above phrases are found.
112 | */
113 | private static boolean areNeverPhrasesPresent(final int distance, final int currentItemPosition,
114 | final List wordsAndEmoticons) {
115 | if (distance == 1) {
116 | final String wordAtDistanceTwoLeft =
117 | wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_BIGRAM_WINDOW);
118 | final String wordAtDistanceOneLeft =
119 | wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_UNIGRAM_WINDOW);
120 | return (wordAtDistanceTwoLeft.equals(SentimentModifyingTokens.NEVER.getValue()))
121 | && (wordAtDistanceOneLeft.equals(SentimentModifyingTokens.SO.getValue())
122 | || (wordAtDistanceOneLeft.equals(SentimentModifyingTokens.NEVER.getValue())));
123 | } else if (distance == 2) {
124 | final String wordAtDistanceThreeLeft = wordsAndEmoticons.get(currentItemPosition
125 | - Constants.PRECEDING_TRIGRAM_WINDOW);
126 | final String wordAtDistanceTwoLeft =
127 | wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_BIGRAM_WINDOW);
128 | final String wordAtDistanceOneLeft =
129 | wordsAndEmoticons.get(currentItemPosition - Constants.PRECEDING_UNIGRAM_WINDOW);
130 | return (wordAtDistanceThreeLeft.equals(SentimentModifyingTokens.NEVER.getValue()))
131 | && (wordAtDistanceTwoLeft.equals(SentimentModifyingTokens.SO.getValue())
132 | || wordAtDistanceTwoLeft.equals(SentimentModifyingTokens.THIS.getValue()))
133 | || (wordAtDistanceOneLeft.equals(SentimentModifyingTokens.SO.getValue())
134 | || wordAtDistanceOneLeft.equals(SentimentModifyingTokens.THIS.getValue()));
135 | }
136 | return false;
137 | }
138 |
139 | /**
140 | * Adjust the valence is there tokens contain any token which is a negative token or the bigrams and trigrams
141 | * around a token are phrases that have "never" in them.
142 | *
143 | * @param currentValence valence before
144 | * @param distance gram window size
145 | * @param currentItemPosition position of the current token
146 | * @param closeTokenIndex token at the distance position from current item
147 | * @param wordsAndEmoticons tokenized version of the input text
148 | * @return adjusted valence.
149 | */
150 | private static float dampValenceIfNegativeTokensFound(final float currentValence, final int distance,
151 | final int currentItemPosition, final int closeTokenIndex,
152 | final List wordsAndEmoticons) {
153 | float newValence = currentValence;
154 | final boolean anyNeverPhrase = areNeverPhrasesPresent(distance, currentItemPosition, wordsAndEmoticons);
155 |
156 | if (!anyNeverPhrase) {
157 | if (isNegative(wordsAndEmoticons.get(closeTokenIndex))) {
158 | newValence *= Valence.NEGATIVE_WORD_DAMPING_FACTOR.getValue();
159 | }
160 | } else {
161 | final float neverPhraseAdjustment = (distance == 1)
162 | ? Valence.PRECEDING_BIGRAM_HAVING_NEVER_DAMPING_FACTOR.getValue()
163 | : Valence.PRECEDING_TRIGRAM_HAVING_NEVER_DAMPING_FACTOR.getValue();
164 | newValence *= neverPhraseAdjustment;
165 | }
166 |
167 | return newValence;
168 | }
169 |
170 | /**
171 | * This method builds the possible to n-grams starting from last token in the token list.
172 | * VADER uses bi-grams and tri-grams only, so here minGramLength will be 2 and maxGramLength
173 | * will be 3.
174 | *
175 | * @param tokenList The list of tokens for which we want to compute the n-grams.
176 | * @param minGramLength The minimum size of the possible n-grams.
177 | * @param maxGramLength The maximum size of the possible n-grams.
178 | * @param startPosition The position of the token from which we'll extract the tokens.
179 | * @param maxDistanceFromStartPosition The max distance from the end of the current gram and the startPosition.
180 | * @return list of all possible to minGramLength-grams and maxGramLength-grams starting from startPosition.
181 | */
182 | private static List getLeftGrams(final List tokenList, final int minGramLength,
183 | final int maxGramLength, final int startPosition,
184 | final int maxDistanceFromStartPosition) {
185 | Preconditions.checkArgument(minGramLength > 0 && maxGramLength > 0,
186 | "Left Gram lengths should not be negative or zero.");
187 | Preconditions.checkArgument(maxGramLength >= minGramLength,
188 | "Maximum left gram length should be at least equal to the minimum value.");
189 | Preconditions.checkArgument(tokenList != null);
190 |
191 | final int noOfTokens = tokenList.size();
192 | if (noOfTokens < minGramLength) {
193 | return Collections.emptyList();
194 | }
195 |
196 | final List result = new ArrayList<>();
197 | for (int end = startPosition; end > 0; end--) {
198 | final int windowStart = end - minGramLength + 1;
199 | final int windowEnd = end - maxGramLength;
200 | String currentSuffix = tokenList.get(end);
201 | for (int start = windowStart; start >= ((windowEnd < 0) ? 0 : Math.max(0, windowEnd) + 1); start--) {
202 | currentSuffix = tokenList.get(start) + Constants.SPACE_SEPARATOR + currentSuffix;
203 | result.add(currentSuffix);
204 | if ((startPosition - end) == maxDistanceFromStartPosition) {
205 | return result;
206 | }
207 | }
208 | }
209 | return result;
210 | }
211 |
212 | /**
213 | * This method builds the first possible n-grams starting from startPosition in the token list.
214 | * VADER uses bi-grams and tri-grams only, so here minGramLength will be 2 and maxGramLength
215 | * will be 3.
216 | *
217 | * @param tokenList The list of tokens for which we want to compute the n-grams.
218 | * @param minGramLength The minimum size of the possible n-grams.
219 | * @param maxGramLength The maximum size of the possible n-grams.
220 | * @param startPosition The position of the token from which we'll extract the tokens.
221 | * @return list of the first to minGramLength-grams and maxGramLength-grams starting from
222 | */
223 | private static List getFirstRightGrams(final List tokenList, final int minGramLength,
224 | final int maxGramLength, final int startPosition) {
225 | Preconditions.checkArgument(minGramLength > 0 && maxGramLength > 0,
226 | "Right Gram lengths should not be negative or zero.");
227 | Preconditions.checkArgument(maxGramLength >= minGramLength,
228 | "Maximum right gram length should be at least equal to the minimum value.");
229 | Preconditions.checkArgument(tokenList != null);
230 |
231 | final int noOfTokens = tokenList.size();
232 | if (noOfTokens < minGramLength) {
233 | return Collections.emptyList();
234 | }
235 |
236 | final List result = new ArrayList<>();
237 | final StringBuilder currentGram = new StringBuilder(tokenList.get(startPosition));
238 | for (int i = minGramLength; i <= maxGramLength; i++) {
239 | final int endPosition = startPosition + i - 1;
240 | if (endPosition > tokenList.size() - 1) {
241 | break;
242 | }
243 | currentGram.append(Constants.SPACE_SEPARATOR).append(tokenList.get(endPosition));
244 | result.add(currentGram.toString());
245 | }
246 | return result;
247 | }
248 |
249 | /**
250 | * We check if the idioms present in {@link Utils#SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY} are present in
251 | * left bi/tri-grams sequences.
252 | *
253 | * @param currentValence current valence before checking for idioms.
254 | * @param leftGramSequences list of all the left bi/tri-grams.
255 | * @return adjusted valence.
256 | */
257 | private static float adjustValenceIfLeftGramsHaveIdioms(final float currentValence,
258 | final List leftGramSequences) {
259 | float newValence = currentValence;
260 | for (String leftGramSequence : leftGramSequences) {
261 | if (Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.containsKey(leftGramSequence)) {
262 | newValence = Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.get(leftGramSequence);
263 | break;
264 | }
265 | }
266 |
267 | // Based on how getLeftGrams calculates grams, the bi-grams are at the all the even indices.
268 | // VADER only deals with the 2 left most bi-grams in leftGramSequences.
269 | for (int i = leftGramSequences.size() - 1; i <= 2; i--) {
270 | if (Utils.BOOSTER_DICTIONARY.containsKey(leftGramSequences.get(i))) {
271 | newValence += Valence.DEFAULT_DAMPING.getValue();
272 | break;
273 | }
274 | }
275 |
276 | return newValence;
277 | }
278 |
279 | /**
280 | * Search if the any bi-gram/tri-grams around the currentItemPosition contains any idioms defined
281 | * in {@link Utils#SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY} Adjust the current valence if there are
282 | * any idioms found.
283 | *
284 | * @param currentValence valence to be adjusted
285 | * @param currentItemPosition current tokens position
286 | * @param wordsAndEmoticons tokenized version of the input text
287 | * @param distance max distance from the end of the current gram and the startPosition.
288 | * @return adjusted valence
289 | */
290 | private static float adjustValenceIfIdiomsFound(final float currentValence, final int currentItemPosition,
291 | final List wordsAndEmoticons, final int distance) {
292 | float newValence;
293 |
294 | final List leftGramSequences = getLeftGrams(wordsAndEmoticons, 2,
295 | Constants.MAX_GRAM_WINDOW_SIZE, currentItemPosition, distance);
296 | newValence = adjustValenceIfLeftGramsHaveIdioms(currentValence, leftGramSequences);
297 |
298 | final List rightGramSequences = getFirstRightGrams(wordsAndEmoticons, 2,
299 | Constants.MAX_GRAM_WINDOW_SIZE, currentItemPosition);
300 | for (String rightGramSequence : rightGramSequences) {
301 | if (Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.containsKey(rightGramSequence)) {
302 | newValence = Utils.SENTIMENT_LADEN_IDIOMS_VALENCE_DICTIONARY.get(rightGramSequence);
303 | }
304 | }
305 |
306 | return newValence;
307 | }
308 |
309 | /**
310 | * Analyze each token/emoticon in the input string and calculate its valence.
311 | *
312 | * @param textProperties This objects holds the tokenized version of a string.
313 | * @return the valence of each token as a list
314 | */
315 | private static List getTokenWiseSentiment(final TextProperties textProperties) {
316 | List sentiments = new ArrayList<>();
317 | final List wordsAndEmoticons = textProperties.getWordsAndEmoticons();
318 |
319 | for (int currentItemPosition = 0; currentItemPosition < wordsAndEmoticons.size(); currentItemPosition++) {
320 | final String currentItem = wordsAndEmoticons.get(currentItemPosition);
321 | final String currentItemLower = currentItem.toLowerCase();
322 | float currentValence = 0.0F;
323 |
324 | LOGGER.debug("Current token, \"{}\" with index, i = {}", currentItem, currentItemPosition);
325 | LOGGER.debug("Sentiment State before \"kind of\" processing: {}", sentiments);
326 |
327 | /*
328 | * This section performs the following evaluation:
329 | * If the term at currentItemPosition is followed by "kind of" or the it is present in
330 | * {@link Utils#BoosterDictionary}, add the currentValence to sentiment array and break
331 | * to the next loop.
332 | *
333 | * If currentValence was 0.0, then current word's valence will also be 0.0.
334 | */
335 | if ((currentItemPosition < wordsAndEmoticons.size() - 1
336 | && currentItemLower.equals(SentimentModifyingTokens.KIND.getValue())
337 | && wordsAndEmoticons.get(currentItemPosition + 1).toLowerCase()
338 | .equals(SentimentModifyingTokens.OF.getValue()))
339 | || Utils.BOOSTER_DICTIONARY.containsKey(currentItemLower)) {
340 | sentiments.add(currentValence);
341 | continue;
342 | }
343 |
344 | LOGGER.debug("Sentiment State after \"kind of\" processing: {}", sentiments);
345 | LOGGER.debug("Current Valence is {} for \"{}\"", currentValence, currentItem);
346 |
347 | /*
348 | * If current item in lowercase is in {@link Utils#WordValenceDictionary}...
349 | */
350 | if (Utils.WORD_VALENCE_DICTIONARY.containsKey(currentItemLower)) {
351 | currentValence = Utils.WORD_VALENCE_DICTIONARY.get(currentItemLower);
352 |
353 | if (LOGGER.isDebugEnabled()) {
354 | LOGGER.debug("Current currentItem isUpper(): {}", Utils.isUpper(currentItem));
355 | LOGGER.debug("Current currentItem isYelling(): {}", textProperties.isYelling());
356 | }
357 |
358 | /*
359 | * If current item is all in uppercase and the input string has yelling words,
360 | * accordingly adjust currentValence.
361 | */
362 | if (Utils.isUpper(currentItem) && textProperties.isYelling()) {
363 | if (currentValence > 0.0) {
364 | currentValence += Valence.ALL_CAPS_FACTOR.getValue();
365 | } else {
366 | currentValence -= Valence.ALL_CAPS_FACTOR.getValue();
367 | }
368 | }
369 |
370 | LOGGER.debug("Current Valence post all CAPS checks: {}", currentValence);
371 |
372 | /*
373 | * "distance" is the window size.
374 | * e.g. "The plot was good, but the characters are uncompelling.",
375 | * if the current item is "characters", then at:
376 | * - distance = 0, closeTokenIndex = 5
377 | * - distance = 1, closeTokenIndex = 4
378 | * - distance = 2, closeTokenIndex = 3
379 | */
380 | int distance = 0;
381 | while (distance < Constants.MAX_GRAM_WINDOW_SIZE) {
382 | int closeTokenIndex = currentItemPosition - (distance + 1);
383 | if (closeTokenIndex < 0) {
384 | closeTokenIndex = wordsAndEmoticons.size() - Math.abs(closeTokenIndex);
385 | }
386 |
387 | if ((currentItemPosition > distance)
388 | && !Utils.WORD_VALENCE_DICTIONARY.containsKey(wordsAndEmoticons.get(closeTokenIndex)
389 | .toLowerCase())) {
390 | LOGGER.debug("Current Valence pre gramBasedValence: {}", currentValence);
391 | float gramBasedValence = adjustValenceIfCapital(wordsAndEmoticons.get(closeTokenIndex),
392 | currentValence, textProperties.isYelling());
393 | LOGGER.debug("Current Valence post gramBasedValence: {}", currentValence);
394 | /*
395 | * At distance of 1, reduce current gram's valence by 5%.
396 | * At distance of 2, reduce current gram's valence by 10%.
397 | */
398 | if (gramBasedValence != 0.0F) {
399 | if (distance == 1) {
400 | gramBasedValence *= Valence.ONE_WORD_DISTANCE_DAMPING_FACTOR.getValue();
401 | } else if (distance == 2) {
402 | gramBasedValence *= Valence.TWO_WORD_DISTANCE_DAMPING_FACTOR.getValue();
403 | }
404 | }
405 | currentValence += gramBasedValence;
406 |
407 | LOGGER.debug("Current Valence post gramBasedValence and distance "
408 | + "based damping: {}", currentValence);
409 |
410 | currentValence = dampValenceIfNegativeTokensFound(currentValence, distance,
411 | currentItemPosition, closeTokenIndex, wordsAndEmoticons);
412 |
413 | LOGGER.debug("Current Valence post \"never\" check: {}", currentValence);
414 |
415 | /*
416 | * At a distance of 2, we check for idioms in bi-grams and tri-grams around currentItemPosition.
417 | */
418 | if (distance == 2) {
419 | currentValence = adjustValenceIfIdiomsFound(currentValence, currentItemPosition,
420 | wordsAndEmoticons, distance);
421 | LOGGER.debug("Current Valence post Idiom check: {}", currentValence);
422 | }
423 | }
424 |
425 | distance++;
426 | }
427 | currentValence = adjustValenceIfHasAtLeast(currentItemPosition, wordsAndEmoticons, currentValence);
428 | }
429 |
430 | sentiments.add(currentValence);
431 | }
432 | LOGGER.debug("Sentiment state after first pass through tokens: {}", sentiments);
433 |
434 | sentiments = adjustValenceIfHasConjunction(wordsAndEmoticons, sentiments);
435 | LOGGER.debug("Sentiment state after checking conjunctions: {}", sentiments);
436 |
437 | return sentiments;
438 | }
439 |
440 | /**
441 | * This methods calculates the positive, negative and neutral sentiment from the sentiment values of the input
442 | * string.
443 | *
444 | * @param tokenWiseSentimentState valence of the each token in input string
445 | * @param punctuationAmplifier valence adjustment factor for punctuations
446 | * @return an object of the non-normalized scores as {@link RawSentimentScores}.
447 | */
448 | private static RawSentimentScores computeRawSentimentScores(final List tokenWiseSentimentState,
449 | final float punctuationAmplifier) {
450 | float positiveSentimentScore = 0.0F;
451 | float negativeSentimentScore = 0.0F;
452 | int neutralSentimentCount = 0;
453 | for (Float valence : tokenWiseSentimentState) {
454 | if (valence > 0.0F) {
455 | positiveSentimentScore += valence + 1.0F;
456 | } else if (valence < 0.0F) {
457 | negativeSentimentScore += valence - 1.0F;
458 | } else {
459 | neutralSentimentCount += 1;
460 | }
461 | }
462 |
463 | if (positiveSentimentScore > Math.abs(negativeSentimentScore)) {
464 | positiveSentimentScore += punctuationAmplifier;
465 | } else if (positiveSentimentScore < Math.abs(negativeSentimentScore)) {
466 | negativeSentimentScore -= punctuationAmplifier;
467 | }
468 |
469 | return new RawSentimentScores(positiveSentimentScore, negativeSentimentScore, (float) neutralSentimentCount);
470 | }
471 |
472 | /**
473 | * The compound score is computed by summing the valence scores of each word in the lexicon, adjusted
474 | * according to the rules, and then normalized to be between -1 (most extreme negative) and +1
475 | * (most extreme positive). This is the most useful metric if you want a single uni-dimensional measure
476 | * of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.
477 | *
478 | * @param tokenWiseSentimentState valence for each token
479 | * @param punctuationAmplifier valence adjustment factor for punctuations
480 | * @return raw compound polarity
481 | */
482 | private static float computeCompoundPolarityScore(final List tokenWiseSentimentState,
483 | final float punctuationAmplifier) {
484 | /*
485 | * Compute the total valence.
486 | */
487 | float totalValence = tokenWiseSentimentState.stream().reduce(0.0F, Float::sum);
488 | LOGGER.debug("Total valence: {}", totalValence);
489 |
490 | if (totalValence > 0.0F) {
491 | totalValence += punctuationAmplifier;
492 | } else if (totalValence < 0.0F) {
493 | totalValence -= punctuationAmplifier;
494 | }
495 |
496 | return totalValence;
497 | }
498 |
499 | /**
500 | * Normalize the compound score and the other three raw sentiment scores.
501 | *
502 | * @param rawSentimentScores multi-dimensional sentiment scores.
503 | * @param compoundPolarityScore uni-dimensional sentiment score.
504 | * @return normalized values of all the type of the sentiment scores in a object of {@link SentimentPolarities}.
505 | */
506 | private static SentimentPolarities normalizeAllScores(final RawSentimentScores rawSentimentScores,
507 | final float compoundPolarityScore) {
508 | final float positiveSentimentScore = rawSentimentScores.getPositiveScore();
509 | final float negativeSentimentScore = rawSentimentScores.getNegativeScore();
510 | final int neutralSentimentCount = Math.round(rawSentimentScores.getNeutralScore());
511 |
512 | final float normalizationFactor = positiveSentimentScore + Math.abs(negativeSentimentScore)
513 | + neutralSentimentCount;
514 |
515 | if (LOGGER.isDebugEnabled()) {
516 | LOGGER.debug("Normalization Factor: {}", normalizationFactor);
517 | LOGGER.debug("Pre-Normalized Scores: {} {} {} {}}",
518 | Math.abs(positiveSentimentScore),
519 | Math.abs(negativeSentimentScore),
520 | Math.abs(neutralSentimentCount),
521 | compoundPolarityScore
522 | );
523 | }
524 |
525 | final float absolutePositivePolarity = Math.abs(positiveSentimentScore / normalizationFactor);
526 | final float absoluteNegativePolarity = Math.abs(negativeSentimentScore / normalizationFactor);
527 | final float absoluteNeutralPolarity = Math.abs(neutralSentimentCount / normalizationFactor);
528 |
529 | LOGGER.debug("Pre-Round Scores: {} {} {} {}}",
530 | absolutePositivePolarity,
531 | absoluteNegativePolarity,
532 | absoluteNeutralPolarity,
533 | compoundPolarityScore
534 | );
535 |
536 | final float normalizedPositivePolarity = roundDecimal(absolutePositivePolarity, 3);
537 | final float normalizedNegativePolarity = roundDecimal(absoluteNegativePolarity, 3);
538 | final float normalizedNeutralPolarity = roundDecimal(absoluteNeutralPolarity, 3);
539 |
540 | // Normalizing the compound score.
541 | final float normalizedCompoundPolarity = roundDecimal(normalizeCompoundScore(compoundPolarityScore), 4);
542 |
543 | return new SentimentPolarities(normalizedPositivePolarity, normalizedNegativePolarity,
544 | normalizedNeutralPolarity, normalizedCompoundPolarity);
545 | }
546 |
547 | /**
548 | * Convert the lower level token wise valence to a higher level polarity scores.
549 | *
550 | * @param tokenWiseSentimentStateParam the token wise scores of the input string
551 | * @param punctuationAmplifier valence adjustment factor for punctuations
552 | * @return the positive, negative, neutral and compound polarity scores as a map
553 | */
554 | private static SentimentPolarities getPolarityScores(final List tokenWiseSentimentStateParam,
555 | final float punctuationAmplifier) {
556 | final List tokenWiseSentimentState = Collections.unmodifiableList(tokenWiseSentimentStateParam);
557 | LOGGER.debug("Final token-wise sentiment state: {}", tokenWiseSentimentState);
558 |
559 | final float compoundPolarity = computeCompoundPolarityScore(tokenWiseSentimentState, punctuationAmplifier);
560 | final RawSentimentScores rawSentimentScores = computeRawSentimentScores(tokenWiseSentimentState,
561 | punctuationAmplifier);
562 |
563 | return normalizeAllScores(rawSentimentScores, compoundPolarity);
564 | }
565 |
566 | /**
567 | * This function jointly performs the boosting if input string contains
568 | * '!'s and/or '?'s and then returns the sum of the boosted scores from
569 | * {@link SentimentAnalyzer#boostByExclamation(String)} and {@link SentimentAnalyzer#boostByQuestionMark(String)}.
570 | *
571 | * @param input the input string that needs to be processed.
572 | * @return joint boosted score
573 | */
574 | private static float boostByPunctuation(String input) {
575 | return boostByExclamation(input) + boostByQuestionMark(input);
576 | }
577 |
578 | /**
579 | * Valence boosting when '!' is found in the input string.
580 | *
581 | * @param input the input string that needs to be processed.
582 | * @return boosting score
583 | */
584 | private static float boostByExclamation(String input) {
585 | final int exclamationCount =
586 | StringUtils.countMatches(input, SentimentModifyingTokens.EXCLAMATION_MARK.getValue());
587 | return Math.min(exclamationCount, Constants.MAX_EXCLAMATION_MARKS)
588 | * Valence.EXCLAMATION_BOOSTING.getValue();
589 | }
590 |
591 | /**
592 | * Valence boosting when '?' is found in the input string.
593 | *
594 | * @param input the input string that needs to be processed.
595 | * @return boosting score
596 | */
597 | private static float boostByQuestionMark(String input) {
598 | final int questionMarkCount =
599 | StringUtils.countMatches(input, SentimentModifyingTokens.QUESTION_MARK.getValue());
600 | float questionMarkAmplifier = 0.0F;
601 | if (questionMarkCount > 1) {
602 | if (questionMarkCount <= Constants.MAX_QUESTION_MARKS) {
603 | questionMarkAmplifier = questionMarkCount * Valence.QUESTION_MARK_MAX_COUNT_BOOSTING.getValue();
604 | } else {
605 | questionMarkAmplifier = Valence.QUESTION_MARK_BOOSTING.getValue();
606 | }
607 | }
608 | return questionMarkAmplifier;
609 | }
610 |
611 | /**
612 | * This methods manages the effect of contrastive conjunctions like "but" on the valence of a token.
613 | * "VADER" only support "but/BUT" as a conjunction that modifies the valence.
614 | *
615 | * @param inputTokensParam list of token and/or emoticons in the input string
616 | * @param tokenWiseSentimentStateParam current token wise sentiment scores
617 | * @return adjusted token wise sentiment scores
618 | */
619 | private static List adjustValenceIfHasConjunction(final List inputTokensParam,
620 | final List tokenWiseSentimentStateParam) {
621 | final List inputTokens = Collections.unmodifiableList(inputTokensParam);
622 | final List tokenWiseSentimentState = new ArrayList<>(tokenWiseSentimentStateParam);
623 |
624 | int indexOfConjunction = inputTokens.indexOf(SentimentModifyingTokens.BUT.getValue());
625 | if (indexOfConjunction < 0) {
626 | indexOfConjunction = inputTokens.indexOf(SentimentModifyingTokens.BUT.getValue().toUpperCase());
627 | }
628 | if (indexOfConjunction >= 0) {
629 | for (int valenceIndex = 0; valenceIndex < tokenWiseSentimentState.size(); valenceIndex++) {
630 | float currentValence = tokenWiseSentimentState.get(valenceIndex);
631 | if (valenceIndex < indexOfConjunction) {
632 | currentValence *= Valence.PRE_CONJUNCTION_ADJUSTMENT_FACTOR.getValue();
633 | } else if (valenceIndex > indexOfConjunction) {
634 | currentValence *= Valence.POST_CONJUNCTION_ADJUSTMENT_FACTOR.getValue();
635 | }
636 | tokenWiseSentimentState.set(valenceIndex, currentValence);
637 | }
638 | }
639 | return tokenWiseSentimentState;
640 | }
641 |
642 | /**
643 | * Check for the cases where you have phrases having "least" in the words preceding the token at
644 | * currentItemPosition and accordingly adjust the valence.
645 | *
646 | * @param currentItemPosition position of the token in wordsAndEmoticons around which we will search for "least"
647 | * type phrases
648 | * @param wordsAndEmoticonsParam list of token and/or emoticons in the input string
649 | * @param currentValence valence of the token at currentItemPosition
650 | * @return adjusted currentValence
651 | */
652 | private static float adjustValenceIfHasAtLeast(final int currentItemPosition,
653 | final List wordsAndEmoticonsParam,
654 | final float currentValence) {
655 | final List wordsAndEmoticons = Collections.unmodifiableList(wordsAndEmoticonsParam);
656 | float valence = currentValence;
657 | if (currentItemPosition > 1
658 | && !Utils.WORD_VALENCE_DICTIONARY.containsKey(wordsAndEmoticons.get(currentItemPosition - 1)
659 | .toLowerCase())
660 | && wordsAndEmoticons.get(currentItemPosition - 1)
661 | .toLowerCase().equals(SentimentModifyingTokens.LEAST.getValue())) {
662 | if (!(wordsAndEmoticons.get(currentItemPosition - 2).toLowerCase()
663 | .equals(SentimentModifyingTokens.AT.getValue())
664 | || wordsAndEmoticons.get(currentItemPosition - 2).toLowerCase()
665 | .equals(SentimentModifyingTokens.VERY.getValue()))) {
666 | valence *= Valence.NEGATIVE_WORD_DAMPING_FACTOR.getValue();
667 | }
668 | } else if (currentItemPosition > 0
669 | && !Utils.WORD_VALENCE_DICTIONARY.containsKey(wordsAndEmoticons.get(currentItemPosition - 1).toLowerCase())
670 | && wordsAndEmoticons.get(currentItemPosition - 1).equals(SentimentModifyingTokens.LEAST.getValue())) {
671 | valence *= Valence.NEGATIVE_WORD_DAMPING_FACTOR.getValue();
672 | }
673 | return valence;
674 | }
675 |
676 | /**
677 | * Check if token has "n't" in the end.
678 | *
679 | * @param token current token
680 | * @return true iff token has "n't" in the end
681 | */
682 | private static boolean hasContraction(final String token) {
683 | return token.endsWith(SentimentModifyingTokens.CONTRACTION.getValue());
684 | }
685 |
686 | /**
687 | * Check if token belongs to a pre-defined list of negative words. e.g. {@link Utils#NEGATIVE_WORDS}
688 | * and also checks if the token has "n't" in the end.
689 | *
690 | * @param token current token
691 | * @param checkContractions flag to check "n't" in end of token
692 | * @return true iff token is in newNegWords or if checkContractions is true, token should have "n't" in its end
693 | */
694 | private static boolean isNegative(final String token, final boolean checkContractions) {
695 | final boolean result = Utils.NEGATIVE_WORDS.contains(token);
696 | if (!checkContractions) {
697 | return result;
698 | }
699 | return result || hasContraction(token);
700 | }
701 |
702 | /**
703 | * This is the default version of {@link SentimentAnalyzer#isNegative(String, boolean)}.
704 | *
705 | * @param token current token
706 | * @return true iff token is in {@link Utils#NEGATIVE_WORDS} or token has "n't" in its end
707 | */
708 | private static boolean isNegative(final String token) {
709 | return isNegative(token, true);
710 | }
711 |
712 | /**
713 | * Normalize the total valence of the input string, where alpha is the estimated maximum value of valence.
714 | *
715 | * @param score score
716 | * @param alpha estimated max value
717 | * @return normalized value of score
718 | */
719 | private static float normalizeCompoundScore(final float score, final float alpha) {
720 | final double normalizedScore = score / Math.sqrt((score * score) + alpha);
721 | return (float) normalizedScore;
722 | }
723 |
724 | /**
725 | * Default version of {@link SentimentAnalyzer#normalizeCompoundScore(float, float)} where alpha is 15.0.
726 | *
727 | * @param score score
728 | * @return normalized value of score
729 | */
730 | private static float normalizeCompoundScore(final float score) {
731 | return normalizeCompoundScore(score, Constants.DEFAULT_ALPHA);
732 | }
733 |
734 | /**
735 | * This method rounds of a float value to defined no. of places.
736 | *
737 | * @param currentValue current float values
738 | * @param noOfPlaces no. of decimal places
739 | * @return rounded float value
740 | */
741 | private static float roundDecimal(final float currentValue, final int noOfPlaces) {
742 | final float factor = (float) Math.pow(10.0, (double) noOfPlaces);
743 | final float number = Math.round(currentValue * factor);
744 | return number / factor;
745 | }
746 |
747 | /**
748 | * This is a composite function that computes token-wise sentiment scores and then converts that to
749 | * higher level scores.
750 | *
751 | * @param inputString string that is to be processed.
752 | * @return the positive, negative, neutral and compound polarity scores as {@link SentimentPolarities}
753 | */
754 | private static SentimentPolarities computeSentimentPolaritiesFor(String inputString) {
755 | // Parse the string using Lucene and get the text tokens.
756 | final TextProperties inputStringProperties;
757 | try {
758 | inputStringProperties = new TextProperties(inputString);
759 | } catch (IOException excp) {
760 | LOGGER.error("There was an issue while pre-processing the inputString.", excp);
761 | return SentimentPolarities.emptySentimentState();
762 | }
763 |
764 | // Calculate the per-token valence.
765 | final List tokenWiseSentiments = getTokenWiseSentiment(inputStringProperties);
766 | if (tokenWiseSentiments.isEmpty()) {
767 | return SentimentPolarities.emptySentimentState();
768 | }
769 | // Adjust the total valence score on the basis of the punctuations in the input string.
770 | final float punctuationAmplifier = boostByPunctuation(inputString);
771 | return getPolarityScores(tokenWiseSentiments, punctuationAmplifier);
772 | }
773 | }
774 | //CHECKSTYLE.ON: ExecutableStatementCount
775 | //CHECKSTYLE.ON: JavaNCSS
776 | //CHECKSTYLE.ON: CyclomaticComplexity
777 | //CHECKSTYLE.ON: NPathComplexity
778 |
--------------------------------------------------------------------------------