) null));
55 |
56 | attributes.add(new Attribute("pos"));
57 | attributes.add(new Attribute("neg"));
58 |
59 |
60 |
61 |
62 |
63 | Instances dataset = new Instances(
64 | "6HumanCoded Dataset", attributes, 0); // The
65 | // last
66 | // attribute
67 |
68 | BufferedReader bf = new BufferedReader(new FileReader(collectionPath));
69 | String line=bf.readLine();
70 | while ((line = bf.readLine()) != null) {
71 | String parts[] = line.split("\t");
72 |
73 | if(parts.length==3){
74 | String content=parts[2];
75 | int pos=Integer.parseInt(parts[0].trim());
76 | int neg=Integer.parseInt(parts[1].trim());
77 |
78 |
79 |
80 | double values[] = new double[3];
81 |
82 | values[0] = dataset.attribute(0).addStringValue(content);
83 | values[1] = pos;
84 | values[2] = neg;
85 |
86 | Instance inst = new DenseInstance(1, values);
87 | dataset.add(inst);
88 |
89 | }
90 |
91 | }
92 |
93 |
94 | bf.close();
95 |
96 | return dataset;
97 | }
98 |
99 | /**
100 | * Main method for testing this class.
101 | *
102 | * should contain the path of input dataset and the name of
103 | * target file scheme (see Evaluation)
104 | * @param args arguments
105 | */
106 | static public void main(String args[]) {
107 |
108 | if (args.length == 2) {
109 |
110 | TweetCollectionToArff ta = new HumanCodedToArff();
111 |
112 | try {
113 | Instances dataset = ta.createDataset(args[0]);
114 | ArffSaver saver = new ArffSaver();
115 | saver.setInstances(dataset);
116 |
117 | saver.setFile(new File(args[1]));
118 | saver.writeBatch();
119 |
120 | } catch (Exception e) {
121 | // TODO Auto-generated catch block
122 | e.printStackTrace();
123 | }
124 |
125 | }
126 |
127 | }
128 |
129 | }
130 |
--------------------------------------------------------------------------------
/doc/deprecated-list.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Deprecated List
7 |
8 |
9 |
10 |
11 |
12 |
22 |
23 | JavaScript is disabled on your browser.
24 |
25 |
26 |
43 |
70 |
71 |
75 |
76 |
93 |
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/PTCMTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.filters.AbstractFilterTest;
24 | import weka.filters.Filter;
25 |
26 | import junit.framework.Test;
27 | import junit.framework.TestSuite;
28 |
29 | import java.io.File;
30 |
31 | /**
32 | * Tests PTCM. Run from the command line with:
33 | * java weka.filters.unsupervised.attribute.PTCMTest
34 | *
35 | * AffectiveTweets package must either be installed or
36 | * JVM must be started in AffectiveTweets directory.
37 | *
38 | * @author FracPete and eibe
39 | * @version $Revision: 9568 $
40 | */
41 | public class PTCMTest extends AbstractFilterTest {
42 |
43 | public PTCMTest(String name) {
44 | super(name);
45 | }
46 |
47 | /** Creates a default PTCM filter */
48 | public Filter getFilter() {
49 | Filter f = null;
50 |
51 | // Check to see if the test is run from directory containing build_package.xml
52 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
53 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
54 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
55 | f = new PTCM();
56 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
57 | } else {
58 | f = new PTCM(); // Hope that the package is installed.
59 | }
60 | return f;
61 | }
62 |
63 | /**
64 | * PTCM is not suitable for use in a FilteredClassifier, so this just creates a dummy
65 | * FilteredClassifier so that the tests run through.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | result.setFilter(new weka.filters.AllFilter());
75 | result.setClassifier(new weka.classifiers.rules.ZeroR());
76 |
77 | return result;
78 | }
79 |
80 | /**
81 | * Called by JUnit before each test method. Sets up the Instances object to use based on
82 | * one of the datasets that comes with the package.
83 | *
84 | * @throws Exception if an error occurs reading the example instances.
85 | */
86 | protected void setUp() throws Exception {
87 | super.setUp();
88 |
89 | // Check to see if the test is run from directory containing build_package.xml
90 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
91 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
92 | } else { // Hope that package is installed.
93 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
94 | }
95 |
96 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
97 | }
98 |
99 | public static Test suite() {
100 | return new TestSuite(PTCMTest.class);
101 | }
102 |
103 | public static void main(String[] args){
104 | junit.textui.TestRunner.run(suite());
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/TweetCentroidTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.filters.AbstractFilterTest;
24 | import weka.filters.Filter;
25 |
26 | import junit.framework.Test;
27 | import junit.framework.TestSuite;
28 |
29 | import java.io.File;
30 |
31 | /**
32 | * Tests TweetCentroid. Run from the command line with:
33 | * java weka.filters.unsupervised.attribute.TweetCentroidTest
34 | *
35 | * AffectiveTweets package must either be installed or
36 | * JVM must be started in AffectiveTweets directory.
37 | *
38 | * @author FracPete and eibe
39 | * @version $Revision: 9568 $
40 | */
41 | public class TweetCentroidTest extends AbstractFilterTest {
42 |
43 | public TweetCentroidTest(String name) {
44 | super(name);
45 | }
46 |
47 | /** Creates a default TweetCentroid filter */
48 | public Filter getFilter() {
49 | Filter f = null;
50 |
51 | // Check to see if the test is run from directory containing build_package.xml
52 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
53 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
54 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
55 | f = new TweetCentroid();
56 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
57 | } else {
58 | f = new TweetCentroid(); // Hope that the package is installed.
59 | }
60 | return f;
61 | }
62 |
63 | /**
64 | * TweetCentroid is not suitable for use in a FilteredClassifier, so this just creates a dummy
65 | * FilteredClassifier so that the tests run through.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | result.setFilter(new weka.filters.AllFilter());
75 | result.setClassifier(new weka.classifiers.rules.ZeroR());
76 |
77 | return result;
78 | }
79 |
80 | /**
81 | * Called by JUnit before each test method. Sets up the Instances object to use based on
82 | * one of the datasets that comes with the package.
83 | *
84 | * @throws Exception if an error occurs reading the example instances.
85 | */
86 | protected void setUp() throws Exception {
87 | super.setUp();
88 |
89 | // Check to see if the test is run from directory containing build_package.xml
90 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
91 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
92 | } else { // Hope that package is installed.
93 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
94 | }
95 |
96 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
97 | }
98 |
99 | public static Test suite() {
100 | return new TestSuite(TweetCentroidTest.class);
101 | }
102 |
103 | public static void main(String[] args){
104 | junit.textui.TestRunner.run(suite());
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/ASATest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.filters.AbstractFilterTest;
24 | import weka.filters.Filter;
25 |
26 | import junit.framework.Test;
27 | import junit.framework.TestSuite;
28 |
29 | import java.io.File;
30 |
31 | /**
32 | * Tests ASA. Run from the command line with:
33 | * java weka.filters.unsupervised.attribute.ASATest
34 | *
35 | * AffectiveTweets package must either be installed or
36 | * JVM must be started in AffectiveTweets directory.
37 | *
38 | * @author FracPete and eibe
39 | * @version $Revision: 9568 $
40 | */
41 | public class ASATest extends AbstractFilterTest {
42 |
43 | public ASATest(String name) {
44 | super(name);
45 | }
46 |
47 | /** Creates a default ASA filter */
48 | public Filter getFilter() {
49 | Filter f = null;
50 |
51 | // Check to see if the test is run from directory containing build_package.xml
52 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
53 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
54 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
55 | f = new ASA();
56 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
57 | } else {
58 | f = new ASA(); // Hope that the package is installed.
59 | }
60 | return f;
61 | }
62 |
63 | /**
64 | * ASA is not suitable for use in a FilteredClassifier, so this just creates a dummy
65 | * FilteredClassifier so that the tests run through.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | result.setFilter(new weka.filters.AllFilter());
75 | result.setClassifier(new weka.classifiers.rules.ZeroR());
76 |
77 | return result;
78 | }
79 |
80 | /**
81 | * Called by JUnit before each test method. Sets up the Instances object to use based on
82 | * one of the datasets that comes with the package.
83 | *
84 | * @throws Exception if an error occurs reading the example instances.
85 | */
86 | protected void setUp() throws Exception {
87 | super.setUp();
88 |
89 | // Check to see if the test is run from directory containing build_package.xml
90 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
91 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
92 | } else { // Hope that package is installed.
93 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
94 | }
95 |
96 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
97 | }
98 |
99 |
100 | /* (non-Javadoc)
101 | * @see weka.filters.AbstractFilterTest#testBuffered()
102 | */
103 | public void testBuffered(){}
104 |
105 | public static Test suite() {
106 | return new TestSuite(ASATest.class);
107 | }
108 |
109 | public static void main(String[] args){
110 | junit.textui.TestRunner.run(suite());
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/java/weka/core/converters/SemEvalToArff.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * SemEvalToArff.java
18 | * Copyright (C) 1999-2018 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 |
23 |
24 | package weka.core.converters;
25 |
26 | import java.io.BufferedReader;
27 | import java.io.File;
28 | import java.io.FileReader;
29 | import java.util.ArrayList;
30 |
31 | import weka.core.Attribute;
32 | import weka.core.DenseInstance;
33 | import weka.core.Instance;
34 | import weka.core.Instances;
35 | import weka.core.converters.ArffSaver;
36 |
37 | /**
38 | * Builds an arff dataset from the SemEval collection of tweets for sentiment
39 | * analysis. More info about the task:
40 | * https://www.cs.york.ac.uk/semeval-2013/task2/
41 | *
42 | * @author Felipe Bravo-Marquez (fjb11 at students.waikato.ac.nz)
43 | * @version 1.0
44 | */
45 |
46 | public class SemEvalToArff extends TweetCollectionToArff {
47 |
48 | /* (non-Javadoc)
49 | * @see weka.core.converters.TweetCollectionToArff#createDataset(java.lang.String)
50 | */
51 | @Override
52 | public Instances createDataset(String collectionPath) throws Exception {
53 |
54 | ArrayList attributes = new ArrayList();
55 |
56 | // The content of the tweet
57 | attributes.add(new Attribute("content", (ArrayList) null));
58 |
59 | // The target label
60 | ArrayList label = new ArrayList();
61 | label.add("positive");
62 | label.add("neutral");
63 | label.add("negative");
64 |
65 | attributes.add(new Attribute("Class", label));
66 | Instances dataset = new Instances(
67 | "Twitter Sentiment Analysis SemEval Dataset", attributes, 0); // The
68 | // last
69 | // attribute
70 |
71 | BufferedReader bf = new BufferedReader(new FileReader(collectionPath));
72 | String line;
73 | while ((line = bf.readLine()) != null) {
74 | String parts[] = line.split("\t");
75 |
76 | String content = parts[3];
77 | String target = parts[2];
78 |
79 | double values[] = new double[2];
80 |
81 | // add the content
82 | values[0] = dataset.attribute(0).addStringValue(content);
83 |
84 | // add the label
85 | if (target.equals("positive")) {
86 | values[1] = dataset.attribute(1).indexOfValue("positive");
87 | } else if (target.equals("neutral") || target.equals("objective")
88 | || target.equals("objective-OR-neutral")) {
89 | values[1] = dataset.attribute(1).indexOfValue("neutral");
90 | } else {
91 | values[1] = dataset.attribute(1).indexOfValue("negative");
92 | }
93 |
94 | Instance inst = new DenseInstance(1, values);
95 | dataset.add(inst);
96 |
97 | }
98 |
99 | // set the class index
100 | dataset.setClassIndex(dataset.numAttributes() - 1);
101 |
102 | bf.close();
103 |
104 | return dataset;
105 | }
106 |
107 | /**
108 | * Main method for testing this class.
109 | *
110 | *
111 | * should contain the path of input dataset and the name of
112 | * target file scheme (see Evaluation)
113 | *@param args arguments
114 | */
115 | static public void main(String args[]) {
116 |
117 | if (args.length == 2) {
118 |
119 | TweetCollectionToArff ta = new SemEvalToArff();
120 |
121 | try {
122 | Instances dataset = ta.createDataset(args[0]);
123 | ArffSaver saver = new ArffSaver();
124 | saver.setInstances(dataset);
125 |
126 | saver.setFile(new File(args[1]));
127 | saver.writeBatch();
128 |
129 | } catch (Exception e) {
130 | // TODO Auto-generated catch block
131 | e.printStackTrace();
132 | }
133 |
134 | }
135 |
136 | }
137 |
138 | }
139 |
--------------------------------------------------------------------------------
/src/main/java/affective/core/PolarityLexiconEvaluator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * PolarityLexiconEvaluator.java
18 | * Copyright (C) 1999-2016 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 |
23 | package affective.core;
24 |
25 | import java.io.BufferedReader;
26 | import java.io.FileInputStream;
27 | import java.io.IOException;
28 | import java.io.InputStreamReader;
29 | import java.util.ArrayList;
30 | import java.util.HashMap;
31 | import java.util.List;
32 | import java.util.Map;
33 | import java.util.zip.GZIPInputStream;
34 |
35 |
36 | /**
37 | *
38 | * This class is used for evaluating the polarity lexicons with positive and negative
39 | * nominal entries.
40 | *
41 | *
42 | *
43 | * @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
44 | * @version $Revision: 1 $
45 | */
46 | public class PolarityLexiconEvaluator extends LexiconEvaluator {
47 |
48 | /** For serialization. */
49 | private static final long serialVersionUID = 5921580335557644894L;
50 |
51 | /** A mapping between words and the sentiment label. */
52 | protected Map dict;
53 |
54 |
55 | /**
56 | * initializes the Object
57 | *
58 | * @param file the file with the lexicon
59 | * @param name the prefix for all the attributes calculated from this lexicon
60 | */
61 | public PolarityLexiconEvaluator(String file,String name) {
62 | super(file,name);
63 | this.dict = new HashMap();
64 |
65 | this.featureNames=new ArrayList();
66 | this.featureNames.add(name+"-posCount");
67 | this.featureNames.add(name+"-negCount");
68 |
69 | }
70 |
71 |
72 | /* (non-Javadoc)
73 | * @see affective.core.LexiconEvaluator#processDict()
74 | */
75 | public void processDict() throws IOException {
76 | // first, we open the file
77 | FileInputStream fin = new FileInputStream(this.path);
78 | GZIPInputStream gzis = new GZIPInputStream(fin);
79 | InputStreamReader xover = new InputStreamReader(gzis);
80 | BufferedReader bf = new BufferedReader(xover);
81 |
82 | String line;
83 | while ((line = bf.readLine()) != null) {
84 | String pair[] = line.split("\t");
85 | this.dict.put(pair[0], pair[1]);
86 |
87 | }
88 | bf.close();
89 | xover.close();
90 | gzis.close();
91 | fin.close();
92 |
93 | }
94 |
95 | /**
96 | * returns the sentiment associated with a word
97 | *
98 | * @param word the input word
99 | * @return the value for the word
100 | */
101 | public String retrieveValue(String word) {
102 | if (!this.dict.containsKey(word)) {
103 | return "not_found";
104 | } else {
105 | return this.dict.get(word);
106 | }
107 |
108 | }
109 |
110 |
111 | /* (non-Javadoc)
112 | * @see affective.core.LexiconEvaluator#evaluateTweet(java.util.List)
113 | */
114 | @Override
115 | public Map evaluateTweet(List tokens) {
116 | Map sentCount = new HashMap();
117 |
118 | double negCount = 0.0;
119 | double posCount = 0.0;
120 |
121 | for (String w : tokens) {
122 | String pol = this.retrieveValue(w);
123 | if (pol.equals("positive")) {
124 | posCount++;
125 | } else if (pol.equals("negative")) {
126 | negCount++;
127 | }
128 | }
129 |
130 | sentCount.put(this.name+"-posCount", posCount);
131 | sentCount.put(this.name+"-negCount", negCount);
132 |
133 | return sentCount;
134 | }
135 |
136 |
137 | /**
138 | * Gets the dictionary mapping the words to their sentiment
139 | *
140 | * @return the dictionary.
141 | */
142 | public Map getDict() {
143 | return this.dict;
144 | }
145 |
146 | }
147 |
--------------------------------------------------------------------------------
/src/main/java/affective/core/IntensityLexiconEvaluator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * IntensityLexiconEvaluator.java
18 | * Copyright (C) 1999-2018 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 |
23 |
24 | package affective.core;
25 |
26 | import java.io.BufferedReader;
27 | import java.io.FileInputStream;
28 | import java.io.IOException;
29 | import java.io.InputStreamReader;
30 | import java.util.ArrayList;
31 | import java.util.HashMap;
32 | import java.util.List;
33 | import java.util.Map;
34 | import java.util.zip.GZIPInputStream;
35 |
36 |
37 | /**
38 | *
39 | * This class is used for evaluating lexicons with numerical sentiment scores.
40 | *
41 | *
42 | *
43 | * @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
44 | * @version $Revision: 1 $
45 | */
46 | public class IntensityLexiconEvaluator extends LexiconEvaluator {
47 |
48 |
49 | /** For serialization. */
50 | private static final long serialVersionUID = -2094228012480778199L;
51 |
52 | /** The dictionary. */
53 | protected Map dict;
54 |
55 | /**
56 | * Initializes the Object
57 | *
58 | * @param file the file with the lexicon
59 | * @param name the prefix for all the attributes calculated from this lexicon
60 | */
61 | public IntensityLexiconEvaluator(String file,String name) {
62 | super(file,name);
63 | this.dict = new HashMap();
64 |
65 | this.featureNames=new ArrayList();
66 | this.featureNames.add(name+"-posScore");
67 | this.featureNames.add(name+"-negScore");
68 |
69 | }
70 |
71 |
72 | /* (non-Javadoc)
73 | * @see affective.core.LexiconEvaluator#processDict()
74 | */
75 | @Override
76 | public void processDict() throws IOException {
77 | // first, we open the file
78 | FileInputStream fin = new FileInputStream(this.path);
79 | GZIPInputStream gzis = new GZIPInputStream(fin);
80 | InputStreamReader xover = new InputStreamReader(gzis);
81 | BufferedReader bf = new BufferedReader(xover);
82 |
83 | String line;
84 | while ((line = bf.readLine()) != null) {
85 | String pair[] = line.split("\t");
86 | this.dict.put(pair[0], pair[1]);
87 |
88 | }
89 | bf.close();
90 | xover.close();
91 | gzis.close();
92 | fin.close();
93 |
94 | }
95 |
96 | /**
97 | * returns the score associated with a word
98 | *
99 | * @param word the input word
100 | * @return the value for the word
101 | */
102 | public String retrieveValue(String word) {
103 | if (!this.dict.containsKey(word)) {
104 | return "not_found";
105 | } else {
106 | return this.dict.get(word);
107 | }
108 |
109 | }
110 |
111 |
112 | /* (non-Javadoc)
113 | * @see affective.core.LexiconEvaluator#evaluateTweet(java.util.List)
114 | */
115 | @Override
116 | public Map evaluateTweet(List tokens) {
117 | Map strengthScores = new HashMap();
118 | double posScore = 0;
119 | double negScore = 0;
120 | for (String w : tokens) {
121 | String pol = this.retrieveValue(w);
122 | if (!pol.equals("not_found")) {
123 | double value = Double.parseDouble(pol);
124 | if (value > 0) {
125 | posScore += value;
126 | } else {
127 | negScore += value;
128 | }
129 | }
130 | }
131 | strengthScores.put(name+"-posScore", posScore);
132 | strengthScores.put(name+"-negScore", negScore);
133 |
134 | return strengthScores;
135 | }
136 |
137 | /**
138 | * Gets the dictionary mapping the words to their vectors
139 | *
140 | * @return the dictionary.
141 | */
142 | public Map getDict() {
143 | return this.dict;
144 | }
145 |
146 |
147 | }
148 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## About
4 |
5 | [AffectiveTweets](https://affectivetweets.cms.waikato.ac.nz/) is a [WEKA](http://www.cs.waikato.ac.nz/~ml/weka/) package for analyzing emotion and sentiment of English written tweets.
6 |
7 | The package implements WEKA filters for calculating state-of-the-art affective analysis features from tweets that can be fed into machine learning algorithms. Many of these features were drawn from the [NRC-Canada System](http://saifmohammad.com/WebPages/NRC-Canada-Sentiment.htm). It also implements methods for building affective lexicons and distant supervision methods for training affective models from unlabelled tweets.
8 |
9 |
10 | The package was made available as the official baseline system for the [WASSA-2017](http://optima.jrc.it/wassa2017/) Shared Task on Emotion Intensity [(EmoInt)](http://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html) and for [SemEval-2018](http://alt.qcri.org/semeval2018/) Task 1: [Affect in Tweets](http://www.saifmohammad.com/WebPages/affectintweets.htm).
11 |
12 | Five participating teams used AffectiveTweets in WASSA-2017 to generate feature vectors, including the teams that eventually ranked first, second, and third. For SemEval-2018, the package was used by 15 teams.
13 |
14 | [https://affectivetweets.cms.waikato.ac.nz/](https://affectivetweets.cms.waikato.ac.nz/)
15 |
16 | ## Using AffectiveTweets
17 |
18 | * [About](https://affectivetweets.cms.waikato.ac.nz/#about)
19 | * [Installation](https://affectivetweets.cms.waikato.ac.nz/install/)
20 | * [Examples](https://affectivetweets.cms.waikato.ac.nz/examples/)
21 |
22 |
23 |
24 | ## Relevant Papers
25 |
26 | The most relevant papers on which this package is based are:
27 |
28 |
29 | * [Sentiment Analysis of Short Informal Texts](http://saifmohammad.com/WebDocs/NRC-Sentiment-JAIR-2014.pdf). Svetlana Kiritchenko, Xiaodan Zhu and Saif Mohammad. Journal of Artificial Intelligence Research, volume 50, pages 723-762, August 2014. [BibTeX](http://saifmohammad.com/WebDocs/JAIR14-bibtex.txt)
30 | * [Meta-Level Sentiment Models for Big Social Data Analysis](http://www.sciencedirect.com/science/article/pii/S0950705114002068). F. Bravo-Marquez, M. Mendoza and B. Poblete. Knowledge-Based Systems Volume 69, October 2014, Pages 86–99. [BibTex](http://dblp.uni-trier.de/rec/bib2/journals/kbs/Bravo-MarquezMP14.bib)
31 | * [Stance and sentiment in tweets](http://saifmohammad.com/WebDocs/1605.01655v1.pdf). Saif M. Mohammad, Parinaz Sobhani, and Svetlana Kiritchenko. 2017. Special Section of the ACM Transactions on Internet Technology on Argumentation in Social Media 17(3). [BibTeX](http://saifmohammad.com/WebPages/Abstracts/stance-toit.bib.txt)
32 | * [Sentiment strength detection for the social Web](http://dl.acm.org/citation.cfm?id=2336261). Thelwall, M., Buckley, K., & Paltoglou, G. (2012). Journal of the American Society for Information Science and Technology, 63(1), 163-173. [BibTex](http://dblp.uni-trier.de/rec/bib2/journals/jasis/ThelwallBP12.bib)
33 |
34 |
35 |
36 |
37 |
38 | ## Citation
39 | - Please cite the following paper if using this package in an academic publication:
40 |
41 | - F. Bravo-Marquez, E. Frank, B. Pfahringer, and S. M. Mohammad [AffectiveTweets: a WEKA Package for Analyzing Affect in Tweets](http://jmlr.org/papers/v20/18-450.html), In *Journal of Machine Learning Research* Volume 20(92), pages 1−6, 2019. ([pdf](https://felipebravom.com/publications/jmlr2019.pdf))
42 |
43 | You are also welcome to cite a previous publication describing the package:
44 |
45 | - S. M. Mohammad and F. Bravo-Marquez [Emotion Intensities in Tweets](http://anthology.aclweb.org/S/S17/S17-1007.pdf), In *\*Sem '17: Proceedings of the sixth joint conference on lexical and computational semantics (\*Sem)*, August 2017, Vancouver, Canada. ([pdf](https://felipebravom.com/publications/starsem2017.pdf))
46 |
47 | You should also cite the papers describing any of the lexicons or resources you are using with this package.
48 |
49 | * Here is the [BibTex](https://affectivetweets.cms.waikato.ac.nz/fullBio.bib.txt) entry for the package along with the entries for the resources included in the package.
50 |
51 | * Here is the [BibTex](https://affectivetweets.cms.waikato.ac.nz/shortBio.bib.txt) entry just for the package.
52 |
53 |
54 |
55 | ## Contact
56 | * Email: fbravoma at waikato.ac.nz
57 | * If you have questions about Weka please refer to the Weka [mailing list](https://list.waikato.ac.nz/mailman/listinfo/wekalist).
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/src/main/java/affective/core/SWN3LexiconEvaluator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * SWN3LexiconEvaluator.java
18 | * Copyright (C) 1999-2018 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 |
23 | package affective.core;
24 |
25 | import java.io.BufferedReader;
26 | import java.io.FileInputStream;
27 | import java.io.IOException;
28 | import java.io.InputStreamReader;
29 | import java.util.ArrayList;
30 | import java.util.HashMap;
31 | import java.util.List;
32 | import java.util.Map;
33 | import java.util.zip.GZIPInputStream;
34 |
35 | /**
36 | *
37 | * This class is used for evaluating SentiWordnet.
38 | *
39 | *
40 | *
41 | * @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
42 | * @version $Revision: 1 $
43 | */
44 | public class SWN3LexiconEvaluator extends LexiconEvaluator{
45 |
46 | /** For serialization. */
47 | private static final long serialVersionUID = 1576067300486821206L;
48 |
49 | /** The dictionary. */
50 | protected Map dict;
51 |
52 | /**
53 | * initializes the Object
54 | *
55 | * @param path the file with the lexicon
56 | * @param name the prefix for all the attributes calculated from this lexicon
57 | */
58 | public SWN3LexiconEvaluator(String path, String name) {
59 | super(path,name);
60 |
61 | this.dict = new HashMap();
62 |
63 | this.featureNames=new ArrayList();
64 | this.featureNames.add(name+"-posScore");
65 | this.featureNames.add(name+"-negScore");
66 | }
67 |
68 |
69 | /* (non-Javadoc)
70 | * @see affective.core.LexiconEvaluator#processDict()
71 | */
72 | @Override
73 | public void processDict() throws IOException {
74 |
75 |
76 | FileInputStream fin = new FileInputStream(this.path);
77 | GZIPInputStream gzis = new GZIPInputStream(fin);
78 | InputStreamReader xover = new InputStreamReader(gzis);
79 | BufferedReader bf = new BufferedReader(xover);
80 |
81 |
82 | String line = "";
83 |
84 | // discard comments
85 | while ((line = bf.readLine()) != null) {
86 | if (line.startsWith("#") || line.startsWith(" #")) {
87 | continue;
88 | }
89 |
90 | String[] data = line.split("\t");
91 |
92 | // Difference between positive and negative score for one particular Synset
93 | Double polScore = Double.parseDouble(data[2])
94 | - Double.parseDouble(data[3]);
95 |
96 | // extract all the synset terms
97 | String[] sysSetTerms = data[4].split(" ");
98 | for (String w : sysSetTerms) {
99 | String[] w_n = w.split("#");
100 |
101 | String word=w_n[0];
102 | // the word's rank, small values indicate a more popular meaning
103 | // More popular word receive a higher weight
104 | int rank = Integer.parseInt(w_n[1]);
105 |
106 | if (this.dict.containsKey(word)) {
107 | Double prevScore=this.dict.get(word);
108 | this.dict.put(word, prevScore + polScore/(1+rank));
109 | } else {
110 | this.dict.put(word, polScore/(1+rank));
111 | }
112 | }
113 | }
114 |
115 | bf.close();
116 | xover.close();
117 | gzis.close();
118 | fin.close();
119 | }
120 |
121 |
122 | /* (non-Javadoc)
123 | * @see affective.core.LexiconEvaluator#evaluateTweet(java.util.List)
124 | */
125 | @Override
126 | public Map evaluateTweet(List tokens) {
127 | Map strengthScores = new HashMap();
128 | double posScore = 0;
129 | double negScore = 0;
130 | for (String w : tokens) {
131 |
132 | if (this.dict.containsKey(w)) {
133 | double value = this.dict.get(w);
134 | if (value > 0) {
135 | posScore += value;
136 | } else {
137 | negScore += value;
138 | }
139 | }
140 |
141 | }
142 | strengthScores.put(name+"-posScore", posScore);
143 | strengthScores.put(name+"-negScore", negScore);
144 |
145 | return strengthScores;
146 | }
147 |
148 | }
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/LabelWordVectorsTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.filters.AbstractFilterTest;
24 | import weka.filters.Filter;
25 |
26 | import junit.framework.Test;
27 | import junit.framework.TestSuite;
28 |
29 | import java.io.File;
30 |
31 | /**
32 | * Tests LabelWordVectors. Run from the command line with:
33 | * java weka.filters.unsupervised.attribute.LabelWordVectorsTest
34 | *
35 | * AffectiveTweets package must either be installed or
36 | * JVM must be started in AffectiveTweets directory.
37 | *
38 | * @author FracPete and eibe
39 | * @version $Revision: 9568 $
40 | */
41 | /**
42 | * @author fbravoma
43 | *
44 | */
45 | public class LabelWordVectorsTest extends AbstractFilterTest {
46 |
47 | public LabelWordVectorsTest(String name) {
48 | super(name);
49 | }
50 |
51 | /** Creates a default LabelWordVectors filter */
52 | public Filter getFilter() {
53 | Filter f = null;
54 |
55 | // Check to see if the test is run from directory containing build_package.xml
56 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
57 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
58 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
59 | f = new LabelWordVectors();
60 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
61 | } else {
62 | f = new LabelWordVectors(); // Hope that the package is installed.
63 | }
64 | return f;
65 | }
66 |
67 | /**
68 | * LabelWordVectors is not suitable for use in a FilteredClassifier, so this just creates a dummy
69 | * FilteredClassifier so that the tests run through.
70 | *
71 | * @return the configured FilteredClassifier
72 | */
73 | protected FilteredClassifier getFilteredClassifier() {
74 | FilteredClassifier result;
75 |
76 | result = new FilteredClassifier();
77 |
78 | result.setFilter(new weka.filters.AllFilter());
79 | result.setClassifier(new weka.classifiers.rules.ZeroR());
80 |
81 | return result;
82 | }
83 |
84 | /**
85 | * Called by JUnit before each test method. Sets up the Instances object to use based on
86 | * one of the datasets that comes with the package.
87 | *
88 | * @throws Exception if an error occurs reading the example instances.
89 | */
90 | protected void setUp() throws Exception {
91 | super.setUp();
92 |
93 | // Check to see if the test is run from directory containing build_package.xml
94 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
95 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
96 | } else { // Hope that package is installed.
97 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
98 | }
99 |
100 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
101 | }
102 |
103 |
104 | /* (non-Javadoc)
105 | * @see weka.filters.AbstractFilterTest#testBuffered()
106 | */
107 | public void testBuffered() {}
108 |
109 | /* (non-Javadoc)
110 | * @see weka.filters.AbstractFilterTest#testRegression()
111 | */
112 | public void testRegression(){}
113 |
114 |
115 | public static Test suite() {
116 | return new TestSuite(LabelWordVectorsTest.class);
117 | }
118 |
119 | public static void main(String[] args){
120 | junit.textui.TestRunner.run(suite());
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/LexiconDistantSupervisionTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.filters.AbstractFilterTest;
24 | import weka.filters.Filter;
25 |
26 | import junit.framework.Test;
27 | import junit.framework.TestSuite;
28 |
29 | import java.io.File;
30 |
31 | /**
32 | * Tests LexiconDistantSupervision. Run from the command line with:
33 | * java weka.filters.unsupervised.attribute.LexiconDistantSupervisionTest
34 | *
35 | * AffectiveTweets package must either be installed or
36 | * JVM must be started in AffectiveTweets directory.
37 | *
38 | * @author FracPete and eibe
39 | * @version $Revision: 9568 $
40 | */
41 | public class LexiconDistantSupervisionTest extends AbstractFilterTest {
42 |
43 | public LexiconDistantSupervisionTest(String name) {
44 | super(name);
45 | }
46 |
47 | /** Creates a default LexiconDistantSupervision filter */
48 | public Filter getFilter() {
49 | Filter f = null;
50 |
51 | // Check to see if the test is run from directory containing build_package.xml
52 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
53 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
54 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
55 | f = new LexiconDistantSupervision();
56 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
57 | } else {
58 | f = new LexiconDistantSupervision(); // Hope that the package is installed.
59 | }
60 | return f;
61 | }
62 |
63 | /**
64 | * LexiconDistantSupervision is not suitable for use in a FilteredClassifier, so this just creates a dummy
65 | * FilteredClassifier so that the tests run through.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | result.setFilter(new weka.filters.AllFilter());
75 | result.setClassifier(new weka.classifiers.rules.ZeroR());
76 |
77 | return result;
78 | }
79 |
80 | /**
81 | * Called by JUnit before each test method. Sets up the Instances object to use based on
82 | * one of the datasets that comes with the package.
83 | *
84 | * @throws Exception if an error occurs reading the example instances.
85 | */
86 | protected void setUp() throws Exception {
87 | super.setUp();
88 |
89 | // Check to see if the test is run from directory containing build_package.xml
90 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
91 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
92 | } else { // Hope that package is installed.
93 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
94 | }
95 |
96 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
97 | }
98 |
99 |
100 | /* (non-Javadoc)
101 | * @see weka.filters.AbstractFilterTest#testBatchFiltering()
102 | */
103 | public void testBatchFiltering(){}
104 |
105 | /* (non-Javadoc)
106 | * @see weka.filters.AbstractFilterTest#testBatchFilteringLarger()
107 | */
108 | public void testBatchFilteringLarger(){}
109 |
110 |
111 | /* (non-Javadoc)
112 | * @see weka.filters.AbstractFilterTest#testBatchFilteringSmaller()
113 | */
114 | public void testBatchFilteringSmaller(){}
115 |
116 |
117 | public static Test suite() {
118 | return new TestSuite(LexiconDistantSupervisionTest.class);
119 | }
120 |
121 | public static void main(String[] args){
122 | junit.textui.TestRunner.run(suite());
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/src/main/java/weka/core/converters/NRCAffectToArff.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * NRCAffectToArff.java
18 | * Copyright (C) 1999-2018 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 | package weka.core.converters;
23 |
24 | import java.io.BufferedReader;
25 | import java.io.File;
26 | import java.io.FileReader;
27 | import java.util.ArrayList;
28 | import java.util.Arrays;
29 | import java.util.HashMap;
30 | import java.util.Map;
31 |
32 | import weka.core.Attribute;
33 | import weka.core.DenseInstance;
34 | import weka.core.Instance;
35 | import weka.core.Instances;
36 |
37 | /**
38 | * Builds an arff dataset from the NRC Affective Lexicon.
39 | * analysis.
40 | *
41 | * @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
42 | * @version 1.0
43 | */
44 | public class NRCAffectToArff {
45 |
46 | /**
47 | * Creates a Weka Instances object from the lexicon.
48 | * @param collectionPath the file path of the lexicon.
49 | * @return an Instances object
50 | * @throws Exception if a wrong file is used.
51 | */
52 | public Instances createDataset(String collectionPath) throws Exception {
53 |
54 | ArrayList attributes = new ArrayList();
55 |
56 | // The content of the tweet
57 | attributes.add(new Attribute("term", (ArrayList) null));
58 | attributes.add(new Attribute("angerScore"));
59 | attributes.add(new Attribute("fearScore"));
60 | attributes.add(new Attribute("sadnessScore"));
61 | attributes.add(new Attribute("joyScore"));
62 |
63 |
64 | Instances dataset = new Instances(
65 | "The NRC Affect Intensity Lexicon v0.5. More info at:www.saifmohammad.com/WebPages/AffectIntensity.htm", attributes, 0);
66 |
67 | Map> mapper=new HashMap>();
68 |
69 | BufferedReader bf = new BufferedReader(new FileReader(collectionPath));
70 | String line;
71 | while ((line = bf.readLine()) != null) {
72 | String parts[] = line.split("\t");
73 | String term=parts[0];
74 | Double score= Double.parseDouble(parts[1]);
75 | String affectDim=parts[2];
76 | // System.out.println(term+" "+score+" "+affectDim);
77 |
78 | if(!mapper.containsKey(term)){
79 | Map scoreVals=new HashMap();
80 | scoreVals.put(affectDim, score);
81 | mapper.put(term,scoreVals);
82 | }
83 | else{
84 | Map scoreVals=mapper.get(term);
85 | scoreVals.put(affectDim, score);
86 | }
87 |
88 | }
89 |
90 |
91 | String[] sortedWords=mapper.keySet().toArray(new String[0]);
92 | Arrays.sort(sortedWords);
93 |
94 | for(String word:sortedWords){
95 | Map scoreVals=mapper.get(word);
96 | double angerScore= scoreVals.containsKey("anger")?scoreVals.get("anger"):weka.core.Utils.missingValue();
97 | double fearScore= scoreVals.containsKey("fear")?scoreVals.get("fear"):weka.core.Utils.missingValue();
98 | double sadnessScore= scoreVals.containsKey("sadness")?scoreVals.get("sadness"):weka.core.Utils.missingValue();
99 | double joyScore= scoreVals.containsKey("joy")?scoreVals.get("joy"):weka.core.Utils.missingValue();
100 |
101 | double values[] = new double[5];
102 | values[0]=dataset.attribute(0).addStringValue(word);
103 | values[1]=angerScore;
104 | values[2]=fearScore;
105 | values[3]=sadnessScore;
106 | values[4]=joyScore;
107 |
108 | Instance inst = new DenseInstance(1, values);
109 | dataset.add(inst);
110 |
111 | }
112 |
113 |
114 |
115 | bf.close();
116 |
117 | return dataset;
118 | }
119 |
120 |
121 | /**
122 | * Main method for testing this class.
123 | *
124 | * should contain the path of input dataset and the name of
125 | * target file scheme (see Evaluation)
126 | * @param args arguments
127 | */
128 | static public void main(String args[]) {
129 |
130 | if (args.length == 2) {
131 |
132 | NRCAffectToArff na = new NRCAffectToArff();
133 |
134 | try {
135 | Instances dataset = na.createDataset(args[0]);
136 | ArffSaver saver = new ArffSaver();
137 | saver.setInstances(dataset);
138 |
139 | saver.setFile(new File(args[1]));
140 | saver.writeBatch();
141 |
142 | } catch (Exception e) {
143 | // TODO Auto-generated catch block
144 | e.printStackTrace();
145 | }
146 |
147 | }
148 |
149 | }
150 |
151 |
152 |
153 | }
154 |
--------------------------------------------------------------------------------
/doc/test/package-tree.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | test Class Hierarchy
7 |
8 |
9 |
10 |
11 |
12 |
22 |
23 | JavaScript is disabled on your browser.
24 |
25 |
26 |
43 |
70 |
71 |
78 |
79 |
Class Hierarchy
80 |
81 | java.lang.Object
82 |
83 | test.AffectiveTestRunner
84 | junit.framework.Assert
85 |
86 | junit.framework.TestCase (implements junit.framework.Test)
87 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
115 |
142 |
143 |
144 |
145 |
--------------------------------------------------------------------------------
/doc/test/package-summary.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | test
7 |
8 |
9 |
10 |
11 |
12 |
22 |
23 | JavaScript is disabled on your browser.
24 |
25 |
26 |
43 |
70 |
71 |
74 |
97 |
98 |
115 |
142 |
143 |
144 |
145 |
--------------------------------------------------------------------------------
/doc/overview-summary.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Overview
7 |
8 |
9 |
10 |
11 |
12 |
22 |
23 | JavaScript is disabled on your browser.
24 |
25 |
26 |
43 |
70 |
71 |
102 |
103 |
120 |
121 |
122 | Prev
123 | Next
124 |
125 |
129 |
132 |
133 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
--------------------------------------------------------------------------------
/doc/allclasses-noframe.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | All Classes
7 |
8 |
9 |
10 |
11 |
12 | All Classes
13 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/doc/constant-values.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Constant Field Values
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
17 |
18 |
19 |
20 |
21 |
31 |
32 | JavaScript is disabled on your browser.
33 |
34 |
35 |
36 |
53 |
54 |
58 |
62 |
65 |
71 |
72 |
82 |
83 | JavaScript is disabled on your browser.
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
96 |
100 |
101 |
118 |
119 |
120 | Prev
121 | Next
122 |
123 |
127 |
130 |
131 |
141 |
142 | JavaScript is disabled on your browser.
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
--------------------------------------------------------------------------------
/benchmark/nltk_scikit_ngram_liu.py:
--------------------------------------------------------------------------------
1 | # This program is free software: you can redistribute it and/or modify
2 | # it under the terms of the GNU General Public License as published by
3 | # the Free Software Foundation, either version 3 of the License, or
4 | # (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful,
7 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 | # GNU General Public License for more details.
10 | #
11 | # You should have received a copy of the GNU General Public License
12 | # along with this program. If not, see .
13 |
14 | # Authors: Felipe Bravo-Marquez
15 |
16 |
17 | import pandas as pd
18 | from nltk.tokenize import TweetTokenizer
19 | from nltk.sentiment.util import mark_negation
20 | from nltk.corpus import opinion_lexicon
21 |
22 | from sklearn.feature_extraction.text import CountVectorizer
23 | from sklearn.linear_model import LogisticRegression
24 | from sklearn.pipeline import Pipeline, FeatureUnion
25 | from sklearn.base import BaseEstimator, TransformerMixin
26 | from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report
27 | import numpy as np
28 |
29 |
30 | # load training and testing datasets as a pandas dataframe
31 | train_data = pd.read_csv("dataset/twitter-train-B.txt", header=None, delimiter="\t",usecols=(2,3), names=("sent","tweet"))
32 | test_data = pd.read_csv("dataset/twitter-test-gold-B.tsv", header=None, delimiter="\t",usecols=(2,3), names=("sent","tweet"))
33 |
34 | # replace objective-OR-neutral and objective to neutral
35 | train_data.sent = train_data.sent.replace(['objective-OR-neutral','objective'],['neutral','neutral'])
36 |
37 | # use a Twitter-specific tokenizer
38 | tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
39 |
40 |
41 |
42 |
43 | #####################################################################################
44 | #
45 | # Train a linear model using n-grams features + features derived from Bing Liu's lexicon
46 | #
47 | ######################################################################################
48 | #import nltk
49 | #nltk.download('opinion_lexicon')
50 |
51 |
52 | # load training and testing datasets as a pandas dataframe
53 | train_data = pd.read_csv("dataset/twitter-train-B.txt", header=None, delimiter="\t",usecols=(2,3), names=("sent","tweet"))
54 | test_data = pd.read_csv("dataset/twitter-test-gold-B.tsv", header=None, delimiter="\t",usecols=(2,3), names=("sent","tweet"))
55 |
56 | # replace objective-OR-neutral and objective to neutral
57 | train_data.sent = train_data.sent.replace(['objective-OR-neutral','objective'],['neutral','neutral'])
58 |
59 | # use a Twitter-specific tokenizer
60 | tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
61 |
62 |
63 |
64 |
65 | class LiuFeatureExtractor(BaseEstimator, TransformerMixin):
66 | """Takes in a corpus of tweets and calculates features using Bing Liu's lexicon"""
67 |
68 | def __init__(self, tokenizer):
69 | self.tokenizer = tokenizer
70 | self.pos_set = set(opinion_lexicon.positive())
71 | self.neg_set = set(opinion_lexicon.negative())
72 |
73 | def liu_score(self,sentence):
74 | """Calculates the number of positive and negative words in the sentence using Bing Liu's Lexicon"""
75 | tokenized_sent = self.tokenizer.tokenize(sentence)
76 | pos_words = 0
77 | neg_words = 0
78 | for word in tokenized_sent:
79 | if word in self.pos_set:
80 | pos_words += 1
81 | elif word in self.neg_set:
82 | neg_words += 1
83 | return [pos_words,neg_words]
84 |
85 | def transform(self, X, y=None):
86 | """Applies liu_score and vader_score on a data.frame containing tweets """
87 | values = []
88 | for tweet in X:
89 | values.append(self.liu_score(tweet))
90 |
91 | return(np.array(values))
92 |
93 | def fit(self, X, y=None):
94 | """This function must return `self` unless we expect the transform function to perform a
95 | different action on training and testing partitions (e.g., when we calculate unigram features,
96 | the dictionary is only extracted from the first batch)"""
97 | return self
98 |
99 |
100 |
101 |
102 |
103 | liu_feat = LiuFeatureExtractor(tokenizer)
104 | vectorizer = CountVectorizer(tokenizer = tokenizer.tokenize, preprocessor = mark_negation, ngram_range=(1,4))
105 | log_mod = LogisticRegression(solver='liblinear',multi_class='ovr')
106 | liu_ngram_clf = Pipeline([ ('feats',
107 | FeatureUnion([ ('ngram', vectorizer), ('liu',liu_feat) ])),
108 | ('clf', log_mod)])
109 |
110 |
111 | liu_ngram_clf.fit(train_data.tweet, train_data.sent)
112 | pred_liu_ngram = liu_ngram_clf.predict(test_data.tweet)
113 |
114 |
115 | conf_liu_ngram = confusion_matrix(test_data.sent, pred_liu_ngram)
116 | kappa_liu_ngram = cohen_kappa_score(test_data.sent, pred_liu_ngram)
117 | class_rep_liu_ngram = classification_report(test_data.sent, pred_liu_ngram)
118 |
119 | print('Confusion Matrix for Logistic Regression + ngrams + features from Bing Liu\'s Lexicon')
120 | print(conf_liu_ngram)
121 | print('Classification Report')
122 | print(class_rep_liu_ngram)
123 | print('kappa:'+str(kappa_liu_ngram))
124 |
125 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/TweetNLPPOSTaggerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.core.Instances;
24 | import weka.filters.AbstractFilterTest;
25 | import weka.filters.Filter;
26 |
27 | import junit.framework.Test;
28 | import junit.framework.TestSuite;
29 |
30 | import java.io.File;
31 |
32 | /**
33 | * Tests TweetNLPPOSTagger. Run from the command line with:
34 | * java weka.filters.unsupervised.attribute.TweetNLPPOSTaggerTest
35 | *
36 | * AffectiveTweets package must either be installed or
37 | * JVM must be started in AffectiveTweets directory.
38 | *
39 | * @author FracPete and eibe
40 | * @version $Revision: 9568 $
41 | */
42 | public class TweetNLPPOSTaggerTest extends AbstractFilterTest {
43 |
44 | public TweetNLPPOSTaggerTest(String name) {
45 | super(name);
46 | }
47 |
48 | /** Creates a default TweetNLPPOSTagger filter */
49 | public Filter getFilter() {
50 | Filter f = null;
51 |
52 | // Check to see if the test is run from directory containing build_package.xml
53 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
54 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
55 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
56 | f = new TweetToSparseFeatureVector();
57 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
58 | } else {
59 | f = new TweetNLPPOSTagger(); // Hope that the package is installed.
60 | }
61 | return f;
62 | }
63 |
64 | /**
65 | * Test for the FilteredClassifier used with this filter.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
75 | Filter[] filters = new Filter[2];
76 | filters[0] = getFilter();
77 | weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
78 | filters[1] = rt;
79 | mf.setFilters(filters);
80 | result.setFilter(mf);
81 | result.setClassifier(new weka.classifiers.functions.SMO());
82 |
83 | return result;
84 | }
85 |
86 | /**
87 | * Data to be used for FilteredClassifier test.
88 | *
89 | * @return the configured FilteredClassifier
90 | */
91 | protected Instances getFilteredClassifierData() throws Exception {
92 | Instances result;
93 |
94 | // Check to see if the test is run from directory containing build_package.xml
95 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
96 | result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
97 | } else { // Hope that package is installed.
98 | result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
99 | }
100 |
101 | result.setClassIndex(result.numAttributes() - 1);
102 |
103 | return result;
104 | }
105 |
106 | /**
107 | * Called by JUnit before each test method. Sets up the Instances object to use based on
108 | * one of the datasets that comes with the package.
109 | *
110 | * @throws Exception if an error occurs reading the example instances.
111 | */
112 | protected void setUp() throws Exception {
113 | super.setUp();
114 |
115 | // Check to see if the test is run from directory containing build_package.xml
116 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
117 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
118 | } else { // Hope that package is installed.
119 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
120 | }
121 |
122 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
123 | }
124 |
125 | public static Test suite() {
126 | return new TestSuite(TweetNLPPOSTaggerTest.class);
127 | }
128 |
129 | public static void main(String[] args){
130 | junit.textui.TestRunner.run(suite());
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/TweetToSparseFeatureVectorTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.core.Instances;
24 | import weka.filters.AbstractFilterTest;
25 | import weka.filters.Filter;
26 |
27 | import junit.framework.Test;
28 | import junit.framework.TestSuite;
29 |
30 | import java.io.File;
31 |
32 | /**
33 | * Tests TweetToSparseFeatureVector. Run from the command line with:
34 | * java weka.filters.unsupervised.attribute.TweetToSparseFeatureVectorTest
35 | *
36 | * AffectiveTweets package must either be installed or
37 | * JVM must be started in AffectiveTweets directory.
38 | *
39 | * @author FracPete and eibe
40 | * @version $Revision: 9568 $
41 | */
42 | public class TweetToSparseFeatureVectorTest extends AbstractFilterTest {
43 |
44 | public TweetToSparseFeatureVectorTest(String name) {
45 | super(name);
46 | }
47 |
48 | /** Creates a default TweetToSparseFeatureVector filter */
49 | public Filter getFilter() {
50 | Filter f = null;
51 |
52 | // Check to see if the test is run from directory containing build_package.xml
53 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
54 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
55 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
56 | f = new TweetToSparseFeatureVector();
57 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
58 | } else {
59 | f = new TweetToSparseFeatureVector(); // Hope that the package is installed.
60 | }
61 | return f;
62 | }
63 |
64 | /**
65 | * Test for the FilteredClassifier used with this filter.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
75 | Filter[] filters = new Filter[2];
76 | filters[0] = getFilter();
77 | weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
78 | filters[1] = rt;
79 | mf.setFilters(filters);
80 | result.setFilter(mf);
81 | result.setClassifier(new weka.classifiers.functions.SMO());
82 |
83 | return result;
84 | }
85 |
86 | /**
87 | * Data to be used for FilteredClassifier test.
88 | *
89 | * @return the configured FilteredClassifier
90 | */
91 | protected Instances getFilteredClassifierData() throws Exception {
92 | Instances result;
93 |
94 | // Check to see if the test is run from directory containing build_package.xml
95 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
96 | result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
97 | } else { // Hope that package is installed.
98 | result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
99 | }
100 |
101 | result.setClassIndex(result.numAttributes() - 1);
102 |
103 | return result;
104 | }
105 |
106 | /**
107 | * Called by JUnit before each test method. Sets up the Instances object to use based on
108 | * one of the datasets that comes with the package.
109 | *
110 | * @throws Exception if an error occurs reading the example instances.
111 | */
112 | protected void setUp() throws Exception {
113 | super.setUp();
114 |
115 | // Check to see if the test is run from directory containing build_package.xml
116 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
117 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
118 | } else { // Hope that package is installed.
119 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
120 | }
121 |
122 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
123 | }
124 |
125 | public static Test suite() {
126 | return new TestSuite(TweetToSparseFeatureVectorTest.class);
127 | }
128 |
129 | public static void main(String[] args){
130 | junit.textui.TestRunner.run(suite());
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/supervised/attribute/PMILexiconExpanderTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.supervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.core.Instances;
24 | import weka.filters.AbstractFilterTest;
25 | import weka.filters.Filter;
26 |
27 | import junit.framework.Test;
28 | import junit.framework.TestSuite;
29 |
30 | import java.io.File;
31 |
32 | /**
33 | * Tests PMILexiconExpander. Run from the command line with:
34 | * java weka.filters.unsupervised.attribute.PMILexiconExpanderTest
35 | *
36 | * AffectiveTweets package must either be installed or
37 | * JVM must be started in AffectiveTweets directory.
38 | *
39 | * @author FracPete and eibe
40 | * @version $Revision: 9568 $
41 | */
42 | public class PMILexiconExpanderTest extends AbstractFilterTest {
43 |
44 | public PMILexiconExpanderTest(String name) {
45 | super(name);
46 | }
47 |
48 | /** Creates a default PMILexiconExpander filter */
49 | public Filter getFilter() {
50 | Filter f = null;
51 |
52 | // Check to see if the test is run from directory containing build_package.xml
53 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
54 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
55 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
56 | f = new PMILexiconExpander();
57 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
58 | } else {
59 | f = new PMILexiconExpander(); // Hope that the package is installed.
60 | }
61 | return f;
62 | }
63 |
64 | /**
65 | * Test for the FilteredClassifier used with this filter.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
75 | Filter[] filters = new Filter[2];
76 | filters[0] = getFilter();
77 | weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
78 | filters[1] = rt;
79 | mf.setFilters(filters);
80 | result.setFilter(mf);
81 | result.setClassifier(new weka.classifiers.functions.SMO());
82 |
83 | return result;
84 | }
85 |
86 | /**
87 | * Data to be used for FilteredClassifier test.
88 | *
89 | * @return the configured FilteredClassifier
90 | */
91 | protected Instances getFilteredClassifierData() throws Exception {
92 | Instances result;
93 |
94 | // Check to see if the test is run from directory containing build_package.xml
95 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
96 | result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
97 | } else { // Hope that package is installed.
98 | result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
99 | }
100 |
101 | result.setClassIndex(result.numAttributes() - 1);
102 |
103 | return result;
104 | }
105 |
106 | /**
107 | * Called by JUnit before each test method. Sets up the Instances object to use based on
108 | * one of the datasets that comes with the package.
109 | *
110 | * @throws Exception if an error occurs reading the example instances.
111 | */
112 | protected void setUp() throws Exception {
113 | super.setUp();
114 |
115 | // Check to see if the test is run from directory containing build_package.xml
116 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
117 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
118 | } else { // Hope that package is installed.
119 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
120 | }
121 |
122 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
123 | }
124 |
125 | public void testFilteredClassifier() {
126 |
127 | }
128 |
129 | public static Test suite() {
130 | return new TestSuite(PMILexiconExpanderTest.class);
131 | }
132 |
133 | public static void main(String[] args){
134 | junit.textui.TestRunner.run(suite());
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/TweetToEmbeddingsFeatureVectorTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.core.Instances;
24 | import weka.filters.AbstractFilterTest;
25 | import weka.filters.Filter;
26 |
27 | import junit.framework.Test;
28 | import junit.framework.TestSuite;
29 |
30 | import java.io.File;
31 |
32 | /**
33 | * Tests TweetToEmbeddingsFeatureVector. Run from the command line with:
34 | * java weka.filters.unsupervised.attribute.TweetToEmbeddingsFeatureVectorTest
35 | *
36 | * AffectiveTweets package must either be installed or
37 | * JVM must be started in AffectiveTweets directory.
38 | *
39 | * @author FracPete and eibe
40 | * @version $Revision: 9568 $
41 | */
42 | public class TweetToEmbeddingsFeatureVectorTest extends AbstractFilterTest {
43 |
44 | public TweetToEmbeddingsFeatureVectorTest(String name) {
45 | super(name);
46 | }
47 |
48 | /** Creates a default TweetToEmbeddingsFeatureVector filter */
49 | public Filter getFilter() {
50 | Filter f = null;
51 |
52 | // Check to see if the test is run from directory containing build_package.xml
53 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
54 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
55 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
56 | f = new TweetToEmbeddingsFeatureVector();
57 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
58 | } else {
59 | f = new TweetToEmbeddingsFeatureVector(); // Hope that the package is installed.
60 | }
61 | return f;
62 | }
63 |
64 | /**
65 | * Test for the FilteredClassifier used with this filter.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
75 | Filter[] filters = new Filter[2];
76 | filters[0] = getFilter();
77 | weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
78 | filters[1] = rt;
79 | mf.setFilters(filters);
80 | result.setFilter(mf);
81 | result.setClassifier(new weka.classifiers.functions.SMO());
82 |
83 | return result;
84 | }
85 |
86 | /**
87 | * Data to be used for FilteredClassifier test.
88 | *
89 | * @return the configured FilteredClassifier
90 | */
91 | protected Instances getFilteredClassifierData() throws Exception {
92 | Instances result;
93 |
94 | // Check to see if the test is run from directory containing build_package.xml
95 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
96 | result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
97 | } else { // Hope that package is installed.
98 | result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
99 | }
100 |
101 | result.setClassIndex(result.numAttributes() - 1);
102 |
103 | return result;
104 | }
105 |
106 | /**
107 | * Called by JUnit before each test method. Sets up the Instances object to use based on
108 | * one of the datasets that comes with the package.
109 | *
110 | * @throws Exception if an error occurs reading the example instances.
111 | */
112 | protected void setUp() throws Exception {
113 | super.setUp();
114 |
115 | // Check to see if the test is run from directory containing build_package.xml
116 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
117 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
118 | } else { // Hope that package is installed.
119 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
120 | }
121 |
122 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
123 | }
124 |
125 | public static Test suite() {
126 | return new TestSuite(TweetToEmbeddingsFeatureVectorTest.class);
127 | }
128 |
129 | public static void main(String[] args){
130 | junit.textui.TestRunner.run(suite());
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/benchmark/nltk_scikit_liu_vader.py:
--------------------------------------------------------------------------------
1 | # This program is free software: you can redistribute it and/or modify
2 | # it under the terms of the GNU General Public License as published by
3 | # the Free Software Foundation, either version 3 of the License, or
4 | # (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful,
7 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 | # GNU General Public License for more details.
10 | #
11 | # You should have received a copy of the GNU General Public License
12 | # along with this program. If not, see .
13 |
14 | # Authors: Felipe Bravo-Marquez
15 |
16 |
17 | import pandas as pd
18 | from nltk.tokenize import TweetTokenizer
19 | from nltk.sentiment import SentimentIntensityAnalyzer
20 | from nltk.corpus import opinion_lexicon
21 |
22 | from sklearn.linear_model import LogisticRegression
23 | from sklearn.pipeline import Pipeline, FeatureUnion
24 | from sklearn.base import BaseEstimator, TransformerMixin
25 | from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report
26 | import numpy as np
27 |
28 |
29 |
30 |
31 | # load training and testing datasets as a pandas dataframe
32 | train_data = pd.read_csv("dataset/twitter-train-B.txt", header=None, delimiter="\t",usecols=(2,3), names=("sent","tweet"))
33 | test_data = pd.read_csv("dataset/twitter-test-gold-B.tsv", header=None, delimiter="\t",usecols=(2,3), names=("sent","tweet"))
34 |
35 | # replace objective-OR-neutral and objective to neutral
36 | train_data.sent = train_data.sent.replace(['objective-OR-neutral','objective'],['neutral','neutral'])
37 |
38 | # use a Twitter-specific tokenizer
39 | tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
40 |
41 |
42 |
43 | #####################################################################################
44 | #
45 | # Train a linear model using features from Bing Liu's lexicon + the Vader method
46 | #
47 | ######################################################################################
48 | #nltk.download('vader_lexicon')
49 |
50 |
51 |
52 |
53 | class LiuFeatureExtractor(BaseEstimator, TransformerMixin):
54 | """Takes in a corpus of tweets and calculates features using Bing Liu's lexicon"""
55 |
56 | def __init__(self, tokenizer):
57 | self.tokenizer = tokenizer
58 | self.pos_set = set(opinion_lexicon.positive())
59 | self.neg_set = set(opinion_lexicon.negative())
60 |
61 | def liu_score(self,sentence):
62 | """Calculates the number of positive and negative words in the sentence using Bing Liu's Lexicon"""
63 | tokenized_sent = self.tokenizer.tokenize(sentence)
64 | pos_words = 0
65 | neg_words = 0
66 | for word in tokenized_sent:
67 | if word in self.pos_set:
68 | pos_words += 1
69 | elif word in self.neg_set:
70 | neg_words += 1
71 | return [pos_words,neg_words]
72 |
73 | def transform(self, X, y=None):
74 | """Applies liu_score and vader_score on a data.frame containing tweets """
75 | values = []
76 | for tweet in X:
77 | values.append(self.liu_score(tweet))
78 |
79 | return(np.array(values))
80 |
81 | def fit(self, X, y=None):
82 | """This function must return `self` unless we expect the transform function to perform a
83 | different action on training and testing partitions (e.g., when we calculate unigram features,
84 | the dictionary is only extracted from the first batch)"""
85 | return self
86 |
87 |
88 |
89 |
90 | class VaderFeatureExtractor(BaseEstimator, TransformerMixin):
91 | """Takes in a corpus of tweets and calculates features using the Vader method"""
92 |
93 | def __init__(self, tokenizer):
94 | self.tokenizer = tokenizer
95 | self.sid = SentimentIntensityAnalyzer()
96 |
97 |
98 | def vader_score(self,sentence):
99 | """ Calculates sentiment scores for a sentence using the Vader method """
100 | pol_scores = self.sid.polarity_scores(sentence)
101 | return(list(pol_scores.values()))
102 |
103 | def transform(self, X, y=None):
104 | """Applies vader_score on a data.frame containing tweets """
105 | values = []
106 | for tweet in X:
107 | values.append(self.vader_score(tweet))
108 |
109 | return(np.array(values))
110 |
111 | def fit(self, X, y=None):
112 | """Returns `self` unless something different happens in train and test"""
113 | return self
114 |
115 |
116 |
117 |
118 |
119 | vader_feat = VaderFeatureExtractor(tokenizer)
120 | liu_feat = LiuFeatureExtractor(tokenizer)
121 |
122 | log_mod = LogisticRegression(solver='liblinear',multi_class='ovr')
123 | vader_liu_clf = Pipeline([ ('feats',
124 | FeatureUnion([ ('vader', vader_feat), ('liu',liu_feat) ])),
125 | ('clf', log_mod)])
126 |
127 |
128 | vader_liu_clf.fit(train_data.tweet, train_data.sent)
129 | pred_vader_liu = vader_liu_clf.predict(test_data.tweet)
130 |
131 |
132 | conf_vader_liu = confusion_matrix(test_data.sent, pred_vader_liu)
133 | kappa_vader_liu = cohen_kappa_score(test_data.sent, pred_vader_liu)
134 | class_rep_vader_liu = classification_report(test_data.sent, pred_vader_liu)
135 |
136 | print('Confusion Matrix for Logistic Regression + Vader + features from Bing Liu\'s Lexicon')
137 | print(conf_vader_liu)
138 | print('Classification Report')
139 | print(class_rep_vader_liu)
140 | print('kappa:'+str(kappa_vader_liu))
141 |
142 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/TweetToSentiStrengthFeatureVectorTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.core.Instances;
24 | import weka.filters.AbstractFilterTest;
25 | import weka.filters.Filter;
26 |
27 | import junit.framework.Test;
28 | import junit.framework.TestSuite;
29 |
30 | import java.io.File;
31 |
32 | /**
33 | * Tests TweetToSentiStrengthFeatureVector. Run from the command line with:
34 | * java weka.filters.unsupervised.attribute.TweetToWordCountFeatureVectorTest
35 | *
36 | * AffectiveTweets package must either be installed or
37 | * JVM must be started in AffectiveTweets directory.
38 | *
39 | * @author FracPete and eibe
40 | * @version $Revision: 9568 $
41 | */
42 | public class TweetToSentiStrengthFeatureVectorTest extends AbstractFilterTest {
43 |
44 | public TweetToSentiStrengthFeatureVectorTest(String name) {
45 | super(name);
46 | }
47 |
48 | /** Creates a default TweetToSentiStrengthFeatureVector filter */
49 | public Filter getFilter() {
50 | Filter f = null;
51 |
52 | // Check to see if the test is run from directory containing build_package.xml
53 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
54 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
55 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
56 | f = new TweetToSentiStrengthFeatureVector();
57 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
58 | } else {
59 | f = new TweetToSentiStrengthFeatureVector(); // Hope that the package is installed.
60 | }
61 | return f;
62 | }
63 |
64 | /**
65 | * Test for the FilteredClassifier used with this filter.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
75 | Filter[] filters = new Filter[2];
76 | filters[0] = getFilter();
77 | weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
78 | filters[1] = rt;
79 | mf.setFilters(filters);
80 | result.setFilter(mf);
81 | result.setClassifier(new weka.classifiers.functions.SMO());
82 |
83 | return result;
84 | }
85 |
86 | /**
87 | * Data to be used for FilteredClassifier test.
88 | *
89 | * @return the configured FilteredClassifier
90 | */
91 | protected Instances getFilteredClassifierData() throws Exception {
92 | Instances result;
93 |
94 | // Check to see if the test is run from directory containing build_package.xml
95 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
96 | result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
97 | } else { // Hope that package is installed.
98 | result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
99 | }
100 |
101 | result.setClassIndex(result.numAttributes() - 1);
102 |
103 | return result;
104 | }
105 |
106 | /**
107 | * Called by JUnit before each test method. Sets up the Instances object to use based on
108 | * one of the datasets that comes with the package.
109 | *
110 | * @throws Exception if an error occurs reading the example instances.
111 | */
112 | protected void setUp() throws Exception {
113 | super.setUp();
114 |
115 | // Check to see if the test is run from directory containing build_package.xml
116 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
117 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
118 | } else { // Hope that package is installed.
119 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
120 | }
121 |
122 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
123 | }
124 |
125 | public static Test suite() {
126 | return new TestSuite(TweetToSentiStrengthFeatureVectorTest.class);
127 | }
128 |
129 | public static void main(String[] args){
130 | junit.textui.TestRunner.run(suite());
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/TweetToWordListCountFeatureVectorTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.core.Instances;
24 | import weka.filters.AbstractFilterTest;
25 | import weka.filters.Filter;
26 |
27 | import junit.framework.Test;
28 | import junit.framework.TestSuite;
29 |
30 | import java.io.File;
31 |
32 | /**
33 | * Tests TweetToWordListCountFeatureVectorTest. Run from the command line with:
34 | * java weka.filters.unsupervised.attribute.TweetToWordListCountFeatureVectorTest
35 | *
36 | * AffectiveTweets package must either be installed or
37 | * JVM must be started in AffectiveTweets directory.
38 | *
39 | * @author FracPete and eibe
40 | * @version $Revision: 9568 $
41 | */
42 | public class TweetToWordListCountFeatureVectorTest extends AbstractFilterTest {
43 |
44 | public TweetToWordListCountFeatureVectorTest(String name) {
45 | super(name);
46 | }
47 |
48 | /** Creates a default TweetToSentiStrengthFeatureVector filter */
49 | public Filter getFilter() {
50 | Filter f = null;
51 |
52 | // Check to see if the test is run from directory containing build_package.xml
53 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
54 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
55 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
56 | f = new TweetToWordListCountFeatureVector();
57 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
58 | } else {
59 | f = new TweetToWordListCountFeatureVector(); // Hope that the package is installed.
60 | }
61 | return f;
62 | }
63 |
64 | /**
65 | * Test for the FilteredClassifier used with this filter.
66 | *
67 | * @return the configured FilteredClassifier
68 | */
69 | protected FilteredClassifier getFilteredClassifier() {
70 | FilteredClassifier result;
71 |
72 | result = new FilteredClassifier();
73 |
74 | weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
75 | Filter[] filters = new Filter[2];
76 | filters[0] = getFilter();
77 | weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
78 | filters[1] = rt;
79 | mf.setFilters(filters);
80 | result.setFilter(mf);
81 | result.setClassifier(new weka.classifiers.functions.SMO());
82 |
83 | return result;
84 | }
85 |
86 | /**
87 | * Data to be used for FilteredClassifier test.
88 | *
89 | * @return the configured FilteredClassifier
90 | */
91 | protected Instances getFilteredClassifierData() throws Exception {
92 | Instances result;
93 |
94 | // Check to see if the test is run from directory containing build_package.xml
95 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
96 | result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
97 | } else { // Hope that package is installed.
98 | result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
99 | }
100 |
101 | result.setClassIndex(result.numAttributes() - 1);
102 |
103 | return result;
104 | }
105 |
106 | /**
107 | * Called by JUnit before each test method. Sets up the Instances object to use based on
108 | * one of the datasets that comes with the package.
109 | *
110 | * @throws Exception if an error occurs reading the example instances.
111 | */
112 | protected void setUp() throws Exception {
113 | super.setUp();
114 |
115 | // Check to see if the test is run from directory containing build_package.xml
116 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
117 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
118 | } else { // Hope that package is installed.
119 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
120 | }
121 |
122 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
123 | }
124 |
125 | public static Test suite() {
126 | return new TestSuite(TweetToWordListCountFeatureVectorTest.class);
127 | }
128 |
129 | public static void main(String[] args){
130 | junit.textui.TestRunner.run(suite());
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/main/java/affective/core/CSVEmbeddingHandler.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * EmbeddingHandler.java
18 | * Copyright (C) 1999-2018 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 |
23 |
24 | package affective.core;
25 |
26 | import it.unimi.dsi.fastutil.doubles.AbstractDoubleList;
27 | import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
28 |
29 | import java.io.BufferedReader;
30 | import java.io.File;
31 | import java.io.FileInputStream;
32 | import java.io.InputStreamReader;
33 | import java.util.zip.GZIPInputStream;
34 |
35 | import weka.core.OptionMetadata;
36 | import weka.core.SingleIndex;
37 | import weka.core.WekaPackageManager;
38 |
39 |
40 | /**
41 | *
42 | * This class is used for handling word vector or embeddings stored in gzipped files.
43 | *
44 | *
45 | *
46 | *
47 | * @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
48 | * @version $Revision: 1 $
49 | */
50 | public class CSVEmbeddingHandler extends EmbeddingHandler {
51 |
52 | /** For serialization **/
53 | private static final long serialVersionUID = -2458037798910799631L;
54 |
55 | /** Default path to where resources are stored. */
56 | public static String RESOURCES_FOLDER_NAME = WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "AffectiveTweets" + File.separator + "resources";
57 |
58 |
59 | /** Embedding File Name. **/
60 | protected File embeddingsFile=new File(RESOURCES_FOLDER_NAME + File.separator + "w2v.twitter.edinburgh.100d.csv.gz");
61 |
62 |
63 | /** The separator String **/
64 | protected String separator="TAB";
65 |
66 |
67 | /** the index of the string attribute to be processed */
68 | protected SingleIndex wordNameIndex = new SingleIndex("last");
69 |
70 |
71 |
72 |
73 | /**
74 | * Returns a string describing this filter.
75 | *
76 | * @return a description of the filter suitable for displaying in the
77 | * explorer/experimenter gui
78 | */
79 | public String globalInfo() {
80 | return "This object handles word embeddings in csv.gz format. \n";
81 | }
82 |
83 |
84 |
85 |
86 |
87 | /* (non-Javadoc)
88 | * @see affective.core.EmbeddingHandler#createDict()
89 | */
90 | public void createDict() throws Exception {
91 |
92 | FileInputStream fin = new FileInputStream(this.embeddingsFile);
93 | GZIPInputStream gzis = new GZIPInputStream(fin);
94 | InputStreamReader xover = new InputStreamReader(gzis);
95 | BufferedReader bf = new BufferedReader(xover);
96 |
97 | this.separator = this.separator.equals("TAB")?"\t":this.separator;
98 |
99 |
100 | String line;
101 | boolean firstLine=true;
102 | while ((line = bf.readLine()) != null) {
103 | String parts[]=line.split(this.separator);
104 |
105 | AbstractDoubleList wordVector=new DoubleArrayList();
106 | if(firstLine){
107 | this.dimensions=parts.length-1;
108 |
109 | this.wordNameIndex.setUpper(this.dimensions);
110 | firstLine=false;
111 | }
112 |
113 | // only consider lines with right number of dimensions
114 | if(parts.length-1==this.dimensions){
115 | for(int i=0;i .
14 | */
15 |
16 | /*
17 | * TweetToWordListCountFeatureVector.java
18 | * Copyright (C) 1999-2019 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 | package weka.filters.unsupervised.attribute;
23 |
24 |
25 |
26 | import java.util.ArrayList;
27 | import java.util.HashSet;
28 | import java.util.List;
29 | import java.util.Set;
30 |
31 | import java.util.Arrays;
32 |
33 | import weka.core.Attribute;
34 | import weka.core.Instance;
35 | import weka.core.Instances;
36 | import weka.core.OptionMetadata;
37 | import weka.core.SparseInstance;
38 |
39 |
40 |
41 | /**
42 | *
43 | * @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
44 | */
45 |
46 |
47 | public class TweetToWordListCountFeatureVector extends TweetToFeatureVector {
48 |
49 | /** For serialization. */
50 | private static final long serialVersionUID = -573366510055859430L;
51 |
52 | /** The given word list as a comma separated string. */
53 | public String wordList = "love,happy,great";
54 |
55 |
56 |
57 |
58 |
59 | /**
60 | * Returns a string describing this filter.
61 | *
62 | * @return a description of the filter suitable for displaying in the
63 | * explorer/experimenter gui
64 | */
65 | @Override
66 | public String globalInfo() {
67 | return "A simple filter that counts occurrences of words from a given list.";
68 | }
69 |
70 |
71 |
72 |
73 | /* (non-Javadoc)
74 | * @see weka.filters.SimpleFilter#determineOutputFormat(weka.core.Instances)
75 | */
76 | @Override
77 | protected Instances determineOutputFormat(Instances inputFormat)
78 | throws Exception {
79 |
80 | ArrayList att = new ArrayList();
81 |
82 | // Adds all attributes of the inputformat
83 | for (int i = 0; i < inputFormat.numAttributes(); i++) {
84 | att.add(inputFormat.attribute(i));
85 | }
86 |
87 | // adds the new attribute
88 | att.add(new Attribute("wordListCount"));
89 |
90 | Instances result = new Instances(inputFormat.relationName(), att, 0);
91 |
92 | // set the class index
93 | result.setClassIndex(inputFormat.classIndex());
94 |
95 | return result;
96 | }
97 |
98 |
99 |
100 | /* (non-Javadoc)
101 | * @see weka.filters.SimpleFilter#process(weka.core.Instances)
102 | */
103 | @Override
104 | protected Instances process(Instances instances) throws Exception {
105 |
106 |
107 | // set upper value for text index
108 | m_textIndex.setUpper(instances.numAttributes() - 1);
109 |
110 | Instances result = getOutputFormat();
111 |
112 |
113 | // reference to the content of the message, users index start from zero
114 | Attribute attrCont = instances.attribute(this.m_textIndex.getIndex());
115 |
116 |
117 |
118 | for (int i = 0; i < instances.numInstances(); i++) {
119 |
120 | // copy all attribute values from the original dataset
121 | double[] values = new double[result.numAttributes()];
122 | for (int n = 0; n < instances.numAttributes(); n++)
123 | values[n] = instances.instance(i).value(n);
124 |
125 |
126 | String content = instances.instance(i).stringValue(attrCont);
127 | // tokenize the content
128 | List words = affective.core.Utils.tokenize(content, this.toLowerCase, this.standarizeUrlsUsers, this.reduceRepeatedLetters, this.m_tokenizer,this.m_stemmer,this.m_stopwordsHandler);
129 |
130 | // convert the list of words into a HashSet
131 | Set wordSet = new HashSet(Arrays.asList(wordList.split(",")));
132 |
133 | // count all the occurrences of words from the list
134 | int wordCounter = 0;
135 | for(String word:words){
136 | if(wordSet.contains(word))
137 | wordCounter++;
138 | }
139 |
140 |
141 | // add the value to the last attribute
142 | values[values.length - 1] = wordCounter;
143 |
144 |
145 | Instance inst = new SparseInstance(1, values);
146 |
147 | inst.setDataset(result);
148 |
149 | // copy possible strings, relational values...
150 | copyValues(inst, false, instances, result);
151 |
152 | result.add(inst);
153 |
154 | }
155 |
156 | return result;
157 | }
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 | /**
166 | * Main method for testing this class.
167 | *
168 | * @param args should contain arguments to the filter: use -h for help
169 | */
170 | public static void main(String[] args) {
171 | runFilter(new TweetToWordListCountFeatureVector(), args);
172 | }
173 |
174 |
175 | // OptionMetada allows setting parameters from within the command-line interface
176 | @OptionMetadata(displayName = "wordlist",
177 | description = "The list with the words to count separated by a comma symbol.",
178 | commandLineParamName = "wordlist", commandLineParamSynopsis = "-wordlist ",
179 | displayOrder = 6)
180 | public String getWordList() {
181 | return wordList;
182 | }
183 | public void setWordList(String wordList) {
184 | this.wordList = wordList;
185 | }
186 |
187 |
188 |
189 | }
190 |
--------------------------------------------------------------------------------
/src/test/java/weka/filters/unsupervised/attribute/TweetToInputLexiconFeatureVectorTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
18 | */
19 |
20 | package weka.filters.unsupervised.attribute;
21 |
22 | import weka.classifiers.meta.FilteredClassifier;
23 | import weka.core.Instances;
24 | import weka.filters.AbstractFilterTest;
25 | import weka.filters.Filter;
26 |
27 | import junit.framework.Test;
28 | import junit.framework.TestSuite;
29 |
30 | import java.io.File;
31 |
32 |
33 | /**
34 | * Tests TweetToInputLexiconFeatureVector. Run from the command line with:
35 | * java weka.filters.unsupervised.attribute.TweetToInputLexiconFeatureVectorTest
36 | *
37 | * AffectiveTweets package must either be installed or
38 | * JVM must be started in AffectiveTweets directory.
39 | *
40 | * @author FracPete and eibe
41 | * @version $Revision: 9568 $
42 | */
43 | public class TweetToInputLexiconFeatureVectorTest extends AbstractFilterTest {
44 |
45 | public TweetToInputLexiconFeatureVectorTest(String name) {
46 | super(name);
47 | }
48 |
49 | /** Creates a default TweetToInputLexiconFeatureVector filter */
50 | public Filter getFilter() {
51 | Filter f = null;
52 |
53 | // Check to see if the test is run from directory containing build_package.xml
54 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
55 | File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
56 | weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
57 | f = new TweetToInputLexiconFeatureVector();
58 | weka.core.WekaPackageManager.PACKAGES_DIR = backup;
59 | } else {
60 | f = new TweetToInputLexiconFeatureVector(); // Hope that the package is installed.
61 | }
62 | return f;
63 | }
64 |
65 | /**
66 | * Test for the FilteredClassifier used with this filter.
67 | *
68 | * @return the configured FilteredClassifier
69 | */
70 | protected FilteredClassifier getFilteredClassifier() {
71 | FilteredClassifier result;
72 |
73 | result = new FilteredClassifier();
74 |
75 | weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
76 | Filter[] filters = new Filter[2];
77 | filters[0] = getFilter();
78 | weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
79 | filters[1] = rt;
80 | mf.setFilters(filters);
81 | result.setFilter(mf);
82 | result.setClassifier(new weka.classifiers.functions.SMO());
83 |
84 | return result;
85 | }
86 |
87 | /**
88 | * Data to be used for FilteredClassifier test.
89 | *
90 | * @return the configured FilteredClassifier
91 | */
92 | protected Instances getFilteredClassifierData() throws Exception {
93 | Instances result;
94 |
95 | // Check to see if the test is run from directory containing build_package.xml
96 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
97 | result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
98 | } else { // Hope that package is installed.
99 | result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
100 | }
101 |
102 | result.setClassIndex(result.numAttributes() - 1);
103 |
104 | return result;
105 | }
106 |
107 | /**
108 | * Called by JUnit before each test method. Sets up the Instances object to use based on
109 | * one of the datasets that comes with the package.
110 | *
111 | * @throws Exception if an error occurs reading the example instances.
112 | */
113 | protected void setUp() throws Exception {
114 | super.setUp();
115 |
116 | // Check to see if the test is run from directory containing build_package.xml
117 | if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
118 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
119 | } else { // Hope that package is installed.
120 | m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
121 | }
122 |
123 | m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
124 | }
125 |
126 |
127 | /* (non-Javadoc)
128 | * @see weka.filters.AbstractFilterTest#testBuffered()
129 | */
130 | public void testBuffered(){}
131 |
132 |
133 | /* (non-Javadoc)
134 | * @see weka.filters.AbstractFilterTest#testRegression()
135 | */
136 | public void testRegression(){}
137 |
138 |
139 | public static Test suite() {
140 | return new TestSuite(TweetToInputLexiconFeatureVectorTest.class);
141 | }
142 |
143 | public static void main(String[] args){
144 | junit.textui.TestRunner.run(suite());
145 | }
146 | }
147 |
--------------------------------------------------------------------------------
/wekarefs/weka/filters/unsupervised/attribute/LexiconDistantSupervisionTest.ref:
--------------------------------------------------------------------------------
1 | @relation 'Twitter Sentiment Analysis Sentiment140 Test: Dataset. More info at: http://help.sentiment140.com/-weka.filters.unsupervised.attribute.LexiconDistantSupervision-lex../AffectiveTweets/lexicons/arff_lexicons/emoticons.arff-polattpolarity-negvalnegative-posvalpositive-removeMatchingWord-I1-tokenizerweka.core.tokenizers.TweetNLPTokenizer'
2 |
3 | @attribute content string
4 | @attribute topic string
5 | @attribute class {negative,neutral,positive}
6 | @attribute polarity {negative,positive}
7 |
8 | @data
9 | {0 '@kenburbary You\'ll love your Kindle2. I\'ve had mine for a few months and never looked back. The new big one is huge! No need for remorse! ',1 kindle2,2 positive,3 positive}
10 | {0 '@mikefish Fair enough. But i have the Kindle2 and I think it\'s perfect ',1 kindle2,2 positive,3 positive}
11 | {0 'downloading apps for my iphone! So much fun There literally is an app for just about anything.',1 'iphone app',2 positive,3 positive}
12 | {0 'I\'m listening to \"P.Y.T\" by Danny Gokey <3 <3 <3 Aww, he\'s so amazing. I <3 him so much ',1 'Danny Gokey',2 positive,3 positive}
13 | {0 'is going to sleep then on a bike ride',1 sleep,2 positive,3 positive}
14 | {0 'Hello Twitter API ',1 '\"twitter api\"',2 positive,3 positive}
15 | {0 'RT @SmartChickPDX: Was just told that Nike layoffs started today ',1 nike}
16 | {0 'Back when I worked for Nike we had one fav word : JUST DO IT! ',1 nike,2 positive,3 positive}
17 | {0 'Class... The 50d is supposed to come today ',1 50d,2 positive,3 positive}
18 | {0 'needs someone to explain lambda calculus to him! ',1 'lambda calculus'}
19 | {0 'insects have infected my spinach plant ',1 insects}
20 | {0 'I hate revision, it\'s so boring! I am totally unprepared for my exam tomorrow Things are not looking good...',1 exam}
21 | {0 'Higher physics exam tommorow, not lookin forward to it much ',1 exam}
22 | {0 'It\'s a bank holiday, yet I\'m only out of work now. Exam season sucks',1 exam}
23 | {0 'i srsly hate the stupid twitter API timeout thing, soooo annoying!!!!! ',1 'twitter api'}
24 | {0 'Night at the Museum tonite instead of UP. oh well. that 4 yr old better enjoy it. LOL',1 '\"night at the museum\"'}
25 | {0 'Ahh...got rid of stupid time warner today & now taking a nap while the roomies cook for me. Pretty good end for a monday ',1 'time warner',3 positive}
26 | {0 'Recovering from surgery..wishing @julesrenner was here ',1 surgery}
27 | {0 'My wrist still hurts. I have to get it looked at. I HATE the dr/dentist/scary places. Time to watch Eagle eye. If you want to join, txt!',1 dentist,2 positive}
28 | {0 '@kirstiealley my dentist is great but she\'s expensive...',1 dentist}
29 | {0 'is studing math tomorrow exam and dentist ',1 dentist,2 positive,3 positive}
30 | {0 'Going to the dentist later.',1 dentist}
31 | {0 'Found NOTHING at Nike Factory Off to Banana Republic Outlet! http/myloc.me/2zic',1 nike}
32 | {0 'According to the create a school, Notre Dame will have 7 receivers in NCAA 10 at 84 or higher rating *sweet*',1 'notre dame school',2 positive,3 positive}
33 | {0 '@siratomofbones we tried but Time Warner wasn\'t being nice so we recorded today. ',1 'time warner',3 positive}
34 | {0 'Safari 4 is fast Even on my shitty AT&T tethering.',1 at&t,3 positive}
35 | {0 '@ArunBasilLal I love Google Translator too ! Good day mate !',1 google,2 positive,3 positive}
36 | {0 'My Kindle2 came and I LOVE it! ',1 kindle2,2 positive,3 positive}
37 | {0 'Obama is quite a good comedian! check out his dinner speech on CNN very funny jokes.',1 obama,2 positive,3 positive}
38 | {0 'Obama\'s got JOKES!! haha just got to watch a bit of his after dinner speech from last night... i\'m in love with mr. president ',1 obama,2 positive,3 positive}
39 | {0 '@ambcharlesfield lol. Ah my skin is itchy damn lawnmowing.',1 itchy}
40 | {0 '@dannygokey I love you DANNY GOKEY!! ',1 'Danny Gokey',2 positive,3 positive}
41 | {0 '@Fraggle312 oh those are awesome! i so wish they weren\'t owned by nike ',1 nike}
42 | {0 '@mitzs hey bud np I do so love my 50D, although I\'d love a 5D mkII more',1 50d,2 positive,3 positive}
43 | {0 '@jonduenas @robynlyn just got us a 50D for the office. ',1 50d,2 positive,3 positive}
44 | {0 'Learning about lambda calculus ',1 'lambda calculus',2 positive,3 positive}
45 | {0 'Just had McDonalds for dinner. It was goooood. Big Mac Meal. ',1 mcdonalds,2 positive,3 positive}
46 | {0 'Stopped to have lunch at McDonalds. Chicken Nuggetssss! yummmmmy.',1 mcdonalds,2 positive,3 positive}
47 | {0 'my exam went good. @HelloLeonie: your prayers worked ',1 exam,2 positive,3 positive}
48 | {0 'Only one exam left, and i am so happy for it ',1 exam,2 positive,3 positive}
49 | {0 '@mashable I never did thank you for including me in your Top 100 Twitter Authors! You Rock! (& I New Wave ) http://bit.ly/EOrFV',1 mashable,2 positive,3 positive}
50 | {0 'HTML 5 Demos! Lots of great stuff to come! Yes, I\'m excited. http://htmlfive.appspot.com #io2009 #googleio',1 googleio,2 positive,3 positive}
51 | {0 '#RantsAndRaves The worst thing about GM (concord / pleasant hill / martinez is the fucking UAW. .. http://buzzup.com/4ueb',1 gm}
52 | {0 'Just got home from chick-fil-a with the boys. Damn my internets down stupid time warner',1 'time warner'}
53 | {0 'confirmed: it\'s Time Warner\'s fault, not Facebook\'s, that fb is taking about 3 minutes to load. so tempted to switch to verizon ',1 'time warner'}
54 | {0 'this dentist\'s office is cold ',1 dentist}
55 | {0 'dropped her broccoli walking home from safeway! so depressed',1 safeway,2 neutral}
56 | {0 'Nike rocks. I\'m super grateful for what I\'ve done with them & the European Division of NIKE is BEYOND! @whitSTYLES @muchasmuertes',1 nike,2 positive,3 positive}
57 | {0 '@sheridanmarfil - its not so much my obsession with cell phones, but the iphone! i\'m a slave to at&t forever because of it. ',1 at&t,3 positive}
58 | {0 'Ahhh... back in a *real* text editing environment. I <3 LaTeX.',1 latex,2 positive,3 positive}
59 |
--------------------------------------------------------------------------------
/src/main/java/weka/core/tokenizers/TweetNLPTokenizer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This program is free software: you can redistribute it and/or modify
3 | * it under the terms of the GNU General Public License as published by
4 | * the Free Software Foundation, either version 3 of the License, or
5 | * (at your option) any later version.
6 | *
7 | * This program is distributed in the hope that it will be useful,
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | * GNU General Public License for more details.
11 | *
12 | * You should have received a copy of the GNU General Public License
13 | * along with this program. If not, see .
14 | */
15 |
16 | /*
17 | * TwitterNLPTokenizer.java
18 | * Copyright (C) 1999-2018 University of Waikato, Hamilton, New Zealand
19 | *
20 | */
21 |
22 | package weka.core.tokenizers;
23 |
24 |
25 | import java.util.Iterator;
26 | import java.util.List;
27 |
28 | import cmu.arktweetnlp.Twokenize;
29 |
30 | import weka.core.RevisionUtils;
31 | import weka.core.TechnicalInformation;
32 | import weka.core.TechnicalInformation.Type;
33 |
34 |
35 |
36 |
37 | /**
38 | * A Twitter-specific tokenizer based on the CMU TwitterNLP library: http://www.cs.cmu.edu/~ark/TweetNLP/
39 | *
40 | *
41 | *
42 | * BibTeX:
43 | *
44 | * @InProceedings{twitterNLP,
45 | * Title = {Part-of-speech tagging for twitter: Annotation, features, and experiments},
46 | * Author = {Gimpel, Kevin and Schneider, Nathan and O'Connor, Brendan and Das, Dipanjan and Mills, Daniel and Eisenstein, Jacob and Heilman, Michael and Yogatama, Dani and Flanigan, Jeffrey and Smith, Noah A},
47 | * Booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: short papers-Volume 2},
48 | * Year = {2011},
49 | * Organization = {Association for Computational Linguistics},
50 | * Pages = {42--47}
51 | * }
52 | *
53 |
54 | *
55 | *
56 | * @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
57 | * @version $Revision: 1 $
58 | */
59 | public class TweetNLPTokenizer extends Tokenizer {
60 |
61 | /** For serialization. **/
62 | private static final long serialVersionUID = 4352757127093531518L;
63 |
64 |
65 | /** the actual tokenizer */
66 | protected transient Iterator m_tokenIterator;
67 |
68 |
69 |
70 | /**
71 | * Returns a string describing this tokenizer.
72 | *
73 | * @return a description of the filter suitable for displaying in the
74 | * explorer/experimenter gui
75 | */
76 | @Override
77 | public String globalInfo() {
78 | return "A Twitter-specific tokenizer based on the CMU TweetNLP library.\n" + getTechnicalInformation().toString();
79 | }
80 |
81 |
82 |
83 | /**
84 | * Returns an instance of a TechnicalInformation object, containing
85 | * detailed information about the technical background of this class,
86 | * e.g., paper reference or book this class is based on.
87 | *
88 | * @return the technical information about this class
89 | */
90 | public TechnicalInformation getTechnicalInformation() {
91 | TechnicalInformation result;
92 |
93 | result = new TechnicalInformation(Type.INPROCEEDINGS);
94 | result.setValue(TechnicalInformation.Field.AUTHOR, "Gimpel, Kevin and Schneider, Nathan and O'Connor, Brendan and Das, Dipanjan and Mills, Daniel and Eisenstein, Jacob and Heilman, Michael and Yogatama, Dani and Flanigan, Jeffrey and Smith, Noah A");
95 | result.setValue(TechnicalInformation.Field.TITLE, "Part-of-speech tagging for twitter: Annotation, features, and experiments");
96 | result.setValue(TechnicalInformation.Field.YEAR, "2011");
97 | result.setValue(TechnicalInformation.Field.URL, "http://www.cs.cmu.edu/~ark/TweetNLP/");
98 | result.setValue(TechnicalInformation.Field.NOTE, "The Weka tokenizer works with version 0.32 of TweetNLP.");
99 |
100 | return result;
101 | }
102 |
103 |
104 |
105 | /**
106 | * Tests if this enumeration contains more elements.
107 | *
108 | * @return true if and only if this enumeration object contains at least one
109 | * more element to provide; false otherwise.
110 | */
111 | public boolean hasMoreElements() {
112 | return this.m_tokenIterator.hasNext();
113 | }
114 |
115 | /**
116 | * Returns the next element of this enumeration if this enumeration object has
117 | * at least one more element to provide.
118 | *
119 | * @return the next element of this enumeration.
120 | */
121 | @Override
122 | public String nextElement() {
123 | return this.m_tokenIterator.next();
124 | }
125 |
126 | /**
127 | * Sets the string to tokenize. Tokenization happens immediately.
128 | *
129 | * @param s the string to tokenize
130 | */
131 | @Override
132 | public void tokenize(String s) {
133 |
134 | List words=Twokenize.tokenizeRawTweetText(s);
135 | this.m_tokenIterator=words.iterator();
136 |
137 |
138 | }
139 |
140 |
141 |
142 |
143 | /**
144 | * Returns the revision string.
145 | *
146 | * @return the revision
147 | */
148 | public String getRevision() {
149 | return RevisionUtils.extract("$Revision: 1 $");
150 | }
151 |
152 |
153 | /**
154 | * Runs the tokenizer with the given options and strings to tokenize. The
155 | * tokens are printed to stdout.
156 | *
157 | * @param args the commandline options and strings to tokenize
158 | */
159 | public static void main(String[] args) {
160 | runTokenizer(new TweetNLPTokenizer(), args);
161 | }
162 |
163 | }
164 |
--------------------------------------------------------------------------------
/src/test/resources/wekarefs/weka/filters/unsupervised/attribute/LexiconDistantSupervisionTest.ref:
--------------------------------------------------------------------------------
1 | @relation 'Twitter Sentiment Analysis Sentiment140 Test: Dataset. More info at: http://help.sentiment140.com/-weka.filters.unsupervised.attribute.LexiconDistantSupervision-lex../AffectiveTweets/lexicons/arff_lexicons/emoticons.arff-polattpolarity-negvalnegative-posvalpositive-removeMatchingWord-I1-tokenizerweka.core.tokenizers.TweetNLPTokenizer'
2 |
3 | @attribute content string
4 | @attribute topic string
5 | @attribute class {negative,neutral,positive}
6 | @attribute polarity {negative,positive}
7 |
8 | @data
9 | {0 '@kenburbary You\'ll love your Kindle2. I\'ve had mine for a few months and never looked back. The new big one is huge! No need for remorse! ',1 kindle2,2 positive,3 positive}
10 | {0 '@mikefish Fair enough. But i have the Kindle2 and I think it\'s perfect ',1 kindle2,2 positive,3 positive}
11 | {0 'downloading apps for my iphone! So much fun There literally is an app for just about anything.',1 'iphone app',2 positive,3 positive}
12 | {0 'I\'m listening to \"P.Y.T\" by Danny Gokey <3 <3 <3 Aww, he\'s so amazing. I <3 him so much ',1 'Danny Gokey',2 positive,3 positive}
13 | {0 'is going to sleep then on a bike ride',1 sleep,2 positive,3 positive}
14 | {0 'Hello Twitter API ',1 '\"twitter api\"',2 positive,3 positive}
15 | {0 'RT @SmartChickPDX: Was just told that Nike layoffs started today ',1 nike}
16 | {0 'Back when I worked for Nike we had one fav word : JUST DO IT! ',1 nike,2 positive,3 positive}
17 | {0 'Class... The 50d is supposed to come today ',1 50d,2 positive,3 positive}
18 | {0 'needs someone to explain lambda calculus to him! ',1 'lambda calculus'}
19 | {0 'insects have infected my spinach plant ',1 insects}
20 | {0 'I hate revision, it\'s so boring! I am totally unprepared for my exam tomorrow Things are not looking good...',1 exam}
21 | {0 'Higher physics exam tommorow, not lookin forward to it much ',1 exam}
22 | {0 'It\'s a bank holiday, yet I\'m only out of work now. Exam season sucks',1 exam}
23 | {0 'i srsly hate the stupid twitter API timeout thing, soooo annoying!!!!! ',1 'twitter api'}
24 | {0 'Night at the Museum tonite instead of UP. oh well. that 4 yr old better enjoy it. LOL',1 '\"night at the museum\"'}
25 | {0 'Ahh...got rid of stupid time warner today & now taking a nap while the roomies cook for me. Pretty good end for a monday ',1 'time warner',3 positive}
26 | {0 'Recovering from surgery..wishing @julesrenner was here ',1 surgery}
27 | {0 'My wrist still hurts. I have to get it looked at. I HATE the dr/dentist/scary places. Time to watch Eagle eye. If you want to join, txt!',1 dentist,2 positive}
28 | {0 '@kirstiealley my dentist is great but she\'s expensive...',1 dentist}
29 | {0 'is studing math tomorrow exam and dentist ',1 dentist,2 positive,3 positive}
30 | {0 'Going to the dentist later.',1 dentist}
31 | {0 'Found NOTHING at Nike Factory Off to Banana Republic Outlet! http/myloc.me/2zic',1 nike}
32 | {0 'According to the create a school, Notre Dame will have 7 receivers in NCAA 10 at 84 or higher rating *sweet*',1 'notre dame school',2 positive,3 positive}
33 | {0 '@siratomofbones we tried but Time Warner wasn\'t being nice so we recorded today. ',1 'time warner',3 positive}
34 | {0 'Safari 4 is fast Even on my shitty AT&T tethering.',1 at&t,3 positive}
35 | {0 '@ArunBasilLal I love Google Translator too ! Good day mate !',1 google,2 positive,3 positive}
36 | {0 'My Kindle2 came and I LOVE it! ',1 kindle2,2 positive,3 positive}
37 | {0 'Obama is quite a good comedian! check out his dinner speech on CNN very funny jokes.',1 obama,2 positive,3 positive}
38 | {0 'Obama\'s got JOKES!! haha just got to watch a bit of his after dinner speech from last night... i\'m in love with mr. president ',1 obama,2 positive,3 positive}
39 | {0 '@ambcharlesfield lol. Ah my skin is itchy damn lawnmowing.',1 itchy}
40 | {0 '@dannygokey I love you DANNY GOKEY!! ',1 'Danny Gokey',2 positive,3 positive}
41 | {0 '@Fraggle312 oh those are awesome! i so wish they weren\'t owned by nike ',1 nike}
42 | {0 '@mitzs hey bud np I do so love my 50D, although I\'d love a 5D mkII more',1 50d,2 positive,3 positive}
43 | {0 '@jonduenas @robynlyn just got us a 50D for the office. ',1 50d,2 positive,3 positive}
44 | {0 'Learning about lambda calculus ',1 'lambda calculus',2 positive,3 positive}
45 | {0 'Just had McDonalds for dinner. It was goooood. Big Mac Meal. ',1 mcdonalds,2 positive,3 positive}
46 | {0 'Stopped to have lunch at McDonalds. Chicken Nuggetssss! yummmmmy.',1 mcdonalds,2 positive,3 positive}
47 | {0 'my exam went good. @HelloLeonie: your prayers worked ',1 exam,2 positive,3 positive}
48 | {0 'Only one exam left, and i am so happy for it ',1 exam,2 positive,3 positive}
49 | {0 '@mashable I never did thank you for including me in your Top 100 Twitter Authors! You Rock! (& I New Wave ) http://bit.ly/EOrFV',1 mashable,2 positive,3 positive}
50 | {0 'HTML 5 Demos! Lots of great stuff to come! Yes, I\'m excited. http://htmlfive.appspot.com #io2009 #googleio',1 googleio,2 positive,3 positive}
51 | {0 '#RantsAndRaves The worst thing about GM (concord / pleasant hill / martinez is the fucking UAW. .. http://buzzup.com/4ueb',1 gm}
52 | {0 'Just got home from chick-fil-a with the boys. Damn my internets down stupid time warner',1 'time warner'}
53 | {0 'confirmed: it\'s Time Warner\'s fault, not Facebook\'s, that fb is taking about 3 minutes to load. so tempted to switch to verizon ',1 'time warner'}
54 | {0 'this dentist\'s office is cold ',1 dentist}
55 | {0 'dropped her broccoli walking home from safeway! so depressed',1 safeway,2 neutral}
56 | {0 'Nike rocks. I\'m super grateful for what I\'ve done with them & the European Division of NIKE is BEYOND! @whitSTYLES @muchasmuertes',1 nike,2 positive,3 positive}
57 | {0 '@sheridanmarfil - its not so much my obsession with cell phones, but the iphone! i\'m a slave to at&t forever because of it. ',1 at&t,3 positive}
58 | {0 'Ahhh... back in a *real* text editing environment. I <3 LaTeX.',1 latex,2 positive,3 positive}
59 |
--------------------------------------------------------------------------------