├── README.md
├── Report
├── Personality_Attribution_Report.pdf
└── Predicting Personality Traits of Authors from Text.pdf
└── code and data
├── essays.csv
├── mlp_avg_res.xlsx
├── mlp_baseline.py
├── mlp_grid_search.py
├── mlp_iterative.py
├── mlp_kfold.py
├── mlp_simple.py
├── naive_grid_search.py
├── naive_iterative_para_opt.py
├── naive_simple.py
├── naive_stopwords_stemmed.py
├── svc_kfold.py
└── w2v_features.txt
/README.md:
--------------------------------------------------------------------------------
1 | # Personality-Attribution-using-Natural-Language-Processing
2 |
3 | ## A project by Chirayu Desai, Akhilesh Hegde and Yuzhou Yin
4 |
5 | All files take one of the personalitiy types as command line argument to predict and assume that the data files required by them are in the same directory.
6 |
7 | Example usage: 1) python file-name.py trait
8 | trait can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
9 | 0: Extraversion
10 | 1: Neuroticism
11 | 2: Agreeableness
12 | 3: Conscientiousness
13 | 4: Openness
14 |
15 | The details of various ways and options of running each file can be found in the top document comment of each file.
16 |
17 | essays.csv is our dataset file.
18 |
19 |
20 | For Naive Bayes, the Parameters and features were decided by running naive_grid_search.py, naive_iterative_para_opt.py and naive_stopwords_stemmed.py for each trait.
21 | naive_simple.py runs for each trait for optimal values of smoothning parameter to generate published results.
22 |
23 | For Multilayer perceptron classifier the Parameters and features were decided by running mlp_baseline.py, mlp_grid_search.py, mlp_kfold.py, mlp_iterative.py for each trait.
24 | The average results of multiple runs of mlp_simple.py produce the published results. The details of results of each run can be found in mlp_avg_res.xlsx
25 |
--------------------------------------------------------------------------------
/Report/Personality_Attribution_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/Report/Personality_Attribution_Report.pdf
--------------------------------------------------------------------------------
/Report/Predicting Personality Traits of Authors from Text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/Report/Predicting Personality Traits of Authors from Text.pdf
--------------------------------------------------------------------------------
/code and data/essays.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/code and data/essays.csv
--------------------------------------------------------------------------------
/code and data/mlp_avg_res.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/code and data/mlp_avg_res.xlsx
--------------------------------------------------------------------------------
/code and data/mlp_baseline.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import numpy as np
3 | from nltk.tokenize import word_tokenize
4 | from sklearn.model_selection import train_test_split
5 | from numpy import zeros
6 | from keras.preprocessing.text import Tokenizer
7 | from keras import layers, Sequential
8 | from matplotlib import pyplot
9 |
10 | def get_common_words(text):
11 | text = nltk.word_tokenize(text)
12 | text = nltk.Text(text)
13 | informative_words = [word.lower() for word in text if len(word) > 5]
14 | return set(informative_words)
15 |
16 | X = []
17 | y = []
18 |
19 | # Read data from file
20 | with open('essays.csv', encoding='latin-1') as f:
21 | i = 0
22 | for row in f:
23 | entry = []
24 | for j in range(5):
25 | if j == 0:
26 | val = (row.rsplit(",", 1)[-1])[:-1]
27 | else:
28 | val = row.rsplit(",", 1)[-1]
29 | entry.insert(0, val)
30 | row = row.rsplit(",")[:-1]
31 | row = ",".join(row)
32 | entry.insert(0, row.split(",",1)[0])
33 | entry.insert(1, row.split(",",1)[1])
34 | i += 1
35 | res = get_common_words(entry[1])
36 | X.append(res)
37 | y.append(entry[2:])
38 | X = X[1:]
39 | y = y[1:]
40 |
41 |
42 | # Use w2v aggregated word vectors
43 | l = []
44 | with open("w2v_features.txt") as f:
45 | contents = f.read();
46 | contents = contents.split("\n")
47 | for i in contents:
48 | if len(i) > 0:
49 | vec = i.split(",")
50 | vec = [float(x) for x in vec]
51 | l.append(np.array(vec))
52 | X = np.array(l)
53 |
54 | # Split into training and test sets
55 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
56 |
57 |
58 | # Function used to vectorize label vectors
59 | def vectorize_labels(labels, dimension=5):
60 | results = zeros((len(labels), dimension))
61 | for i, label in enumerate(labels):
62 | for j in range(5):
63 | if label[j] == 'y':
64 | results[i, j] = 1
65 | else:
66 | results[i, j] = 0
67 | return results
68 |
69 | y_train_labels = vectorize_labels(y_train)
70 | y_test_labels = vectorize_labels(y_test)
71 |
72 |
73 | # Create model using Keras
74 | model = Sequential()
75 | model.add(layers.Dense(8, activation='relu', input_shape=(X.shape[1], )))
76 | model.add(layers.Dense(10, activation='relu'))
77 | model.add(layers.Dense(5, activation='sigmoid'))
78 | model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
79 |
80 | # Training
81 | history = model.fit(X_train, y_train_labels, epochs=100, batch_size=100, validation_split=0.1)
82 |
83 | # Evaluation
84 | predictions = model.predict(X_test)
85 | labels = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
86 | for i, test_rec in enumerate(X_test):
87 | s = ""
88 | for j, val in enumerate(predictions[i]):
89 | if val > 0.5:
90 | s = s + labels[j] + " "
91 |
92 | # Plot the accuracy and loss curves
93 | def plot_accuracy(history):
94 | pyplot.plot(history.history['acc'])
95 | pyplot.plot(history.history['val_acc'])
96 | pyplot.title('model accuracy')
97 | pyplot.ylabel('accuracy')
98 | pyplot.xlabel('epoch')
99 | pyplot.legend(['training', 'validation'], loc='lower right')
100 | pyplot.show()
101 |
102 | def plot_loss(history):
103 | pyplot.plot(history.history['loss'])
104 | pyplot.plot(history.history['val_loss'])
105 | pyplot.title('model loss')
106 | pyplot.ylabel('loss')
107 | pyplot.xlabel('epoch')
108 | pyplot.legend(['training', 'validation'], loc='upper right')
109 | pyplot.show()
110 |
111 | # Summarize history for accuracy
112 | plot_accuracy(history)
113 |
114 | # Summarize history for loss
115 | plot_loss(history)
116 |
117 | # Read results for the test set predictions
118 | results = model.evaluate(X_test, y_test_labels)
119 | print(model.metrics_names)
120 | print(results)
--------------------------------------------------------------------------------
/code and data/mlp_grid_search.py:
--------------------------------------------------------------------------------
1 | """
2 | mlp_grid_search.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait.
3 |
4 | Performs grid-search cross validation to figure out optimal values of hyper-parameters
5 |
6 | The data essays are converted to lowercase before use.
7 | The labels are Binarized
8 |
9 | Example usage: 1) python mlp_grid_search.py
10 |
11 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
12 | 0: Extraversion
13 | 1: Neuroticism
14 | 2: Agreeableness
15 | 3: Conscientiousness
16 | 4: Openness
17 |
18 | """
19 | __author__ = "Chirayu Desai"
20 |
21 | import sys
22 | import warnings
23 | import numpy as np
24 | import pandas as pd
25 | from sklearn.neural_network import MLPClassifier
26 | from sklearn.model_selection import GridSearchCV
27 | from sklearn.metrics import accuracy_score
28 | from sklearn.metrics import classification_report
29 | from sklearn.feature_extraction.text import TfidfVectorizer
30 | from sklearn.model_selection import train_test_split
31 |
32 |
33 | warnings.filterwarnings('ignore')
34 |
35 | # read data from csv
36 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
37 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
38 |
39 |
40 | def get_choice(choice):
41 | """
42 | Get the users choice for which trait to predict based on provided command line option
43 | :param choice: the value of command line option
44 | :return: the trait label and default alpha value
45 | """
46 | return {
47 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')),
48 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')),
49 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')),
50 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')),
51 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs'))
52 | }.get(choice, (None, None))
53 |
54 |
55 | def classify(trait_arg):
56 | """
57 | Runs MLP classifier with provided parameters
58 | :param trait_arg: the trait to predict
59 | """
60 | x = df['essay'][1:]
61 | x = x.str.lower()
62 | y = df[trait_arg][1:]
63 | # binarize labels
64 | y = np.where(y == 'n', 0, 1)
65 |
66 | print("Predicting ", trait_arg)
67 | print("Test set, Train Set ratio: 1:3")
68 |
69 | # Test train split in 25 : 75 ratio
70 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
71 |
72 | # TF-IDF vectorizer
73 | vectorizer = TfidfVectorizer()
74 | xx_train = vectorizer.fit_transform(x_train)
75 | xx_test = vectorizer.transform(x_test)
76 |
77 | # Lists of Possible Hyper-parameter values
78 | activation_types = ['identity', 'logistic', 'tanh', 'relu']
79 | learning_rates = ['constant', 'invscaling', 'adaptive']
80 | solver_types = ['lbfgs', 'sgd', 'adam']
81 | hidden_layers = [(20), (40), (60), (6, 10), (10, 20), (20, 40), (50, 100), (75, 150, 300), (50, 100, 150, 200)]
82 | iterations = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 500]
83 |
84 | parameters = [{'activation': activation_types, 'learning_rate': learning_rates,
85 | 'solver': solver_types, 'hidden_layer_sizes': hidden_layers, 'max_iter': iterations}]
86 |
87 | scorers = ['accuracy', 'precision', 'recall', 'f1']
88 |
89 | # Tune for each scorer
90 | for scorer in scorers:
91 | print("Tuning hyper-parameters for %s" % scorer)
92 | print()
93 |
94 | # Grid Search with 10-fold cross validation
95 | clf = GridSearchCV(MLPClassifier(), parameters, cv=10, scoring=scorer, n_jobs=4)
96 | clf.fit(xx_train, y_train)
97 |
98 | print("Best parameters set found on development set:")
99 | print()
100 | print(clf.best_params_)
101 | print()
102 | print("Grid scores on development set:")
103 | print()
104 | means = clf.cv_results_['mean_test_score']
105 | standard_deviation = clf.cv_results_['std_test_score']
106 | for mean, std, params in zip(means, standard_deviation, clf.cv_results_['params']):
107 | print("%0.3f (+/-%0.03f) for %r"
108 | % (mean, std * 2, params))
109 | print()
110 |
111 | print("Detailed classification report:")
112 | print()
113 | print("The model is trained on the full development set.")
114 | print("The scores are computed on the full evaluation set.")
115 | print()
116 | predictions = clf.predict(xx_test)
117 | print(classification_report(y_test, predictions))
118 | print(accuracy_score(y_test, predictions))
119 | print()
120 |
121 |
122 | if __name__ == "__main__":
123 |
124 | if not len(sys.argv) > 1:
125 | print("No command line Arguments Provided")
126 | elif len(sys.argv) == 2:
127 | trait_index = sys.argv[1]
128 | trait, params = get_choice(trait_index)
129 | if trait is None:
130 | print("Trait index value should be between 0 and 4")
131 | else:
132 | classify(trait)
133 | else:
134 | print("Incorrect command line arguments")
135 |
--------------------------------------------------------------------------------
/code and data/mlp_iterative.py:
--------------------------------------------------------------------------------
1 | """
2 | mlp_iterative.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait.
3 |
4 | Performs iterative scoring to estimate best parameters
5 | The data essays are converted to lowercase before use
6 |
7 | Example usage: 1) python mlp_iterative.py
8 |
9 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
10 | 0: Extraversion
11 | 1: Neuroticism
12 | 2: Agreeableness
13 | 3: Conscientiousness
14 | 4: Openness
15 |
16 | """
17 | __author__ = "Chirayu Desai"
18 |
19 | import sys
20 | import warnings
21 | import pandas as pd
22 | from itertools import product
23 | from sklearn.neural_network import MLPClassifier
24 | from sklearn.metrics import accuracy_score
25 | from sklearn.metrics import classification_report
26 | from sklearn.feature_extraction.text import TfidfVectorizer
27 | from sklearn.model_selection import train_test_split
28 |
29 | warnings.filterwarnings('ignore')
30 |
31 | # read data from csv
32 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
33 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
34 |
35 |
36 | def get_choice(choice):
37 | """
38 | Get the users choice for which trait to predict based on provided command line option
39 | :param choice: the value of command line option
40 | :return: the trait label and default alpha value
41 | """
42 | return {
43 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')),
44 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')),
45 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')),
46 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')),
47 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs'))
48 | }.get(choice, (None, None))
49 |
50 |
51 | def classify(trait_arg, activation_arg, learning_rate_arg, solver_arg):
52 | """
53 | Runs MLP classifier with provided parameters
54 | :param trait_arg: the trait to predict
55 | :param activation_arg: the activation function
56 | :param learning_rate_arg: the type of learning for neural network
57 | :param solver_arg: the type of solver to be used
58 | """
59 | x = df['essay'][1:]
60 | x = x.str.lower()
61 | y = df[trait_arg][1:]
62 |
63 | print("Predicting ", trait_arg, " with arguments = ", activation_arg, "\t", learning_rate_arg, "\t", solver_arg)
64 | print("Test set, Train Set ratio: 1:3")
65 |
66 | # Test train split in 25 : 75 ratio
67 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
68 |
69 | # TF-IDF vectorizer
70 | vectorizer = TfidfVectorizer()
71 | xx_train = vectorizer.fit_transform(x_train)
72 | xx_test = vectorizer.transform(x_test)
73 |
74 | # Multilayer Perceptron Classifier with single hidden layer size = 60
75 | classifier = MLPClassifier(activation=activation_arg, alpha=0.0001, hidden_layer_sizes=(60),
76 | learning_rate=learning_rate_arg, max_iter=20, solver=solver_arg)
77 | classifier.fit(xx_train, y_train)
78 |
79 | predictions = classifier.predict(xx_test)
80 | print("Confusion Matrix:")
81 | print(classification_report(y_test, predictions))
82 | score = accuracy_score(y_test, predictions)
83 | print("Accuracy:", score)
84 |
85 |
86 | if __name__ == "__main__":
87 |
88 | if not len(sys.argv) > 1:
89 | print("No command line Arguments Provided")
90 | elif len(sys.argv) == 2:
91 | trait_index = sys.argv[1]
92 | trait, params = get_choice(trait_index)
93 | if trait is None:
94 | print("Trait index value should be between 0 and 4")
95 | else:
96 | performance = dict()
97 |
98 | activation_types = ['identity', 'logistic', 'tanh', 'relu']
99 | learning_rates = ['constant', 'invscaling', 'adaptive']
100 | solver_types = ['lbfgs', 'sgd', 'adam']
101 |
102 | a_l_s = product(activation_types, learning_rates, solver_types)
103 | for params in a_l_s:
104 | ac, lr, sl = params
105 | classify(trait, ac, lr, sl)
106 | else:
107 | print("Incorrect command line arguments")
108 |
--------------------------------------------------------------------------------
/code and data/mlp_kfold.py:
--------------------------------------------------------------------------------
1 | """
2 | mlp_kfold.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait.
3 |
4 | Performs k-fold cross validation to figure out optimal values of hyper-parameters
5 |
6 | The data essays are converted to lowercase before use.
7 | The labels are Binarized
8 |
9 | Example usage: 1) python mlp_kfold.py
10 |
11 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
12 | 0: Extraversion
13 | 1: Neuroticism
14 | 2: Agreeableness
15 | 3: Conscientiousness
16 | 4: Openness
17 |
18 | """
19 | __author__ = "Chirayu Desai"
20 |
21 |
22 | import sys
23 | import warnings
24 | import operator
25 | import pandas as pd
26 | from sklearn.model_selection import KFold
27 | from sklearn.neural_network import MLPClassifier
28 | from sklearn.metrics import accuracy_score
29 | from sklearn.metrics import classification_report
30 | from sklearn.feature_extraction.text import TfidfVectorizer
31 | from collections import OrderedDict
32 | from itertools import product
33 |
34 |
35 | warnings.filterwarnings('ignore')
36 |
37 | # read data from csv
38 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
39 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
40 |
41 |
42 | def get_choice(choice):
43 | """
44 | Get the users choice for which trait to predict based on provided command line option
45 | :param choice: the value of command line option
46 | :return: the trait label and default alpha value
47 | """
48 | return {
49 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')),
50 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')),
51 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')),
52 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')),
53 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs'))
54 | }.get(choice, (None, None))
55 |
56 |
57 | def parse_classification_report(classification_report_arg):
58 | """
59 | Source StackOverflow
60 | Parse a sklearn classification report into a dict keyed by class name
61 | and containing a tuple (precision, recall, fscore, support) for each class
62 | :param classification_report_arg: the generated classification report
63 | """
64 | lines = classification_report_arg.split('\n')
65 | # Remove empty lines
66 | lines = list(filter(lambda l: not len(l.strip()) == 0, lines))
67 |
68 | # Starts with a header, then score for each class and finally an average
69 | header = lines[0]
70 | cls_lines = lines[1:-1]
71 | avg_line = lines[-1]
72 |
73 | assert header.split() == ['precision', 'recall', 'f1-score', 'support']
74 | assert avg_line.split()[0] == 'avg'
75 |
76 | # We cannot simply use split because class names can have spaces. So instead
77 | # figure the width of the class field by looking at the indentation of the
78 | # precision header
79 | cls_field_width = len(header) - len(header.lstrip())
80 |
81 | # Now, collect all the class names and score in a dict
82 |
83 | def parse_line(l):
84 | """Parse a line of classification_report"""
85 | cls_name = l[:cls_field_width].strip()
86 | precision, recall, fscore, support = l[cls_field_width:].split()
87 | precision = float(precision)
88 | recall = float(recall)
89 | fscore = float(fscore)
90 | support = int(support)
91 | return (cls_name, precision, recall, fscore, support)
92 |
93 | data = OrderedDict()
94 | for l in cls_lines:
95 | ret = parse_line(l)
96 | cls_name = ret[0]
97 | scores = ret[1:]
98 | data[cls_name] = scores
99 |
100 | # average
101 | data['avg'] = parse_line(avg_line)[1:]
102 | return data
103 |
104 |
105 | def classify(trait_arg, params_arg, performance_dict):
106 | """
107 | Runs a classifier on given activation type
108 | :param trait_arg: the trait to predict
109 | :param params_arg: the values of activation_type, learning_rate_type, solver_type
110 | :param performance_dict: to store performance for each combination
111 | """
112 |
113 | activation_type, learning_rate_type, solver_type = params_arg
114 | print("Using Parameters : ", params_arg)
115 | x = df['essay'][1:]
116 | x = x.str.lower()
117 | y = df[trait_arg][1:]
118 |
119 | print("Predicting ", trait_arg)
120 |
121 | # TF-IDF vectorizer
122 | vectorizer = TfidfVectorizer()
123 |
124 | # 10 fold
125 | kf = KFold(n_splits=10)
126 |
127 | # MLP classifier
128 | classifier = MLPClassifier(activation=activation_type, batch_size='auto',
129 | hidden_layer_sizes=(60), learning_rate=learning_rate_type, max_iter=20,
130 | random_state=None, solver=solver_type)
131 |
132 | ind = 0
133 | precision_dict = dict()
134 | recall_dict = dict()
135 | accuracy_dict = dict()
136 | for train_indices, test_indices in kf.split(x, y):
137 | x_train, x_test, y_train, y_test = x[train_indices][1:], x[test_indices].tolist()[1:], \
138 | y[train_indices][1:], y[test_indices].tolist()[1:]
139 | train_x_vector = vectorizer.fit_transform(x_train)
140 | test_X_vector = vectorizer.transform(x_test)
141 | classifier.fit(train_x_vector, y_train)
142 | guess = classifier.predict(test_X_vector)
143 | rep = classification_report(y_test, guess)
144 | a, b, c, d = dict(parse_classification_report(rep))['avg']
145 | precision_dict[ind] = a
146 | recall_dict[ind] = b
147 | accuracy_dict[ind] = accuracy_score(y_test, guess)
148 | ind = ind + 1
149 |
150 | p, r, a = (float(sum(precision_dict.values())) / 10), (float(sum(recall_dict.values())) / 10)\
151 | , (float(sum(accuracy_dict.values())) / 10)
152 | performance_dict[params_arg] = (p + r + a) * 33.33
153 | print("Precision : ", p * 100, ", Recall : ", r * 100, ", Accuracy : ", a * 100)
154 | return performance_dict
155 |
156 |
157 | if __name__ == "__main__":
158 |
159 | if not len(sys.argv) > 1:
160 | print("No command line Arguments Provided")
161 | elif len(sys.argv) == 2:
162 | trait_index = sys.argv[1]
163 | trait, params = get_choice(trait_index)
164 | if trait is None:
165 | print("Trait index value should be between 0 and 4")
166 | else:
167 | performance = dict()
168 |
169 | activation_types = ['identity', 'logistic', 'tanh', 'relu']
170 | learning_rates = ['constant', 'invscaling', 'adaptive']
171 | solver_types = ['lbfgs', 'sgd', 'adam']
172 |
173 | a_l_s = product(activation_types, learning_rates, solver_types)
174 |
175 | for param in a_l_s:
176 | performance = classify(trait, param, performance)
177 |
178 | best = max(performance.items(), key=operator.itemgetter(1))[0]
179 | print(best, " seems to perform the best.")
180 | else:
181 | print("Incorrect command line arguments")
182 |
--------------------------------------------------------------------------------
/code and data/mlp_simple.py:
--------------------------------------------------------------------------------
1 | """
2 | mlp_simple.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait.
3 |
4 | The data essays are converted to lowercase before use
5 |
6 | Example usage: 1) python mlp_simple.py
7 | OR
8 | 2) python mlp_simple.py
9 |
10 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
11 | 0: Extraversion
12 | 1: Neuroticism
13 | 2: Agreeableness
14 | 3: Conscientiousness
15 | 4: Openness
16 |
17 | is one of ['identity', 'logistic', 'tanh', 'relu']
18 |
19 | is one of ['constant', 'invscaling', 'adaptive']
20 |
21 | is one of ['lbfgs', 'sgd', 'adam']
22 |
23 | """
24 | __author__ = "Chirayu Desai"
25 |
26 | import sys
27 | import warnings
28 | import pandas as pd
29 | from sklearn.neural_network import MLPClassifier
30 | from sklearn.metrics import accuracy_score
31 | from sklearn.metrics import classification_report
32 | from sklearn.feature_extraction.text import TfidfVectorizer
33 | from sklearn.model_selection import train_test_split
34 |
35 | warnings.filterwarnings('ignore')
36 |
37 | # read data from csv
38 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
39 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
40 |
41 |
42 | def get_choice(choice):
43 | """
44 | Get the users choice for which trait to predict based on provided command line option
45 | :param choice: the value of command line option
46 | :return: the trait label and default alpha value
47 | """
48 | return {
49 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')),
50 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')),
51 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')),
52 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')),
53 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs'))
54 | }.get(choice, (None, None))
55 |
56 |
57 | def classify(trait_arg, activation_arg, learning_rate_arg, solver_arg):
58 | """
59 | Runs MLP classifier with provided parameters
60 | :param trait_arg: the trait to predict
61 | :param activation_arg: the activation function
62 | :param learning_rate_arg: the type of learning for neural network
63 | :param solver_arg: the type of solver to be used
64 | """
65 | x = df['essay'][1:]
66 | x = x.str.lower()
67 | y = df[trait_arg][1:]
68 |
69 | print("Predicting ", trait_arg, " with arguments = ", activation_arg, "\t", learning_rate_arg, "\t", solver_arg)
70 | print("Test set, Train Set ratio: 1:3")
71 |
72 | # Test train split in 25 : 75 ratio
73 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
74 |
75 | # TF-IDF vectorizer
76 | vectorizer = TfidfVectorizer()
77 | xx_train = vectorizer.fit_transform(x_train)
78 | xx_test = vectorizer.transform(x_test)
79 |
80 | # Multilayer Perceptron Classifier with single hidden layer size = 60
81 | classifier = MLPClassifier(activation=activation_arg, alpha=0.0001, hidden_layer_sizes=(60),
82 | learning_rate=learning_rate_arg, max_iter=20, solver=solver_arg)
83 | classifier.fit(xx_train, y_train)
84 |
85 | predictions = classifier.predict(xx_test)
86 | print("Confusion Matrix:")
87 | print(classification_report(y_test, predictions))
88 | score = accuracy_score(y_test, predictions)
89 | print("Accuracy:", score)
90 |
91 |
92 | if __name__ == "__main__":
93 |
94 | if not len(sys.argv) > 1:
95 | print("No command line Arguments Provided")
96 | elif len(sys.argv) == 2:
97 | trait_index = sys.argv[1]
98 | trait, params = get_choice(trait_index)
99 | if trait is None:
100 | print("Trait index value should be between 0 and 4")
101 | else:
102 | ac, lr, sl = params
103 | classify(trait, ac, lr, sl)
104 | elif len(sys.argv) == 5:
105 | trait_index = sys.argv[1]
106 | trait, params = get_choice(trait_index)
107 | if trait is None:
108 | print("Trait index value should be between 0 and 4")
109 | else:
110 | ac, lr, sl = sys.argv[2], sys.argv[3], sys.argv[4]
111 | classify(trait, ac, lr, sl)
112 | else:
113 | print("Incorrect command line arguments")
114 |
--------------------------------------------------------------------------------
/code and data/naive_grid_search.py:
--------------------------------------------------------------------------------
1 | """
2 | naive_grid_search.py: Runs Multinomial Naive Bayes Classifier to predict a user selected personality trait.
3 |
4 | Performs grid-search cross validation to figure out good values of alpha
5 |
6 | The data essays is converted to lowercase before use
7 | The labels are binarized
8 |
9 | Example usage: python naive_grid_search.py
10 |
11 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
12 | 0: Extraversion
13 | 1: Neuroticism
14 | 2: Agreeableness
15 | 3: Conscientiousness
16 | 4: Openness
17 |
18 | """
19 | __author__ = "Chirayu Desai"
20 |
21 | import sys
22 | import numpy as np
23 | import pandas as pd
24 | from sklearn.naive_bayes import MultinomialNB
25 | from sklearn.metrics import accuracy_score
26 | from sklearn.metrics import classification_report
27 | from sklearn.feature_extraction.text import TfidfVectorizer
28 | from sklearn.model_selection import train_test_split
29 | from sklearn.model_selection import GridSearchCV
30 |
31 | # read data from csv
32 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
33 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
34 |
35 |
36 | def get_choice(choice):
37 | """
38 | Get the users choice for which trait to predict based on provided command line option
39 | :param choice: the value of command line option
40 | :return: the trait label and default alpha value
41 | """
42 | return {
43 | '0': ('Extraversion', 0.07),
44 | '1': ('Neuroticism', 0.27),
45 | '2': ('Agreeableness', 0.11),
46 | '3': ('Conscientiousness', 0.09),
47 | '4': ('Openness', 0.45)
48 | }.get(choice, (None, None))
49 |
50 |
51 | def classify(trait_arg):
52 | """
53 | Runs Naive Bayes classifier with gird search cv for provided trait
54 | :param trait_arg: the trait to predict
55 | """
56 | print("Predicting for trait: ", trait_arg)
57 | x = df['essay'][1:]
58 | x = x.str.lower()
59 | y = df[trait_arg][1:]
60 |
61 | # binarize labels
62 | y = np.where(y == 'n', 0, 1)
63 |
64 | # Hyper-parameter options
65 | params = np.arange(0.01, 2.01, 0.01, dtype=float).tolist()
66 | params = [float(f'{x:.3f}') for x in params]
67 | print("Alpha Values : ", params)
68 | alphas = [{'alpha': params}]
69 | scorers = ['accuracy']
70 |
71 | print("Test set, Train Set ratio: 1:3")
72 |
73 | # Test train split in 25 : 75 ratio
74 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
75 |
76 | # TF-IDF vectorizer
77 | vectorizer = TfidfVectorizer()
78 | xx_train = vectorizer.fit_transform(x_train)
79 | xx_test = vectorizer.transform(x_test)
80 |
81 | for scorer in scorers:
82 | print("Tuning hyper-parameters for %s" % scorer)
83 | print()
84 |
85 | # Grid Search with 10-fold cross validation
86 | clf = GridSearchCV(MultinomialNB(), alphas, cv=10, scoring=scorer)
87 | clf.fit(xx_train, y_train)
88 |
89 | print("Best parameters set found on development set:")
90 | print()
91 | print(clf.best_params_)
92 | print()
93 | print("Grid scores on development set:")
94 | print()
95 | means = clf.cv_results_['mean_test_score']
96 | standard_deviations = clf.cv_results_['std_test_score']
97 | for mean, std, params in zip(means, standard_deviations, clf.cv_results_['params']):
98 | print("%0.3f (+/-%0.03f) for %r"
99 | % (mean, std * 2, params))
100 | print()
101 |
102 | print("Detailed classification report:")
103 | print()
104 | print("The model is trained on the full development set.")
105 | print("The scores are computed on the full evaluation set.")
106 | print()
107 | predictions = clf.predict(xx_test)
108 | print(classification_report(y_test, predictions))
109 | print(accuracy_score(y_test, predictions))
110 | print()
111 |
112 |
113 | if __name__ == "__main__":
114 |
115 | if not len(sys.argv) > 1:
116 | print("No command line Arguments Provided")
117 | else:
118 | trait_index = sys.argv[1]
119 |
120 | trait, default_alpha = get_choice(trait_index)
121 | if trait is None:
122 | print("Trait index value should be between 0 and 4")
123 | else:
124 | classify(trait)
125 |
--------------------------------------------------------------------------------
/code and data/naive_iterative_para_opt.py:
--------------------------------------------------------------------------------
1 | """
2 | naive_iterative_para_opt.py: Runs Multinomial Naive Bayes Classifier to predict a user selected personality trait.
3 |
4 | Performs iterative evaluation over a range of alpha to figure out good values of alpha
5 |
6 | The data essays is converted to lowercase before use
7 |
8 | Example usage: python naive_iterative_para_opt.py
9 |
10 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
11 | 0: Extraversion
12 | 1: Neuroticism
13 | 2: Agreeableness
14 | 3: Conscientiousness
15 | 4: Openness
16 |
17 | """
18 | __author__ = "Chirayu Desai"
19 |
20 | import sys
21 | import operator
22 | import numpy as np
23 | import pandas as pd
24 | from sklearn.naive_bayes import MultinomialNB
25 | from sklearn.metrics import accuracy_score
26 | from sklearn.feature_extraction.text import TfidfVectorizer
27 | from sklearn.model_selection import train_test_split
28 |
29 | # read data from csv
30 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
31 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
32 |
33 |
34 | def get_choice(choice):
35 | """
36 | Get the users choice for which trait to predict based on provided command line option
37 | :param choice: the value of command line option
38 | :return: the trait label and default alpha value
39 | """
40 | return {
41 | '0': ('Extraversion', 0.07),
42 | '1': ('Neuroticism', 0.27),
43 | '2': ('Agreeableness', 0.11),
44 | '3': ('Conscientiousness', 0.09),
45 | '4': ('Openness', 0.45)
46 | }.get(choice, (None, None))
47 |
48 |
49 | def classify(trait_arg):
50 | """
51 | Runs Naive Bayes classifier with iterative search for provided trait
52 | :param trait_arg: the trait to predict
53 | """
54 | print("Predicting for trait: ", trait_arg)
55 | x = df['essay'][1:]
56 | x = x.str.lower()
57 | y = df[trait_arg][1:]
58 |
59 | # Range of alpha values
60 | params = np.arange(0.01, 2.01, 0.01, dtype=float).tolist()
61 | params = [float(f'{x:.3f}') for x in params]
62 |
63 | print("Test set, Train Set ratio: 1:3")
64 |
65 | # Test train split in 25 : 75 ratio
66 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
67 | scores = dict()
68 |
69 | for alpha in params:
70 |
71 | # TF-IDF vectorizer
72 | vectorizer = TfidfVectorizer()
73 | xx_train = vectorizer.fit_transform(x_train)
74 | xx_test = vectorizer.transform(x_test)
75 |
76 | # Multinomial Naive Bayes Classifier
77 | classifier = MultinomialNB(alpha=alpha)
78 | classifier.fit(xx_train, y_train)
79 |
80 | predictions = classifier.predict(xx_test)
81 |
82 | # print(classification_report(y_test, predictions))
83 | score = accuracy_score(y_test, predictions)
84 | print("Alpha: ", alpha, " \t Accuracy: ", score)
85 | scores[alpha] = score
86 |
87 | print('Best Alpha: ', max(scores.items(), key=operator.itemgetter(1))[0], ' with Accuracy : ', max(scores.values()))
88 |
89 |
90 | if __name__ == "__main__":
91 |
92 | if not len(sys.argv) > 1:
93 | print("No command line Arguments Provided")
94 | else:
95 | trait_index = sys.argv[1]
96 |
97 | trait, default_alpha = get_choice(trait_index)
98 | if trait is None:
99 | print("Trait index value should be between 0 and 4")
100 | else:
101 | classify(trait)
102 |
--------------------------------------------------------------------------------
/code and data/naive_simple.py:
--------------------------------------------------------------------------------
1 | """
2 | naive_simple.py: Runs Simple Multinomial Naive Bayes Classifier to predict a user selected personality trait.
3 |
4 | The data essays are converted to lowercase before use
5 |
6 | Example usage: python naive_simple.py
7 |
8 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
9 | 0: Extraversion
10 | 1: Neuroticism
11 | 2: Agreeableness
12 | 3: Conscientiousness
13 | 4: Openness
14 |
15 | is the smoothing parameter allows values from 0.0001 to 2.0000
16 | """
17 | __author__ = "Chirayu Desai"
18 |
19 | import sys
20 | import pandas as pd
21 | from sklearn.naive_bayes import MultinomialNB
22 | from sklearn.metrics import accuracy_score
23 | from sklearn.metrics import classification_report
24 | from sklearn.feature_extraction.text import TfidfVectorizer
25 | from sklearn.model_selection import train_test_split
26 |
27 | # read data from csv
28 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
29 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
30 |
31 |
32 | def get_choice(choice):
33 | """
34 | Get the users choice for which trait to predict based on provided command line option
35 | :param choice: the value of command line option
36 | :return: the trait label and default alpha value
37 | """
38 | return {
39 | '0': ('Extraversion', 0.07),
40 | '1': ('Neuroticism', 0.27),
41 | '2': ('Agreeableness', 0.11),
42 | '3': ('Conscientiousness', 0.09),
43 | '4': ('Openness', 0.45)
44 | }.get(choice, (None, None))
45 |
46 |
47 | def classify(trait_arg, alpha):
48 | """
49 | Runs Naive Bayes classifier with provided parameters
50 | :param trait_arg: the trait to predict
51 | :param alpha: the alpha value to be used for smoothing
52 | """
53 | x = df['essay'][1:]
54 | x = x.str.lower()
55 | y = df[trait_arg][1:]
56 |
57 | print("Predicting ", trait_arg, " with alpha = ", alpha)
58 | print("Test set, Train Set ratio: 1:3")
59 |
60 | # Test train split in 25 : 75 ratio
61 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
62 |
63 | # TF-IDF vectorizer
64 | vectorizer = TfidfVectorizer()
65 | xx_train = vectorizer.fit_transform(x_train)
66 | xx_test = vectorizer.transform(x_test)
67 |
68 | # Multinomial Naive Bayes Classifier
69 | classifier = MultinomialNB(alpha=alpha)
70 | classifier.fit(xx_train, y_train)
71 |
72 | predictions = classifier.predict(xx_test)
73 | print("Confusion Matrix:")
74 | print(classification_report(y_test, predictions))
75 | score = accuracy_score(y_test, predictions)
76 | print("Accuracy:", score)
77 |
78 |
79 | if __name__ == "__main__":
80 |
81 | if not len(sys.argv) > 1:
82 | print("No command line Arguments Provided")
83 | else:
84 | trait_index = sys.argv[1]
85 |
86 | if len(sys.argv) > 2:
87 | custom_alpha = float(sys.argv[2])
88 | else:
89 | custom_alpha = None
90 |
91 | if custom_alpha is not None:
92 | if 0.0001 <= custom_alpha < 2.0001:
93 | trait, default_alpha = get_choice(trait_index)
94 | if trait is None:
95 | print("Trait index value should be between 0 and 4")
96 | else:
97 | classify(trait, custom_alpha)
98 | else:
99 | print("Please Enter Alpha Values between 0.0001 and 2.0000")
100 | else:
101 | trait, default_alpha = get_choice(trait_index)
102 | if trait is None:
103 | print("Trait index value should be between 0 and 4")
104 | else:
105 | classify(trait, default_alpha)
106 |
--------------------------------------------------------------------------------
/code and data/naive_stopwords_stemmed.py:
--------------------------------------------------------------------------------
1 | """
2 | naive_stopwords_stemmed.py: Runs a Multinomial Naive Bayes Classifier to predict a user selected personality trait.
3 | The data essays are preprocessed by removing stop words and stemming the words.
4 |
5 | Example usage: python naive_stopwords_stemmed.py
6 |
7 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
8 | 0: Extraversion
9 | 1: Neuroticism
10 | 2: Agreeableness
11 | 3: Conscientiousness
12 | 4: Openness
13 |
14 | is the smoothing parameter allows values from 0.0001 to 2.0000
15 | """
16 | __author__ = "Chirayu Desai"
17 |
18 | import sys
19 | import pandas as pd
20 | from sklearn.naive_bayes import MultinomialNB
21 | from sklearn.metrics import accuracy_score
22 | from sklearn.metrics import classification_report
23 | from sklearn.feature_extraction.text import TfidfVectorizer
24 | from sklearn.model_selection import train_test_split
25 | from nltk.corpus import stopwords
26 | from nltk.stem.snowball import SnowballStemmer
27 |
28 | # stemmer
29 | stemmer = SnowballStemmer("english")
30 |
31 | # read data from csv
32 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
33 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi')
34 |
35 |
36 | def get_choice(choice):
37 | """
38 | Get the users choice for which trait to predict based on provided command line option
39 | :param choice: the value of command line option
40 | :return: the trait label and default alpha value
41 | """
42 | return {
43 | '0': ('Extraversion', 0.07),
44 | '1': ('Neuroticism', 0.27),
45 | '2': ('Agreeableness', 0.11),
46 | '3': ('Conscientiousness', 0.09),
47 | '4': ('Openness', 0.45)
48 | }.get(choice, (None, None))
49 |
50 |
51 | def classify(trait_arg, alpha):
52 | """
53 | Runs Naive Bayes classifier with provided parameters
54 | :param trait_arg: the trait to predict
55 | :param alpha: the alpha value to be used for smoothing
56 | """
57 | x = df['essay'][1:]
58 | x = x.str.lower()
59 | x = x.apply(lambda k: ' '.join([stemmer.stem(word) for word in k.split() if word not in
60 | stopwords.words('english')]))
61 |
62 | y = df[trait_arg][1:]
63 |
64 | print("Predicting ", trait_arg, " with alpha = ", alpha)
65 | print("Test set, Train Set ratio: 1:3")
66 |
67 | # Test train split in 25 : 75 ratio
68 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
69 |
70 | # TF-IDF vectorizer
71 | vectorizer = TfidfVectorizer()
72 | xx_train = vectorizer.fit_transform(x_train)
73 | xx_test = vectorizer.transform(x_test)
74 |
75 | # Multinomial Naive Bayes Classifier
76 | classifier = MultinomialNB(alpha=alpha)
77 | classifier.fit(xx_train, y_train)
78 |
79 | predictions = classifier.predict(xx_test)
80 | print("Confusion Matrix:")
81 | print(classification_report(y_test, predictions))
82 | score = accuracy_score(y_test, predictions)
83 | print("Accuracy:", score)
84 |
85 |
86 | if __name__ == "__main__":
87 |
88 | if not len(sys.argv) > 1:
89 | print("No command line Arguments Provided")
90 | else:
91 | trait_index = sys.argv[1]
92 |
93 | if len(sys.argv) > 2:
94 | custom_alpha = float(sys.argv[2])
95 | else:
96 | custom_alpha = None
97 |
98 | if custom_alpha is not None:
99 | if 0.0001 <= custom_alpha < 2.0001:
100 | trait, default_alpha = get_choice(trait_index)
101 | if trait is None:
102 | print("Trait index value should be between 0 and 4")
103 | else:
104 | classify(trait, custom_alpha)
105 | else:
106 | print("Please Enter Alpha Values between 0.0001 and 2.0000")
107 | else:
108 | trait, default_alpha = get_choice(trait_index)
109 | if trait is None:
110 | print("Trait index value should be between 0 and 4")
111 | else:
112 | classify(trait, default_alpha)
113 |
--------------------------------------------------------------------------------
/code and data/svc_kfold.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pandas as pd
3 | import numpy as np
4 | from sklearn import preprocessing
5 | from sklearn.pipeline import Pipeline
6 | from sklearn.feature_extraction.text import CountVectorizer
7 | from sklearn.svm import LinearSVC
8 | from sklearn.feature_extraction.text import TfidfTransformer
9 | from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.metrics import accuracy_score
12 | from sklearn.model_selection import KFold
13 |
14 |
15 | # Read data from csv
16 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism',
17 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='latin-1')
18 |
19 | def get_choice(choice):
20 | """
21 | Get the users choice for which trait to predict based on provided command line option
22 | :param choice: the value of command line option
23 | :return: the trait label and default alpha value
24 | """
25 | return {
26 | '0': 'Extraversion',
27 | '1': 'Neuroticism',
28 | '2': 'Agreeableness',
29 | '3': 'Conscientiousness',
30 | '4': 'Openness'
31 | }.get(choice, (None, None))
32 |
33 | def fetch_prediction_results(y_test, predicted):
34 | """
35 | Compute the evaluation metrics
36 | "param y_test : The test label vector"
37 | "param predicted: The predicted label vector"
38 | "return: the metrics computed as a multiple argument return value"
39 | """
40 | tp = 0
41 | fp = 0
42 | tn = 0
43 | fn = 0
44 |
45 | for i in range(len(predicted)):
46 | if predicted[i] == 'y' and y_test[i] == 'y': tp += 1
47 | elif predicted[i] == 'y' and y_test[i] == 'n': fp += 1
48 | elif predicted[i] == 'n' and y_test[i] == 'n': tn += 1
49 | else: fn += 1
50 |
51 | # Calculate metrics from raw counts of the confusion matrix
52 | accuracy = accuracy_score(y_test, predicted)
53 | precision = tp / (tp + fp)
54 | recall = tp / (tp + fn)
55 | f1 = (2 * precision * recall) / (precision + recall)
56 | return accuracy, precision, recall, f1
57 |
58 | def classify(trait_arg):
59 | """
60 | Do SVC classification with k-fold cross validation performed over 10 iterations
61 | :param trait_arg: the index of the personality trait for which we want to do prediction
62 | """
63 | print("Predicting ", trait_arg)
64 | X = np.array(df['essay'])
65 | y = df[trait_arg]
66 |
67 | # Initialize k-fold cross validator
68 | k = 10
69 | kf = KFold(n_splits=k)
70 | mlb = preprocessing.MultiLabelBinarizer()
71 |
72 | accuracy_l = []
73 | precision_l = []
74 | recall_l = []
75 |
76 | # Run the classifier prediction for k splits
77 | for train_indices, test_indices in kf.split(X, y):
78 |
79 | X_train, X_test, y_train, y_test = X[train_indices][1:], X[test_indices].tolist()[1:], \
80 | y[train_indices][1:], y[test_indices].tolist()[1:]
81 |
82 | classifier = Pipeline([
83 | ('vectorizer', CountVectorizer()),
84 | ('tfidf', TfidfTransformer()),
85 | ('clf', OneVsRestClassifier(LinearSVC()))
86 | ])
87 |
88 | classifier.fit(X_train, y_train)
89 |
90 | predicted = classifier.predict(X_test)
91 | a, p, r, f = fetch_prediction_results(y_test, predicted)
92 | accuracy_l.append(a)
93 | precision_l.append(p)
94 | recall_l.append(r)
95 |
96 | # Average the results
97 | acc = sum(accuracy_l) / k
98 | prec = sum(precision_l) / k
99 | recall = sum(recall_l) / k
100 | f1 = (2 * prec * recall) / (prec + recall)
101 |
102 | return acc, prec, recall, f1
103 |
104 | if __name__ == "__main__":
105 |
106 | if not len(sys.argv) > 1:
107 | print("No command line Arguments Provided")
108 | elif len(sys.argv) == 2:
109 | trait_index = sys.argv[1]
110 | trait = get_choice(trait_index)
111 | if trait is None:
112 | print("Trait index value should be between 0 and 4")
113 | else:
114 | results = classify(trait)
115 | print(list(zip(["Accuracy", "Precision", "Recall", "F1 score"], results)))
116 | else:
117 | print("Incorrect command line arguments")
118 |
119 |
--------------------------------------------------------------------------------