├── README.md ├── Report ├── Personality_Attribution_Report.pdf └── Predicting Personality Traits of Authors from Text.pdf └── code and data ├── essays.csv ├── mlp_avg_res.xlsx ├── mlp_baseline.py ├── mlp_grid_search.py ├── mlp_iterative.py ├── mlp_kfold.py ├── mlp_simple.py ├── naive_grid_search.py ├── naive_iterative_para_opt.py ├── naive_simple.py ├── naive_stopwords_stemmed.py ├── svc_kfold.py └── w2v_features.txt /README.md: -------------------------------------------------------------------------------- 1 | # Personality-Attribution-using-Natural-Language-Processing 2 | 3 | ## A project by Chirayu Desai, Akhilesh Hegde and Yuzhou Yin 4 | 5 | All files take one of the personalitiy types as command line argument to predict and assume that the data files required by them are in the same directory. 6 | 7 | Example usage: 1) python file-name.py trait
8 | trait can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by:
9 | 0: Extraversion 10 | 1: Neuroticism 11 | 2: Agreeableness 12 | 3: Conscientiousness 13 | 4: Openness 14 | 15 | The details of various ways and options of running each file can be found in the top document comment of each file. 16 | 17 | essays.csv is our dataset file. 18 | 19 | 20 | For Naive Bayes, the Parameters and features were decided by running naive_grid_search.py, naive_iterative_para_opt.py and naive_stopwords_stemmed.py for each trait. 21 | naive_simple.py runs for each trait for optimal values of smoothning parameter to generate published results. 22 | 23 | For Multilayer perceptron classifier the Parameters and features were decided by running mlp_baseline.py, mlp_grid_search.py, mlp_kfold.py, mlp_iterative.py for each trait. 24 | The average results of multiple runs of mlp_simple.py produce the published results. The details of results of each run can be found in mlp_avg_res.xlsx 25 | -------------------------------------------------------------------------------- /Report/Personality_Attribution_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/Report/Personality_Attribution_Report.pdf -------------------------------------------------------------------------------- /Report/Predicting Personality Traits of Authors from Text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/Report/Predicting Personality Traits of Authors from Text.pdf -------------------------------------------------------------------------------- /code and data/essays.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/code and data/essays.csv -------------------------------------------------------------------------------- /code and data/mlp_avg_res.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/862eb0ffe53655dd7351bb3abf9581b09935fa3f/code and data/mlp_avg_res.xlsx -------------------------------------------------------------------------------- /code and data/mlp_baseline.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy as np 3 | from nltk.tokenize import word_tokenize 4 | from sklearn.model_selection import train_test_split 5 | from numpy import zeros 6 | from keras.preprocessing.text import Tokenizer 7 | from keras import layers, Sequential 8 | from matplotlib import pyplot 9 | 10 | def get_common_words(text): 11 | text = nltk.word_tokenize(text) 12 | text = nltk.Text(text) 13 | informative_words = [word.lower() for word in text if len(word) > 5] 14 | return set(informative_words) 15 | 16 | X = [] 17 | y = [] 18 | 19 | # Read data from file 20 | with open('essays.csv', encoding='latin-1') as f: 21 | i = 0 22 | for row in f: 23 | entry = [] 24 | for j in range(5): 25 | if j == 0: 26 | val = (row.rsplit(",", 1)[-1])[:-1] 27 | else: 28 | val = row.rsplit(",", 1)[-1] 29 | entry.insert(0, val) 30 | row = row.rsplit(",")[:-1] 31 | row = ",".join(row) 32 | entry.insert(0, row.split(",",1)[0]) 33 | entry.insert(1, row.split(",",1)[1]) 34 | i += 1 35 | res = get_common_words(entry[1]) 36 | X.append(res) 37 | y.append(entry[2:]) 38 | X = X[1:] 39 | y = y[1:] 40 | 41 | 42 | # Use w2v aggregated word vectors 43 | l = [] 44 | with open("w2v_features.txt") as f: 45 | contents = f.read(); 46 | contents = contents.split("\n") 47 | for i in contents: 48 | if len(i) > 0: 49 | vec = i.split(",") 50 | vec = [float(x) for x in vec] 51 | l.append(np.array(vec)) 52 | X = np.array(l) 53 | 54 | # Split into training and test sets 55 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 56 | 57 | 58 | # Function used to vectorize label vectors 59 | def vectorize_labels(labels, dimension=5): 60 | results = zeros((len(labels), dimension)) 61 | for i, label in enumerate(labels): 62 | for j in range(5): 63 | if label[j] == 'y': 64 | results[i, j] = 1 65 | else: 66 | results[i, j] = 0 67 | return results 68 | 69 | y_train_labels = vectorize_labels(y_train) 70 | y_test_labels = vectorize_labels(y_test) 71 | 72 | 73 | # Create model using Keras 74 | model = Sequential() 75 | model.add(layers.Dense(8, activation='relu', input_shape=(X.shape[1], ))) 76 | model.add(layers.Dense(10, activation='relu')) 77 | model.add(layers.Dense(5, activation='sigmoid')) 78 | model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) 79 | 80 | # Training 81 | history = model.fit(X_train, y_train_labels, epochs=100, batch_size=100, validation_split=0.1) 82 | 83 | # Evaluation 84 | predictions = model.predict(X_test) 85 | labels = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'] 86 | for i, test_rec in enumerate(X_test): 87 | s = "" 88 | for j, val in enumerate(predictions[i]): 89 | if val > 0.5: 90 | s = s + labels[j] + " " 91 | 92 | # Plot the accuracy and loss curves 93 | def plot_accuracy(history): 94 | pyplot.plot(history.history['acc']) 95 | pyplot.plot(history.history['val_acc']) 96 | pyplot.title('model accuracy') 97 | pyplot.ylabel('accuracy') 98 | pyplot.xlabel('epoch') 99 | pyplot.legend(['training', 'validation'], loc='lower right') 100 | pyplot.show() 101 | 102 | def plot_loss(history): 103 | pyplot.plot(history.history['loss']) 104 | pyplot.plot(history.history['val_loss']) 105 | pyplot.title('model loss') 106 | pyplot.ylabel('loss') 107 | pyplot.xlabel('epoch') 108 | pyplot.legend(['training', 'validation'], loc='upper right') 109 | pyplot.show() 110 | 111 | # Summarize history for accuracy 112 | plot_accuracy(history) 113 | 114 | # Summarize history for loss 115 | plot_loss(history) 116 | 117 | # Read results for the test set predictions 118 | results = model.evaluate(X_test, y_test_labels) 119 | print(model.metrics_names) 120 | print(results) -------------------------------------------------------------------------------- /code and data/mlp_grid_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | mlp_grid_search.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait. 3 | 4 | Performs grid-search cross validation to figure out optimal values of hyper-parameters 5 | 6 | The data essays are converted to lowercase before use. 7 | The labels are Binarized 8 | 9 | Example usage: 1) python mlp_grid_search.py 10 | 11 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 12 | 0: Extraversion 13 | 1: Neuroticism 14 | 2: Agreeableness 15 | 3: Conscientiousness 16 | 4: Openness 17 | 18 | """ 19 | __author__ = "Chirayu Desai" 20 | 21 | import sys 22 | import warnings 23 | import numpy as np 24 | import pandas as pd 25 | from sklearn.neural_network import MLPClassifier 26 | from sklearn.model_selection import GridSearchCV 27 | from sklearn.metrics import accuracy_score 28 | from sklearn.metrics import classification_report 29 | from sklearn.feature_extraction.text import TfidfVectorizer 30 | from sklearn.model_selection import train_test_split 31 | 32 | 33 | warnings.filterwarnings('ignore') 34 | 35 | # read data from csv 36 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 37 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 38 | 39 | 40 | def get_choice(choice): 41 | """ 42 | Get the users choice for which trait to predict based on provided command line option 43 | :param choice: the value of command line option 44 | :return: the trait label and default alpha value 45 | """ 46 | return { 47 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')), 48 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')), 49 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')), 50 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')), 51 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs')) 52 | }.get(choice, (None, None)) 53 | 54 | 55 | def classify(trait_arg): 56 | """ 57 | Runs MLP classifier with provided parameters 58 | :param trait_arg: the trait to predict 59 | """ 60 | x = df['essay'][1:] 61 | x = x.str.lower() 62 | y = df[trait_arg][1:] 63 | # binarize labels 64 | y = np.where(y == 'n', 0, 1) 65 | 66 | print("Predicting ", trait_arg) 67 | print("Test set, Train Set ratio: 1:3") 68 | 69 | # Test train split in 25 : 75 ratio 70 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) 71 | 72 | # TF-IDF vectorizer 73 | vectorizer = TfidfVectorizer() 74 | xx_train = vectorizer.fit_transform(x_train) 75 | xx_test = vectorizer.transform(x_test) 76 | 77 | # Lists of Possible Hyper-parameter values 78 | activation_types = ['identity', 'logistic', 'tanh', 'relu'] 79 | learning_rates = ['constant', 'invscaling', 'adaptive'] 80 | solver_types = ['lbfgs', 'sgd', 'adam'] 81 | hidden_layers = [(20), (40), (60), (6, 10), (10, 20), (20, 40), (50, 100), (75, 150, 300), (50, 100, 150, 200)] 82 | iterations = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 500] 83 | 84 | parameters = [{'activation': activation_types, 'learning_rate': learning_rates, 85 | 'solver': solver_types, 'hidden_layer_sizes': hidden_layers, 'max_iter': iterations}] 86 | 87 | scorers = ['accuracy', 'precision', 'recall', 'f1'] 88 | 89 | # Tune for each scorer 90 | for scorer in scorers: 91 | print("Tuning hyper-parameters for %s" % scorer) 92 | print() 93 | 94 | # Grid Search with 10-fold cross validation 95 | clf = GridSearchCV(MLPClassifier(), parameters, cv=10, scoring=scorer, n_jobs=4) 96 | clf.fit(xx_train, y_train) 97 | 98 | print("Best parameters set found on development set:") 99 | print() 100 | print(clf.best_params_) 101 | print() 102 | print("Grid scores on development set:") 103 | print() 104 | means = clf.cv_results_['mean_test_score'] 105 | standard_deviation = clf.cv_results_['std_test_score'] 106 | for mean, std, params in zip(means, standard_deviation, clf.cv_results_['params']): 107 | print("%0.3f (+/-%0.03f) for %r" 108 | % (mean, std * 2, params)) 109 | print() 110 | 111 | print("Detailed classification report:") 112 | print() 113 | print("The model is trained on the full development set.") 114 | print("The scores are computed on the full evaluation set.") 115 | print() 116 | predictions = clf.predict(xx_test) 117 | print(classification_report(y_test, predictions)) 118 | print(accuracy_score(y_test, predictions)) 119 | print() 120 | 121 | 122 | if __name__ == "__main__": 123 | 124 | if not len(sys.argv) > 1: 125 | print("No command line Arguments Provided") 126 | elif len(sys.argv) == 2: 127 | trait_index = sys.argv[1] 128 | trait, params = get_choice(trait_index) 129 | if trait is None: 130 | print("Trait index value should be between 0 and 4") 131 | else: 132 | classify(trait) 133 | else: 134 | print("Incorrect command line arguments") 135 | -------------------------------------------------------------------------------- /code and data/mlp_iterative.py: -------------------------------------------------------------------------------- 1 | """ 2 | mlp_iterative.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait. 3 | 4 | Performs iterative scoring to estimate best parameters 5 | The data essays are converted to lowercase before use 6 | 7 | Example usage: 1) python mlp_iterative.py 8 | 9 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 10 | 0: Extraversion 11 | 1: Neuroticism 12 | 2: Agreeableness 13 | 3: Conscientiousness 14 | 4: Openness 15 | 16 | """ 17 | __author__ = "Chirayu Desai" 18 | 19 | import sys 20 | import warnings 21 | import pandas as pd 22 | from itertools import product 23 | from sklearn.neural_network import MLPClassifier 24 | from sklearn.metrics import accuracy_score 25 | from sklearn.metrics import classification_report 26 | from sklearn.feature_extraction.text import TfidfVectorizer 27 | from sklearn.model_selection import train_test_split 28 | 29 | warnings.filterwarnings('ignore') 30 | 31 | # read data from csv 32 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 33 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 34 | 35 | 36 | def get_choice(choice): 37 | """ 38 | Get the users choice for which trait to predict based on provided command line option 39 | :param choice: the value of command line option 40 | :return: the trait label and default alpha value 41 | """ 42 | return { 43 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')), 44 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')), 45 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')), 46 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')), 47 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs')) 48 | }.get(choice, (None, None)) 49 | 50 | 51 | def classify(trait_arg, activation_arg, learning_rate_arg, solver_arg): 52 | """ 53 | Runs MLP classifier with provided parameters 54 | :param trait_arg: the trait to predict 55 | :param activation_arg: the activation function 56 | :param learning_rate_arg: the type of learning for neural network 57 | :param solver_arg: the type of solver to be used 58 | """ 59 | x = df['essay'][1:] 60 | x = x.str.lower() 61 | y = df[trait_arg][1:] 62 | 63 | print("Predicting ", trait_arg, " with arguments = ", activation_arg, "\t", learning_rate_arg, "\t", solver_arg) 64 | print("Test set, Train Set ratio: 1:3") 65 | 66 | # Test train split in 25 : 75 ratio 67 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) 68 | 69 | # TF-IDF vectorizer 70 | vectorizer = TfidfVectorizer() 71 | xx_train = vectorizer.fit_transform(x_train) 72 | xx_test = vectorizer.transform(x_test) 73 | 74 | # Multilayer Perceptron Classifier with single hidden layer size = 60 75 | classifier = MLPClassifier(activation=activation_arg, alpha=0.0001, hidden_layer_sizes=(60), 76 | learning_rate=learning_rate_arg, max_iter=20, solver=solver_arg) 77 | classifier.fit(xx_train, y_train) 78 | 79 | predictions = classifier.predict(xx_test) 80 | print("Confusion Matrix:") 81 | print(classification_report(y_test, predictions)) 82 | score = accuracy_score(y_test, predictions) 83 | print("Accuracy:", score) 84 | 85 | 86 | if __name__ == "__main__": 87 | 88 | if not len(sys.argv) > 1: 89 | print("No command line Arguments Provided") 90 | elif len(sys.argv) == 2: 91 | trait_index = sys.argv[1] 92 | trait, params = get_choice(trait_index) 93 | if trait is None: 94 | print("Trait index value should be between 0 and 4") 95 | else: 96 | performance = dict() 97 | 98 | activation_types = ['identity', 'logistic', 'tanh', 'relu'] 99 | learning_rates = ['constant', 'invscaling', 'adaptive'] 100 | solver_types = ['lbfgs', 'sgd', 'adam'] 101 | 102 | a_l_s = product(activation_types, learning_rates, solver_types) 103 | for params in a_l_s: 104 | ac, lr, sl = params 105 | classify(trait, ac, lr, sl) 106 | else: 107 | print("Incorrect command line arguments") 108 | -------------------------------------------------------------------------------- /code and data/mlp_kfold.py: -------------------------------------------------------------------------------- 1 | """ 2 | mlp_kfold.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait. 3 | 4 | Performs k-fold cross validation to figure out optimal values of hyper-parameters 5 | 6 | The data essays are converted to lowercase before use. 7 | The labels are Binarized 8 | 9 | Example usage: 1) python mlp_kfold.py 10 | 11 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 12 | 0: Extraversion 13 | 1: Neuroticism 14 | 2: Agreeableness 15 | 3: Conscientiousness 16 | 4: Openness 17 | 18 | """ 19 | __author__ = "Chirayu Desai" 20 | 21 | 22 | import sys 23 | import warnings 24 | import operator 25 | import pandas as pd 26 | from sklearn.model_selection import KFold 27 | from sklearn.neural_network import MLPClassifier 28 | from sklearn.metrics import accuracy_score 29 | from sklearn.metrics import classification_report 30 | from sklearn.feature_extraction.text import TfidfVectorizer 31 | from collections import OrderedDict 32 | from itertools import product 33 | 34 | 35 | warnings.filterwarnings('ignore') 36 | 37 | # read data from csv 38 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 39 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 40 | 41 | 42 | def get_choice(choice): 43 | """ 44 | Get the users choice for which trait to predict based on provided command line option 45 | :param choice: the value of command line option 46 | :return: the trait label and default alpha value 47 | """ 48 | return { 49 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')), 50 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')), 51 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')), 52 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')), 53 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs')) 54 | }.get(choice, (None, None)) 55 | 56 | 57 | def parse_classification_report(classification_report_arg): 58 | """ 59 | Source StackOverflow 60 | Parse a sklearn classification report into a dict keyed by class name 61 | and containing a tuple (precision, recall, fscore, support) for each class 62 | :param classification_report_arg: the generated classification report 63 | """ 64 | lines = classification_report_arg.split('\n') 65 | # Remove empty lines 66 | lines = list(filter(lambda l: not len(l.strip()) == 0, lines)) 67 | 68 | # Starts with a header, then score for each class and finally an average 69 | header = lines[0] 70 | cls_lines = lines[1:-1] 71 | avg_line = lines[-1] 72 | 73 | assert header.split() == ['precision', 'recall', 'f1-score', 'support'] 74 | assert avg_line.split()[0] == 'avg' 75 | 76 | # We cannot simply use split because class names can have spaces. So instead 77 | # figure the width of the class field by looking at the indentation of the 78 | # precision header 79 | cls_field_width = len(header) - len(header.lstrip()) 80 | 81 | # Now, collect all the class names and score in a dict 82 | 83 | def parse_line(l): 84 | """Parse a line of classification_report""" 85 | cls_name = l[:cls_field_width].strip() 86 | precision, recall, fscore, support = l[cls_field_width:].split() 87 | precision = float(precision) 88 | recall = float(recall) 89 | fscore = float(fscore) 90 | support = int(support) 91 | return (cls_name, precision, recall, fscore, support) 92 | 93 | data = OrderedDict() 94 | for l in cls_lines: 95 | ret = parse_line(l) 96 | cls_name = ret[0] 97 | scores = ret[1:] 98 | data[cls_name] = scores 99 | 100 | # average 101 | data['avg'] = parse_line(avg_line)[1:] 102 | return data 103 | 104 | 105 | def classify(trait_arg, params_arg, performance_dict): 106 | """ 107 | Runs a classifier on given activation type 108 | :param trait_arg: the trait to predict 109 | :param params_arg: the values of activation_type, learning_rate_type, solver_type 110 | :param performance_dict: to store performance for each combination 111 | """ 112 | 113 | activation_type, learning_rate_type, solver_type = params_arg 114 | print("Using Parameters : ", params_arg) 115 | x = df['essay'][1:] 116 | x = x.str.lower() 117 | y = df[trait_arg][1:] 118 | 119 | print("Predicting ", trait_arg) 120 | 121 | # TF-IDF vectorizer 122 | vectorizer = TfidfVectorizer() 123 | 124 | # 10 fold 125 | kf = KFold(n_splits=10) 126 | 127 | # MLP classifier 128 | classifier = MLPClassifier(activation=activation_type, batch_size='auto', 129 | hidden_layer_sizes=(60), learning_rate=learning_rate_type, max_iter=20, 130 | random_state=None, solver=solver_type) 131 | 132 | ind = 0 133 | precision_dict = dict() 134 | recall_dict = dict() 135 | accuracy_dict = dict() 136 | for train_indices, test_indices in kf.split(x, y): 137 | x_train, x_test, y_train, y_test = x[train_indices][1:], x[test_indices].tolist()[1:], \ 138 | y[train_indices][1:], y[test_indices].tolist()[1:] 139 | train_x_vector = vectorizer.fit_transform(x_train) 140 | test_X_vector = vectorizer.transform(x_test) 141 | classifier.fit(train_x_vector, y_train) 142 | guess = classifier.predict(test_X_vector) 143 | rep = classification_report(y_test, guess) 144 | a, b, c, d = dict(parse_classification_report(rep))['avg'] 145 | precision_dict[ind] = a 146 | recall_dict[ind] = b 147 | accuracy_dict[ind] = accuracy_score(y_test, guess) 148 | ind = ind + 1 149 | 150 | p, r, a = (float(sum(precision_dict.values())) / 10), (float(sum(recall_dict.values())) / 10)\ 151 | , (float(sum(accuracy_dict.values())) / 10) 152 | performance_dict[params_arg] = (p + r + a) * 33.33 153 | print("Precision : ", p * 100, ", Recall : ", r * 100, ", Accuracy : ", a * 100) 154 | return performance_dict 155 | 156 | 157 | if __name__ == "__main__": 158 | 159 | if not len(sys.argv) > 1: 160 | print("No command line Arguments Provided") 161 | elif len(sys.argv) == 2: 162 | trait_index = sys.argv[1] 163 | trait, params = get_choice(trait_index) 164 | if trait is None: 165 | print("Trait index value should be between 0 and 4") 166 | else: 167 | performance = dict() 168 | 169 | activation_types = ['identity', 'logistic', 'tanh', 'relu'] 170 | learning_rates = ['constant', 'invscaling', 'adaptive'] 171 | solver_types = ['lbfgs', 'sgd', 'adam'] 172 | 173 | a_l_s = product(activation_types, learning_rates, solver_types) 174 | 175 | for param in a_l_s: 176 | performance = classify(trait, param, performance) 177 | 178 | best = max(performance.items(), key=operator.itemgetter(1))[0] 179 | print(best, " seems to perform the best.") 180 | else: 181 | print("Incorrect command line arguments") 182 | -------------------------------------------------------------------------------- /code and data/mlp_simple.py: -------------------------------------------------------------------------------- 1 | """ 2 | mlp_simple.py: Runs Multilayer Perceptron Neural Network to predict a user selected personality trait. 3 | 4 | The data essays are converted to lowercase before use 5 | 6 | Example usage: 1) python mlp_simple.py 7 | OR 8 | 2) python mlp_simple.py 9 | 10 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 11 | 0: Extraversion 12 | 1: Neuroticism 13 | 2: Agreeableness 14 | 3: Conscientiousness 15 | 4: Openness 16 | 17 | is one of ['identity', 'logistic', 'tanh', 'relu'] 18 | 19 | is one of ['constant', 'invscaling', 'adaptive'] 20 | 21 | is one of ['lbfgs', 'sgd', 'adam'] 22 | 23 | """ 24 | __author__ = "Chirayu Desai" 25 | 26 | import sys 27 | import warnings 28 | import pandas as pd 29 | from sklearn.neural_network import MLPClassifier 30 | from sklearn.metrics import accuracy_score 31 | from sklearn.metrics import classification_report 32 | from sklearn.feature_extraction.text import TfidfVectorizer 33 | from sklearn.model_selection import train_test_split 34 | 35 | warnings.filterwarnings('ignore') 36 | 37 | # read data from csv 38 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 39 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 40 | 41 | 42 | def get_choice(choice): 43 | """ 44 | Get the users choice for which trait to predict based on provided command line option 45 | :param choice: the value of command line option 46 | :return: the trait label and default alpha value 47 | """ 48 | return { 49 | '0': ('Extraversion', ('tanh', 'adaptive', 'lbfgs')), 50 | '1': ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')), 51 | '2': ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')), 52 | '3': ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')), 53 | '4': ('Openness', ('relu', 'invscaling', 'lbfgs')) 54 | }.get(choice, (None, None)) 55 | 56 | 57 | def classify(trait_arg, activation_arg, learning_rate_arg, solver_arg): 58 | """ 59 | Runs MLP classifier with provided parameters 60 | :param trait_arg: the trait to predict 61 | :param activation_arg: the activation function 62 | :param learning_rate_arg: the type of learning for neural network 63 | :param solver_arg: the type of solver to be used 64 | """ 65 | x = df['essay'][1:] 66 | x = x.str.lower() 67 | y = df[trait_arg][1:] 68 | 69 | print("Predicting ", trait_arg, " with arguments = ", activation_arg, "\t", learning_rate_arg, "\t", solver_arg) 70 | print("Test set, Train Set ratio: 1:3") 71 | 72 | # Test train split in 25 : 75 ratio 73 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) 74 | 75 | # TF-IDF vectorizer 76 | vectorizer = TfidfVectorizer() 77 | xx_train = vectorizer.fit_transform(x_train) 78 | xx_test = vectorizer.transform(x_test) 79 | 80 | # Multilayer Perceptron Classifier with single hidden layer size = 60 81 | classifier = MLPClassifier(activation=activation_arg, alpha=0.0001, hidden_layer_sizes=(60), 82 | learning_rate=learning_rate_arg, max_iter=20, solver=solver_arg) 83 | classifier.fit(xx_train, y_train) 84 | 85 | predictions = classifier.predict(xx_test) 86 | print("Confusion Matrix:") 87 | print(classification_report(y_test, predictions)) 88 | score = accuracy_score(y_test, predictions) 89 | print("Accuracy:", score) 90 | 91 | 92 | if __name__ == "__main__": 93 | 94 | if not len(sys.argv) > 1: 95 | print("No command line Arguments Provided") 96 | elif len(sys.argv) == 2: 97 | trait_index = sys.argv[1] 98 | trait, params = get_choice(trait_index) 99 | if trait is None: 100 | print("Trait index value should be between 0 and 4") 101 | else: 102 | ac, lr, sl = params 103 | classify(trait, ac, lr, sl) 104 | elif len(sys.argv) == 5: 105 | trait_index = sys.argv[1] 106 | trait, params = get_choice(trait_index) 107 | if trait is None: 108 | print("Trait index value should be between 0 and 4") 109 | else: 110 | ac, lr, sl = sys.argv[2], sys.argv[3], sys.argv[4] 111 | classify(trait, ac, lr, sl) 112 | else: 113 | print("Incorrect command line arguments") 114 | -------------------------------------------------------------------------------- /code and data/naive_grid_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | naive_grid_search.py: Runs Multinomial Naive Bayes Classifier to predict a user selected personality trait. 3 | 4 | Performs grid-search cross validation to figure out good values of alpha 5 | 6 | The data essays is converted to lowercase before use 7 | The labels are binarized 8 | 9 | Example usage: python naive_grid_search.py 10 | 11 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 12 | 0: Extraversion 13 | 1: Neuroticism 14 | 2: Agreeableness 15 | 3: Conscientiousness 16 | 4: Openness 17 | 18 | """ 19 | __author__ = "Chirayu Desai" 20 | 21 | import sys 22 | import numpy as np 23 | import pandas as pd 24 | from sklearn.naive_bayes import MultinomialNB 25 | from sklearn.metrics import accuracy_score 26 | from sklearn.metrics import classification_report 27 | from sklearn.feature_extraction.text import TfidfVectorizer 28 | from sklearn.model_selection import train_test_split 29 | from sklearn.model_selection import GridSearchCV 30 | 31 | # read data from csv 32 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 33 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 34 | 35 | 36 | def get_choice(choice): 37 | """ 38 | Get the users choice for which trait to predict based on provided command line option 39 | :param choice: the value of command line option 40 | :return: the trait label and default alpha value 41 | """ 42 | return { 43 | '0': ('Extraversion', 0.07), 44 | '1': ('Neuroticism', 0.27), 45 | '2': ('Agreeableness', 0.11), 46 | '3': ('Conscientiousness', 0.09), 47 | '4': ('Openness', 0.45) 48 | }.get(choice, (None, None)) 49 | 50 | 51 | def classify(trait_arg): 52 | """ 53 | Runs Naive Bayes classifier with gird search cv for provided trait 54 | :param trait_arg: the trait to predict 55 | """ 56 | print("Predicting for trait: ", trait_arg) 57 | x = df['essay'][1:] 58 | x = x.str.lower() 59 | y = df[trait_arg][1:] 60 | 61 | # binarize labels 62 | y = np.where(y == 'n', 0, 1) 63 | 64 | # Hyper-parameter options 65 | params = np.arange(0.01, 2.01, 0.01, dtype=float).tolist() 66 | params = [float(f'{x:.3f}') for x in params] 67 | print("Alpha Values : ", params) 68 | alphas = [{'alpha': params}] 69 | scorers = ['accuracy'] 70 | 71 | print("Test set, Train Set ratio: 1:3") 72 | 73 | # Test train split in 25 : 75 ratio 74 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) 75 | 76 | # TF-IDF vectorizer 77 | vectorizer = TfidfVectorizer() 78 | xx_train = vectorizer.fit_transform(x_train) 79 | xx_test = vectorizer.transform(x_test) 80 | 81 | for scorer in scorers: 82 | print("Tuning hyper-parameters for %s" % scorer) 83 | print() 84 | 85 | # Grid Search with 10-fold cross validation 86 | clf = GridSearchCV(MultinomialNB(), alphas, cv=10, scoring=scorer) 87 | clf.fit(xx_train, y_train) 88 | 89 | print("Best parameters set found on development set:") 90 | print() 91 | print(clf.best_params_) 92 | print() 93 | print("Grid scores on development set:") 94 | print() 95 | means = clf.cv_results_['mean_test_score'] 96 | standard_deviations = clf.cv_results_['std_test_score'] 97 | for mean, std, params in zip(means, standard_deviations, clf.cv_results_['params']): 98 | print("%0.3f (+/-%0.03f) for %r" 99 | % (mean, std * 2, params)) 100 | print() 101 | 102 | print("Detailed classification report:") 103 | print() 104 | print("The model is trained on the full development set.") 105 | print("The scores are computed on the full evaluation set.") 106 | print() 107 | predictions = clf.predict(xx_test) 108 | print(classification_report(y_test, predictions)) 109 | print(accuracy_score(y_test, predictions)) 110 | print() 111 | 112 | 113 | if __name__ == "__main__": 114 | 115 | if not len(sys.argv) > 1: 116 | print("No command line Arguments Provided") 117 | else: 118 | trait_index = sys.argv[1] 119 | 120 | trait, default_alpha = get_choice(trait_index) 121 | if trait is None: 122 | print("Trait index value should be between 0 and 4") 123 | else: 124 | classify(trait) 125 | -------------------------------------------------------------------------------- /code and data/naive_iterative_para_opt.py: -------------------------------------------------------------------------------- 1 | """ 2 | naive_iterative_para_opt.py: Runs Multinomial Naive Bayes Classifier to predict a user selected personality trait. 3 | 4 | Performs iterative evaluation over a range of alpha to figure out good values of alpha 5 | 6 | The data essays is converted to lowercase before use 7 | 8 | Example usage: python naive_iterative_para_opt.py 9 | 10 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 11 | 0: Extraversion 12 | 1: Neuroticism 13 | 2: Agreeableness 14 | 3: Conscientiousness 15 | 4: Openness 16 | 17 | """ 18 | __author__ = "Chirayu Desai" 19 | 20 | import sys 21 | import operator 22 | import numpy as np 23 | import pandas as pd 24 | from sklearn.naive_bayes import MultinomialNB 25 | from sklearn.metrics import accuracy_score 26 | from sklearn.feature_extraction.text import TfidfVectorizer 27 | from sklearn.model_selection import train_test_split 28 | 29 | # read data from csv 30 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 31 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 32 | 33 | 34 | def get_choice(choice): 35 | """ 36 | Get the users choice for which trait to predict based on provided command line option 37 | :param choice: the value of command line option 38 | :return: the trait label and default alpha value 39 | """ 40 | return { 41 | '0': ('Extraversion', 0.07), 42 | '1': ('Neuroticism', 0.27), 43 | '2': ('Agreeableness', 0.11), 44 | '3': ('Conscientiousness', 0.09), 45 | '4': ('Openness', 0.45) 46 | }.get(choice, (None, None)) 47 | 48 | 49 | def classify(trait_arg): 50 | """ 51 | Runs Naive Bayes classifier with iterative search for provided trait 52 | :param trait_arg: the trait to predict 53 | """ 54 | print("Predicting for trait: ", trait_arg) 55 | x = df['essay'][1:] 56 | x = x.str.lower() 57 | y = df[trait_arg][1:] 58 | 59 | # Range of alpha values 60 | params = np.arange(0.01, 2.01, 0.01, dtype=float).tolist() 61 | params = [float(f'{x:.3f}') for x in params] 62 | 63 | print("Test set, Train Set ratio: 1:3") 64 | 65 | # Test train split in 25 : 75 ratio 66 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) 67 | scores = dict() 68 | 69 | for alpha in params: 70 | 71 | # TF-IDF vectorizer 72 | vectorizer = TfidfVectorizer() 73 | xx_train = vectorizer.fit_transform(x_train) 74 | xx_test = vectorizer.transform(x_test) 75 | 76 | # Multinomial Naive Bayes Classifier 77 | classifier = MultinomialNB(alpha=alpha) 78 | classifier.fit(xx_train, y_train) 79 | 80 | predictions = classifier.predict(xx_test) 81 | 82 | # print(classification_report(y_test, predictions)) 83 | score = accuracy_score(y_test, predictions) 84 | print("Alpha: ", alpha, " \t Accuracy: ", score) 85 | scores[alpha] = score 86 | 87 | print('Best Alpha: ', max(scores.items(), key=operator.itemgetter(1))[0], ' with Accuracy : ', max(scores.values())) 88 | 89 | 90 | if __name__ == "__main__": 91 | 92 | if not len(sys.argv) > 1: 93 | print("No command line Arguments Provided") 94 | else: 95 | trait_index = sys.argv[1] 96 | 97 | trait, default_alpha = get_choice(trait_index) 98 | if trait is None: 99 | print("Trait index value should be between 0 and 4") 100 | else: 101 | classify(trait) 102 | -------------------------------------------------------------------------------- /code and data/naive_simple.py: -------------------------------------------------------------------------------- 1 | """ 2 | naive_simple.py: Runs Simple Multinomial Naive Bayes Classifier to predict a user selected personality trait. 3 | 4 | The data essays are converted to lowercase before use 5 | 6 | Example usage: python naive_simple.py 7 | 8 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 9 | 0: Extraversion 10 | 1: Neuroticism 11 | 2: Agreeableness 12 | 3: Conscientiousness 13 | 4: Openness 14 | 15 | is the smoothing parameter allows values from 0.0001 to 2.0000 16 | """ 17 | __author__ = "Chirayu Desai" 18 | 19 | import sys 20 | import pandas as pd 21 | from sklearn.naive_bayes import MultinomialNB 22 | from sklearn.metrics import accuracy_score 23 | from sklearn.metrics import classification_report 24 | from sklearn.feature_extraction.text import TfidfVectorizer 25 | from sklearn.model_selection import train_test_split 26 | 27 | # read data from csv 28 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 29 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 30 | 31 | 32 | def get_choice(choice): 33 | """ 34 | Get the users choice for which trait to predict based on provided command line option 35 | :param choice: the value of command line option 36 | :return: the trait label and default alpha value 37 | """ 38 | return { 39 | '0': ('Extraversion', 0.07), 40 | '1': ('Neuroticism', 0.27), 41 | '2': ('Agreeableness', 0.11), 42 | '3': ('Conscientiousness', 0.09), 43 | '4': ('Openness', 0.45) 44 | }.get(choice, (None, None)) 45 | 46 | 47 | def classify(trait_arg, alpha): 48 | """ 49 | Runs Naive Bayes classifier with provided parameters 50 | :param trait_arg: the trait to predict 51 | :param alpha: the alpha value to be used for smoothing 52 | """ 53 | x = df['essay'][1:] 54 | x = x.str.lower() 55 | y = df[trait_arg][1:] 56 | 57 | print("Predicting ", trait_arg, " with alpha = ", alpha) 58 | print("Test set, Train Set ratio: 1:3") 59 | 60 | # Test train split in 25 : 75 ratio 61 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) 62 | 63 | # TF-IDF vectorizer 64 | vectorizer = TfidfVectorizer() 65 | xx_train = vectorizer.fit_transform(x_train) 66 | xx_test = vectorizer.transform(x_test) 67 | 68 | # Multinomial Naive Bayes Classifier 69 | classifier = MultinomialNB(alpha=alpha) 70 | classifier.fit(xx_train, y_train) 71 | 72 | predictions = classifier.predict(xx_test) 73 | print("Confusion Matrix:") 74 | print(classification_report(y_test, predictions)) 75 | score = accuracy_score(y_test, predictions) 76 | print("Accuracy:", score) 77 | 78 | 79 | if __name__ == "__main__": 80 | 81 | if not len(sys.argv) > 1: 82 | print("No command line Arguments Provided") 83 | else: 84 | trait_index = sys.argv[1] 85 | 86 | if len(sys.argv) > 2: 87 | custom_alpha = float(sys.argv[2]) 88 | else: 89 | custom_alpha = None 90 | 91 | if custom_alpha is not None: 92 | if 0.0001 <= custom_alpha < 2.0001: 93 | trait, default_alpha = get_choice(trait_index) 94 | if trait is None: 95 | print("Trait index value should be between 0 and 4") 96 | else: 97 | classify(trait, custom_alpha) 98 | else: 99 | print("Please Enter Alpha Values between 0.0001 and 2.0000") 100 | else: 101 | trait, default_alpha = get_choice(trait_index) 102 | if trait is None: 103 | print("Trait index value should be between 0 and 4") 104 | else: 105 | classify(trait, default_alpha) 106 | -------------------------------------------------------------------------------- /code and data/naive_stopwords_stemmed.py: -------------------------------------------------------------------------------- 1 | """ 2 | naive_stopwords_stemmed.py: Runs a Multinomial Naive Bayes Classifier to predict a user selected personality trait. 3 | The data essays are preprocessed by removing stop words and stemming the words. 4 | 5 | Example usage: python naive_stopwords_stemmed.py 6 | 7 | can take vales form 0 to 4 based on the trait for which the user wants to run model denoted by: 8 | 0: Extraversion 9 | 1: Neuroticism 10 | 2: Agreeableness 11 | 3: Conscientiousness 12 | 4: Openness 13 | 14 | is the smoothing parameter allows values from 0.0001 to 2.0000 15 | """ 16 | __author__ = "Chirayu Desai" 17 | 18 | import sys 19 | import pandas as pd 20 | from sklearn.naive_bayes import MultinomialNB 21 | from sklearn.metrics import accuracy_score 22 | from sklearn.metrics import classification_report 23 | from sklearn.feature_extraction.text import TfidfVectorizer 24 | from sklearn.model_selection import train_test_split 25 | from nltk.corpus import stopwords 26 | from nltk.stem.snowball import SnowballStemmer 27 | 28 | # stemmer 29 | stemmer = SnowballStemmer("english") 30 | 31 | # read data from csv 32 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 33 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='ansi') 34 | 35 | 36 | def get_choice(choice): 37 | """ 38 | Get the users choice for which trait to predict based on provided command line option 39 | :param choice: the value of command line option 40 | :return: the trait label and default alpha value 41 | """ 42 | return { 43 | '0': ('Extraversion', 0.07), 44 | '1': ('Neuroticism', 0.27), 45 | '2': ('Agreeableness', 0.11), 46 | '3': ('Conscientiousness', 0.09), 47 | '4': ('Openness', 0.45) 48 | }.get(choice, (None, None)) 49 | 50 | 51 | def classify(trait_arg, alpha): 52 | """ 53 | Runs Naive Bayes classifier with provided parameters 54 | :param trait_arg: the trait to predict 55 | :param alpha: the alpha value to be used for smoothing 56 | """ 57 | x = df['essay'][1:] 58 | x = x.str.lower() 59 | x = x.apply(lambda k: ' '.join([stemmer.stem(word) for word in k.split() if word not in 60 | stopwords.words('english')])) 61 | 62 | y = df[trait_arg][1:] 63 | 64 | print("Predicting ", trait_arg, " with alpha = ", alpha) 65 | print("Test set, Train Set ratio: 1:3") 66 | 67 | # Test train split in 25 : 75 ratio 68 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) 69 | 70 | # TF-IDF vectorizer 71 | vectorizer = TfidfVectorizer() 72 | xx_train = vectorizer.fit_transform(x_train) 73 | xx_test = vectorizer.transform(x_test) 74 | 75 | # Multinomial Naive Bayes Classifier 76 | classifier = MultinomialNB(alpha=alpha) 77 | classifier.fit(xx_train, y_train) 78 | 79 | predictions = classifier.predict(xx_test) 80 | print("Confusion Matrix:") 81 | print(classification_report(y_test, predictions)) 82 | score = accuracy_score(y_test, predictions) 83 | print("Accuracy:", score) 84 | 85 | 86 | if __name__ == "__main__": 87 | 88 | if not len(sys.argv) > 1: 89 | print("No command line Arguments Provided") 90 | else: 91 | trait_index = sys.argv[1] 92 | 93 | if len(sys.argv) > 2: 94 | custom_alpha = float(sys.argv[2]) 95 | else: 96 | custom_alpha = None 97 | 98 | if custom_alpha is not None: 99 | if 0.0001 <= custom_alpha < 2.0001: 100 | trait, default_alpha = get_choice(trait_index) 101 | if trait is None: 102 | print("Trait index value should be between 0 and 4") 103 | else: 104 | classify(trait, custom_alpha) 105 | else: 106 | print("Please Enter Alpha Values between 0.0001 and 2.0000") 107 | else: 108 | trait, default_alpha = get_choice(trait_index) 109 | if trait is None: 110 | print("Trait index value should be between 0 and 4") 111 | else: 112 | classify(trait, default_alpha) 113 | -------------------------------------------------------------------------------- /code and data/svc_kfold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn import preprocessing 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.feature_extraction.text import CountVectorizer 7 | from sklearn.svm import LinearSVC 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.metrics import accuracy_score 12 | from sklearn.model_selection import KFold 13 | 14 | 15 | # Read data from csv 16 | df = pd.read_csv('essays.csv', names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 17 | 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='latin-1') 18 | 19 | def get_choice(choice): 20 | """ 21 | Get the users choice for which trait to predict based on provided command line option 22 | :param choice: the value of command line option 23 | :return: the trait label and default alpha value 24 | """ 25 | return { 26 | '0': 'Extraversion', 27 | '1': 'Neuroticism', 28 | '2': 'Agreeableness', 29 | '3': 'Conscientiousness', 30 | '4': 'Openness' 31 | }.get(choice, (None, None)) 32 | 33 | def fetch_prediction_results(y_test, predicted): 34 | """ 35 | Compute the evaluation metrics 36 | "param y_test : The test label vector" 37 | "param predicted: The predicted label vector" 38 | "return: the metrics computed as a multiple argument return value" 39 | """ 40 | tp = 0 41 | fp = 0 42 | tn = 0 43 | fn = 0 44 | 45 | for i in range(len(predicted)): 46 | if predicted[i] == 'y' and y_test[i] == 'y': tp += 1 47 | elif predicted[i] == 'y' and y_test[i] == 'n': fp += 1 48 | elif predicted[i] == 'n' and y_test[i] == 'n': tn += 1 49 | else: fn += 1 50 | 51 | # Calculate metrics from raw counts of the confusion matrix 52 | accuracy = accuracy_score(y_test, predicted) 53 | precision = tp / (tp + fp) 54 | recall = tp / (tp + fn) 55 | f1 = (2 * precision * recall) / (precision + recall) 56 | return accuracy, precision, recall, f1 57 | 58 | def classify(trait_arg): 59 | """ 60 | Do SVC classification with k-fold cross validation performed over 10 iterations 61 | :param trait_arg: the index of the personality trait for which we want to do prediction 62 | """ 63 | print("Predicting ", trait_arg) 64 | X = np.array(df['essay']) 65 | y = df[trait_arg] 66 | 67 | # Initialize k-fold cross validator 68 | k = 10 69 | kf = KFold(n_splits=k) 70 | mlb = preprocessing.MultiLabelBinarizer() 71 | 72 | accuracy_l = [] 73 | precision_l = [] 74 | recall_l = [] 75 | 76 | # Run the classifier prediction for k splits 77 | for train_indices, test_indices in kf.split(X, y): 78 | 79 | X_train, X_test, y_train, y_test = X[train_indices][1:], X[test_indices].tolist()[1:], \ 80 | y[train_indices][1:], y[test_indices].tolist()[1:] 81 | 82 | classifier = Pipeline([ 83 | ('vectorizer', CountVectorizer()), 84 | ('tfidf', TfidfTransformer()), 85 | ('clf', OneVsRestClassifier(LinearSVC())) 86 | ]) 87 | 88 | classifier.fit(X_train, y_train) 89 | 90 | predicted = classifier.predict(X_test) 91 | a, p, r, f = fetch_prediction_results(y_test, predicted) 92 | accuracy_l.append(a) 93 | precision_l.append(p) 94 | recall_l.append(r) 95 | 96 | # Average the results 97 | acc = sum(accuracy_l) / k 98 | prec = sum(precision_l) / k 99 | recall = sum(recall_l) / k 100 | f1 = (2 * prec * recall) / (prec + recall) 101 | 102 | return acc, prec, recall, f1 103 | 104 | if __name__ == "__main__": 105 | 106 | if not len(sys.argv) > 1: 107 | print("No command line Arguments Provided") 108 | elif len(sys.argv) == 2: 109 | trait_index = sys.argv[1] 110 | trait = get_choice(trait_index) 111 | if trait is None: 112 | print("Trait index value should be between 0 and 4") 113 | else: 114 | results = classify(trait) 115 | print(list(zip(["Accuracy", "Precision", "Recall", "F1 score"], results))) 116 | else: 117 | print("Incorrect command line arguments") 118 | 119 | --------------------------------------------------------------------------------