├── dataset ├── .gitignore ├── proxies.py ├── 3_visualize_data.py ├── requirements.txt ├── 2_clean_data.py ├── 0_submission_scraper.py └── 1_comment_scraper.py ├── LICENSE └── nn ├── sklearn_train.py └── train.py /dataset/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | /0_submission_data_1546300800_1577836800.csv 7 | /1_submission_data_1546300800_1577836800.csv 8 | /aita_clean.csv -------------------------------------------------------------------------------- /dataset/proxies.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | proxy_list = [ 4 | 'http://p.webshare.io:19999' 5 | ] 6 | 7 | def random_proxy(): 8 | i = random.randint(0, len(proxy_list) - 1) 9 | p = { 10 | 'http': proxy_list[i] 11 | } 12 | return p 13 | 14 | def remove_proxy(proxy): 15 | proxy_list.remove(proxy) 16 | print(f'Removed {proxy}-- {len(proxy_list)} proxies left') -------------------------------------------------------------------------------- /dataset/3_visualize_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | def main(): 6 | random = pd.read_csv('2_aita_random.csv') 7 | stratified = pd.read_csv('2_aita_stratified.csv') 8 | 9 | plt.hist( 10 | [random['yta_percent'], stratified['yta_percent']], 11 | label=['random', 'stratified'] 12 | ) 13 | plt.legend(loc='upper right') 14 | plt.show() 15 | 16 | if __name__ == '__main__': 17 | main() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 David Mo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dataset/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | blas=1.0=mkl 5 | brotlipy=0.7.0=py38he774522_1000 6 | ca-certificates=2020.7.22=0 7 | certifi=2020.6.20=py38_0 8 | cffi=1.14.2=py38h7a1dbc1_0 9 | chardet=3.0.4=py38_1003 10 | cryptography=3.0=py38h7a1dbc1_0 11 | idna=2.10=py_0 12 | intel-openmp=2020.2=254 13 | mkl=2020.2=256 14 | mkl-service=2.3.0=py38hb782905_0 15 | mkl_fft=1.1.0=py38h45dec08_0 16 | mkl_random=1.1.1=py38h47e9c7a_0 17 | numpy=1.19.1=py38h5510c5b_0 18 | numpy-base=1.19.1=py38ha3acd2a_0 19 | openssl=1.1.1g=he774522_1 20 | pandas=1.1.1=py38ha925a31_0 21 | pip=20.2.2=py38_0 22 | pycparser=2.20=py_2 23 | pyopenssl=19.1.0=py_1 24 | pysocks=1.7.1=py38_0 25 | python=3.8.5=he1778fa_0 26 | python-dateutil=2.8.1=py_0 27 | pytz=2020.1=py_0 28 | requests=2.24.0=py_0 29 | setuptools=49.6.0=py38_0 30 | six=1.15.0=py_0 31 | sqlite=3.33.0=h2a8f88b_0 32 | tqdm=4.48.2=pypi_0 33 | update-checker=0.18.0=pypi_0 34 | urllib3=1.25.10=py_0 35 | vc=14.1=h0510ff6_4 36 | vs2015_runtime=14.16.27012=hf0eaf9b_3 37 | websocket-client=0.57.0=pypi_0 38 | wheel=0.35.1=py_0 39 | win_inet_pton=1.1.0=py38_0 40 | wincertstore=0.2=py38_0 41 | zlib=1.2.11=h62dcd97_4 42 | -------------------------------------------------------------------------------- /nn/sklearn_train.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.naive_bayes import MultinomialNB 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.metrics import confusion_matrix, accuracy_score 7 | 8 | df = pd.read_csv('2_aita_stratified.csv') 9 | X = (df['title'] + df['body']).values 10 | 11 | X = X.astype(str) 12 | Y = df['verdict'].values 13 | 14 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) 15 | 16 | print('Loaded data') 17 | 18 | vectorizer = TfidfVectorizer(binary=False, max_df=0.95, stop_words='english') 19 | X_train = vectorizer.fit_transform(X_train).toarray() 20 | X_test = vectorizer.transform(X_test).toarray() 21 | 22 | print('Vectorized data') 23 | 24 | 25 | scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000) 26 | model = scikit_log_reg.fit(X_train, Y_train) 27 | 28 | 29 | """ 30 | model = MultinomialNB().fit(X_train, Y_train) 31 | """ 32 | 33 | print('Finished training') 34 | 35 | Y_pred = model.predict(X_test) 36 | 37 | print("Accuracy:", accuracy_score(Y_test, Y_pred)) 38 | print(confusion_matrix(Y_test, Y_pred)) -------------------------------------------------------------------------------- /dataset/2_clean_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | import pandas as pd 4 | 5 | def clean(df): 6 | df_use = df.copy() 7 | 8 | print(f'There are {str(len(df_use))} posts') 9 | 10 | # Remove any edits that may give away the answer [ie, "edit: okay you're right I'm the asshole" ] 11 | df_use['body'] = df_use['body'].str.replace('(edit|update).*?(YTA|a-|ass|\\sta\\s)(.*)', '', case=False) 12 | 13 | # Replace negative verdict scores with zero 14 | df_use[df_use['yta_score'] < 0] = 0 15 | df_use[df_use['nta_score'] < 0] = 0 16 | 17 | # Remove any post with no verdicts 18 | df_use = df_use[df_use['yta_score'] + df_use['nta_score'] > 10] 19 | print(f'After removing posts with less than or equal to 10 verdict score, there are {str(len(df_use))} posts left') 20 | 21 | # Remove any deleted or removed posts 22 | gone_list = ['[deleted]', '[removed]', ''] 23 | df_use = df_use[df_use['body'].str.strip().isin(gone_list) == False] 24 | df_use = df_use.dropna() 25 | print(f'After removing empty posts, there are {str(len(df_use))} posts left') 26 | 27 | # Sort by timestamp 28 | df_use = df_use.sort_values(by=['timestamp']) 29 | 30 | # Create yta and nta percent 31 | df_use['yta_percent'] = df_use['yta_score'] / (df_use['yta_score'] + df_use['nta_score']) 32 | df_use['nta_percent'] = 1 - df_use['yta_percent'] 33 | df_use.loc[df_use['yta_percent'] > 0.5, 'verdict'] = 'yta' 34 | df_use.loc[df_use['yta_percent'] <= 0.5, 'verdict'] = 'nta' 35 | 36 | return df_use 37 | 38 | def random_sample(df, n_samples=20000): 39 | return df.sample(n=n_samples, random_state=1) 40 | 41 | def stratified_sample(df, n_samples=20000): 42 | n = min(n_samples // 2, df['verdict'].value_counts().min()) 43 | df_ = df.groupby('verdict').apply(lambda x: x.sample(n)) 44 | return df_ 45 | 46 | def main(): 47 | print('Reading data') 48 | raw = pd.read_csv('1_submission_data_1546300800_1577836800.csv') 49 | 50 | print('Cleaning data') 51 | clean_data = clean(raw) 52 | clean_data.to_csv('2_aita_clean.csv', index=False, quoting=csv.QUOTE_ALL) 53 | 54 | print('Sampling data') 55 | random_data = random_sample(clean_data) 56 | random_data.to_csv('2_aita_random.csv', index=False, quoting=csv.QUOTE_ALL) 57 | 58 | stratified_data = stratified_sample(clean_data) 59 | stratified_data.to_csv('2_aita_stratified.csv', index=False, quoting=csv.QUOTE_ALL) 60 | 61 | print('Done') 62 | 63 | if __name__ == '__main__': 64 | main() -------------------------------------------------------------------------------- /dataset/0_submission_scraper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import csv 4 | import requests 5 | 6 | import pandas as pd 7 | 8 | def get_data(mode, args, fields): 9 | url = f'http://api.pushshift.io/reddit/{mode}/search/{args}&fields={",".join(fields)}&size=100' 10 | try: 11 | r = requests.get(url) 12 | status = r.status_code 13 | except Exception as e: 14 | #print(f'{e} on {proxy["http"]}') 15 | status = -1 16 | if status != 200: 17 | print(f'Error {status}') 18 | time.sleep(0.5) 19 | return get_data(mode, args, fields) 20 | 21 | #print(f'Fetched data from "{url}" on "{proxy["http"]}"') 22 | data = json.loads(r.text) 23 | return data['data'] 24 | 25 | def get_submissions(subreddit, after, before): 26 | after, before = str(after), str(before) 27 | args = f'?after={after}&before={before}&subreddit={subreddit}' + \ 28 | f'&sort=asc&sort_type=created_utc' 29 | fields = ['id', 'created_utc', 'title', 'selftext', 'author', 'num_comments', 'score'] 30 | 31 | return get_data('submission', args, fields) 32 | 33 | def scrape_submission(submission_data): 34 | submission_id = submission_data.get('id') 35 | timestamp = submission_data.get('created_utc') 36 | title = submission_data.get('title') 37 | body = submission_data.get('selftext') 38 | author = submission_data.get('author') 39 | num_comments = submission_data.get('num_comments') 40 | score = submission_data.get('score') 41 | yta_score, nta_score = 0, 0 42 | 43 | data = [submission_id, timestamp, title, body, author, score, num_comments, yta_score, nta_score] 44 | 45 | return data 46 | 47 | def scrape_submissions(subreddit, after, before): 48 | print(f'Scraping submissions of r/{subreddit} from {after} to {before}') 49 | 50 | f_out = open(f'0_submission_data_{after}_{before}.csv', 'w', newline='', encoding='utf-8') 51 | writer = csv.writer(f_out, quoting=csv.QUOTE_ALL) 52 | header = ['id', 'timestamp', 'title', 'body', 'author', 'score', 'num_comments', 'yta_score', 'nta_score'] 53 | writer.writerow(header) 54 | 55 | scraped_submissions = list() 56 | 57 | while after < before: 58 | submissions = get_submissions('amitheasshole', after, before) 59 | if not submissions: 60 | break 61 | 62 | for submission in submissions: 63 | scraped_submission = scrape_submission(submission) 64 | scraped_submissions.append(scraped_submission) 65 | 66 | after = submissions[-1]['created_utc'] 67 | 68 | print(f'{len(scraped_submissions)} posts scraped') 69 | 70 | print(f'Done scraping submissions of r/{subreddit} from {after} to {before}') 71 | writer.writerows(scraped_submissions) 72 | f_out.close() 73 | 74 | def main(): 75 | start_epoch = 1546300800 # January 1, 2019 76 | end_epoch = 1577836800 # January 1, 2020 77 | 78 | scrape_submissions('amitheasshole', start_epoch, end_epoch) 79 | 80 | print('Done') 81 | 82 | if __name__ == '__main__': 83 | main() -------------------------------------------------------------------------------- /nn/train.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.metrics import confusion_matrix 4 | from sklearn.model_selection import train_test_split 5 | from tensorflow import keras 6 | from tensorflow.keras import layers 7 | from tensorflow.keras.preprocessing.text import Tokenizer 8 | from tensorflow.keras.preprocessing.sequence import pad_sequences 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | 12 | df = pd.read_csv('aita_stratified.csv') 13 | 14 | X = (df['title'] + df['body']).values 15 | X = X.astype(str) 16 | 17 | Y = df['verdict'].values 18 | Y = np.asarray(Y).astype(np.int32) 19 | 20 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) 21 | 22 | print('Loaded data') 23 | 24 | vocab_size = 10000 25 | oov_token = '' 26 | max_length = 500 27 | 28 | tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) 29 | tokenizer.fit_on_texts(X_train) 30 | 31 | X_train = tokenizer.texts_to_sequences(X_train) 32 | X_test = tokenizer.texts_to_sequences(X_test) 33 | 34 | X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post') 35 | X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post') 36 | 37 | word_index = tokenizer.word_index 38 | print('Tokenized data,', len(word_index), 'words') 39 | 40 | def create_embedding_matrix(filepath, word_index, embedding_dim): 41 | vocab_size = len(word_index) + 1 42 | embedding_matrix = np.zeros((vocab_size, embedding_dim)) 43 | 44 | with open(filepath, 'r', encoding='utf-8') as f: 45 | for line in f: 46 | word, *vector = line.split() 47 | if word in word_index: 48 | idx = word_index[word] 49 | embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim] 50 | 51 | return embedding_matrix 52 | 53 | embedding_dim = 100 54 | embedding_matrix = create_embedding_matrix('glove.twitter.27B.100d.txt', word_index, embedding_dim) 55 | 56 | nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1)) 57 | embedding_accuracy = nonzero_elements / (len(word_index) + 1) 58 | print('Embedding accuracy: ' + str(embedding_accuracy)) 59 | 60 | model = keras.Sequential() 61 | model.add(layers.Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False)) 62 | model.add(layers.Conv1D(128, 5, activation='relu')) 63 | model.add(layers.Dropout(0.7)) 64 | model.add(layers.GlobalMaxPooling1D()) 65 | model.add(layers.Dense(16, activation='relu')) 66 | model.add(layers.Dense(1, activation='sigmoid')) 67 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 68 | 69 | res = model.fit(X_train, Y_train, epochs=20, verbose=True, validation_data=(X_test, Y_test), batch_size=512) 70 | 71 | 72 | plt.plot(res.history['accuracy']) 73 | plt.plot(res.history['val_accuracy']) 74 | plt.title('model accuracy') 75 | plt.ylabel('accuracy') 76 | plt.xlabel('epoch') 77 | plt.legend(['train', 'val'], loc='upper left') 78 | plt.show() 79 | 80 | Y_pred = model.predict(X_test, verbose=1) 81 | 82 | x_ax = range(len(Y_pred)) 83 | plt.scatter(x_ax, Y_test, s=5, color='blue', label='original') 84 | plt.plot(x_ax, Y_pred, lw=0.8, color='red', label='predicted') 85 | plt.legend() 86 | plt.show() 87 | 88 | cf_matrix = confusion_matrix(Y_test, (Y_pred > 0.5).astype(np.int32)) 89 | print(cf_matrix) 90 | sns.heatmap(cf_matrix, annot=True) 91 | plt.show() 92 | 93 | model.save('aita_classifier') -------------------------------------------------------------------------------- /dataset/1_comment_scraper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import string 4 | import csv 5 | import itertools 6 | import requests 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from tqdm import tqdm 12 | from multiprocessing import Pool 13 | from proxies import random_proxy, remove_proxy 14 | 15 | def get_data(mode, args, fields): 16 | url = f'http://api.pushshift.io/reddit/{mode}/search/{args}&fields={",".join(fields)}&size=100' 17 | proxy = random_proxy() 18 | timeout = (3, 7) 19 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} 20 | 21 | try: 22 | r = requests.get(url, proxies=proxy, timeout=timeout, headers=headers, allow_redirects=False) 23 | status = r.status_code 24 | except Exception as e: 25 | #print(f'{e} on {proxy["http"]}') 26 | status = -1 27 | if status != 200: 28 | #if status != -1: 29 | #print(f'{status} on {proxy["http"]}') 30 | #remove_proxy(proxy['http']) 31 | time.sleep(0.25) 32 | return get_data(mode, args, fields) 33 | 34 | #print(f'Fetched data from "{url}" on "{proxy["http"]}"') 35 | data = json.loads(r.text) 36 | return data['data'] 37 | 38 | def get_comments(id, after): 39 | after = str(after) 40 | args = f'?after={after}&link_id={id}' + \ 41 | f'&sort=asc&sort_type=created_utc' 42 | fields = ['created_utc', 'body', 'author', 'score'] 43 | 44 | return get_data('comment', args, fields) 45 | 46 | def get_all_comments(id): 47 | after = 0 48 | comments = list() 49 | 50 | while True: 51 | new_comments = get_comments(id, after) 52 | if not new_comments: 53 | break 54 | 55 | for comment in new_comments: 56 | comments.append(comment) 57 | 58 | after = comments[-1]['created_utc'] 59 | 60 | return comments 61 | 62 | def scrape_comment(submission_data): 63 | yta_score, nta_score = 0, 0 64 | 65 | if int(submission_data['num_comments']) > 0: 66 | comments = get_all_comments(submission_data['id']) 67 | for comment in comments: 68 | if comment['author'].lower() == 'automoderator': 69 | continue 70 | comment_text = comment['body'].translate(str.maketrans('', '', string.punctuation)) 71 | words = comment_text.lower().split() 72 | if 'yta' in words or 'esh' in words: 73 | yta_score += int(comment['score']) 74 | if 'nta' in words or 'nah' in words: 75 | nta_score += int(comment['score']) 76 | 77 | submission_data['yta_score'] = yta_score 78 | submission_data['nta_score'] = nta_score 79 | 80 | return submission_data 81 | 82 | def scrape_commments(subreddit, after, before): 83 | print(f'Scraping comments of r/{subreddit} from {after} to {before}') 84 | 85 | f_out = open(f'1_submission_data_{after}_{before}.csv', 'w', newline='', encoding='utf-8') 86 | header = ['id', 'timestamp', 'title', 'body', 'author', 'score', 'num_comments', 'yta_score', 'nta_score'] 87 | writer = csv.DictWriter(f_out, quoting=csv.QUOTE_ALL, fieldnames=header) 88 | writer.writeheader() 89 | 90 | processes = 32 91 | submissions = list() 92 | scraped_submissions = list() 93 | 94 | with open(f'0_submission_data_{after}_{before}.csv', 'r', newline='', encoding='utf-8') as f_in: 95 | reader = csv.DictReader(f_in) 96 | for row in reader: 97 | submissions.append(row) 98 | 99 | p = Pool(processes=processes) 100 | 101 | with tqdm(total=len(submissions)) as progressbar: 102 | for entry in enumerate(p.imap_unordered(scrape_comment, submissions)): 103 | scraped_submissions.append(entry) 104 | progressbar.update() 105 | 106 | p.close() 107 | print(f'Done scraping comments of r/{subreddit} from {after} to {before}') 108 | 109 | for row in scraped_submissions: 110 | writer.writerow(row[1]) 111 | f_out.close() 112 | 113 | def main(): 114 | start_epoch = 1546300800 # January 1, 2019 115 | end_epoch = 1577836800 # January 1, 2020 116 | 117 | scrape_commments('amitheasshole', start_epoch, end_epoch) 118 | 119 | print('Done') 120 | 121 | if __name__ == '__main__': 122 | main() --------------------------------------------------------------------------------