├── README.md ├── reddit_antiwork.py ├── reddit_ask_reddit.py ├── reddit_astrology.py ├── reddit_birds_are_not_real.py ├── reddit_booksuggestions.py ├── reddit_conspiracy.py ├── reddit_coronavirus.py ├── reddit_cricket.py ├── reddit_cryptocurrency.py ├── reddit_data_science.py ├── reddit_euro_2020.py ├── reddit_extract_content.py ├── reddit_fanatasy_premier_league.py ├── reddit_i_dont_work_here_lady.py ├── reddit_justnomil.py ├── reddit_pfizer_vaccine.csv ├── reddit_pfizer_vaccine.py ├── reddit_politics.py ├── reddit_tales_from_the_job.py ├── reddit_tokyo_2020.py └── reddit_vaccine_myths.py /README.md: -------------------------------------------------------------------------------- 1 | # reddit_extract_content 2 | 3 | Extract data from a reddit subreddit -------------------------------------------------------------------------------- /reddit_antiwork.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='antiwork', items_limit=4000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_antiwork.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_ask_reddit.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='AskReddit', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "ask_reddit.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_astrology.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='Astrology', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_astrology.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_birds_are_not_real.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='BirdsArentReal', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_birds_arent_real.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_booksuggestions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='booksuggestions', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_booksuggestions.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_conspiracy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='ConspiracyTheory', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_ct.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_coronavirus.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='Coronavirus', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_coronavirus.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_cricket.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='Cricket', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_cricket.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_cryptocurrency.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='CryptoCurrency', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_cc.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_data_science.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='datascience', items_limit=4000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=4000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "data_science.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_euro_2020.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='Euro2020', items_limit=5000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=5000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_euro_2020.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_extract_content.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='WallStreetBets', items_limit=1000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | topics_df = pd.DataFrame(topics_dict) 51 | print(f"new reddit posts retrieved: {len(topics_df)}") 52 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 53 | 54 | return topics_df 55 | 56 | 57 | def update_and_save_dataset(topics_df): 58 | file_path = "reddit_wsb.csv" 59 | if os.path.exists(file_path): 60 | topics_old_df = pd.read_csv(file_path) 61 | print(f"past reddit posts: {topics_old_df.shape}") 62 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 63 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 64 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 65 | print(f"all reddit posts: {topics_new_df.shape}") 66 | topics_new_df.to_csv(file_path, index=False) 67 | else: 68 | print(f"reddit posts: {topics_df.shape}") 69 | topics_df.to_csv(file_path, index=False) 70 | 71 | 72 | if __name__ == "__main__": 73 | reddit = reddit_connection() 74 | topics_data_df = build_dataset(reddit) 75 | update_and_save_dataset(topics_data_df) 76 | -------------------------------------------------------------------------------- /reddit_fanatasy_premier_league.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='FantasyPL', items_limit=5000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=5000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_fpl.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_i_dont_work_here_lady.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='IDontWorkHereLady', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_i_don_t_work_here_lady.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_justnomil.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='JUSTNOMIL', items_limit=4000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_justnomil.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_pfizer_vaccine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='PfizerVaccine', items_limit=3000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=3000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | 60 | topics_df = pd.DataFrame(topics_dict) 61 | print(f"new reddit posts retrieved: {len(topics_df)}") 62 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 63 | 64 | return topics_df 65 | 66 | 67 | def update_and_save_dataset(topics_df): 68 | file_path = "reddit_pfizer_vaccine.csv" 69 | if os.path.exists(file_path): 70 | topics_old_df = pd.read_csv(file_path) 71 | print(f"past reddit posts: {topics_old_df.shape}") 72 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 73 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 74 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 75 | print(f"all reddit posts: {topics_new_df.shape}") 76 | topics_new_df.to_csv(file_path, index=False) 77 | else: 78 | print(f"reddit posts: {topics_df.shape}") 79 | topics_df.to_csv(file_path, index=False) 80 | 81 | 82 | if __name__ == "__main__": 83 | reddit = reddit_connection() 84 | topics_data_df = build_dataset(reddit) 85 | update_and_save_dataset(topics_data_df) 86 | -------------------------------------------------------------------------------- /reddit_politics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='politics', items_limit=5000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=5000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_politics.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_tales_from_the_job.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='talesfromthejob', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_tales_from_the_job.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_tokyo_2020.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='Tokyo2020', items_limit=3000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=3000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_tokyo_2020.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | -------------------------------------------------------------------------------- /reddit_vaccine_myths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import praw 3 | import pandas as pd 4 | import datetime as dt 5 | from tqdm import tqdm 6 | import time 7 | 8 | 9 | def get_date(created): 10 | return dt.datetime.fromtimestamp(created) 11 | 12 | 13 | def reddit_connection(): 14 | personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"] 15 | client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"] 16 | user_agent = os.environ["REDDIT_APP_NAME"] 17 | username = os.environ["REDDIT_USER_NAME"] 18 | password = os.environ["REDDIT_LOGIN_PASSWORD"] 19 | 20 | reddit = praw.Reddit(client_id=personal_use_script, \ 21 | client_secret=client_secret, \ 22 | user_agent=user_agent, \ 23 | username=username, \ 24 | password='') 25 | return reddit 26 | 27 | 28 | def build_dataset(reddit, search_words='VaccineMyths', items_limit=2000): 29 | 30 | # Collect reddit posts 31 | subreddit = reddit.subreddit(search_words) 32 | new_subreddit = subreddit.new(limit=items_limit) 33 | topics_dict = { "title":[], 34 | "score":[], 35 | "id":[], "url":[], 36 | "comms_num": [], 37 | "created": [], 38 | "body":[]} 39 | 40 | print(f"retreive new reddit posts ...") 41 | for submission in tqdm(new_subreddit): 42 | topics_dict["title"].append(submission.title) 43 | topics_dict["score"].append(submission.score) 44 | topics_dict["id"].append(submission.id) 45 | topics_dict["url"].append(submission.url) 46 | topics_dict["comms_num"].append(submission.num_comments) 47 | topics_dict["created"].append(submission.created) 48 | topics_dict["body"].append(submission.selftext) 49 | 50 | for comment in tqdm(subreddit.comments(limit=2000)): 51 | topics_dict["title"].append("Comment") 52 | topics_dict["score"].append(comment.score) 53 | topics_dict["id"].append(comment.id) 54 | topics_dict["url"].append("") 55 | topics_dict["comms_num"].append(0) 56 | topics_dict["created"].append(comment.created) 57 | topics_dict["body"].append(comment.body) 58 | 59 | topics_df = pd.DataFrame(topics_dict) 60 | print(f"new reddit posts retrieved: {len(topics_df)}") 61 | topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x)) 62 | 63 | return topics_df 64 | 65 | 66 | def update_and_save_dataset(topics_df): 67 | file_path = "reddit_vm.csv" 68 | if os.path.exists(file_path): 69 | topics_old_df = pd.read_csv(file_path) 70 | print(f"past reddit posts: {topics_old_df.shape}") 71 | topics_all_df = pd.concat([topics_old_df, topics_df], axis=0) 72 | print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}") 73 | topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False) 74 | print(f"all reddit posts: {topics_new_df.shape}") 75 | topics_new_df.to_csv(file_path, index=False) 76 | else: 77 | print(f"reddit posts: {topics_df.shape}") 78 | topics_df.to_csv(file_path, index=False) 79 | 80 | 81 | if __name__ == "__main__": 82 | reddit = reddit_connection() 83 | topics_data_df = build_dataset(reddit) 84 | update_and_save_dataset(topics_data_df) 85 | --------------------------------------------------------------------------------