├── README.md
├── reddit_antiwork.py
├── reddit_ask_reddit.py
├── reddit_astrology.py
├── reddit_birds_are_not_real.py
├── reddit_booksuggestions.py
├── reddit_conspiracy.py
├── reddit_coronavirus.py
├── reddit_cricket.py
├── reddit_cryptocurrency.py
├── reddit_data_science.py
├── reddit_euro_2020.py
├── reddit_extract_content.py
├── reddit_fanatasy_premier_league.py
├── reddit_i_dont_work_here_lady.py
├── reddit_justnomil.py
├── reddit_pfizer_vaccine.csv
├── reddit_pfizer_vaccine.py
├── reddit_politics.py
├── reddit_tales_from_the_job.py
├── reddit_tokyo_2020.py
└── reddit_vaccine_myths.py


/README.md:
--------------------------------------------------------------------------------
1 | # reddit_extract_content
2 | 
3 | Extract data from a reddit subreddit


--------------------------------------------------------------------------------
/reddit_antiwork.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='antiwork', items_limit=4000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_antiwork.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_ask_reddit.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='AskReddit', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "ask_reddit.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_astrology.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='Astrology', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_astrology.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_birds_are_not_real.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='BirdsArentReal', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_birds_arent_real.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_booksuggestions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='booksuggestions', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_booksuggestions.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_conspiracy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='ConspiracyTheory', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_ct.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_coronavirus.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='Coronavirus', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_coronavirus.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_cricket.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='Cricket', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_cricket.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_cryptocurrency.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='CryptoCurrency', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_cc.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_data_science.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='datascience', items_limit=4000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=4000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "data_science.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_euro_2020.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='Euro2020', items_limit=5000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=5000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_euro_2020.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_extract_content.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='WallStreetBets', items_limit=1000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     topics_df = pd.DataFrame(topics_dict)
51 |     print(f"new reddit posts retrieved: {len(topics_df)}")
52 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
53 | 
54 |     return topics_df
55 |    
56 | 
57 | def update_and_save_dataset(topics_df):   
58 |     file_path = "reddit_wsb.csv"
59 |     if os.path.exists(file_path):
60 |         topics_old_df = pd.read_csv(file_path)
61 |         print(f"past reddit posts: {topics_old_df.shape}")
62 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
63 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
64 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
65 |         print(f"all reddit posts: {topics_new_df.shape}")
66 |         topics_new_df.to_csv(file_path, index=False)
67 |     else:
68 |         print(f"reddit posts: {topics_df.shape}")
69 |         topics_df.to_csv(file_path, index=False)
70 | 
71 | 
72 | if __name__ == "__main__": 
73 | 	reddit = reddit_connection()
74 | 	topics_data_df = build_dataset(reddit)
75 | 	update_and_save_dataset(topics_data_df)
76 | 


--------------------------------------------------------------------------------
/reddit_fanatasy_premier_league.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='FantasyPL', items_limit=5000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=5000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_fpl.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_i_dont_work_here_lady.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='IDontWorkHereLady', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_i_don_t_work_here_lady.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_justnomil.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='JUSTNOMIL', items_limit=4000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_justnomil.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_pfizer_vaccine.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='PfizerVaccine', items_limit=3000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=3000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 | 
60 |     topics_df = pd.DataFrame(topics_dict)
61 |     print(f"new reddit posts retrieved: {len(topics_df)}")
62 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
63 | 
64 |     return topics_df
65 |    
66 | 
67 | def update_and_save_dataset(topics_df):   
68 |     file_path = "reddit_pfizer_vaccine.csv"
69 |     if os.path.exists(file_path):
70 |         topics_old_df = pd.read_csv(file_path)
71 |         print(f"past reddit posts: {topics_old_df.shape}")
72 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
73 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
74 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
75 |         print(f"all reddit posts: {topics_new_df.shape}")
76 |         topics_new_df.to_csv(file_path, index=False)
77 |     else:
78 |         print(f"reddit posts: {topics_df.shape}")
79 |         topics_df.to_csv(file_path, index=False)
80 | 
81 | 
82 | if __name__ == "__main__": 
83 | 	reddit = reddit_connection()
84 | 	topics_data_df = build_dataset(reddit)
85 | 	update_and_save_dataset(topics_data_df)
86 | 


--------------------------------------------------------------------------------
/reddit_politics.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='politics', items_limit=5000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=5000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_politics.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_tales_from_the_job.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='talesfromthejob', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_tales_from_the_job.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_tokyo_2020.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='Tokyo2020', items_limit=3000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=3000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_tokyo_2020.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------
/reddit_vaccine_myths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import praw
 3 | import pandas as pd
 4 | import datetime as dt
 5 | from tqdm import tqdm
 6 | import time
 7 | 
 8 | 
 9 | def get_date(created):
10 |     return dt.datetime.fromtimestamp(created)
11 | 
12 | 
13 | def reddit_connection():
14 |     personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
15 |     client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
16 |     user_agent = os.environ["REDDIT_APP_NAME"]
17 |     username = os.environ["REDDIT_USER_NAME"]
18 |     password = os.environ["REDDIT_LOGIN_PASSWORD"]
19 | 
20 |     reddit = praw.Reddit(client_id=personal_use_script, \
21 |                          client_secret=client_secret, \
22 |                          user_agent=user_agent, \
23 |                          username=username, \
24 |                          password='')
25 |     return reddit
26 | 
27 | 
28 | def build_dataset(reddit, search_words='VaccineMyths', items_limit=2000):
29 |     
30 |     # Collect reddit posts
31 |     subreddit = reddit.subreddit(search_words)
32 |     new_subreddit = subreddit.new(limit=items_limit)
33 |     topics_dict = { "title":[],
34 |                 "score":[],
35 |                 "id":[], "url":[],
36 |                 "comms_num": [],
37 |                 "created": [],
38 |                 "body":[]}
39 |     
40 |     print(f"retreive new reddit posts ...")
41 |     for submission in tqdm(new_subreddit):
42 |         topics_dict["title"].append(submission.title)
43 |         topics_dict["score"].append(submission.score)
44 |         topics_dict["id"].append(submission.id)
45 |         topics_dict["url"].append(submission.url)
46 |         topics_dict["comms_num"].append(submission.num_comments)
47 |         topics_dict["created"].append(submission.created)
48 |         topics_dict["body"].append(submission.selftext)
49 | 
50 |     for comment in tqdm(subreddit.comments(limit=2000)):
51 |         topics_dict["title"].append("Comment")
52 |         topics_dict["score"].append(comment.score)
53 |         topics_dict["id"].append(comment.id)
54 |         topics_dict["url"].append("")
55 |         topics_dict["comms_num"].append(0)
56 |         topics_dict["created"].append(comment.created)
57 |         topics_dict["body"].append(comment.body)
58 | 
59 |     topics_df = pd.DataFrame(topics_dict)
60 |     print(f"new reddit posts retrieved: {len(topics_df)}")
61 |     topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
62 | 
63 |     return topics_df
64 |    
65 | 
66 | def update_and_save_dataset(topics_df):   
67 |     file_path = "reddit_vm.csv"
68 |     if os.path.exists(file_path):
69 |         topics_old_df = pd.read_csv(file_path)
70 |         print(f"past reddit posts: {topics_old_df.shape}")
71 |         topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
72 |         print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
73 |         topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
74 |         print(f"all reddit posts: {topics_new_df.shape}")
75 |         topics_new_df.to_csv(file_path, index=False)
76 |     else:
77 |         print(f"reddit posts: {topics_df.shape}")
78 |         topics_df.to_csv(file_path, index=False)
79 | 
80 | 
81 | if __name__ == "__main__": 
82 | 	reddit = reddit_connection()
83 | 	topics_data_df = build_dataset(reddit)
84 | 	update_and_save_dataset(topics_data_df)
85 | 


--------------------------------------------------------------------------------