├── Facebook.py └── README.md /Facebook.py: -------------------------------------------------------------------------------- 1 | #---- Author: Bhagyashree Borate ------ 2 | #--- Facebook script to collect data from Facebook Public pages and cannot recieve private data. 3 | #The file should be provided with the API KEYS from Facebook Graph API to work correctly. 4 | #The page_id should be given with the total number of public pages to collect data from. 5 | # How to run? - Just run the script as it is without any arguments at runtime. i.e. python Facebook.py 6 | # ---- PYTHON LIBRARIES ---- 7 | import urllib.request 8 | import json 9 | import datetime 10 | import csv 11 | import time 12 | import re 13 | import os 14 | global acc_name, fan_count 15 | 16 | # ------ ------- 17 | 18 | 19 | # -- Facebook app Id and secret key ( TO BE KEPT CONFIDENTIAL ) 20 | app_id = "your app_id goes here" 21 | app_secret = "your app_secret" 22 | access_token = app_id + "|" + app_secret 23 | # -------- 24 | 25 | 26 | 27 | #---- Mention id of page which you want to get data from --- 28 | 29 | page_id = ['sortedfood', ..., 'name_of_facebook_pages'] #you can find this name in the profile link for particular page 30 | 31 | 32 | # --- Function to get Id, Name, TOTAL LIKES, Link of the PAGE ---- 33 | 34 | def get_FB_Page_Details(page_id,access_token): 35 | api_endpoint = "https://graph.facebook.com/v2.4/" 36 | fb_graph_url = api_endpoint+page_id+"?fields=id,name,fan_count,link&access_token="+access_token 37 | 38 | try: 39 | api_request = urllib.request.Request(fb_graph_url) 40 | api_response = urllib.request.urlopen(api_request) 41 | 42 | try: 43 | data = json.loads(api_response.read()) 44 | return data 45 | except (ValueError, KeyError, TypeError): 46 | return "0" 47 | 48 | except IOError as e: 49 | if hasattr(e, 'code'): 50 | return "no" 51 | elif hasattr(e, 'reason'): 52 | return "no" 53 | 54 | 55 | 56 | #--- Funtion to check if requested URL exists -- 57 | 58 | def request_until_succeed(url): 59 | req = urllib.request.Request(url) 60 | success = False 61 | while success is False: 62 | try: 63 | response = urllib.request.urlopen(req) 64 | if response.getcode() == 200: 65 | success = True 66 | except Exception as e: 67 | time.sleep(5) 68 | 69 | #print("Error for URL %s: %s" % (url, datetime.datetime.now())) 70 | 71 | return response.read().decode('utf-8') 72 | 73 | 74 | 75 | # ----- Get Details of POSTS ------ 76 | def get_FB_Page_Post_Details(page_id, access_token, num_statuses): 77 | 78 | # construct the URL string 79 | base = "https://graph.facebook.com" 80 | node = "/" + page_id + "/feed" 81 | parameters = "/?fields=place,message_tags,message,link,created_time,type,name,id,likes.summary(true).limit(0),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(ANGRY).limit(0).summary(1).as(angry),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # URL DATA 82 | url = base + node + parameters 83 | 84 | # retrieve data 85 | data = json.loads(request_until_succeed(url)) 86 | 87 | return data 88 | 89 | 90 | def getFacebookPagePostData(i,access_token): # Get Details of Particular Post 91 | # construct the URL string 92 | base = "https://graph.facebook.com/v2.6/?fields=place,message_tags,message,link,created_time,type,name,id,likes.summary(true).limit(0),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(ANGRY).limit(0).summary(1).as(angry),comments.limit(1).summary(true),shares&id="+i+"&access_token="+access_token 93 | # retrieve data 94 | data = json.loads(request_until_succeed(base)) 95 | 96 | return data 97 | 98 | 99 | # ---- main function where counts and other data is retrived --- 100 | 101 | def get_FB_Page_Post_Data(page_id,account, page_likes, status): 102 | 103 | status_id = status['id'] 104 | s = status_id.split('_') 105 | post_url = "https://www.facebook.com/"+page_id+"/posts/"+s[1] 106 | 107 | status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8') 108 | status_message1 = '' if 'message' not in status.keys() else status['message'] 109 | link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8') 110 | status_type = status['type'] 111 | 112 | 113 | location = 0 if 'place' not in status.keys() else 1 114 | messagetags = '' if 'message_tags' not in status.keys() else status['message_tags'] 115 | 116 | tags = [] 117 | if len(messagetags)>0: 118 | for i in range(len(messagetags)): 119 | if any('name' in d for d in messagetags): 120 | tags.append(messagetags[i]['name']) 121 | 122 | else: 123 | tags = [] 124 | else: 125 | tags = [] 126 | 127 | #tagged people count 128 | with_people = len(tags) 129 | 130 | #get hashtags from status 131 | x = re.compile(r'\B#\w+') 132 | hashtags = len(x.findall(status_message1)) 133 | hashtag_list = x.findall(status_message1) 134 | hashtag_text = ' '.join(hashtag_list) 135 | 136 | 137 | #get Links from status 138 | urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', status_message1) 139 | links = [] 140 | for url in urls: 141 | links.append(url) 142 | 143 | 144 | #total links 145 | num_links = len(links) 146 | link_data = [] 147 | #link1 2 3 148 | if len(links)>0: 149 | if len(links)==3: 150 | for i in range(0,3): 151 | link_data.append(links[i]) 152 | elif len(links)< 3: 153 | count = 3 - len(links) 154 | for j in range(len(links)): 155 | link_data.append(links[j]) 156 | for j in range(0,count): 157 | link_data.append(" ") 158 | else: 159 | for i in range(0,3): 160 | if i>2: 161 | break 162 | else: 163 | link_data.append(" ") 164 | 165 | link1 = link_data[0] 166 | link2 = link_data[1] 167 | link3 = link_data[2] 168 | 169 | data_collected_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 170 | 171 | #time conversion - CREATION time of post 172 | status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000') 173 | status_published = status_published + datetime.timedelta(hours=-5) # EST 174 | status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs 175 | 176 | 177 | num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count'] 178 | num_loves = 0 if 'love' not in status.keys() else status['love']['summary']['total_count'] 179 | num_haha = 0 if 'haha' not in status.keys() else status['haha']['summary']['total_count'] 180 | num_wow = 0 if 'wow' not in status.keys() else status['wow']['summary']['total_count'] 181 | num_sad = 0 if 'sad' not in status.keys() else status['sad']['summary']['total_count'] 182 | num_angry = 0 if 'angry' not in status.keys() else status['angry']['summary']['total_count'] 183 | num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count'] 184 | num_shares = 0 if 'shares' not in status.keys() else status['shares']['count'] 185 | 186 | # return a tuple of all processed data 187 | return (account,page_likes, status_id, post_url, data_collected_time, status_published, 188 | num_likes,num_loves, num_haha,num_wow,num_sad,num_angry,num_comments, num_shares,status_message, hashtags, hashtag_text, num_links,link1,link2,link3) 189 | 190 | 191 | #---main call function -- 192 | 193 | def get_FB_Page_Posts(page_id,access_token): 194 | try: 195 | flag = 0 196 | while True: 197 | now = datetime.datetime.now() 198 | if (now.hour == 19 and now.minute == 24 and now.second==50) or (now.hour == 12 and now.minute == 00 and now.second == 00): #checking if current time is 0:00 i.e. 12AM or 12:00 i.e. 12:00 PM 199 | for i in page_id: 200 | has_next_page = True 201 | num_processed = 0 # keep a count on how many we've processed 202 | script_starttime = datetime.datetime.now() 203 | statuses = get_FB_Page_Post_Details(i, access_token, 30) #here 4 means 4 posts can be fetched at a time 204 | flag = 0 205 | with open('facebook_details.csv', 'a', newline='') as file: 206 | w = csv.writer(file) 207 | for status in statuses['data']: 208 | if flag == 0: 209 | page_data = get_FB_Page_Details(i, access_token) 210 | account = str(page_data['name']) 211 | page_likes = str(page_data['fan_count']) 212 | flag = 1 213 | acc_name = account 214 | fan_count = page_likes 215 | else: 216 | account = "" 217 | page_likes ="" 218 | w.writerow(get_FB_Page_Post_Data(i, acc_name, fan_count, status)) 219 | 220 | # output progress 221 | num_processed += 1 222 | if num_processed % 1000 == 0: 223 | print("%s Statuses Processed: %s" % (num_processed, datetime.datetime.now())) 224 | print("\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - script_starttime)) 225 | 226 | else: 227 | a=0 228 | except KeyboardInterrupt: 229 | print("exit!") 230 | 231 | 232 | #---CREATE EXCEL SHEET --- 233 | with open('facebook_details.csv', 'w', newline='') as file: 234 | w = csv.writer(file) 235 | w.writerow(["Account", "#pagelike", "post_id", "url","Data Collected Time","Post Creation Time","#likes","#loves", "#haha",",#wow","#sad","angry","num_comments", "num_shares","text","#hashtags","hashtag_text","#links","link1","link2","link3"]) 236 | # --- start of program ---- 237 | 238 | get_FB_Page_Posts(page_id,access_token) 239 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Facebook-Web-Scraper-Python 2 | I have created a Facebook Page and Posts scraper in Python. The python script collects posts of the facebook page and also other details like Comments, likes, haha, wow's, texts, links, hashtags, etc. 3 | 4 | The scraper collect details of Public Facebook Page using Facebook Graph API. In order to use the script, you should have a Facebook Graph API activated and their Security credentials as mentioned below in the code. 5 | 6 | To get the credentials for Graph API you must have a Facebook Developers Account. Here you will create a Developer account and also your intended app for facebook scraping. Once the account and app is created you can access your app_id and app_secret key in settings of that particular app. Once you get this details, you have to add them in app_id and app_secret variables in the script. 7 | 8 | Once the script is run the data is collected and stored in a CSV format with the name facebook_details.csv in the directory where the script is stored. 9 | 10 | Note: My script collects data from Facebook at 12 AM and 12 PM twice a day. If you dont want the script to continously run and collect data at specific times then you can remove the timer set in the script. 11 | 12 | The results fetched are: 13 | 14 | # Facebook Account details - 15 | Facebook Account Name, Facebook total Page likes, Data Collection time. 16 | # Post details - 17 | Facebook Post id, Post URL, Post Creation Time, total Post likes, total Post loves, total Post haha' s, total Post wow's, total Post sad's, total Post angary's, total Post comments, total Post shares, Post Text, Total post Hashtags, Hashtag texts, total links in post and 3 separate links in separate cells. 18 | 19 | # How to run the script - 20 | 1. Download Facebook.py file 21 | 2. The Facebook graph api version is v2.4 Make sure you app supports this version 22 | 3. Update the app_id and app_secret with your app's id and secret key 23 | 4. Update the page_id list with the name of Facebook Page e.g. Suppose page you want to scrape is - https://www.facebook.com/sortedfood then give page_id as 'sortedfood' in the page_id list. You can add multiple page ids in this list to collect data for all the pages at once. Make sure the page is public and name you are giving is correct. 24 | 5. In the code there is a line - statuses = getFacebookPageFeedData(i, access_token, 30) this line collects data for maximum of 30 posts. You can change this number to 1 - 500 to fetch 1-500 posts at time for one page. 25 | 6. If you dont want script to run on specified time then modify the code by removing 26 | //**while True: 27 | now = datetime.datetime.now() 28 | if (now.hour == 19 and now.minute == 24 and now.second==50) or (now.hour == 12 and now.minute == 00 and now.second == 00): **// 29 | 30 | lines from the code. (Modify the code as per your own requirements) 31 | --------------------------------------------------------------------------------