├── .gitignore ├── README.md ├── sample_usage.py └── instagram_api_functions.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.mp4 3 | *.jpg 4 | *.png 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Instagram API 2 | 3 | This repository has some sample code from helping CSCAR clients for extracting data from Instagram. This uses the unofficial Instagram API for Python located at https://github.com/LevPasha/Instagram-API-python. I use the API to get the JSON response and then extract some subset of the response. 4 | 5 | You also need the [Google Maps module](https://github.com/googlemaps/google-maps-services-python) (for geocoding) OR you can just comment out the relevant lines of code. 6 | 7 | Example code is provided in `sample_code.py`. You need to provide your login information in `instagram_api_functions.py` before starting. 8 | 9 | `instagram_api_functions.py` contains functions to do the following: 10 | - Get user id for username based on exact match. 11 | - Get timestamp of first post for user. 12 | - Gets metadata about a user. 13 | - Iterates through a user's timeline and extracts post and metadata. 14 | - Downloads Instagram Media given a url. 15 | - Iterates through a user's timeline and downloads the associated media. 16 | - Gets media id for a given url. 17 | - Get comments for a particular post (media). 18 | 19 | -------------------------------------------------------------------------------- /sample_usage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import instagram_api_functions as IG 3 | import pandas as pd 4 | 5 | pd.options.display.max_rows = 16 6 | 7 | #%% Get ids of users 8 | usernames = ['umichhockey'] 9 | profiles = IG.get_user_id(usernames) 10 | 11 | #%% Specify user 12 | for username in profiles.keys(): 13 | print(username) 14 | uid = profiles[username] 15 | 16 | #%% Get user posts 17 | posts = IG.get_user_posts(uid, geocode=False) 18 | posts['text'] = posts['text'].apply(lambda x: x.replace('\n',' ').replace('\r',' ')) 19 | posts.to_csv('{}_posts.txt'.format(username), index=False, sep='|', encoding='utf-8') 20 | 21 | #%% Download media for specific posts 22 | if posts.shape[0] > 0: #check if there are any posts 23 | media_ids = posts['media_id'].tolist() 24 | IG.get_user_media(uid) 25 | 26 | # Get comments for specific posts 27 | with open('{}_comments.txt'.format(username), 'w', encoding='utf-8') as fout: 28 | for i, media_id in enumerate(media_ids, start=1): 29 | header_flag = True if i == 1 else False 30 | print('{}, Media {}'.format(i, media_id)) 31 | comments = IG.get_post_comments(media_id) 32 | comments.insert(0,'media_id',media_id) 33 | comments.to_csv(fout, index=False, sep='|', header=header_flag) 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /instagram_api_functions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This script contains functions to do the following: 4 | Get user id for username based on exact match. 5 | Get timestamp of first post for user. 6 | Gets metadata about a user. 7 | Iterates through a user's timeline and extracts post and metadata. 8 | Downloads Instagram Media given a url. 9 | Iterates through a user's timeline and downloads the associated media. 10 | Gets media id for a given url. 11 | Get comments for a particular post (media). 12 | 13 | You will need to install the Instagram API module and/or the GoogleMaps module 14 | (for geocoding) located here on Github: 15 | https://github.com/LevPasha/Instagram-API-python 16 | https://github.com/googlemaps/google-maps-services-python 17 | 18 | IMPORTANT: 19 | You also need to include your login information after the import statements 20 | """ 21 | 22 | from InstagramAPI import InstagramAPI 23 | import os 24 | import requests 25 | import pandas as pd 26 | from datetime import datetime 27 | from bs4 import BeautifulSoup 28 | import googlemaps 29 | 30 | username = 'username' 31 | pwd = 'password' 32 | API = InstagramAPI(username,pwd) 33 | API.login() 34 | 35 | #%% 36 | def get_user_id(usernames): 37 | """ 38 | Get user id for username based on exact match. 39 | 40 | Parameters 41 | ---------- 42 | usernames: list 43 | List of usernames 44 | 45 | Returns 46 | ------- 47 | dictionary 48 | key, value = user, user id 49 | """ 50 | 51 | profiles = {} 52 | if not isinstance(usernames, list): 53 | usernames = list(usernames) 54 | for username in usernames: 55 | API.fbUserSearch(username) 56 | response = API.LastJson 57 | for account in response['users']: 58 | if account['user']['username'] == username: 59 | profiles[username]= account['user']['pk'] 60 | break 61 | return profiles 62 | 63 | 64 | def get_first_post_timestamp(uid): 65 | """ 66 | Get timestamp of first post for user. 67 | 68 | Parameters 69 | ---------- 70 | uid: str 71 | User id 72 | 73 | Returns 74 | ------- 75 | tuple 76 | two-tuple of timestamp and max_id 77 | """ 78 | 79 | timestamp = None 80 | ts = pd.read_csv('max_id_timestamps.txt') 81 | for row in ts.itertuples(index=False): 82 | max_id = row[1] 83 | while True: 84 | API.getUserFeed(uid, maxid=max_id) 85 | response = API.LastJson 86 | if response['num_results'] == 0: 87 | break 88 | item = response['items'][-1] 89 | caption = item['caption'] 90 | seconds = caption['created_at'] if caption else item['taken_at'] 91 | tiempo = datetime.fromtimestamp(seconds) 92 | timestamp = tiempo.strftime('%Y-%m-%d %H:%M:%S') 93 | max_id = item['id'] 94 | print(timestamp, max_id) 95 | if not response['more_available']: 96 | break 97 | if timestamp: 98 | break 99 | return (timestamp, max_id) 100 | 101 | 102 | def get_user_metadata(profiles): 103 | """ 104 | Gets metadata about a user. 105 | 106 | Parameters 107 | ---------- 108 | profiles: dict 109 | Dictionary of users 110 | 111 | Returns 112 | ------- 113 | DataFrame 114 | Columns: ['username','posts','followers','following','first_post', 115 | 'max_id'] 116 | """ 117 | 118 | metadata = [] 119 | for username in profiles.keys(): 120 | print('Processing {}'.format(username) ) 121 | uid = profiles[username] 122 | API.getUsernameInfo(uid) 123 | response = API.LastJson 124 | posts = response['user']['media_count'] 125 | followers = response['user']['follower_count'] 126 | following = response['user']['following_count'] 127 | timestamp, max_id = get_first_post_timestamp(uid) 128 | metadata.append((username,posts,followers,following,timestamp,max_id)) 129 | columns = columns=['username','posts','followers','following','first_post','max_id'] 130 | return pd.DataFrame(metadata, columns=columns) 131 | 132 | 133 | def get_user_posts(uid, *, max_id = '', count=15000, geocode=False): 134 | """ 135 | Iterates through a user's timeline and extracts post and metadata. 136 | 137 | Parameters 138 | ---------- 139 | uid: int 140 | User id 141 | max_id: str 142 | Return post earlier than this max_id 143 | count: int 144 | Count of posts to return 145 | geocode: boolean 146 | Whether to geocode the lon/lat coordinates if exists using 147 | Google Maps API. True for geocoding. False for no geocoding. 148 | 149 | Returns 150 | ------- 151 | DataFrame 152 | Columns: ['media_id','shortcode','timestamp','weekday','lon', 153 | 'lat','address','like_count','comment_count','media_type', 154 | 'duration','views','photos','text'] 155 | """ 156 | 157 | if geocode: 158 | apikey = os.getenv('GOOGLE_MAP_API_KEY') # Insert own key HERE 159 | gmaps = googlemaps.Client(apikey) 160 | data = [] 161 | counter = 0 162 | if not isinstance(max_id, str): 163 | max_id = str(max_id) 164 | while counter < count: 165 | print('Post {} {}'.format(counter, max_id)) 166 | API.getUserFeed(uid, maxid=max_id) 167 | response = API.LastJson 168 | for i, item in enumerate(response['items'], start=counter+1): 169 | media_id = item['pk'] 170 | shortcode = item['code'] 171 | if item['caption']: 172 | seconds = item['caption']['created_at'] 173 | txt = item['caption']['text'] 174 | else: 175 | seconds = item['taken_at'] 176 | txt = '' 177 | tiempo = datetime.fromtimestamp(seconds) 178 | timestamp = tiempo.strftime('%Y-%m-%d %H:%M:%S') 179 | weekday = tiempo.isoweekday() 180 | if 'lng' in item: 181 | lon, lat = item['lng'], item['lat'] 182 | address = gmaps.reverse_geocode((lat, lon))[0]['formatted_address'] if geocode else None 183 | else: 184 | lon, lat, address = None, None, None 185 | likes = item['like_count'] 186 | comments = item.get('comment_count',0) 187 | media = item['media_type'] 188 | duration = item.get('video_duration', None) 189 | views = item.get('view_count', None) 190 | photos = len(item['carousel_media']) if media == 8 else 1 191 | data.append((media_id, shortcode, timestamp, weekday, lon, lat, address, likes, comments, media, duration, views, photos, txt)) 192 | if i >= count: 193 | break 194 | counter = i 195 | if response['more_available']: 196 | max_id = response['next_max_id'] 197 | else: 198 | break 199 | 200 | columns = ['media_id','shortcode','timestamp','weekday','lon','lat','address','like_count','comment_count','media_type','duration','views','photos','text'] 201 | return pd.DataFrame(data, columns=columns) 202 | 203 | 204 | def download_media(url, filename, *, photo=True): 205 | """ 206 | Downloads Instagram Media given a url. 207 | 208 | Parameters 209 | ---------- 210 | url: str 211 | Url of media 212 | filename: str 213 | Filename of downloaded media 214 | photo: boolean 215 | Whether media is a photo. True if photo. False if video. 216 | 217 | Returns 218 | ------- 219 | file 220 | Downloaded media 221 | """ 222 | 223 | R = requests.get(url) 224 | if R.status_code == 404: 225 | print('url not found for media {}'.format(filename)) 226 | return 227 | else: 228 | R.raise_for_status() 229 | with open(filename, 'wb') as fout: 230 | if photo: 231 | fout.write(R.content) 232 | else: # video 233 | for chunk in R.iter_content(chunk_size=255): 234 | if chunk: # filter out keep-alive new chunks 235 | fout.write(chunk) 236 | print('{} downloaded'.format(filename)) 237 | 238 | 239 | def get_user_media(uid, *, max_id = '', count=15000, filter=False, media_ids=[]): 240 | """ 241 | Iterates through a user's timeline and downloads the associated media. 242 | 243 | Parameters 244 | ---------- 245 | uid: int 246 | User id 247 | max_id: str 248 | Return media earlier than this max_id 249 | count: int 250 | Count of media to return 251 | filter: boolean 252 | Whether to filter for particular posts 253 | media_ids: list 254 | List of post ids to filter for (filter argument needs to be True) 255 | """ 256 | 257 | counter = 0 258 | media_count = 0 259 | if not isinstance(max_id, str): 260 | max_id = str(max_id) 261 | if filter and not isinstance(media_ids, list): 262 | media_ids = list(media_ids) 263 | while counter < count: 264 | print('Post {} {}'.format(counter, max_id)) 265 | API.getUserFeed(uid, maxid=max_id) 266 | response = API.LastJson 267 | for i, item in enumerate(response['items'], start=counter+1): 268 | media_id = item['pk'] 269 | if filter: # filtering media 270 | assert type(media_id) == type(media_ids[0]) 271 | if media_id not in media_ids: 272 | continue 273 | else: 274 | media_count += 1 275 | media = item['media_type'] 276 | filename = '{4}_{2}_{3:0>2}.jpg'.format(username,i,media,1,media_id) 277 | if media == 1: # not a slide show (carousel) 278 | url = item['image_versions2']['candidates'][0]['url'] 279 | download_media(url, filename) 280 | elif media == 8: # carousel 281 | for k, photo in enumerate(item['carousel_media'], start=1): 282 | filename = '{4}_{2}_{3:0>2}.jpg'.format(username,i,media,k,media_id) 283 | url = photo['image_versions2']['candidates'][0]['url'] 284 | download_media(url, filename) 285 | elif media == 2: # video 286 | if 'video_dash_manifest' in item: 287 | html = item.get('video_dash_manifest') 288 | soup = BeautifulSoup(html, 'html.parser') 289 | videolinks = soup.find_all('baseurl') 290 | if videolinks: 291 | url = videolinks[0].text 292 | else: 293 | print('Could NOT find video link for {}'.format(media_id)) 294 | assert 1>2 295 | elif 'video_versions' in item: 296 | url = item['video_versions'][0]['url'] 297 | else: 298 | print('Unknown key to access video url for {}'.format(media_id)) 299 | assert 1>2 300 | extension = os.path.splitext(url)[-1] 301 | filename = '{0}{1}'.format(media_id,extension) 302 | download_media(url, filename, photo=False) 303 | else: 304 | print('Unknown Media Type for {}'.format(media_id)) 305 | assert 1>2 306 | if i >= count: 307 | break 308 | if filter: # checking if all media has been obtained 309 | if media_count == len(media_ids): 310 | return 311 | counter = i 312 | if response['more_available']: 313 | max_id = response['next_max_id'] 314 | else: 315 | return 316 | 317 | 318 | def get_media_id(url): 319 | """ 320 | Gets media id for a given url. 321 | 322 | Parameters 323 | ---------- 324 | url: str 325 | Url of media 326 | 327 | Returns 328 | ------- 329 | str 330 | Media id 331 | """ 332 | 333 | callback_url = 'http://www.google.com' 334 | get_mediaid_url = r'http://api.instagram.com/oembed?callback={}&url={}'.format(callback_url,url) 335 | R = requests.get(get_mediaid_url) 336 | R.raise_for_status() 337 | response = R.json() 338 | return response['media_id'] 339 | 340 | 341 | def get_post_comments(media_id, *, max_id='', count=100000): 342 | """ 343 | Get comments for a particular post (media). 344 | 345 | Parameters 346 | ---------- 347 | media_id: str 348 | Media id 349 | max_id: str 350 | Return comment earlier than this max_id 351 | count: int 352 | Count of comments to return 353 | 354 | Returns 355 | ------- 356 | DataFrame 357 | Columns: ['timestamp','name','userid','text'] 358 | Sorted by timestamp 359 | """ 360 | 361 | columns = ['timestamp','name','userid','text'] 362 | comments = [] 363 | counter = 0 364 | if not isinstance(media_id, str): 365 | media_id = str(media_id) 366 | if not isinstance(max_id, str): 367 | max_id = str(max_id) 368 | while counter < count: 369 | print('Comment {}'.format(counter) ) 370 | API.getMediaComments(media_id, max_id=max_id) 371 | response = API.LastJson 372 | if response.get('comment_count',0) == 0 or len(response['comments']) == 0: 373 | return pd.DataFrame(columns=columns) 374 | for i, comment in enumerate(response['comments'], start=counter+1): 375 | seconds = comment['created_at'] 376 | tiempo = datetime.fromtimestamp(seconds) 377 | timestamp = tiempo.strftime('%Y-%m-%d %H:%M:%S') 378 | user = comment['user']['full_name'] 379 | userid = comment['user_id'] 380 | txt = comment['text'] 381 | comments.append((timestamp, user, userid, txt)) 382 | if i >= count: 383 | break 384 | counter = i 385 | if response['has_more_comments']: 386 | max_id = response['next_max_id'] 387 | else: 388 | break 389 | df = pd.DataFrame(comments, columns=columns) 390 | df.sort_values('timestamp', inplace=True) 391 | df['text'] = df['text'].apply(lambda x: x.replace('\n',' ').replace('\r',' ')) 392 | return df 393 | 394 | 395 | 396 | --------------------------------------------------------------------------------