├── .gitignore
├── README.md
├── sample_usage.py
└── instagram_api_functions.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | *.mp4
3 | *.jpg
4 | *.png
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Instagram API
 2 | 
 3 | This repository has some sample code from helping CSCAR clients for extracting data from Instagram. This uses the unofficial Instagram API for Python located at https://github.com/LevPasha/Instagram-API-python.  I use the API to get the JSON response and then extract some subset of the response.
 4 | 
 5 | You also need the [Google Maps module](https://github.com/googlemaps/google-maps-services-python) (for geocoding) OR you can just comment out the relevant lines of code.
 6 | 
 7 | Example code is provided in `sample_code.py`. You need to provide your login information in `instagram_api_functions.py` before starting.
 8 | 
 9 | `instagram_api_functions.py` contains functions to do the following:
10 | - Get user id for username based on exact match.
11 | - Get timestamp of first post for user.
12 | - Gets metadata about a user.
13 | - Iterates through a user's timeline and extracts post and metadata.
14 | - Downloads Instagram Media given a url.
15 | - Iterates through a user's timeline and downloads the associated media.
16 | - Gets media id for a given url.
17 | - Get comments for a particular post (media).    
18 | 
19 | 


--------------------------------------------------------------------------------
/sample_usage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import instagram_api_functions as IG
 3 | import pandas as pd
 4 | 
 5 | pd.options.display.max_rows = 16
 6 | 
 7 | #%% Get ids of users
 8 | usernames = ['umichhockey']
 9 | profiles = IG.get_user_id(usernames)
10 | 
11 | #%% Specify user
12 | for username in profiles.keys():
13 |     print(username)
14 |     uid = profiles[username]
15 |     
16 |     #%% Get user posts
17 |     posts = IG.get_user_posts(uid, geocode=False)
18 |     posts['text'] = posts['text'].apply(lambda x: x.replace('\n',' ').replace('\r',' '))
19 |     posts.to_csv('{}_posts.txt'.format(username), index=False, sep='|', encoding='utf-8')
20 |     
21 |     #%% Download media for specific posts
22 |     if posts.shape[0] > 0: #check if there are any posts
23 |         media_ids = posts['media_id'].tolist()
24 |         IG.get_user_media(uid)
25 | 
26 |         # Get comments for specific posts
27 |         with open('{}_comments.txt'.format(username), 'w', encoding='utf-8') as fout:
28 |             for i, media_id in enumerate(media_ids, start=1):
29 |                 header_flag = True if i == 1 else False
30 |                 print('{}, Media {}'.format(i, media_id))
31 |                 comments = IG.get_post_comments(media_id)
32 |                 comments.insert(0,'media_id',media_id)       
33 |                 comments.to_csv(fout, index=False, sep='|', header=header_flag)
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/instagram_api_functions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This script contains functions to do the following:
  4 |     Get user id for username based on exact match.
  5 |     Get timestamp of first post for user.
  6 |     Gets metadata about a user.
  7 |     Iterates through a user's timeline and extracts post and metadata.
  8 |     Downloads Instagram Media given a url.
  9 |     Iterates through a user's timeline and downloads the associated media.
 10 |     Gets media id for a given url.
 11 |     Get comments for a particular post (media).    
 12 | 
 13 | You will need to install the Instagram API module and/or the GoogleMaps module
 14 | (for geocoding) located here on Github:
 15 | https://github.com/LevPasha/Instagram-API-python
 16 | https://github.com/googlemaps/google-maps-services-python
 17 | 
 18 | IMPORTANT:
 19 | You also need to include your login information after the import statements
 20 | """
 21 | 
 22 | from InstagramAPI import InstagramAPI
 23 | import os
 24 | import requests
 25 | import pandas as pd
 26 | from datetime import datetime
 27 | from bs4 import BeautifulSoup
 28 | import googlemaps
 29 | 
 30 | username = 'username'
 31 | pwd = 'password'
 32 | API = InstagramAPI(username,pwd)
 33 | API.login()
 34 | 
 35 | #%%
 36 | def get_user_id(usernames):
 37 |     """
 38 |     Get user id for username based on exact match.
 39 |         
 40 |     Parameters
 41 |     ----------
 42 |     usernames: list
 43 |         List of usernames
 44 |   
 45 |     Returns
 46 |     -------
 47 |     dictionary
 48 |         key, value = user, user id
 49 |     """
 50 |     
 51 |     profiles = {}
 52 |     if not isinstance(usernames, list):
 53 |         usernames = list(usernames)
 54 |     for username in usernames:
 55 |         API.fbUserSearch(username)
 56 |         response = API.LastJson
 57 |         for account in response['users']:
 58 |             if account['user']['username'] == username:
 59 |                 profiles[username]= account['user']['pk']
 60 |                 break
 61 |     return profiles
 62 | 
 63 |     
 64 | def get_first_post_timestamp(uid):
 65 |     """
 66 |     Get timestamp of first post for user.
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     uid: str
 71 |         User id
 72 |   
 73 |     Returns
 74 |     -------
 75 |     tuple
 76 |         two-tuple of timestamp and max_id
 77 |     """
 78 |     
 79 |     timestamp = None
 80 |     ts = pd.read_csv('max_id_timestamps.txt')
 81 |     for row in ts.itertuples(index=False):
 82 |         max_id = row[1]
 83 |         while True:
 84 |             API.getUserFeed(uid, maxid=max_id)
 85 |             response = API.LastJson
 86 |             if response['num_results'] == 0:
 87 |                 break
 88 |             item = response['items'][-1]
 89 |             caption = item['caption']
 90 |             seconds = caption['created_at'] if caption else item['taken_at']
 91 |             tiempo = datetime.fromtimestamp(seconds)
 92 |             timestamp = tiempo.strftime('%Y-%m-%d %H:%M:%S')
 93 |             max_id = item['id']
 94 |             print(timestamp, max_id)
 95 |             if not response['more_available']:
 96 |                 break
 97 |         if timestamp:
 98 |             break
 99 |     return (timestamp, max_id)
100 | 
101 | 
102 | def get_user_metadata(profiles):
103 |     """
104 |     Gets metadata about a user.
105 |     
106 |     Parameters
107 |     ----------
108 |     profiles: dict
109 |         Dictionary of users
110 |   
111 |     Returns
112 |     -------
113 |     DataFrame
114 |         Columns: ['username','posts','followers','following','first_post',
115 |                   'max_id']        
116 |     """
117 |     
118 |     metadata = []
119 |     for username in profiles.keys():
120 |         print('Processing {}'.format(username) )
121 |         uid = profiles[username]
122 |         API.getUsernameInfo(uid)
123 |         response = API.LastJson
124 |         posts = response['user']['media_count'] 
125 |         followers = response['user']['follower_count']
126 |         following = response['user']['following_count']
127 |         timestamp, max_id = get_first_post_timestamp(uid)
128 |         metadata.append((username,posts,followers,following,timestamp,max_id))
129 |     columns = columns=['username','posts','followers','following','first_post','max_id']
130 |     return pd.DataFrame(metadata, columns=columns)
131 |     
132 | 
133 | def get_user_posts(uid, *, max_id = '', count=15000, geocode=False):
134 |     """
135 |     Iterates through a user's timeline and extracts post and metadata.
136 |     
137 |     Parameters
138 |     ----------
139 |     uid: int
140 |         User id
141 |     max_id: str
142 |         Return post earlier than this max_id
143 |     count: int
144 |         Count of posts to return
145 |     geocode: boolean              
146 |         Whether to geocode the lon/lat coordinates if exists using 
147 |         Google Maps API. True for geocoding. False for no geocoding.
148 |   
149 |     Returns
150 |     -------
151 |     DataFrame
152 |         Columns: ['media_id','shortcode','timestamp','weekday','lon',
153 |                   'lat','address','like_count','comment_count','media_type',
154 |                   'duration','views','photos','text']
155 |     """
156 |     
157 |     if geocode:
158 |         apikey = os.getenv('GOOGLE_MAP_API_KEY') # Insert own key HERE
159 |         gmaps = googlemaps.Client(apikey)    
160 |     data = []
161 |     counter = 0
162 |     if not isinstance(max_id, str):
163 |         max_id = str(max_id)
164 |     while counter < count:
165 |         print('Post {} {}'.format(counter, max_id))
166 |         API.getUserFeed(uid, maxid=max_id)
167 |         response = API.LastJson
168 |         for i, item in enumerate(response['items'], start=counter+1):
169 |             media_id = item['pk']
170 |             shortcode = item['code']
171 |             if item['caption']:
172 |                 seconds = item['caption']['created_at']
173 |                 txt = item['caption']['text']
174 |             else:
175 |                 seconds = item['taken_at']
176 |                 txt = ''
177 |             tiempo = datetime.fromtimestamp(seconds)
178 |             timestamp = tiempo.strftime('%Y-%m-%d %H:%M:%S')
179 |             weekday = tiempo.isoweekday()
180 |             if 'lng' in item:
181 |                 lon, lat = item['lng'], item['lat']
182 |                 address = gmaps.reverse_geocode((lat, lon))[0]['formatted_address'] if geocode else None
183 |             else:
184 |                 lon, lat, address = None, None, None
185 |             likes = item['like_count']
186 |             comments = item.get('comment_count',0)
187 |             media = item['media_type']
188 |             duration = item.get('video_duration', None)
189 |             views = item.get('view_count', None)
190 |             photos = len(item['carousel_media']) if media == 8 else 1
191 |             data.append((media_id, shortcode, timestamp, weekday, lon, lat, address, likes, comments, media, duration, views, photos, txt))
192 |             if i >= count:
193 |                 break
194 |         counter = i
195 |         if response['more_available']:
196 |             max_id = response['next_max_id']
197 |         else:
198 |             break
199 |         
200 |     columns = ['media_id','shortcode','timestamp','weekday','lon','lat','address','like_count','comment_count','media_type','duration','views','photos','text']        
201 |     return pd.DataFrame(data, columns=columns)    
202 | 
203 |     
204 | def download_media(url, filename, *, photo=True):
205 |     """
206 |     Downloads Instagram Media given a url.
207 |     
208 |     Parameters
209 |     ----------
210 |     url: str
211 |         Url of media
212 |     filename: str
213 |         Filename of downloaded media
214 |     photo: boolean
215 |         Whether media is a photo. True if photo. False if video.        
216 |         
217 |     Returns
218 |     -------
219 |     file
220 |         Downloaded media
221 |     """
222 |     
223 |     R = requests.get(url)
224 |     if R.status_code == 404:
225 |         print('url not found for media {}'.format(filename))  
226 |         return
227 |     else:
228 |         R.raise_for_status()
229 |     with open(filename, 'wb') as fout:
230 |         if photo:           
231 |             fout.write(R.content)
232 |         else: # video
233 |             for chunk in R.iter_content(chunk_size=255): 
234 |                 if chunk: # filter out keep-alive new chunks
235 |                     fout.write(chunk)
236 |     print('{} downloaded'.format(filename))
237 | 
238 | 
239 | def get_user_media(uid, *, max_id = '', count=15000, filter=False, media_ids=[]):
240 |     """
241 |     Iterates through a user's timeline and downloads the associated media.
242 |     
243 |     Parameters
244 |     ----------
245 |     uid: int
246 |         User id
247 |     max_id: str
248 |         Return media earlier than this max_id
249 |     count: int
250 |         Count of media to return
251 |     filter: boolean
252 |         Whether to filter for particular posts
253 |     media_ids: list
254 |         List of post ids to filter for (filter argument needs to be True)                
255 |     """    
256 |     
257 |     counter = 0
258 |     media_count = 0
259 |     if not isinstance(max_id, str):
260 |         max_id = str(max_id)
261 |     if filter and not isinstance(media_ids, list):
262 |         media_ids = list(media_ids)
263 |     while counter < count:
264 |         print('Post {} {}'.format(counter, max_id))
265 |         API.getUserFeed(uid, maxid=max_id)
266 |         response = API.LastJson
267 |         for i, item in enumerate(response['items'], start=counter+1):
268 |             media_id = item['pk']
269 |             if filter: # filtering media
270 |                 assert type(media_id) == type(media_ids[0])
271 |                 if media_id not in media_ids:
272 |                     continue
273 |                 else:
274 |                     media_count += 1
275 |             media = item['media_type']
276 |             filename = '{4}_{2}_{3:0>2}.jpg'.format(username,i,media,1,media_id)
277 |             if media == 1: # not a slide show (carousel)
278 |                 url = item['image_versions2']['candidates'][0]['url']
279 |                 download_media(url, filename)
280 |             elif media == 8: # carousel
281 |                 for k, photo in enumerate(item['carousel_media'], start=1):
282 |                     filename = '{4}_{2}_{3:0>2}.jpg'.format(username,i,media,k,media_id)
283 |                     url = photo['image_versions2']['candidates'][0]['url']
284 |                     download_media(url, filename)
285 |             elif media == 2: # video
286 |                 if 'video_dash_manifest' in item:
287 |                     html = item.get('video_dash_manifest')
288 |                     soup = BeautifulSoup(html, 'html.parser')
289 |                     videolinks = soup.find_all('baseurl')
290 |                     if videolinks:
291 |                         url = videolinks[0].text 
292 |                     else:
293 |                         print('Could NOT find video link for {}'.format(media_id))
294 |                         assert 1>2
295 |                 elif 'video_versions' in item:
296 |                     url = item['video_versions'][0]['url']
297 |                 else:
298 |                     print('Unknown key to access video url for {}'.format(media_id))
299 |                     assert 1>2                
300 |                 extension = os.path.splitext(url)[-1]
301 |                 filename = '{0}{1}'.format(media_id,extension)
302 |                 download_media(url, filename, photo=False)
303 |             else:
304 |                 print('Unknown Media Type for {}'.format(media_id))
305 |                 assert 1>2
306 |             if i >= count:                
307 |                 break
308 |             if filter: # checking if all media has been obtained
309 |                 if media_count == len(media_ids):
310 |                     return
311 |         counter = i
312 |         if response['more_available']:
313 |             max_id = response['next_max_id']
314 |         else:
315 |             return    
316 | 
317 |     
318 | def get_media_id(url):
319 |     """
320 |     Gets media id for a given url.
321 |     
322 |     Parameters
323 |     ----------
324 |     url: str
325 |         Url of media
326 |     
327 |     Returns
328 |     -------
329 |     str
330 |         Media id
331 |     """
332 |     
333 |     callback_url = 'http://www.google.com'
334 |     get_mediaid_url = r'http://api.instagram.com/oembed?callback={}&url={}'.format(callback_url,url)
335 |     R = requests.get(get_mediaid_url)
336 |     R.raise_for_status()
337 |     response = R.json()
338 |     return response['media_id']
339 | 
340 |     
341 | def get_post_comments(media_id, *, max_id='', count=100000):
342 |     """
343 |     Get comments for a particular post (media).
344 |     
345 |     Parameters
346 |     ----------
347 |     media_id: str
348 |         Media id
349 |     max_id: str
350 |         Return comment earlier than this max_id
351 |     count: int
352 |         Count of comments to return
353 |                 
354 |     Returns
355 |     -------
356 |     DataFrame
357 |         Columns: ['timestamp','name','userid','text']
358 |         Sorted by timestamp
359 |     """
360 |     
361 |     columns = ['timestamp','name','userid','text']
362 |     comments = []
363 |     counter = 0
364 |     if not isinstance(media_id, str):
365 |         media_id = str(media_id)
366 |     if not isinstance(max_id, str):
367 |         max_id = str(max_id)
368 |     while counter < count:
369 |         print('Comment {}'.format(counter) )
370 |         API.getMediaComments(media_id, max_id=max_id)
371 |         response = API.LastJson
372 |         if response.get('comment_count',0) == 0 or len(response['comments']) == 0:
373 |             return pd.DataFrame(columns=columns)
374 |         for i, comment in enumerate(response['comments'], start=counter+1):
375 |             seconds = comment['created_at']
376 |             tiempo = datetime.fromtimestamp(seconds)
377 |             timestamp = tiempo.strftime('%Y-%m-%d %H:%M:%S')
378 |             user = comment['user']['full_name']
379 |             userid = comment['user_id']
380 |             txt = comment['text']
381 |             comments.append((timestamp, user, userid, txt))
382 |             if i >= count:
383 |                 break
384 |         counter = i
385 |         if response['has_more_comments']:
386 |             max_id = response['next_max_id']
387 |         else:
388 |             break    
389 |     df = pd.DataFrame(comments, columns=columns)
390 |     df.sort_values('timestamp', inplace=True)  
391 |     df['text'] = df['text'].apply(lambda x: x.replace('\n',' ').replace('\r',' '))
392 |     return df
393 |     
394 |     
395 |     
396 |     


--------------------------------------------------------------------------------