├── InstAnalytics.py
└── README.md


/InstAnalytics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from selenium import webdriver
  5 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6 | from selenium.webdriver.common.action_chains import ActionChains
  7 | from bs4 import BeautifulSoup
  8 | from datetime import datetime
  9 | import json, time, os, re
 10 | 
 11 | 
 12 | 
 13 | # List of users
 14 | users = ['yotta_life']
 15 | 
 16 | 
 17 | 
 18 | # ----------------------------------------
 19 | #  InstAnalytics function
 20 | # ----------------------------------------
 21 | 
 22 | def InstAnalytics():
 23 | 
 24 | 	# Launch browser
 25 | 	browser = webdriver.PhantomJS(desired_capabilities=dcap)
 26 | 
 27 | 	for user in users:
 28 | 
 29 | 		# Load JSON
 30 | 		with open('InstAnalytics.json') as iaFile:
 31 | 			iaDictionary = json.load(iaFile)
 32 | 
 33 | 		# Backup JSON
 34 | 		with open('InstAnalytics_backup.json', 'w') as iaFile:
 35 | 			json.dump(iaDictionary, iaFile, indent=4)
 36 | 
 37 | 		# User's profile
 38 | 		browser.get('https://instagram.com/' + user)
 39 | 		time.sleep(0.5)
 40 | 
 41 | 		# Soup
 42 | 		soup = BeautifulSoup(browser.page_source, 'html.parser')
 43 | 
 44 | 		# User's statistics
 45 | 		postsT     = soup.html.body.span.section.main.article.header.findAll('div', recursive=False)[1].ul.findAll('li', recursive=False)[0].span.findAll('span', recursive=False)[1].getText()
 46 | 		followersT = soup.html.body.span.section.main.article.header.findAll('div', recursive=False)[1].ul.findAll('li', recursive=False)[1].span.findAll('span', recursive=False)[1].getText()
 47 | 		followingT = soup.html.body.span.section.main.article.header.findAll('div', recursive=False)[1].ul.findAll('li', recursive=False)[2].span.findAll('span', recursive=False)[1].getText()
 48 | 
 49 | 		# Remove all non-numeric characters
 50 | 		posts     = int(re.sub('[^0-9]', '', postsT))
 51 | 		followers = int(re.sub('[^0-9]', '', followersT))
 52 | 		following = int(re.sub('[^0-9]', '', followingT))
 53 | 
 54 | 		# Convert k to thousands and m to millions
 55 | 		if 'k' in postsT: 	  posts     = posts     * 1000
 56 | 		if 'k' in followersT: followers = followers * 1000
 57 | 		if 'k' in followingT: following = following * 1000
 58 | 		if 'm' in postsT: 	  posts     = posts     * 1000000
 59 | 		if 'm' in followersT: followers = followers * 1000000
 60 | 		if 'm' in followingT: following = following * 1000000
 61 | 
 62 | 		if posts > 12:
 63 | 			# Click the 'Load more' button
 64 | 			browser.find_element_by_xpath('/html/body/span/section/main/article/div/div[3]/a').click()
 65 | 
 66 | 		if posts > 24:
 67 | 			# Load more by scrolling to the bottom of the page
 68 | 			for i in range (0, (posts-24)//12):
 69 | 				browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
 70 | 				time.sleep(0.1)
 71 | 				browser.execute_script('window.scrollTo(0, 0)')
 72 | 				time.sleep(0.5)
 73 | 
 74 | 		browser.execute_script('window.scrollTo(0, 0)')
 75 | 
 76 | 		# Soup
 77 | 		soup = BeautifulSoup(browser.page_source, 'html.parser')
 78 | 
 79 | 		# User's photos statistics
 80 | 
 81 | 		links = []
 82 | 		for link in soup.html.body.span.section.main.article.findAll('a'):
 83 | 			if link.get('href')[:3] == '/p/': links.append(link.get('href'))
 84 | 
 85 | 		photosDic = []
 86 | 		pLikesT = 0
 87 | 		pCounter = 0
 88 | 
 89 | 		for link in links:
 90 | 			# Photo Id
 91 | 			pId = link.split("/")[2]
 92 | 			# Hover over a photo reveals Likes & Comments
 93 | 			time.sleep(0.2)
 94 | 			photo = browser.find_element_by_xpath('//a[contains(@href, "' + pId + '")]')
 95 | 			time.sleep(0.2)
 96 | 			ActionChains(browser).move_to_element(photo).perform()
 97 | 			# Soup
 98 | 			soup = BeautifulSoup(browser.page_source, 'html.parser')
 99 | 			# Likes
100 | 			pLikes    = int(re.sub('[^0-9]', '', soup.html.body.span.section.main.article.findAll('div', recursive=False)[0].findAll('div', recursive=False)[0].findAll('a')[pCounter].find('ul').findAll('li', recursive=False)[0].findAll('span', recursive=False)[1].getText()))
101 | 			# Comments
102 | 			pComments = int(re.sub('[^0-9]', '', soup.html.body.span.section.main.article.findAll('div', recursive=False)[0].findAll('div', recursive=False)[0].findAll('a')[pCounter].find('ul').findAll('li', recursive=False)[1].findAll('span', recursive=False)[1].getText()))
103 | 			# Photo dictionary
104 | 			photoDic = {
105 | 				'pId': pId,
106 | 				'pLikes': pLikes,
107 | 				'pComments': pComments
108 | 			}
109 | 			photosDic.append(photoDic)
110 | 			# Total likes
111 | 			pLikesT += pLikes
112 | 			# Simple counter
113 | 			pCounter += 1
114 | 
115 | 		# Dictionary
116 | 		userDic = {
117 | 			'username': user,
118 | 			'date': datetime.now().strftime(timeFormat),
119 | 			'data': {
120 | 				'posts': posts,
121 | 				'followers': followers,
122 | 				'following': following,
123 | 				'pLikesT': pLikesT,
124 | 				'photos': photosDic
125 | 			}
126 | 		}
127 | 
128 | 		# Add data to JSON
129 | 		iaDictionary.append(userDic)
130 | 		with open('InstAnalytics.json', 'w') as iaFile:
131 | 			json.dump(iaDictionary, iaFile, indent=4)
132 | 
133 | 		print '|', user
134 | 
135 | 	# Quit browser
136 | 	browser.quit()
137 | 
138 | 	# Remove ghostdriver.log
139 | 	if os.path.isfile('ghostdriver.log') == True:
140 | 		os.remove('ghostdriver.log')
141 | 
142 | 
143 | 
144 | # ----------------------------------------
145 | #  Main
146 | # ----------------------------------------
147 | 
148 | if __name__ == '__main__':
149 | 
150 | 	# Desired capabilities for PhantomJS
151 | 	dcap = dict(DesiredCapabilities.PHANTOMJS)
152 | 	dcap['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36'
153 | 
154 | 	timeFormat = "%Y-%m-%d"
155 | 
156 | 	# Check if the JSON file exists, otherwise create it
157 | 	if os.path.isfile('InstAnalytics.json') == False:
158 | 		iaDictionary = []
159 | 		with open('InstAnalytics.json', 'w') as iaFile:
160 | 			json.dump(iaDictionary, iaFile, indent=4)
161 | 
162 | 	print 'Scrapping data from', users, 'account(s) every day at 11pm\n'
163 | 	
164 | 	while True:
165 | 		# Scheduled, every day at 11pm
166 | 		if datetime.now().hour == 23:
167 | 			print datetime.now().strftime(timeFormat),
168 | 			try:
169 | 				InstAnalytics()
170 | 				time.sleep(82800) # Sleep for 23 hours
171 | 			except Exception, e:
172 | 				print 'Error', e
173 | 				time.sleep(30) # Retry after 30s
174 | 		else:
175 | 			time.sleep(60) # Check every minute
176 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # InstAnalytics
 2 | 
 3 | ## About
 4 | 
 5 | Each day, this Python script scraps the web version of Instagram, to get the number of posts, followers, following + likes and comments per photo from any **public** account. The data is then stored in a JSON file (`data/InstAnalytics.json`) so you can get track its growth.
 6 | 
 7 | As I made it to run on my Raspberry Pi, it uses PhantomJS, a lightweight headless browser (which is perfect for a RPi, especially in terms of resources consumption).
 8 | 
 9 | For more info, check out my [blog post](http://nbyim.com/monitor-instagram-accounts-without-using-api).
10 | 
11 | ## Requirements
12 | 
13 | Before you can run **InstAnalytics.py**, you will need to install a few Python dependencies.
14 | 
15 | Note: Python 2.7.9 and later (on the python2 series), and Python 3.4 and later include pip by default, so you may have pip already. Otherwise, you can install [easy_install](https://pythonhosted.org/setuptools/easy_install.html) `sudo apt-get install python-setuptools` to install [pip](https://pypi.python.org/pypi/pip) `sudo easy_install pip`.
16 | 
17 | - [BeautifulSoup4](https://pypi.python.org/pypi/beautifulsoup4), for parsing html: `pip install BeautifulSoup4`
18 | - [Selenium](http://www.seleniumhq.org/), for browser automation: `pip install Selenium`
19 | 
20 | PhantomJS:
21 | - On Windows, download the binary from the [official website](http://phantomjs.org) and put it in the same folder than **InstAnalytics.py**.
22 | - On OS X Yosemite, the binary provided by the PhantomJS crew doesn't work (*selenium.common.exceptions.WebDriverException: Message: 'Can not connect to GhostDriver'*). You can either compile it by yourself or download the binary provided by the awesome [eugene1g](https://github.com/eugene1g/phantomjs/releases). Then put it in the `/usr/local` folder.
23 | - It's the same for Raspbian : compile it and put it in the `/usr/bin` folder or download the binary provided by the awesome [spfaffly](https://github.com/spfaffly/phantomjs-linux-armv6l).
24 | 
25 | If you want to built your own binaries, here is the [build instructions](http://phantomjs.org/build.html) for PhantomJS.
26 | 
27 | If you plan to change the browser to Firefox or Chrome, edit the line `browser = webdriver.PhantomJS(desired_capabilities=dcap)` to `browser = webdriver.Firefox()` or `browser = webdriver.Chrome()`. To use Firefox you don't need anything more. For Chrome, first get the [webdriver](https://sites.google.com/a/chromium.org/chromedriver/downloads) then put it in the same folder than **InstAnalytics.py** if you are on Windows, or in the `/usr/local` folder if you are on OS X.
28 | 
29 | ## Configuration
30 | 
31 | Before you run **InstAnalytics.py**, edit the `users = ['yotta_life']` list to add as much as you want public Instagram accounts. It's that simple!
32 | 
33 | ## JSON output example
34 | 
35 | ```JSON
36 | 
37 | [
38 |     {
39 |         "username": "yotta_life", 
40 |         "date": "2016-04-21", 
41 |         "data": {
42 |             "following": 231, 
43 |             "followers": 649000, 
44 |             "pLikesT": 2029474, 
45 |             "posts": 608, 
46 |             "photos": [
47 |                 {
48 |                     "pId": "BEZLlc8sU_v", 
49 |                     "pLikes": 4486, 
50 |                     "pComments": 205
51 |                 }, 
52 |                 ...
53 |                 {
54 |                     "pId": "uuhChFMU92", 
55 |                     "pLikes": 282, 
56 |                     "pComments": 19
57 |                 }
58 |             ]
59 |         }
60 |     }
61 | ]
62 | ```
63 | 
64 | The complete output is available here: [https://gist.github.com/helloitsim/c3a6ce83e302a1279398b896c1c36ccb](https://gist.github.com/helloitsim/c3a6ce83e302a1279398b896c1c36ccb).
65 | 


--------------------------------------------------------------------------------