├── InstAnalytics.py └── README.md /InstAnalytics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from selenium import webdriver 5 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 6 | from selenium.webdriver.common.action_chains import ActionChains 7 | from bs4 import BeautifulSoup 8 | from datetime import datetime 9 | import json, time, os, re 10 | 11 | 12 | 13 | # List of users 14 | users = ['yotta_life'] 15 | 16 | 17 | 18 | # ---------------------------------------- 19 | # InstAnalytics function 20 | # ---------------------------------------- 21 | 22 | def InstAnalytics(): 23 | 24 | # Launch browser 25 | browser = webdriver.PhantomJS(desired_capabilities=dcap) 26 | 27 | for user in users: 28 | 29 | # Load JSON 30 | with open('InstAnalytics.json') as iaFile: 31 | iaDictionary = json.load(iaFile) 32 | 33 | # Backup JSON 34 | with open('InstAnalytics_backup.json', 'w') as iaFile: 35 | json.dump(iaDictionary, iaFile, indent=4) 36 | 37 | # User's profile 38 | browser.get('https://instagram.com/' + user) 39 | time.sleep(0.5) 40 | 41 | # Soup 42 | soup = BeautifulSoup(browser.page_source, 'html.parser') 43 | 44 | # User's statistics 45 | postsT = soup.html.body.span.section.main.article.header.findAll('div', recursive=False)[1].ul.findAll('li', recursive=False)[0].span.findAll('span', recursive=False)[1].getText() 46 | followersT = soup.html.body.span.section.main.article.header.findAll('div', recursive=False)[1].ul.findAll('li', recursive=False)[1].span.findAll('span', recursive=False)[1].getText() 47 | followingT = soup.html.body.span.section.main.article.header.findAll('div', recursive=False)[1].ul.findAll('li', recursive=False)[2].span.findAll('span', recursive=False)[1].getText() 48 | 49 | # Remove all non-numeric characters 50 | posts = int(re.sub('[^0-9]', '', postsT)) 51 | followers = int(re.sub('[^0-9]', '', followersT)) 52 | following = int(re.sub('[^0-9]', '', followingT)) 53 | 54 | # Convert k to thousands and m to millions 55 | if 'k' in postsT: posts = posts * 1000 56 | if 'k' in followersT: followers = followers * 1000 57 | if 'k' in followingT: following = following * 1000 58 | if 'm' in postsT: posts = posts * 1000000 59 | if 'm' in followersT: followers = followers * 1000000 60 | if 'm' in followingT: following = following * 1000000 61 | 62 | if posts > 12: 63 | # Click the 'Load more' button 64 | browser.find_element_by_xpath('/html/body/span/section/main/article/div/div[3]/a').click() 65 | 66 | if posts > 24: 67 | # Load more by scrolling to the bottom of the page 68 | for i in range (0, (posts-24)//12): 69 | browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') 70 | time.sleep(0.1) 71 | browser.execute_script('window.scrollTo(0, 0)') 72 | time.sleep(0.5) 73 | 74 | browser.execute_script('window.scrollTo(0, 0)') 75 | 76 | # Soup 77 | soup = BeautifulSoup(browser.page_source, 'html.parser') 78 | 79 | # User's photos statistics 80 | 81 | links = [] 82 | for link in soup.html.body.span.section.main.article.findAll('a'): 83 | if link.get('href')[:3] == '/p/': links.append(link.get('href')) 84 | 85 | photosDic = [] 86 | pLikesT = 0 87 | pCounter = 0 88 | 89 | for link in links: 90 | # Photo Id 91 | pId = link.split("/")[2] 92 | # Hover over a photo reveals Likes & Comments 93 | time.sleep(0.2) 94 | photo = browser.find_element_by_xpath('//a[contains(@href, "' + pId + '")]') 95 | time.sleep(0.2) 96 | ActionChains(browser).move_to_element(photo).perform() 97 | # Soup 98 | soup = BeautifulSoup(browser.page_source, 'html.parser') 99 | # Likes 100 | pLikes = int(re.sub('[^0-9]', '', soup.html.body.span.section.main.article.findAll('div', recursive=False)[0].findAll('div', recursive=False)[0].findAll('a')[pCounter].find('ul').findAll('li', recursive=False)[0].findAll('span', recursive=False)[1].getText())) 101 | # Comments 102 | pComments = int(re.sub('[^0-9]', '', soup.html.body.span.section.main.article.findAll('div', recursive=False)[0].findAll('div', recursive=False)[0].findAll('a')[pCounter].find('ul').findAll('li', recursive=False)[1].findAll('span', recursive=False)[1].getText())) 103 | # Photo dictionary 104 | photoDic = { 105 | 'pId': pId, 106 | 'pLikes': pLikes, 107 | 'pComments': pComments 108 | } 109 | photosDic.append(photoDic) 110 | # Total likes 111 | pLikesT += pLikes 112 | # Simple counter 113 | pCounter += 1 114 | 115 | # Dictionary 116 | userDic = { 117 | 'username': user, 118 | 'date': datetime.now().strftime(timeFormat), 119 | 'data': { 120 | 'posts': posts, 121 | 'followers': followers, 122 | 'following': following, 123 | 'pLikesT': pLikesT, 124 | 'photos': photosDic 125 | } 126 | } 127 | 128 | # Add data to JSON 129 | iaDictionary.append(userDic) 130 | with open('InstAnalytics.json', 'w') as iaFile: 131 | json.dump(iaDictionary, iaFile, indent=4) 132 | 133 | print '|', user 134 | 135 | # Quit browser 136 | browser.quit() 137 | 138 | # Remove ghostdriver.log 139 | if os.path.isfile('ghostdriver.log') == True: 140 | os.remove('ghostdriver.log') 141 | 142 | 143 | 144 | # ---------------------------------------- 145 | # Main 146 | # ---------------------------------------- 147 | 148 | if __name__ == '__main__': 149 | 150 | # Desired capabilities for PhantomJS 151 | dcap = dict(DesiredCapabilities.PHANTOMJS) 152 | dcap['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36' 153 | 154 | timeFormat = "%Y-%m-%d" 155 | 156 | # Check if the JSON file exists, otherwise create it 157 | if os.path.isfile('InstAnalytics.json') == False: 158 | iaDictionary = [] 159 | with open('InstAnalytics.json', 'w') as iaFile: 160 | json.dump(iaDictionary, iaFile, indent=4) 161 | 162 | print 'Scrapping data from', users, 'account(s) every day at 11pm\n' 163 | 164 | while True: 165 | # Scheduled, every day at 11pm 166 | if datetime.now().hour == 23: 167 | print datetime.now().strftime(timeFormat), 168 | try: 169 | InstAnalytics() 170 | time.sleep(82800) # Sleep for 23 hours 171 | except Exception, e: 172 | print 'Error', e 173 | time.sleep(30) # Retry after 30s 174 | else: 175 | time.sleep(60) # Check every minute 176 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InstAnalytics 2 | 3 | ## About 4 | 5 | Each day, this Python script scraps the web version of Instagram, to get the number of posts, followers, following + likes and comments per photo from any **public** account. The data is then stored in a JSON file (`data/InstAnalytics.json`) so you can get track its growth. 6 | 7 | As I made it to run on my Raspberry Pi, it uses PhantomJS, a lightweight headless browser (which is perfect for a RPi, especially in terms of resources consumption). 8 | 9 | For more info, check out my [blog post](http://nbyim.com/monitor-instagram-accounts-without-using-api). 10 | 11 | ## Requirements 12 | 13 | Before you can run **InstAnalytics.py**, you will need to install a few Python dependencies. 14 | 15 | Note: Python 2.7.9 and later (on the python2 series), and Python 3.4 and later include pip by default, so you may have pip already. Otherwise, you can install [easy_install](https://pythonhosted.org/setuptools/easy_install.html) `sudo apt-get install python-setuptools` to install [pip](https://pypi.python.org/pypi/pip) `sudo easy_install pip`. 16 | 17 | - [BeautifulSoup4](https://pypi.python.org/pypi/beautifulsoup4), for parsing html: `pip install BeautifulSoup4` 18 | - [Selenium](http://www.seleniumhq.org/), for browser automation: `pip install Selenium` 19 | 20 | PhantomJS: 21 | - On Windows, download the binary from the [official website](http://phantomjs.org) and put it in the same folder than **InstAnalytics.py**. 22 | - On OS X Yosemite, the binary provided by the PhantomJS crew doesn't work (*selenium.common.exceptions.WebDriverException: Message: 'Can not connect to GhostDriver'*). You can either compile it by yourself or download the binary provided by the awesome [eugene1g](https://github.com/eugene1g/phantomjs/releases). Then put it in the `/usr/local` folder. 23 | - It's the same for Raspbian : compile it and put it in the `/usr/bin` folder or download the binary provided by the awesome [spfaffly](https://github.com/spfaffly/phantomjs-linux-armv6l). 24 | 25 | If you want to built your own binaries, here is the [build instructions](http://phantomjs.org/build.html) for PhantomJS. 26 | 27 | If you plan to change the browser to Firefox or Chrome, edit the line `browser = webdriver.PhantomJS(desired_capabilities=dcap)` to `browser = webdriver.Firefox()` or `browser = webdriver.Chrome()`. To use Firefox you don't need anything more. For Chrome, first get the [webdriver](https://sites.google.com/a/chromium.org/chromedriver/downloads) then put it in the same folder than **InstAnalytics.py** if you are on Windows, or in the `/usr/local` folder if you are on OS X. 28 | 29 | ## Configuration 30 | 31 | Before you run **InstAnalytics.py**, edit the `users = ['yotta_life']` list to add as much as you want public Instagram accounts. It's that simple! 32 | 33 | ## JSON output example 34 | 35 | ```JSON 36 | 37 | [ 38 | { 39 | "username": "yotta_life", 40 | "date": "2016-04-21", 41 | "data": { 42 | "following": 231, 43 | "followers": 649000, 44 | "pLikesT": 2029474, 45 | "posts": 608, 46 | "photos": [ 47 | { 48 | "pId": "BEZLlc8sU_v", 49 | "pLikes": 4486, 50 | "pComments": 205 51 | }, 52 | ... 53 | { 54 | "pId": "uuhChFMU92", 55 | "pLikes": 282, 56 | "pComments": 19 57 | } 58 | ] 59 | } 60 | } 61 | ] 62 | ``` 63 | 64 | The complete output is available here: [https://gist.github.com/helloitsim/c3a6ce83e302a1279398b896c1c36ccb](https://gist.github.com/helloitsim/c3a6ce83e302a1279398b896c1c36ccb). 65 | --------------------------------------------------------------------------------