├── requirements.txt
├── auth.json.example
├── utils
    ├── set_path.sh
    ├── get_phantomjs.sh
    └── get_gecko.sh
├── .gitignore
├── test.sh
├── .travis.yml
├── LICENSE
├── README.md
└── instagramcrawler.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.21.0
2 | selenium==3.4.0
3 | 


--------------------------------------------------------------------------------
/auth.json.example:
--------------------------------------------------------------------------------
1 | {
2 |   "username": "",
3 |   "password": ""
4 | }
5 | 


--------------------------------------------------------------------------------
/utils/set_path.sh:
--------------------------------------------------------------------------------
1 | 
2 | PATH=${PATH}:${PWD}/geckodriver
3 | PATH=${PATH}:${PWD}/phantomjs-2.1.1-linux-x86_64/bin
4 | 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Default data directory
2 | data
3 | 
4 | # prevent terrible mistakes
5 | auth.json
6 | 
7 | # driver logs
8 | geckodriver.log
9 | ghostdriver.log


--------------------------------------------------------------------------------
/utils/get_phantomjs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # from https://stackoverflow.com/a/45273545/470341
4 | 
5 | wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
6 | tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2
7 | 
8 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | echo "Query account 'instagram', download 20 photos and their captions"
2 | python instagramcrawler.py -q 'instagram' -n 20 -c
3 | 
4 | echo "Query hashtag '#breakfast' and download 20 photos"
5 | python instagramcrawler.py -q '#breakfast' -n 20
6 | 


--------------------------------------------------------------------------------
/utils/get_gecko.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # following travis.yml https://github.com/iammrhelo/InstagramCrawler/blob/master/.travis.yml#L11-L14
4 | 
5 | wget https://github.com/mozilla/geckodriver/releases/download/v0.16.0/geckodriver-v0.16.0-linux64.tar.gz
6 | mkdir -p geckodriver && tar zxvf geckodriver-v0.16.0-linux64.tar.gz -C geckodriver
7 | 
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.5"
 5 | addons:
 6 |   firefox: "53.0"
 7 | before_script:
 8 |   - "export DISPLAY=:99.0"
 9 |   - "sh -e /etc/init.d/xvfb start"
10 |   - sleep 3 # give xvfb some time to start
11 | before_install: 
12 |   - wget https://github.com/mozilla/geckodriver/releases/download/v0.16.0/geckodriver-v0.16.0-linux64.tar.gz
13 |   - mkdir geckodriver && tar zxvf geckodriver-v0.16.0-linux64.tar.gz -C geckodriver
14 |   - export PATH=$PATH:$PWD/geckodriver
15 | install: "pip install -r requirements.txt"
16 | script: sh test.sh


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Antonie Lin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 05/03/2019 Repo is now archived. 
 2 | 
 3 | I am now officially archiving this repo after a long time of, well, not maintaining.
 4 | 
 5 | ---
 6 | # InstagramCrawler
 7 | A non API python program to crawl public photos, posts, followers, and following
 8 | 
 9 | ##### Login to crawl followers/following
10 | To crawl followers or followings, you will need to login with your credentials either by filling in 'auth.json' or typing in(as you would do when you are simply browsing instagram)
11 | 
12 | Well, it is to copy 'auth.json.example' to 'auth.json' and fill in your username and password
13 | 
14 | ##### PhantomJS for headless browser
15 | For headless browser, after installing [phantomjs](http://phantomjs.org/), add '-l' to the arguments
16 | 
17 | ### Examples:
18 | Download the first 100 photos and captions(user's posts, if any) from username "instagram"
19 | 
20 | ###### NOTE: When I ran on public account 'instagram', somehow it stops at caption 29
21 | ```
22 | $ python instagramcrawler.py -q 'instagram' -c -n 100
23 | ```
24 | Search for the hashtag "#breakfast" and download first 50 photos
25 | ```
26 | $ python instagramcrawler.py -q '#breakfast' -n 50
27 | ```
28 | Record the first 30 followers of the username "instagram", requires log in
29 | ```
30 | $ python instagramcrawler.py -q 'instagram' -t 'followers' -n 30 -a auth.json
31 | ```
32 | 
33 | ### Full usage:
34 | ```
35 | usage: instagramcrawler.py [-h] [-d DIR] [-q QUERY] [-t CRAWL_TYPE] [-n NUMBER] [-c]  [-a AUTHENTICATION]
36 | ```
37 |   - [-d DIR]: the directory to save crawling results, default is './data/[query]'
38 |   - [-q QUERY] : username, add '#' to search for hashtags, e.g. 'username', '#hashtag'
39 |   - [-t CRAWL_TYPE]: crawl_type, Options: 'photos | followers | following'
40 |   - [-n NUMBER]: number of posts, followers, or following to crawl
41 |   - [-c]: add this flag to download captions(what user wrote to describe their photos)
42 |   - [-a AUTHENTICATION]: path to a json file, which contains your instagram credentials, please see 'auth.json'
43 |   - [-l HEADLESS]: If set, will use PhantomJS driver to run script as headless
44 |   - [-f FIREFOX_PATH]: path to the **binary** (not the script) of firefox on your system (see this issue in Selenium https://github.com/SeleniumHQ/selenium/issues/3884#issuecomment-296988595)
45 | 
46 | 
47 | ### Installation
48 | There are 2 packages : selenium & requests
49 | 
50 | ###### NOTE: I used selenium = 3.4, geckodriver = 0.16 (fixed bug in previous versions)
51 | ```
52 | $ pip install -r requirements.txt
53 | ```
54 | 
55 | ###### Optional: geckodriver and phantomjs if not present on your system
56 | ```
57 | bash utils/get_gecko.sh
58 | bash utils/get_phantomjs.sh
59 | source utils/set_path.sh
60 | ```
61 | 
62 | 


--------------------------------------------------------------------------------
/instagramcrawler.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import argparse
  4 | import codecs
  5 | from collections import defaultdict
  6 | import json
  7 | import os
  8 | import re
  9 | import sys
 10 | import time
 11 | try:
 12 |     from urlparse import urljoin
 13 |     from urllib import urlretrieve
 14 | except ImportError:
 15 |     from urllib.parse import urljoin
 16 |     from urllib.request import urlretrieve
 17 | 
 18 | import requests
 19 | import selenium
 20 | from selenium import webdriver
 21 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 22 | from selenium.webdriver.common.by import By
 23 | from selenium.webdriver.common.keys import Keys
 24 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
 25 | from selenium.webdriver.support import expected_conditions as EC
 26 | from selenium.webdriver.support.ui import WebDriverWait
 27 | 
 28 | # HOST
 29 | HOST = 'http://www.instagram.com'
 30 | 
 31 | # SELENIUM CSS SELECTOR
 32 | CSS_LOAD_MORE = "a._1cr2e._epyes"
 33 | CSS_RIGHT_ARROW = "a[class='_de018 coreSpriteRightPaginationArrow']"
 34 | FIREFOX_FIRST_POST_PATH = "//div[contains(@class, '_8mlbc _vbtk2 _t5r8b')]"
 35 | TIME_TO_CAPTION_PATH = "../../../div/ul/li/span"
 36 | 
 37 | # FOLLOWERS/FOLLOWING RELATED
 38 | CSS_EXPLORE = "a[href='/explore/']"
 39 | CSS_LOGIN = "a[href='/accounts/login/']"
 40 | CSS_FOLLOWERS = "a[href='/{}/followers/']"
 41 | CSS_FOLLOWING = "a[href='/{}/following/']"
 42 | FOLLOWER_PATH = "//div[contains(text(), 'Followers')]"
 43 | FOLLOWING_PATH = "//div[contains(text(), 'Following')]"
 44 | 
 45 | # JAVASCRIPT COMMANDS
 46 | SCROLL_UP = "window.scrollTo(0, 0);"
 47 | SCROLL_DOWN = "window.scrollTo(0, document.body.scrollHeight);"
 48 | 
 49 | class url_change(object):
 50 |     """
 51 |         Used for caption scraping
 52 |     """
 53 |     def __init__(self, prev_url):
 54 |         self.prev_url = prev_url
 55 | 
 56 |     def __call__(self, driver):
 57 |         return self.prev_url != driver.current_url
 58 | 
 59 | class InstagramCrawler(object):
 60 |     """
 61 |         Crawler class
 62 |     """
 63 |     def __init__(self, headless=True, firefox_path=None):
 64 |         if headless:
 65 |             print("headless mode on")
 66 |             self._driver = webdriver.PhantomJS()
 67 |         else:
 68 |             # credit to https://github.com/SeleniumHQ/selenium/issues/3884#issuecomment-296990844
 69 |             binary = FirefoxBinary(firefox_path)
 70 |             self._driver = webdriver.Firefox(firefox_binary=binary)
 71 | 
 72 |         self._driver.implicitly_wait(10)
 73 |         self.data = defaultdict(list)
 74 | 
 75 |     def login(self, authentication=None):
 76 |         """
 77 |             authentication: path to authentication json file
 78 |         """
 79 |         self._driver.get(urljoin(HOST, "accounts/login/"))
 80 | 
 81 |         if authentication:
 82 |             print("Username and password loaded from {}".format(authentication))
 83 |             with open(authentication, 'r') as fin:
 84 |                 auth_dict = json.loads(fin.read())
 85 |             # Input username
 86 |             username_input = WebDriverWait(self._driver, 5).until(
 87 |                 EC.presence_of_element_located((By.NAME, 'username'))
 88 |             )
 89 |             username_input.send_keys(auth_dict['username'])
 90 |             # Input password
 91 |             password_input = WebDriverWait(self._driver, 5).until(
 92 |                 EC.presence_of_element_located((By.NAME, 'password'))
 93 |             )
 94 |             password_input.send_keys(auth_dict['password'])
 95 |             # Submit
 96 |             password_input.submit()
 97 |         else:
 98 |             print("Type your username and password by hand to login!")
 99 |             print("You have a minute to do so!")
100 | 
101 |         print("")
102 |         WebDriverWait(self._driver, 60).until(
103 |             EC.presence_of_element_located((By.CSS_SELECTOR, CSS_EXPLORE))
104 |         )
105 | 
106 |     def quit(self):
107 |         self._driver.quit()
108 | 
109 |     def crawl(self, dir_prefix, query, crawl_type, number, caption, authentication):
110 |         print("dir_prefix: {}, query: {}, crawl_type: {}, number: {}, caption: {}, authentication: {}"
111 |               .format(dir_prefix, query, crawl_type, number, caption, authentication))
112 | 
113 |         if crawl_type == "photos":
114 |             # Browse target page
115 |             self.browse_target_page(query)
116 |             # Scroll down until target number photos is reached
117 |             self.scroll_to_num_of_posts(number)
118 |             # Scrape photo links
119 |             self.scrape_photo_links(number, is_hashtag=query.startswith("#"))
120 |             # Scrape captions if specified
121 |             if caption is True:
122 |                 self.click_and_scrape_captions(number)
123 | 
124 |         elif crawl_type in ["followers", "following"]:
125 |             # Need to login first before crawling followers/following
126 |             print("You will need to login to crawl {}".format(crawl_type))
127 |             self.login(authentication)
128 | 
129 |             # Then browse target page
130 |             assert not query.startswith(
131 |                 '#'), "Hashtag does not have followers/following!"
132 |             self.browse_target_page(query)
133 |             # Scrape captions
134 |             self.scrape_followers_or_following(crawl_type, query, number)
135 |         else:
136 |             print("Unknown crawl type: {}".format(crawl_type))
137 |             self.quit()
138 |             return
139 |         # Save to directory
140 |         print("Saving...")
141 |         self.download_and_save(dir_prefix, query, crawl_type)
142 | 
143 |         # Quit driver
144 |         print("Quitting driver...")
145 |         self.quit()
146 | 
147 |     def browse_target_page(self, query):
148 |         # Browse Hashtags
149 |         if query.startswith('#'):
150 |             relative_url = urljoin('explore/tags/', query.strip('#'))
151 |         else:  # Browse user page
152 |             relative_url = query
153 | 
154 |         target_url = urljoin(HOST, relative_url)
155 | 
156 |         self._driver.get(target_url)
157 | 
158 |     def scroll_to_num_of_posts(self, number):
159 |         # Get total number of posts of page
160 |         num_info = re.search(r'\], "count": \d+',
161 |                              self._driver.page_source).group()
162 |         num_of_posts = int(re.findall(r'\d+', num_info)[0])
163 |         print("posts: {}, number: {}".format(num_of_posts, number))
164 |         number = number if number < num_of_posts else num_of_posts
165 | 
166 |         # scroll page until reached
167 |         loadmore = WebDriverWait(self._driver, 10).until(
168 |             EC.presence_of_element_located(
169 |                 (By.CSS_SELECTOR, CSS_LOAD_MORE))
170 |         )
171 |         loadmore.click()
172 | 
173 |         num_to_scroll = int((number - 12) / 12) + 1
174 |         for _ in range(num_to_scroll):
175 |             self._driver.execute_script(SCROLL_DOWN)
176 |             time.sleep(0.2)
177 |             self._driver.execute_script(SCROLL_UP)
178 |             time.sleep(0.2)
179 | 
180 |     def scrape_photo_links(self, number, is_hashtag=False):
181 |         print("Scraping photo links...")
182 |         encased_photo_links = re.finditer(r'src="([https]+:...[\/\w \.-]*..[\/\w \.-]*'
183 |                                           r'..[\/\w \.-]*..[\/\w \.-].jpg)', self._driver.page_source)
184 | 
185 |         photo_links = [m.group(1) for m in encased_photo_links]
186 | 
187 |         print("Number of photo_links: {}".format(len(photo_links)))
188 | 
189 |         begin = 0 if is_hashtag else 1
190 | 
191 |         self.data['photo_links'] = photo_links[begin:number + begin]
192 | 
193 |     def click_and_scrape_captions(self, number):
194 |         print("Scraping captions...")
195 |         captions = []
196 | 
197 |         for post_num in range(number):
198 |             sys.stdout.write("\033[F")
199 |             print("Scraping captions {} / {}".format(post_num+1,number))
200 |             if post_num == 0:  # Click on the first post
201 |                 # Chrome
202 |                 # self._driver.find_element_by_class_name('_ovg3g').click()
203 |                 self._driver.find_element_by_xpath(
204 |                     FIREFOX_FIRST_POST_PATH).click()
205 | 
206 |                 if number != 1:  #
207 |                     WebDriverWait(self._driver, 5).until(
208 |                         EC.presence_of_element_located(
209 |                             (By.CSS_SELECTOR, CSS_RIGHT_ARROW)
210 |                         )
211 |                     )
212 | 
213 |             elif number != 1:  # Click Right Arrow to move to next post
214 |                 url_before = self._driver.current_url
215 |                 self._driver.find_element_by_css_selector(
216 |                     CSS_RIGHT_ARROW).click()
217 | 
218 |                 # Wait until the page has loaded
219 |                 try:
220 |                     WebDriverWait(self._driver, 10).until(
221 |                         url_change(url_before))
222 |                 except TimeoutException:
223 |                     print("Time out in caption scraping at number {}".format(post_num))
224 |                     break
225 | 
226 |             # Parse caption
227 |             try:
228 |                 time_element = WebDriverWait(self._driver, 10).until(
229 |                     EC.presence_of_element_located((By.TAG_NAME, "time"))
230 |                 )
231 |                 caption = time_element.find_element_by_xpath(
232 |                     TIME_TO_CAPTION_PATH).text
233 |             except NoSuchElementException:  # Forbidden
234 |                 print("Caption not found in the {} photo".format(post_num))
235 |                 caption = ""
236 | 
237 |             captions.append(caption)
238 | 
239 |         self.data['captions'] = captions
240 | 
241 |     def scrape_followers_or_following(self, crawl_type, query, number):
242 |         print("Scraping {}...".format(crawl_type))
243 |         if crawl_type == "followers":
244 |             FOLLOW_ELE = CSS_FOLLOWERS
245 |             FOLLOW_PATH = FOLLOWER_PATH
246 |         elif crawl_type == "following":
247 |             FOLLOW_ELE = CSS_FOLLOWING
248 |             FOLLOW_PATH = FOLLOWING_PATH
249 | 
250 |         # Locate follow list
251 |         follow_ele = WebDriverWait(self._driver, 5).until(
252 |             EC.presence_of_element_located(
253 |                 (By.CSS_SELECTOR, FOLLOW_ELE.format(query)))
254 |         )
255 | 
256 |         # when no number defined, check the total items
257 |         if number is 0:
258 |             number = int(filter(str.isdigit, str(follow_ele.text)))
259 |             print("getting all " + str(number) + " items")
260 | 
261 |         # open desired list
262 |         follow_ele.click()
263 | 
264 |         title_ele = WebDriverWait(self._driver, 5).until(
265 |             EC.presence_of_element_located(
266 |                 (By.XPATH, FOLLOW_PATH))
267 |         )
268 |         List = title_ele.find_element_by_xpath(
269 |             '..').find_element_by_tag_name('ul')
270 |         List.click()
271 | 
272 |         # Loop through list till target number is reached
273 |         num_of_shown_follow = len(List.find_elements_by_xpath('*'))
274 |         while len(List.find_elements_by_xpath('*')) < number:
275 |             element = List.find_elements_by_xpath('*')[-1]
276 |             # Work around for now => should use selenium's Expected Conditions!
277 |             try:
278 |                 element.send_keys(Keys.PAGE_DOWN)
279 |             except Exception as e:
280 |                 time.sleep(0.1)
281 | 
282 |         follow_items = []
283 |         for ele in List.find_elements_by_xpath('*')[:number]:
284 |             follow_items.append(ele.text.split('\n')[0])
285 | 
286 |         self.data[crawl_type] = follow_items
287 | 
288 |     def download_and_save(self, dir_prefix, query, crawl_type):
289 |         # Check if is hashtag
290 |         dir_name = query.lstrip(
291 |             '#') + '.hashtag' if query.startswith('#') else query
292 | 
293 |         dir_path = os.path.join(dir_prefix, dir_name)
294 |         if not os.path.exists(dir_path):
295 |             os.makedirs(dir_path)
296 | 
297 |         print("Saving to directory: {}".format(dir_path))
298 | 
299 |         # Save Photos
300 |         for idx, photo_link in enumerate(self.data['photo_links'], 0):
301 |             sys.stdout.write("\033[F")
302 |             print("Downloading {} images to ".format(idx + 1))
303 |             # Filename
304 |             _, ext = os.path.splitext(photo_link)
305 |             filename = str(idx) + ext
306 |             filepath = os.path.join(dir_path, filename)
307 |             # Send image request
308 |             urlretrieve(photo_link, filepath)
309 | 
310 |         # Save Captions
311 |         for idx, caption in enumerate(self.data['captions'], 0):
312 | 
313 |             filename = str(idx) + '.txt'
314 |             filepath = os.path.join(dir_path, filename)
315 | 
316 |             with codecs.open(filepath, 'w', encoding='utf-8') as fout:
317 |                 fout.write(caption + '\n')
318 | 
319 |         # Save followers/following
320 |         filename = crawl_type + '.txt'
321 |         filepath = os.path.join(dir_path, filename)
322 |         if len(self.data[crawl_type]):
323 |             with codecs.open(filepath, 'w', encoding='utf-8') as fout:
324 |                 for fol in self.data[crawl_type]:
325 |                     fout.write(fol + '\n')
326 | 
327 | 
328 | def main():
329 |     #   Arguments  #
330 |     parser = argparse.ArgumentParser(description='Instagram Crawler')
331 |     parser.add_argument('-d', '--dir_prefix', type=str,
332 |                         default='./data/', help='directory to save results')
333 |     parser.add_argument('-q', '--query', type=str, default='instagram',
334 |                         help="target to crawl, add '#' for hashtags")
335 |     parser.add_argument('-t', '--crawl_type', type=str,
336 |                         default='photos', help="Options: 'photos' | 'followers' | 'following'")
337 |     parser.add_argument('-n', '--number', type=int, default=0,
338 |                         help='Number of posts to download: integer')
339 |     parser.add_argument('-c', '--caption', action='store_true',
340 |                         help='Add this flag to download caption when downloading photos')
341 |     parser.add_argument('-l', '--headless', action='store_true',
342 |                         help='If set, will use PhantomJS driver to run script as headless')
343 |     parser.add_argument('-a', '--authentication', type=str, default=None,
344 |                         help='path to authentication json file')
345 |     parser.add_argument('-f', '--firefox_path', type=str, default=None,
346 |                         help='path to Firefox installation')
347 |     args = parser.parse_args()
348 |     #  End Argparse #
349 | 
350 |     crawler = InstagramCrawler(headless=args.headless, firefox_path=args.firefox_path)
351 |     crawler.crawl(dir_prefix=args.dir_prefix,
352 |                   query=args.query,
353 |                   crawl_type=args.crawl_type,
354 |                   number=args.number,
355 |                   caption=args.caption,
356 |                   authentication=args.authentication)
357 | 
358 | 
359 | if __name__ == "__main__":
360 |     main()
361 | 


--------------------------------------------------------------------------------