├── requirements.txt ├── auth.json.example ├── utils ├── set_path.sh ├── get_phantomjs.sh └── get_gecko.sh ├── .gitignore ├── test.sh ├── .travis.yml ├── LICENSE ├── README.md └── instagramcrawler.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.21.0 2 | selenium==3.4.0 3 | -------------------------------------------------------------------------------- /auth.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "username": "", 3 | "password": "" 4 | } 5 | -------------------------------------------------------------------------------- /utils/set_path.sh: -------------------------------------------------------------------------------- 1 | 2 | PATH=${PATH}:${PWD}/geckodriver 3 | PATH=${PATH}:${PWD}/phantomjs-2.1.1-linux-x86_64/bin 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Default data directory 2 | data 3 | 4 | # prevent terrible mistakes 5 | auth.json 6 | 7 | # driver logs 8 | geckodriver.log 9 | ghostdriver.log -------------------------------------------------------------------------------- /utils/get_phantomjs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # from https://stackoverflow.com/a/45273545/470341 4 | 5 | wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 6 | tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 7 | 8 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | echo "Query account 'instagram', download 20 photos and their captions" 2 | python instagramcrawler.py -q 'instagram' -n 20 -c 3 | 4 | echo "Query hashtag '#breakfast' and download 20 photos" 5 | python instagramcrawler.py -q '#breakfast' -n 20 6 | -------------------------------------------------------------------------------- /utils/get_gecko.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # following travis.yml https://github.com/iammrhelo/InstagramCrawler/blob/master/.travis.yml#L11-L14 4 | 5 | wget https://github.com/mozilla/geckodriver/releases/download/v0.16.0/geckodriver-v0.16.0-linux64.tar.gz 6 | mkdir -p geckodriver && tar zxvf geckodriver-v0.16.0-linux64.tar.gz -C geckodriver 7 | 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | addons: 6 | firefox: "53.0" 7 | before_script: 8 | - "export DISPLAY=:99.0" 9 | - "sh -e /etc/init.d/xvfb start" 10 | - sleep 3 # give xvfb some time to start 11 | before_install: 12 | - wget https://github.com/mozilla/geckodriver/releases/download/v0.16.0/geckodriver-v0.16.0-linux64.tar.gz 13 | - mkdir geckodriver && tar zxvf geckodriver-v0.16.0-linux64.tar.gz -C geckodriver 14 | - export PATH=$PATH:$PWD/geckodriver 15 | install: "pip install -r requirements.txt" 16 | script: sh test.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Antonie Lin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 05/03/2019 Repo is now archived. 2 | 3 | I am now officially archiving this repo after a long time of, well, not maintaining. 4 | 5 | --- 6 | # InstagramCrawler 7 | A non API python program to crawl public photos, posts, followers, and following 8 | 9 | ##### Login to crawl followers/following 10 | To crawl followers or followings, you will need to login with your credentials either by filling in 'auth.json' or typing in(as you would do when you are simply browsing instagram) 11 | 12 | Well, it is to copy 'auth.json.example' to 'auth.json' and fill in your username and password 13 | 14 | ##### PhantomJS for headless browser 15 | For headless browser, after installing [phantomjs](http://phantomjs.org/), add '-l' to the arguments 16 | 17 | ### Examples: 18 | Download the first 100 photos and captions(user's posts, if any) from username "instagram" 19 | 20 | ###### NOTE: When I ran on public account 'instagram', somehow it stops at caption 29 21 | ``` 22 | $ python instagramcrawler.py -q 'instagram' -c -n 100 23 | ``` 24 | Search for the hashtag "#breakfast" and download first 50 photos 25 | ``` 26 | $ python instagramcrawler.py -q '#breakfast' -n 50 27 | ``` 28 | Record the first 30 followers of the username "instagram", requires log in 29 | ``` 30 | $ python instagramcrawler.py -q 'instagram' -t 'followers' -n 30 -a auth.json 31 | ``` 32 | 33 | ### Full usage: 34 | ``` 35 | usage: instagramcrawler.py [-h] [-d DIR] [-q QUERY] [-t CRAWL_TYPE] [-n NUMBER] [-c] [-a AUTHENTICATION] 36 | ``` 37 | - [-d DIR]: the directory to save crawling results, default is './data/[query]' 38 | - [-q QUERY] : username, add '#' to search for hashtags, e.g. 'username', '#hashtag' 39 | - [-t CRAWL_TYPE]: crawl_type, Options: 'photos | followers | following' 40 | - [-n NUMBER]: number of posts, followers, or following to crawl 41 | - [-c]: add this flag to download captions(what user wrote to describe their photos) 42 | - [-a AUTHENTICATION]: path to a json file, which contains your instagram credentials, please see 'auth.json' 43 | - [-l HEADLESS]: If set, will use PhantomJS driver to run script as headless 44 | - [-f FIREFOX_PATH]: path to the **binary** (not the script) of firefox on your system (see this issue in Selenium https://github.com/SeleniumHQ/selenium/issues/3884#issuecomment-296988595) 45 | 46 | 47 | ### Installation 48 | There are 2 packages : selenium & requests 49 | 50 | ###### NOTE: I used selenium = 3.4, geckodriver = 0.16 (fixed bug in previous versions) 51 | ``` 52 | $ pip install -r requirements.txt 53 | ``` 54 | 55 | ###### Optional: geckodriver and phantomjs if not present on your system 56 | ``` 57 | bash utils/get_gecko.sh 58 | bash utils/get_phantomjs.sh 59 | source utils/set_path.sh 60 | ``` 61 | 62 | -------------------------------------------------------------------------------- /instagramcrawler.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import argparse 4 | import codecs 5 | from collections import defaultdict 6 | import json 7 | import os 8 | import re 9 | import sys 10 | import time 11 | try: 12 | from urlparse import urljoin 13 | from urllib import urlretrieve 14 | except ImportError: 15 | from urllib.parse import urljoin 16 | from urllib.request import urlretrieve 17 | 18 | import requests 19 | import selenium 20 | from selenium import webdriver 21 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 22 | from selenium.webdriver.common.by import By 23 | from selenium.webdriver.common.keys import Keys 24 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 25 | from selenium.webdriver.support import expected_conditions as EC 26 | from selenium.webdriver.support.ui import WebDriverWait 27 | 28 | # HOST 29 | HOST = 'http://www.instagram.com' 30 | 31 | # SELENIUM CSS SELECTOR 32 | CSS_LOAD_MORE = "a._1cr2e._epyes" 33 | CSS_RIGHT_ARROW = "a[class='_de018 coreSpriteRightPaginationArrow']" 34 | FIREFOX_FIRST_POST_PATH = "//div[contains(@class, '_8mlbc _vbtk2 _t5r8b')]" 35 | TIME_TO_CAPTION_PATH = "../../../div/ul/li/span" 36 | 37 | # FOLLOWERS/FOLLOWING RELATED 38 | CSS_EXPLORE = "a[href='/explore/']" 39 | CSS_LOGIN = "a[href='/accounts/login/']" 40 | CSS_FOLLOWERS = "a[href='/{}/followers/']" 41 | CSS_FOLLOWING = "a[href='/{}/following/']" 42 | FOLLOWER_PATH = "//div[contains(text(), 'Followers')]" 43 | FOLLOWING_PATH = "//div[contains(text(), 'Following')]" 44 | 45 | # JAVASCRIPT COMMANDS 46 | SCROLL_UP = "window.scrollTo(0, 0);" 47 | SCROLL_DOWN = "window.scrollTo(0, document.body.scrollHeight);" 48 | 49 | class url_change(object): 50 | """ 51 | Used for caption scraping 52 | """ 53 | def __init__(self, prev_url): 54 | self.prev_url = prev_url 55 | 56 | def __call__(self, driver): 57 | return self.prev_url != driver.current_url 58 | 59 | class InstagramCrawler(object): 60 | """ 61 | Crawler class 62 | """ 63 | def __init__(self, headless=True, firefox_path=None): 64 | if headless: 65 | print("headless mode on") 66 | self._driver = webdriver.PhantomJS() 67 | else: 68 | # credit to https://github.com/SeleniumHQ/selenium/issues/3884#issuecomment-296990844 69 | binary = FirefoxBinary(firefox_path) 70 | self._driver = webdriver.Firefox(firefox_binary=binary) 71 | 72 | self._driver.implicitly_wait(10) 73 | self.data = defaultdict(list) 74 | 75 | def login(self, authentication=None): 76 | """ 77 | authentication: path to authentication json file 78 | """ 79 | self._driver.get(urljoin(HOST, "accounts/login/")) 80 | 81 | if authentication: 82 | print("Username and password loaded from {}".format(authentication)) 83 | with open(authentication, 'r') as fin: 84 | auth_dict = json.loads(fin.read()) 85 | # Input username 86 | username_input = WebDriverWait(self._driver, 5).until( 87 | EC.presence_of_element_located((By.NAME, 'username')) 88 | ) 89 | username_input.send_keys(auth_dict['username']) 90 | # Input password 91 | password_input = WebDriverWait(self._driver, 5).until( 92 | EC.presence_of_element_located((By.NAME, 'password')) 93 | ) 94 | password_input.send_keys(auth_dict['password']) 95 | # Submit 96 | password_input.submit() 97 | else: 98 | print("Type your username and password by hand to login!") 99 | print("You have a minute to do so!") 100 | 101 | print("") 102 | WebDriverWait(self._driver, 60).until( 103 | EC.presence_of_element_located((By.CSS_SELECTOR, CSS_EXPLORE)) 104 | ) 105 | 106 | def quit(self): 107 | self._driver.quit() 108 | 109 | def crawl(self, dir_prefix, query, crawl_type, number, caption, authentication): 110 | print("dir_prefix: {}, query: {}, crawl_type: {}, number: {}, caption: {}, authentication: {}" 111 | .format(dir_prefix, query, crawl_type, number, caption, authentication)) 112 | 113 | if crawl_type == "photos": 114 | # Browse target page 115 | self.browse_target_page(query) 116 | # Scroll down until target number photos is reached 117 | self.scroll_to_num_of_posts(number) 118 | # Scrape photo links 119 | self.scrape_photo_links(number, is_hashtag=query.startswith("#")) 120 | # Scrape captions if specified 121 | if caption is True: 122 | self.click_and_scrape_captions(number) 123 | 124 | elif crawl_type in ["followers", "following"]: 125 | # Need to login first before crawling followers/following 126 | print("You will need to login to crawl {}".format(crawl_type)) 127 | self.login(authentication) 128 | 129 | # Then browse target page 130 | assert not query.startswith( 131 | '#'), "Hashtag does not have followers/following!" 132 | self.browse_target_page(query) 133 | # Scrape captions 134 | self.scrape_followers_or_following(crawl_type, query, number) 135 | else: 136 | print("Unknown crawl type: {}".format(crawl_type)) 137 | self.quit() 138 | return 139 | # Save to directory 140 | print("Saving...") 141 | self.download_and_save(dir_prefix, query, crawl_type) 142 | 143 | # Quit driver 144 | print("Quitting driver...") 145 | self.quit() 146 | 147 | def browse_target_page(self, query): 148 | # Browse Hashtags 149 | if query.startswith('#'): 150 | relative_url = urljoin('explore/tags/', query.strip('#')) 151 | else: # Browse user page 152 | relative_url = query 153 | 154 | target_url = urljoin(HOST, relative_url) 155 | 156 | self._driver.get(target_url) 157 | 158 | def scroll_to_num_of_posts(self, number): 159 | # Get total number of posts of page 160 | num_info = re.search(r'\], "count": \d+', 161 | self._driver.page_source).group() 162 | num_of_posts = int(re.findall(r'\d+', num_info)[0]) 163 | print("posts: {}, number: {}".format(num_of_posts, number)) 164 | number = number if number < num_of_posts else num_of_posts 165 | 166 | # scroll page until reached 167 | loadmore = WebDriverWait(self._driver, 10).until( 168 | EC.presence_of_element_located( 169 | (By.CSS_SELECTOR, CSS_LOAD_MORE)) 170 | ) 171 | loadmore.click() 172 | 173 | num_to_scroll = int((number - 12) / 12) + 1 174 | for _ in range(num_to_scroll): 175 | self._driver.execute_script(SCROLL_DOWN) 176 | time.sleep(0.2) 177 | self._driver.execute_script(SCROLL_UP) 178 | time.sleep(0.2) 179 | 180 | def scrape_photo_links(self, number, is_hashtag=False): 181 | print("Scraping photo links...") 182 | encased_photo_links = re.finditer(r'src="([https]+:...[\/\w \.-]*..[\/\w \.-]*' 183 | r'..[\/\w \.-]*..[\/\w \.-].jpg)', self._driver.page_source) 184 | 185 | photo_links = [m.group(1) for m in encased_photo_links] 186 | 187 | print("Number of photo_links: {}".format(len(photo_links))) 188 | 189 | begin = 0 if is_hashtag else 1 190 | 191 | self.data['photo_links'] = photo_links[begin:number + begin] 192 | 193 | def click_and_scrape_captions(self, number): 194 | print("Scraping captions...") 195 | captions = [] 196 | 197 | for post_num in range(number): 198 | sys.stdout.write("\033[F") 199 | print("Scraping captions {} / {}".format(post_num+1,number)) 200 | if post_num == 0: # Click on the first post 201 | # Chrome 202 | # self._driver.find_element_by_class_name('_ovg3g').click() 203 | self._driver.find_element_by_xpath( 204 | FIREFOX_FIRST_POST_PATH).click() 205 | 206 | if number != 1: # 207 | WebDriverWait(self._driver, 5).until( 208 | EC.presence_of_element_located( 209 | (By.CSS_SELECTOR, CSS_RIGHT_ARROW) 210 | ) 211 | ) 212 | 213 | elif number != 1: # Click Right Arrow to move to next post 214 | url_before = self._driver.current_url 215 | self._driver.find_element_by_css_selector( 216 | CSS_RIGHT_ARROW).click() 217 | 218 | # Wait until the page has loaded 219 | try: 220 | WebDriverWait(self._driver, 10).until( 221 | url_change(url_before)) 222 | except TimeoutException: 223 | print("Time out in caption scraping at number {}".format(post_num)) 224 | break 225 | 226 | # Parse caption 227 | try: 228 | time_element = WebDriverWait(self._driver, 10).until( 229 | EC.presence_of_element_located((By.TAG_NAME, "time")) 230 | ) 231 | caption = time_element.find_element_by_xpath( 232 | TIME_TO_CAPTION_PATH).text 233 | except NoSuchElementException: # Forbidden 234 | print("Caption not found in the {} photo".format(post_num)) 235 | caption = "" 236 | 237 | captions.append(caption) 238 | 239 | self.data['captions'] = captions 240 | 241 | def scrape_followers_or_following(self, crawl_type, query, number): 242 | print("Scraping {}...".format(crawl_type)) 243 | if crawl_type == "followers": 244 | FOLLOW_ELE = CSS_FOLLOWERS 245 | FOLLOW_PATH = FOLLOWER_PATH 246 | elif crawl_type == "following": 247 | FOLLOW_ELE = CSS_FOLLOWING 248 | FOLLOW_PATH = FOLLOWING_PATH 249 | 250 | # Locate follow list 251 | follow_ele = WebDriverWait(self._driver, 5).until( 252 | EC.presence_of_element_located( 253 | (By.CSS_SELECTOR, FOLLOW_ELE.format(query))) 254 | ) 255 | 256 | # when no number defined, check the total items 257 | if number is 0: 258 | number = int(filter(str.isdigit, str(follow_ele.text))) 259 | print("getting all " + str(number) + " items") 260 | 261 | # open desired list 262 | follow_ele.click() 263 | 264 | title_ele = WebDriverWait(self._driver, 5).until( 265 | EC.presence_of_element_located( 266 | (By.XPATH, FOLLOW_PATH)) 267 | ) 268 | List = title_ele.find_element_by_xpath( 269 | '..').find_element_by_tag_name('ul') 270 | List.click() 271 | 272 | # Loop through list till target number is reached 273 | num_of_shown_follow = len(List.find_elements_by_xpath('*')) 274 | while len(List.find_elements_by_xpath('*')) < number: 275 | element = List.find_elements_by_xpath('*')[-1] 276 | # Work around for now => should use selenium's Expected Conditions! 277 | try: 278 | element.send_keys(Keys.PAGE_DOWN) 279 | except Exception as e: 280 | time.sleep(0.1) 281 | 282 | follow_items = [] 283 | for ele in List.find_elements_by_xpath('*')[:number]: 284 | follow_items.append(ele.text.split('\n')[0]) 285 | 286 | self.data[crawl_type] = follow_items 287 | 288 | def download_and_save(self, dir_prefix, query, crawl_type): 289 | # Check if is hashtag 290 | dir_name = query.lstrip( 291 | '#') + '.hashtag' if query.startswith('#') else query 292 | 293 | dir_path = os.path.join(dir_prefix, dir_name) 294 | if not os.path.exists(dir_path): 295 | os.makedirs(dir_path) 296 | 297 | print("Saving to directory: {}".format(dir_path)) 298 | 299 | # Save Photos 300 | for idx, photo_link in enumerate(self.data['photo_links'], 0): 301 | sys.stdout.write("\033[F") 302 | print("Downloading {} images to ".format(idx + 1)) 303 | # Filename 304 | _, ext = os.path.splitext(photo_link) 305 | filename = str(idx) + ext 306 | filepath = os.path.join(dir_path, filename) 307 | # Send image request 308 | urlretrieve(photo_link, filepath) 309 | 310 | # Save Captions 311 | for idx, caption in enumerate(self.data['captions'], 0): 312 | 313 | filename = str(idx) + '.txt' 314 | filepath = os.path.join(dir_path, filename) 315 | 316 | with codecs.open(filepath, 'w', encoding='utf-8') as fout: 317 | fout.write(caption + '\n') 318 | 319 | # Save followers/following 320 | filename = crawl_type + '.txt' 321 | filepath = os.path.join(dir_path, filename) 322 | if len(self.data[crawl_type]): 323 | with codecs.open(filepath, 'w', encoding='utf-8') as fout: 324 | for fol in self.data[crawl_type]: 325 | fout.write(fol + '\n') 326 | 327 | 328 | def main(): 329 | # Arguments # 330 | parser = argparse.ArgumentParser(description='Instagram Crawler') 331 | parser.add_argument('-d', '--dir_prefix', type=str, 332 | default='./data/', help='directory to save results') 333 | parser.add_argument('-q', '--query', type=str, default='instagram', 334 | help="target to crawl, add '#' for hashtags") 335 | parser.add_argument('-t', '--crawl_type', type=str, 336 | default='photos', help="Options: 'photos' | 'followers' | 'following'") 337 | parser.add_argument('-n', '--number', type=int, default=0, 338 | help='Number of posts to download: integer') 339 | parser.add_argument('-c', '--caption', action='store_true', 340 | help='Add this flag to download caption when downloading photos') 341 | parser.add_argument('-l', '--headless', action='store_true', 342 | help='If set, will use PhantomJS driver to run script as headless') 343 | parser.add_argument('-a', '--authentication', type=str, default=None, 344 | help='path to authentication json file') 345 | parser.add_argument('-f', '--firefox_path', type=str, default=None, 346 | help='path to Firefox installation') 347 | args = parser.parse_args() 348 | # End Argparse # 349 | 350 | crawler = InstagramCrawler(headless=args.headless, firefox_path=args.firefox_path) 351 | crawler.crawl(dir_prefix=args.dir_prefix, 352 | query=args.query, 353 | crawl_type=args.crawl_type, 354 | number=args.number, 355 | caption=args.caption, 356 | authentication=args.authentication) 357 | 358 | 359 | if __name__ == "__main__": 360 | main() 361 | --------------------------------------------------------------------------------