├── .gitignore ├── .idea ├── .gitignore ├── Instagram-Comments-Scraper.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── LICENSE.md ├── README.md ├── __pycache__ └── excel_exporter.cpython-36.pyc ├── _config.yml ├── excel_exporter.py ├── requirements.txt └── scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.xlsx 3 | __pycache__/ 4 | ~lock.* 5 | chromedriver_linux64.zip 6 | .venv 7 | .idea 8 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default ignored files 3 | /workspace.xml -------------------------------------------------------------------------------- /.idea/Instagram-Comments-Scraper.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 14 | 15 | 16 | 18 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Agi Maulana 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > I don't actively maintain this repository. 2 | > 3 | > 4 | > Why this scraper doesn't work on my side? 5 | > 6 | > As I observed on the current Instagram website the website could respond with different elements or pages so this scraper might not work for some people. 7 | 8 | # Instagram Comments Scraper 9 | 10 | ## Installation 11 | 1. Clone: 12 | `git clone git@github.com:AgiMaulana/Instagram-Comments-Scraper.git` 13 | 14 | or `git clone https://github.com/AgiMaulana/Instagram-Comments-Scraper.git` 15 | 16 | or download the [zip](https://github.com/AgiMaulana/Instagram-Comments-Scraper/archive/master.zip) 17 | 18 | 3. Create Virtual Environment (Recommended)
19 | - `pip install virtualenv` 20 | - `virtualenv .venv` 21 | 22 | 4. Activate the virtual environment 23 | - `source .venv/bin/activate` 24 | 25 | 5. Install dependencies 26 | - `pip install -r requirements.txt` 27 | 28 | 6. Login 29 | - `username.send_keys ('USER-NAME')` change with your username 30 | - `password.send_keys('PASSWORD')` change with your password 31 | - We don't store your password 32 | 33 | 7. Run 34 | - `python scraper.py post-url total-load-more-click` 35 | 36 | Change the URL with your post target.
37 | For example: `python scraper.py https://www.instagram.com/p/CBHH2KjI6BW/ 5` 38 | 39 | 8. Deactivate the virtual environment 40 | - `deactivate` 41 | 42 | ## License 43 | This project is under the [MIT License](https://github.com/AgiMaulana/instagram-comments-scraper/blob/master/LICENSE.md) 44 | -------------------------------------------------------------------------------- /__pycache__/excel_exporter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AgiMaulana/Instagram-Comments-Scraper/6b6b1a13ae783b45ebae9031aa5b7e9ab6e02363/__pycache__/excel_exporter.cpython-36.pyc -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /excel_exporter.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import ExcelWriter 3 | import os.path 4 | 5 | def export(names, comments): 6 | fname = 'comments.xlsx' 7 | temp = {} 8 | temp_names = [] 9 | temp_comments = [] 10 | # if os.path.isfile(fname): 11 | # saved = pd.read_excel(fname) 12 | # temp_names.extend(saved['name']) 13 | # temp_comments.extend(saved['comment']) 14 | # temp_names.extend(names) 15 | # temp_comments.extend(comments) 16 | temp.update({'name': names, 'comment': comments}) 17 | df = pd.DataFrame(temp) 18 | 19 | 20 | 21 | df.to_excel(fname) 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | selenium 2 | pandas 3 | xlrd 4 | openpyxl -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | import sys 4 | from selenium.webdriver.common.by import By 5 | 6 | service = webdriver.FirefoxService(executable_path="path_to_geckodriver") 7 | 8 | driver = webdriver.Firefox(service=service) 9 | 10 | url="https://www.instagram.com/" 11 | 12 | driver.get(url) 13 | 14 | time.sleep (2) 15 | 16 | """ 17 | On the first project of me which we entered Twitter without using password,we use XPaths of 18 | elements but in Instagram,when we refresh our website,id number of login url changes so we need 19 | to use something different to use that link through Python Selenium. Either we can choose class name 20 | or name selectors to use that. 21 | """ 22 | username=driver.find_element(By.NAME,"username") 23 | username.send_keys ('USER-NAME') 24 | 25 | password =driver.find_element (By.NAME,"password") 26 | password.send_keys('PASSWORD') 27 | password.submit() 28 | 29 | 30 | time.sleep(10) 31 | 32 | 33 | 34 | driver.get(sys.argv[1]) 35 | 36 | time.sleep(4) 37 | 38 | 39 | 40 | # load "sys.argv[2]" comments 41 | try: 42 | load_more_comment = driver.find_element(By.XPATH,'/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[2]/div/div[2]/div/div/ul/li/div/button') 43 | print("Found {}".format(str(load_more_comment))) 44 | i = 0 45 | while load_more_comment.is_displayed() and i < int(sys.argv[2]): 46 | load_more_comment.click() 47 | time.sleep(7) 48 | load_more_comment = driver.find_element(By.XPATH,'/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[2]/div/div[2]/div/div/ul/li/div/button') 49 | print(i) 50 | print("Found {}".format(str(load_more_comment))) 51 | i += 1 52 | except Exception as e: 53 | print(e) 54 | pass 55 | 56 | 57 | 58 | user_names = [] 59 | user_comments = [] 60 | comment = driver.find_elements(By.CLASS_NAME,'_a9ym') 61 | for c in comment: 62 | container = c.find_element(By.CLASS_NAME,'_a9zr') 63 | name = container.find_element(By.CLASS_NAME,'_a9zc').text 64 | content = container.find_element(By.TAG_NAME,'span').text 65 | content = content.replace('\n', ' ').strip().rstrip() 66 | user_names.append(name) 67 | user_comments.append(content) 68 | 69 | user_names.pop(0) 70 | user_comments.pop(0) 71 | # print(user_names) 72 | # print(user_comments) 73 | import excel_exporter 74 | excel_exporter.export(user_names, user_comments) 75 | 76 | driver.close() 77 | --------------------------------------------------------------------------------