├── .gitignore
├── .idea
├── .gitignore
├── Instagram-Comments-Scraper.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── LICENSE.md
├── README.md
├── __pycache__
└── excel_exporter.cpython-36.pyc
├── _config.yml
├── excel_exporter.py
├── requirements.txt
└── scraper.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.xlsx
3 | __pycache__/
4 | ~lock.*
5 | chromedriver_linux64.zip
6 | .venv
7 | .idea
8 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Default ignored files
3 | /workspace.xml
--------------------------------------------------------------------------------
/.idea/Instagram-Comments-Scraper.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Agi Maulana
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | > I don't actively maintain this repository.
2 | >
3 | >
4 | > Why this scraper doesn't work on my side?
5 | >
6 | > As I observed on the current Instagram website the website could respond with different elements or pages so this scraper might not work for some people.
7 |
8 | # Instagram Comments Scraper
9 |
10 | ## Installation
11 | 1. Clone:
12 | `git clone git@github.com:AgiMaulana/Instagram-Comments-Scraper.git`
13 |
14 | or `git clone https://github.com/AgiMaulana/Instagram-Comments-Scraper.git`
15 |
16 | or download the [zip](https://github.com/AgiMaulana/Instagram-Comments-Scraper/archive/master.zip)
17 |
18 | 3. Create Virtual Environment (Recommended)
19 | - `pip install virtualenv`
20 | - `virtualenv .venv`
21 |
22 | 4. Activate the virtual environment
23 | - `source .venv/bin/activate`
24 |
25 | 5. Install dependencies
26 | - `pip install -r requirements.txt`
27 |
28 | 6. Login
29 | - `username.send_keys ('USER-NAME')` change with your username
30 | - `password.send_keys('PASSWORD')` change with your password
31 | - We don't store your password
32 |
33 | 7. Run
34 | - `python scraper.py post-url total-load-more-click`
35 |
36 | Change the URL with your post target.
37 | For example: `python scraper.py https://www.instagram.com/p/CBHH2KjI6BW/ 5`
38 |
39 | 8. Deactivate the virtual environment
40 | - `deactivate`
41 |
42 | ## License
43 | This project is under the [MIT License](https://github.com/AgiMaulana/instagram-comments-scraper/blob/master/LICENSE.md)
44 |
--------------------------------------------------------------------------------
/__pycache__/excel_exporter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AgiMaulana/Instagram-Comments-Scraper/6b6b1a13ae783b45ebae9031aa5b7e9ab6e02363/__pycache__/excel_exporter.cpython-36.pyc
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/excel_exporter.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pandas import ExcelWriter
3 | import os.path
4 |
5 | def export(names, comments):
6 | fname = 'comments.xlsx'
7 | temp = {}
8 | temp_names = []
9 | temp_comments = []
10 | # if os.path.isfile(fname):
11 | # saved = pd.read_excel(fname)
12 | # temp_names.extend(saved['name'])
13 | # temp_comments.extend(saved['comment'])
14 | # temp_names.extend(names)
15 | # temp_comments.extend(comments)
16 | temp.update({'name': names, 'comment': comments})
17 | df = pd.DataFrame(temp)
18 |
19 |
20 |
21 | df.to_excel(fname)
22 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | selenium
2 | pandas
3 | xlrd
4 | openpyxl
--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import time
3 | import sys
4 | from selenium.webdriver.common.by import By
5 |
6 | service = webdriver.FirefoxService(executable_path="path_to_geckodriver")
7 |
8 | driver = webdriver.Firefox(service=service)
9 |
10 | url="https://www.instagram.com/"
11 |
12 | driver.get(url)
13 |
14 | time.sleep (2)
15 |
16 | """
17 | On the first project of me which we entered Twitter without using password,we use XPaths of
18 | elements but in Instagram,when we refresh our website,id number of login url changes so we need
19 | to use something different to use that link through Python Selenium. Either we can choose class name
20 | or name selectors to use that.
21 | """
22 | username=driver.find_element(By.NAME,"username")
23 | username.send_keys ('USER-NAME')
24 |
25 | password =driver.find_element (By.NAME,"password")
26 | password.send_keys('PASSWORD')
27 | password.submit()
28 |
29 |
30 | time.sleep(10)
31 |
32 |
33 |
34 | driver.get(sys.argv[1])
35 |
36 | time.sleep(4)
37 |
38 |
39 |
40 | # load "sys.argv[2]" comments
41 | try:
42 | load_more_comment = driver.find_element(By.XPATH,'/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[2]/div/div[2]/div/div/ul/li/div/button')
43 | print("Found {}".format(str(load_more_comment)))
44 | i = 0
45 | while load_more_comment.is_displayed() and i < int(sys.argv[2]):
46 | load_more_comment.click()
47 | time.sleep(7)
48 | load_more_comment = driver.find_element(By.XPATH,'/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[2]/div/div[2]/div/div/ul/li/div/button')
49 | print(i)
50 | print("Found {}".format(str(load_more_comment)))
51 | i += 1
52 | except Exception as e:
53 | print(e)
54 | pass
55 |
56 |
57 |
58 | user_names = []
59 | user_comments = []
60 | comment = driver.find_elements(By.CLASS_NAME,'_a9ym')
61 | for c in comment:
62 | container = c.find_element(By.CLASS_NAME,'_a9zr')
63 | name = container.find_element(By.CLASS_NAME,'_a9zc').text
64 | content = container.find_element(By.TAG_NAME,'span').text
65 | content = content.replace('\n', ' ').strip().rstrip()
66 | user_names.append(name)
67 | user_comments.append(content)
68 |
69 | user_names.pop(0)
70 | user_comments.pop(0)
71 | # print(user_names)
72 | # print(user_comments)
73 | import excel_exporter
74 | excel_exporter.export(user_names, user_comments)
75 |
76 | driver.close()
77 |
--------------------------------------------------------------------------------