├── README.md ├── Instagram Scraper - Comments Only.ipynb └── Insta_scraper_V2.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # instagram-photo-reel-webscraping 2 | 📷 💾 Python bulk instagram scraper for photos and videos using Selenium and BS4. 3 | 4 |
tag containing the JSON data\n",
150 | " \n",
151 | " # Extract the JSON data from the tag\n",
152 | "\n",
153 | " # Parse the JSON data\n",
154 | " \n",
155 | " # Add json to the list\n",
156 | " \n",
157 | " #Error Handling\n"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "395bd4f1",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "# Lists to store URLs and corresponding dates\n",
168 | "\n",
169 | "# Iterate through each JSON data in the list\n",
170 | " \n",
171 | " # Extract the list from the 'items' key\n",
172 | " \n",
173 | " \n",
174 | " # Iterate through each item in the 'items' list\n",
175 | " \n",
176 | " # Extract the date the item was taken\n",
177 | "\n",
178 | " # Check if 'carousel_media' is present\n",
179 | " \n",
180 | " # Iterate through each media in the 'carousel_media' list\n",
181 | " \n",
182 | " # Extract the image URL from the media\n",
183 | " \n",
184 | " # Check if the image_url field is found inside the 'carousel_media' list\n",
185 | "\n",
186 | " # Add the image URL and corresponding date to the lists\n",
187 | " \n",
188 | " # Extract the video URL from the media\n",
189 | " \n",
190 | " # Add the video URL and corresponding date to the lists\n",
191 | "\n",
192 | " # Handle cases of unique image, instead of carousel\n",
193 | " \n",
194 | " # Add the image URL and corresponding date to the lists\n",
195 | "\n",
196 | " # Check if 'video_versions' key exists\n",
197 | " "
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "id": "70173379",
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "# Create a directory to store downloaded files\n",
208 | "\n",
209 | "# Create subfolders for images and videos\n",
210 | "\n",
211 | "# Initialize counters for images and videos\n",
212 | "\n",
213 | "\n",
214 | "# Iterate through URLs in the all_urls list and download media\n",
215 | "\n",
216 | " # Extract file extension from the URL\n",
217 | "\n",
218 | " # Determine the file name based on the URL\n",
219 | " \n",
220 | " # Default to the main download directory for other file types\n",
221 | "\n",
222 | " # Save the file to the appropriate folder\n",
223 | "\n",
224 | " \n",
225 | " # Write the content of the response to the file\n",
226 | " \n",
227 | "\n",
228 | "# Print a message indicating the number of downloaded files and the download directory"
229 | ]
230 | }
231 | ],
232 | "metadata": {
233 | "kernelspec": {
234 | "display_name": "Python 3 (ipykernel)",
235 | "language": "python",
236 | "name": "python3"
237 | },
238 | "language_info": {
239 | "codemirror_mode": {
240 | "name": "ipython",
241 | "version": 3
242 | },
243 | "file_extension": ".py",
244 | "mimetype": "text/x-python",
245 | "name": "python",
246 | "nbconvert_exporter": "python",
247 | "pygments_lexer": "ipython3",
248 | "version": "3.9.13"
249 | }
250 | },
251 | "nbformat": 4,
252 | "nbformat_minor": 5
253 | }
254 |
--------------------------------------------------------------------------------
/Insta_scraper_V2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "729a2f1e",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "#Import dependencies\n",
11 | "from selenium import webdriver\n",
12 | "from selenium.webdriver.common.keys import Keys\n",
13 | "from selenium.webdriver.support import expected_conditions as EC\n",
14 | "from selenium.webdriver.common.by import By\n",
15 | "from selenium.webdriver.support.wait import WebDriverWait\n",
16 | "from selenium.webdriver.chrome.options import Options\n",
17 | "from selenium.common.exceptions import NoSuchElementException, TimeoutException\n",
18 | "import time\n",
19 | "import requests\n",
20 | "from bs4 import BeautifulSoup\n",
21 | "import re\n",
22 | "import config\n",
23 | "import json\n",
24 | "import os\n",
25 | "from urllib.parse import urlparse\n",
26 | "import csv"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "id": "833b157f",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "#specify the path to chromedriver.exe (download and save on your computer)\n",
37 | "driver = webdriver.Chrome()\n",
38 | "\n",
39 | "#open the webpage\n",
40 | "driver.get(\"https://www.instagram.com/\")"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "id": "fbe88437",
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "#target username\n",
51 | "username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, \"input[name='username']\")))\n",
52 | "password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, \"input[name='password']\")))\n",
53 | "\n",
54 | "#enter username and password\n",
55 | "username.clear()\n",
56 | "username.send_keys(config.username)\n",
57 | "password.clear()\n",
58 | "password.send_keys(config.password)\n",
59 | "\n",
60 | "#target the login button and click it\n",
61 | "button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, \"button[type='submit']\"))).click()"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "id": "c4813345",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "not_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), \"Not Now\")]')))\n",
72 | "\n",
73 | "not_button.click()"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "id": "df9ee85d",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "# Wait up to 10 seconds for the search button to be clickable on the web page\n",
84 | "search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'svg[aria-label=\"Search\"]')))\n",
85 | "\n",
86 | "# Click the search button once it becomes clickable\n",
87 | "search_button.click()"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "id": "aed26dce",
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "#target the search input field\n",
98 | "searchbox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, \"//input[@placeholder='Search']\")))\n",
99 | "searchbox.clear()\n",
100 | "\n",
101 | "#search for the @handle or keyword\n",
102 | "keyword = \"@sample-handle\"\n",
103 | "searchbox.send_keys(keyword)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "f06c2e34",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "# Check if the keyword starts with \"@\"\n",
114 | "if keyword.startswith(\"@\"):\n",
115 | " # Remove the \"@\" symbol\n",
116 | " keyword = keyword[1:]\n",
117 | " \n",
118 | "# Find the first element with the specified XPath that matches the keyword \n",
119 | "first_result = driver.find_element(By.XPATH, f'//span[text()=\"{keyword}\"]')\n",
120 | "\n",
121 | "# Click on the found element (assuming it represents the desired search result)\n",
122 | "first_result.click()"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "id": "1c79c06c",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "# Get the initial page height\n",
133 | "initial_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
134 | "\n",
135 | "# Create a list to store htmls\n",
136 | "soups = []\n",
137 | "\n",
138 | "while True:\n",
139 | " # Scroll down to the bottom of the page\n",
140 | " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
141 | "\n",
142 | " # Wait for a moment to allow new content to load (adjust as needed)\n",
143 | " time.sleep(5)\n",
144 | " \n",
145 | " # Parse the HTML\n",
146 | " html = driver.page_source\n",
147 | " \n",
148 | " # Create a BeautifulSoup object from the scraped HTML\n",
149 | " soups.append(BeautifulSoup(html, 'html.parser'))\n",
150 | "\n",
151 | " # Get the current page height\n",
152 | " current_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
153 | "\n",
154 | " if current_height == initial_height:\n",
155 | " break # Exit the loop when you can't scroll further\n",
156 | "\n",
157 | " initial_height = current_height # Update the initial height for the next iteration"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "d7fa7a7b",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "# List to store the post image URLs\n",
168 | "post_urls = []\n",
169 | "\n",
170 | "for soup in soups:\n",
171 | " # Find all anchor elements with href attributes\n",
172 | " anchors = soup.find_all('a', href=True)\n",
173 | " \n",
174 | " # Filter URLs that start with \"/p/\" or \"/reel/\"\n",
175 | " post_urls.extend([anchor['href'] for anchor in anchors if anchor['href'].startswith((\"/p/\", \"/reel/\"))])\n",
176 | "\n",
177 | "# Convert the list to a set to remove duplicates\n",
178 | "unique_post_urls = list(set(post_urls))\n",
179 | "\n",
180 | "print(f\"before: {len(post_urls)}, after: {len(unique_post_urls)}\")"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "id": "801f4c86",
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "json_list = []\n",
191 | "\n",
192 | "# Define the query parameters to add\n",
193 | "query_parameters = \"__a=1&__d=dis\"\n",
194 | "\n",
195 | "# go through all urls\n",
196 | "for url in unique_post_urls:\n",
197 | " try:\n",
198 | " # Get the current URL of the page\n",
199 | " current_url = driver.current_url\n",
200 | "\n",
201 | " # Append the query parameters to the current URL\n",
202 | " modified_url = \"https://www.instagram.com/\" + url + \"?\" + query_parameters\n",
203 | "\n",
204 | " # Get URL\n",
205 | " driver.get(modified_url)\n",
206 | "\n",
207 | " # Wait for a moment to allow new content to load (adjust as needed)\n",
208 | " time.sleep(1)\n",
209 | "\n",
210 | " # Find the tag containing the JSON data\n",
211 | " WebDriverWait(driver, 10).until(\n",
212 | " EC.presence_of_element_located((By.XPATH, '//pre'))\n",
213 | " )\n",
214 | " pre_tag = driver.find_element_by_xpath('//pre')\n",
215 | "\n",
216 | " # Extract the JSON data from the tag\n",
217 | " json_script = pre_tag.text\n",
218 | "\n",
219 | " # Parse the JSON data\n",
220 | " json_parsed = json.loads(json_script)\n",
221 | "\n",
222 | " # Add json to the list\n",
223 | " json_list.append(json_parsed)\n",
224 | " except (NoSuchElementException, TimeoutException, json.JSONDecodeError) as e:\n",
225 | " print(f\"Error processing URL {url}: {e}\")\n"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "id": "4c032fe0",
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "# Lists to store URLs and corresponding dates\n",
236 | "all_urls = []\n",
237 | "all_dates = []\n",
238 | "\n",
239 | "# Iterate through each JSON data in the list\n",
240 | "for json_data in json_list:\n",
241 | " \n",
242 | " # Extract the list from the 'items' key\n",
243 | " item_list = json_data.get('items', [])\n",
244 | " \n",
245 | " # Iterate through each item in the 'items' list\n",
246 | " for item in item_list:\n",
247 | " \n",
248 | " # Extract the date the item was taken\n",
249 | " date_taken = item.get('taken_at') # Move this line inside the loop\n",
250 | "\n",
251 | " # Check if 'carousel_media' is present\n",
252 | " carousel_media = item.get('carousel_media', [])\n",
253 | " \n",
254 | " # Iterate through each media in the 'carousel_media' list\n",
255 | " for media in carousel_media:\n",
256 | " \n",
257 | " # Extract the image URL from the media\n",
258 | " image_url = media.get('image_versions2', {}).get('candidates', [{}])[0].get('url')\n",
259 | " \n",
260 | " if image_url:\n",
261 | " # Add the image URL and corresponding date to the lists\n",
262 | " all_urls.append(image_url)\n",
263 | " all_dates.append(date_taken)\n",
264 | " print(f\"carousel image added\")\n",
265 | " \n",
266 | " # Extract the video URL from the media\n",
267 | " video_versions = media.get('video_versions', [])\n",
268 | " if video_versions:\n",
269 | " video_url = video_versions[0].get('url')\n",
270 | " if video_url:\n",
271 | " \n",
272 | " # Add the video URL and corresponding date to the lists\n",
273 | " all_urls.append(video_url)\n",
274 | " all_dates.append(date_taken)\n",
275 | " print(f\"carousel video added\")\n",
276 | "\n",
277 | " # Handle cases of unique image, instead of carousel\n",
278 | " image_url = item.get('image_versions2', {}).get('candidates', [{}])[0].get('url')\n",
279 | " if image_url:\n",
280 | " \n",
281 | " # Add the image URL and corresponding date to the lists\n",
282 | " all_urls.append(image_url)\n",
283 | " all_dates.append(date_taken)\n",
284 | " print(f\"single image added\")\n",
285 | "\n",
286 | " # Check if 'video_versions' key exists\n",
287 | " video_versions = item.get('video_versions', [])\n",
288 | " if video_versions:\n",
289 | " video_url = video_versions[0].get('url')\n",
290 | " if video_url:\n",
291 | " all_urls.append(video_url)\n",
292 | " all_dates.append(date_taken)\n",
293 | " print(f\"video added\")\n",
294 | " \n",
295 | "# Print or use all collected URLs as needed\n",
296 | "print(len(all_urls))\n",
297 | " \n",
298 | "\n"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "id": "df201782",
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "# Create a directory to store downloaded files\n",
309 | "download_dir = keyword\n",
310 | "os.makedirs(download_dir, exist_ok=True)\n",
311 | "\n",
312 | "# Create subfolders for images and videos\n",
313 | "image_dir = os.path.join(download_dir, \"images\")\n",
314 | "video_dir = os.path.join(download_dir, \"videos\")\n",
315 | "os.makedirs(image_dir, exist_ok=True)\n",
316 | "os.makedirs(video_dir, exist_ok=True)\n",
317 | "\n",
318 | "# Initialize counters for images and videos\n",
319 | "image_counter = 1\n",
320 | "video_counter = 1\n",
321 | "\n",
322 | "# Iterate through URLs in the all_urls list and download media\n",
323 | "for index, url in enumerate(all_urls, 0):\n",
324 | " response = requests.get(url, stream=True)\n",
325 | "\n",
326 | " # Extract file extension from the URL\n",
327 | " url_path = urlparse(url).path\n",
328 | " file_extension = os.path.splitext(url_path)[1]\n",
329 | "\n",
330 | " # Determine the file name based on the URL\n",
331 | " if file_extension.lower() in {'.jpg', '.jpeg', '.png', '.gif'}:\n",
332 | " file_name = f\"{all_dates[index]}-img-{image_counter}.png\"\n",
333 | " destination_folder = image_dir\n",
334 | " image_counter += 1\n",
335 | " elif file_extension.lower() in {'.mp4', '.avi', '.mkv', '.mov'}:\n",
336 | " file_name = f\"{all_dates[index]}-vid-{video_counter}.mp4\"\n",
337 | " destination_folder = video_dir\n",
338 | " video_counter += 1\n",
339 | " else:\n",
340 | " # Default to the main download directory for other file types\n",
341 | " file_name = f\"{all_dates[index]}{file_extension}\"\n",
342 | " destination_folder = download_dir\n",
343 | "\n",
344 | " # Save the file to the appropriate folder\n",
345 | " file_path = os.path.join(destination_folder, file_name)\n",
346 | " \n",
347 | " # Write the content of the response to the file\n",
348 | " with open(file_path, 'wb') as file:\n",
349 | " for chunk in response.iter_content(chunk_size=8192):\n",
350 | " if chunk:\n",
351 | " file.write(chunk)\n",
352 | "\n",
353 | " print(f\"Downloaded: {file_path}\")\n",
354 | "\n",
355 | "# Print a message indicating the number of downloaded files and the download directory\n",
356 | "print(f\"Downloaded {len(all_urls)} files to {download_dir}\")"
357 | ]
358 | }
359 | ],
360 | "metadata": {
361 | "kernelspec": {
362 | "display_name": "Python 3 (ipykernel)",
363 | "language": "python",
364 | "name": "python3"
365 | },
366 | "language_info": {
367 | "codemirror_mode": {
368 | "name": "ipython",
369 | "version": 3
370 | },
371 | "file_extension": ".py",
372 | "mimetype": "text/x-python",
373 | "name": "python",
374 | "nbconvert_exporter": "python",
375 | "pygments_lexer": "ipython3",
376 | "version": "3.10.9"
377 | }
378 | },
379 | "nbformat": 4,
380 | "nbformat_minor": 5
381 | }
382 |
--------------------------------------------------------------------------------