├── .gitignore ├── requirements.txt ├── automate.sh ├── platform_finder.py ├── automate.bat ├── README.md └── amazon_scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | myenv 2 | __pycache__ 3 | products -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kanugurajesh/Amazon-Scraper/HEAD/requirements.txt -------------------------------------------------------------------------------- /automate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # use this file only in linux to push code to github 4 | git add . 5 | git commit -m "adder" 6 | git push -u origin main -------------------------------------------------------------------------------- /platform_finder.py: -------------------------------------------------------------------------------- 1 | # The below one is used to find your operating sytem 2 | 3 | import platform 4 | os_name = platform.system() 5 | print("Operating System",os_name) -------------------------------------------------------------------------------- /automate.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | REM This is a comment use this file in only windows to push code to github 4 | 5 | git add . 6 | git commit -m "adder" 7 | git push -u origin main -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon Scraper 2 | 3 | This is a amazon scraper build using selenium it can scrape product data from the website and write it to a csv file i have used selenium because it resembles a bit like human and it is a advanced testing frameworks which offers many advantages for scraping and i also used beautiful soup to extract data from the scraped data 4 | 5 | # Project Setup 6 | 7 | 1.The code is same for both windows and linux you need to install seperate chromedriver for seperate environments you can install chromedriver at https://chromedriver.chromium.org/downloads The chromedriver file should be placed at root folder 8 |
9 | 2.setup a python environment using the command python -m venv myenv 10 |
11 | 3.activate the python virtual environment for windows myenv/Scripts/activate.ps1 and for linux source myenv/bin/activate 12 |
13 | 4.install the python modules in the python virtual environment using the command pip install -r requirements.txt 14 |
15 | 16 | # Usage 17 | 18 | 1. simply run the project in windows python amazon_scraper.py and in linux python3 amazon_scraper.py 19 | 1.To run project in windows run the command python amazon_scraper.py 20 | 2.To run the project in linux run the command python3 amazon_scraper.py 21 | 22 | # Project Working 23 | 24 | 1.When you first run the project the selenium will scrape the product and write each product source code into html fiels in the products directory 25 |
26 | 2.In the next step the data is extracted from the html files and written to output.csv file 27 | 28 | # Project Files 29 | 30 | 1.amazon-scraper.py is the main python file which scrapes and writes data to the csv files 31 |
32 | 2.requirements.txt is the file which contains all the modules required by the project to function without errors 33 |
34 | 3.automate.bat is used to push code to github in windows 35 |
36 | 4.automate.sh is used to push code to github in linux 37 | 38 | # Project Working Video 39 | 40 | https://drive.google.com/file/d/1xS28dAszifytomf69G2MfKjBfMia_it8/view?usp=sharing 41 | 42 | ## Contributing 43 | 44 | This project loves to accept contributions from everyone 45 | 46 | ## Technologies Used 47 | 48 | - HTML 49 | - CSS 50 | - JavaScript 51 | 52 | ## 🔗 Links 53 | [![portfolio](https://img.shields.io/badge/my_portfolio-000?style=for-the-badge&logo=ko-fi&logoColor=white)](https://rajeshportfolio.me/) 54 | [![linkedin](https://img.shields.io/badge/linkedin-0A66C2?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/rajesh-kanugu-aba8a3254/) 55 | [![twitter](https://img.shields.io/badge/twitter-1DA1F2?style=for-the-badge&logo=twitter&logoColor=white)](https://twitter.com/exploringengin1) 56 | 57 | ## Authors 58 | 59 | - [@kanugurajesh](https://github.com/kanugurajesh) 60 | 61 | ## Support 62 | 63 | For support, you can buy me a coffee 64 | 65 |

66 | 67 | ## License 68 | 69 | This project license is MIT LICENSE 70 | 71 | -------------------------------------------------------------------------------- /amazon_scraper.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from bs4 import BeautifulSoup 7 | from lxml import etree 8 | import time 9 | import csv 10 | import os 11 | 12 | # Enter the url of the product 13 | default_url = "https://www.amazon.in/s?k=bags&crid=3RXHWM4CTEH4C&sprefix=bags%2Caps%2C433&ref=nb_sb_noss_1" 14 | file_name = "store" 15 | 16 | # list of product urls 17 | product_url = [] 18 | 19 | start = 1 20 | # Enter the number of pages you want to scrape 21 | end = 2 22 | directory_name = "products" 23 | 24 | # initializing the chromedriver 25 | # options = webdriver.ChromeOptions() 26 | driver = webdriver.Chrome() 27 | driver.implicitly_wait(10) 28 | 29 | # checking whether the directory exists or not 30 | def check_and_create_directory(directory_name): 31 | if not os.path.exists(directory_name): 32 | os.mkdir(directory_name) 33 | 34 | check_and_create_directory(directory_name) 35 | 36 | # creating a file writer function to write page source to a html file 37 | def file_writer(name,val): 38 | with open(f"{name}_{val}.html","w",encoding='utf-8') as f: 39 | sourcer = driver.page_source 40 | f.write(sourcer) 41 | 42 | # creating a page allocator function to click on the product link and extract data 43 | def page_allocator(name,val): 44 | prod_val = 1 45 | elements = driver.find_elements(By.CSS_SELECTOR, '.a-link-normal.s-underline-text.s-underline-link-text.s-link-style.a-text-normal') 46 | urls = [i.get_attribute('href') for i in elements] 47 | for url in urls: 48 | driver.get(url) 49 | file_writer(f"./products/{name}", f"{val}_{prod_val}") 50 | prod_val += 1 51 | product_url.append(driver.current_url) 52 | 53 | # default step 54 | driver.get(default_url) 55 | page_allocator(file_name,start) 56 | start += 1 57 | 58 | # iterating through pages 59 | while start < end: 60 | urls = f"https://www.amazon.in/s?k=bags&page={start}&crid=ULWLFTP4YQBC&qid=1688637618&sprefix=bags%2Caps%2C206&ref=sr_pg_1" 61 | driver.get(urls) 62 | page_allocator(file_name,start) 63 | start += 1 64 | 65 | # closing the webdriver 66 | driver.close() 67 | 68 | # The below code is used to write data to the csv file 69 | 70 | # creating a new csv file 71 | with open('products.csv','w',newline='') as file: 72 | writer = csv.writer(file) 73 | header = ["product url","product name","product price","rating","number of reviews","description","asin","danufacturer"] 74 | writer.writerow(header) 75 | 76 | # variable to give serial numbers in csv 77 | num = 1 78 | 79 | # function to extract data fro html and write it to csv 80 | def parser_writer(file,num): 81 | try: 82 | with open(file,'r') as f: 83 | soup = BeautifulSoup(f,'html.parser') 84 | # creating a dom tree 85 | dom = etree.HTML(str(soup)) 86 | 87 | # getting all the elements with attributes 88 | product_name = soup.find(class_='a-size-large product-title-word-break').text.strip() 89 | product_rating_count = soup.find(id='acrCustomerReviewText').text.split(" ")[0].strip() 90 | 91 | product_price_symbol = soup.find(class_='a-price-symbol').text 92 | product_price = soup.find(class_='a-price-whole').text 93 | product_rating = soup.find(class_='a-size-base a-color-base').text 94 | 95 | product_asin = dom.xpath('//*[@id="detailBullets_feature_div"]/ul/li[4]/span/span[2]')[0].text 96 | product_manufacturer = dom.xpath('//*[@id="detailBullets_feature_div"]/ul/li[8]/span/span[2]')[0].text 97 | product_description_parent = soup.find(class_='a-unordered-list a-vertical a-spacing-mini') 98 | product_description = product_description_parent.find_all(class_='a-list-item') 99 | # concatenating the currency symbol and price 100 | product_price = product_price_symbol + product_price 101 | 102 | # list containing all the descriptions 103 | descript = [] 104 | 105 | # looping through the description elements 106 | for i in product_description: 107 | # appending the text in the description elements to the descript list 108 | descript.append(i.text.strip()) 109 | result = ','.join(descript) 110 | 111 | # appending all the data to the products.csv file 112 | with open('products.csv','a',newline='') as file: 113 | writer = csv.writer(file) 114 | row = [num,product_url[num-1],product_name,product_price,product_rating,product_rating_count,result,product_asin,product_manufacturer] 115 | writer.writerow(row) 116 | num += 1 117 | except FileNotFoundError: 118 | print(f"File '{file}' not found. {num}") 119 | except Exception as e: 120 | print(f"An error occurred: {e} {num}") 121 | 122 | # getting all the file names in the directory 123 | file_names = os.listdir("./products") 124 | 125 | # iterating over the products 126 | for file_name in file_names: 127 | parser_writer(f"./products/{file_name}",num) --------------------------------------------------------------------------------