├── .gitignore ├── README.md └── scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.xlsx 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WooCommerce Scraper and Logger 2 | 3 | Scrapes Woocommerce Shops for products and generates csv files of each product's: 4 | * Name 5 | * ID (Size, Unique ID, etc.) 6 | * Price 7 | * URL 8 | 9 | Isn't guaranteed to work with every shop at the moment but so far it's been working for the shops I've tested. 10 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from requests import get 3 | from datetime import datetime 4 | import csv 5 | import math 6 | import sys 7 | import re 8 | 9 | def buildURL(domain,entryTitle, entryID): 10 | s = re.sub('[^0-9a-zA-Z]+', '-', entryTitle+"-"+entryID) 11 | if s[-1] == '-': 12 | s = s[:-1] 13 | url = f"https://{domain}/product/{s}" 14 | return url 15 | 16 | if __name__ == '__main__': 17 | catalog = [] 18 | page_number = 1 19 | page_limit = 2 20 | if len(sys.argv) == 2: 21 | domain = sys.argv[1] 22 | else: 23 | print("Usage: 'python3 main.py [example.com]'") 24 | exit() 25 | # Iterate through each page of the shop 26 | while page_number <= page_limit: 27 | url = f"https://{domain}/page/{page_number}/?s=&post_type=product" 28 | response = get(url).text 29 | soup = BeautifulSoup(response, 'html.parser') 30 | 31 | if page_number == 1: 32 | limits = soup.find_all('p',class_='woocommerce-result-count') 33 | print("Finding page limit...") 34 | limits_text = '\n'.join([str(ele) for ele in limits]) 35 | limits_nums = [int(i) for i in limits_text.split() if i.isdigit()] 36 | page_limit = (math.ceil(limits_nums[0]/16)) 37 | print(page_limit) 38 | products = soup.find_all('h2', class_='woocommerce-loop-product__title') 39 | if not products: 40 | products = soup.find_all('p', class_='product-title') 41 | prices = soup.find_all('span', class_='woocommerce-Price-amount') 42 | print("Adding to catalog array...",page_number,"/",page_limit) 43 | for (item,entryPrice) in zip(products,prices): 44 | keywords = item.text.split() 45 | entryID = keywords.pop() 46 | item = ' '.join(keywords) 47 | entryPrice = entryPrice.text 48 | url = buildURL(domain,item,entryID) 49 | catalog.append( (item,entryID,entryPrice,url) ) 50 | page_number += 1 51 | 52 | # Build CSV File 53 | print("Building CSV File...") 54 | with open(f"{domain}_{datetime.now().date()}.csv",'w') as csvfile: 55 | fwriter = csv.writer(csvfile) 56 | fwriter.writerow(["Product","Product ID","Price","URL"]) 57 | for (entryTitle, entryID,entryPrice,url) in catalog: 58 | fwriter.writerow([entryTitle,entryID,entryPrice,url]) 59 | csvfile.close() 60 | 61 | 62 | --------------------------------------------------------------------------------