├── .gitignore
├── README.md
└── scraper.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.xlsx
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # WooCommerce Scraper and Logger
 2 | 
 3 | Scrapes Woocommerce Shops for products and generates csv files of each product's:
 4 | * Name
 5 | * ID (Size, Unique ID, etc.)
 6 | * Price
 7 | * URL
 8 | 
 9 | Isn't guaranteed to work with every shop at the moment but so far it's been working for the shops I've tested.
10 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from requests import get
 3 | from datetime import datetime
 4 | import csv
 5 | import math
 6 | import sys
 7 | import re
 8 | 
 9 | def buildURL(domain,entryTitle, entryID):
10 |     s = re.sub('[^0-9a-zA-Z]+', '-', entryTitle+"-"+entryID)
11 |     if s[-1] == '-':
12 |         s = s[:-1]
13 |     url = f"https://{domain}/product/{s}"
14 |     return url
15 | 
16 | if __name__ == '__main__':
17 |     catalog = []
18 |     page_number = 1
19 |     page_limit = 2
20 |     if len(sys.argv) == 2:
21 |         domain = sys.argv[1]
22 |     else:
23 |         print("Usage: 'python3 main.py [example.com]'")
24 |         exit()
25 |     # Iterate through each page of the shop
26 |     while page_number <= page_limit:
27 |         url = f"https://{domain}/page/{page_number}/?s=&post_type=product"
28 |         response = get(url).text
29 |         soup = BeautifulSoup(response, 'html.parser')
30 |         
31 |         if page_number == 1:
32 |             limits = soup.find_all('p',class_='woocommerce-result-count')
33 |             print("Finding page limit...")
34 |             limits_text = '\n'.join([str(ele) for ele in limits])
35 |             limits_nums = [int(i) for i in limits_text.split() if i.isdigit()]
36 |             page_limit = (math.ceil(limits_nums[0]/16))
37 |             print(page_limit)
38 |         products = soup.find_all('h2', class_='woocommerce-loop-product__title')
39 |         if not products:
40 |             products = soup.find_all('p', class_='product-title')
41 |         prices = soup.find_all('span', class_='woocommerce-Price-amount')
42 |         print("Adding to catalog array...",page_number,"/",page_limit)
43 |         for (item,entryPrice) in zip(products,prices):
44 |             keywords = item.text.split()
45 |             entryID = keywords.pop()
46 |             item = ' '.join(keywords)
47 |             entryPrice = entryPrice.text
48 |             url = buildURL(domain,item,entryID)
49 |             catalog.append( (item,entryID,entryPrice,url) )
50 |         page_number += 1
51 | 
52 |     # Build CSV File
53 |     print("Building CSV File...")
54 |     with open(f"{domain}_{datetime.now().date()}.csv",'w') as csvfile:
55 |         fwriter = csv.writer(csvfile)
56 |         fwriter.writerow(["Product","Product ID","Price","URL"])
57 |         for (entryTitle, entryID,entryPrice,url) in catalog:
58 |             fwriter.writerow([entryTitle,entryID,entryPrice,url])
59 |     csvfile.close()
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------