├── README.md ├── requirements.txt └── shopfiy_scraper.py /README.md: -------------------------------------------------------------------------------- 1 | Buy Me A Coffee 2 | 3 | # Shopify Products Scraper 4 | 5 | This is Shopify products Scraper. The script retrieves data from the products.json file of Shopify shop. 6 | Then, for each product, it makes an additional query to the 7 | product page to retrieve data from meta tags. 8 | 9 | All scraped information is saved to a CSV file (products.csv) 10 | 11 | #### Why is this useful? 12 | 13 | Unfortunately, the /products.json endpoint does not contain meta tags. To export all product parameters, it is necessary to query each product directly. 14 | 15 | That's why a script was created. To do it automatically! 16 | 17 | ### 🔥 Installation 18 | ``` 19 | git clone https://github.com/grabowskiadrian/shopify-products-scraper.git 20 | cd shopify-products-scraper 21 | pip3 install -r requirements.txt 22 | ``` 23 | 24 | ### 🚀 Usage 25 | 26 | ``` 27 | python3 shopfiy_scraper.py -t https://www.shopifyshop.com 28 | ``` 29 | 30 | Output: 31 | 32 | ``` 33 | python3 shopfiy_scraper.py -t https://www.shopifyshop.com 34 | [+] Starting script 35 | [+] Checking products page 36 | ├ Scraping: https://www.shopifyshop.com/products/nami-nude-corn-outlet 37 | ├ Scraping: https://www.shopifyshop.com/products/sniegowce-damskie-czarne-boom-snow-boots-black-grape 38 | ├ Scraping: https://www.shopifyshop.com/products/mini-pouch-mokka-croco 39 | ├ Scraping: https://www.shopifyshop.com/products/saszetka-etui-na-karty-damskie-mini-pouch-black-ink-croco 40 | ... 41 | [+] Scraping done 42 | ``` 43 | 44 | Script will generate products.csv with this header: 45 | ``` 46 | Name,URL,Meta Title,Meta Description,Product Description 47 | ``` 48 | 49 | You can use `-v` flag to save Product Variants in separated rows. 50 | 51 | ``` 52 | Name,Variant Name,Price,URL,Meta Title,Meta Description,Product Description 53 | ``` 54 | 55 | then script synchronize Variant Name and Price also. 56 | 57 | ### Plans and ideas 58 | - add more columns to output csv (not only SEO parameters) 59 | - add possibility to list/download images of product 60 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.2 2 | -------------------------------------------------------------------------------- /shopfiy_scraper.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import urllib.request 4 | import requests 5 | import pandas as pd 6 | import argparse 7 | 8 | from bs4 import BeautifulSoup 9 | 10 | parser = argparse.ArgumentParser(description="Scrap products data from Shopify store") 11 | parser.add_argument('-t', '--target', dest='website_url', type=str, help='URL to Shopify store (https://shopifystore.com)') 12 | parser.add_argument('-v', '--variants', dest='variants', action="store_true", help='Scrap also with variants data') 13 | args = parser.parse_args() 14 | 15 | if not args.website_url: 16 | print("usage: shopfiy_scraper.py [-h] [-t WEBSITE_URL] [-v]") 17 | exit(0) 18 | 19 | base_url = args.website_url 20 | url = base_url + '/products.json' 21 | 22 | with_variants = args.variants 23 | 24 | 25 | def get_page(page): 26 | data = urllib.request.urlopen(url + '?page={}'.format(page)).read() 27 | products = json.loads(data)['products'] 28 | 29 | return products 30 | 31 | 32 | def get_tags_from_product(product): 33 | r = urllib.request.urlopen(product).read() 34 | soup = BeautifulSoup(r, "html.parser") 35 | 36 | title = soup.title.string 37 | description = '' 38 | 39 | meta = soup.find_all('meta') 40 | for tag in meta: 41 | if 'name' in tag.attrs.keys() and tag.attrs['name'].strip().lower() == 'description': 42 | description = tag.attrs['content']; 43 | 44 | return [title, description] 45 | 46 | def get_inventory_from_product(product_url): 47 | get_product = requests.get(product_url) 48 | product_json = get_product.json() 49 | product_variants = pd.DataFrame(product_json['product']['variants']) 50 | 51 | return product_variants 52 | 53 | 54 | with open('products.csv', 'w') as f: 55 | page = 1 56 | 57 | print("[+] Starting script") 58 | 59 | # create file header 60 | writer = csv.writer(f) 61 | if with_variants: 62 | writer.writerow([ 63 | 'Name', 'Variant ID', 'Product ID', 'Variant Title', 'Price', 'SKU', 64 | 'Position', 'Inventory Policy', 'Compare At Price', 'Fulfillment Service', 65 | 'Inventory Management', 'Option1', 'Option2', 'Option3', 'Created At', 66 | 'Updated At', 'Taxable', 'Barcode', 'Grams', 'Image ID', 'Weight', 67 | 'Weight Unit', 'Inventory Quantity', 'Old Inventory Quantity', 68 | 'Tax Code', 'Requires Shipping', 'Quantity Rule', 'Price Currency', 69 | 'Compare At Price Currency', 'Quantity Price Breaks', 70 | 'URL', 'Meta Title', 'Meta Description', 'Product Description' 71 | ]) 72 | else: 73 | writer.writerow(['Name', 'URL', 'Meta Title', 'Meta Description', 'Product Description']) 74 | 75 | print("[+] Checking products page") 76 | 77 | products = get_page(page) 78 | while products: 79 | for product in products: 80 | name = product['title'] 81 | product_url = base_url + '/products/' + product['handle'] 82 | category = product['product_type'] 83 | 84 | body_description = BeautifulSoup(product['body_html'], "html.parser") 85 | body_description = body_description.get_text() 86 | 87 | 88 | print(" ├ Scraping: " + product_url) 89 | 90 | title, description = get_tags_from_product(product_url) 91 | 92 | if with_variants: 93 | variants_df = get_inventory_from_product(product_url + '.json') 94 | for _, variant in variants_df.iterrows(): 95 | row = [ 96 | name, variant['id'], variant['product_id'], variant['title'], 97 | variant['price'], variant['sku'], variant['position'], 98 | variant['inventory_policy'], variant['compare_at_price'], 99 | variant['fulfillment_service'], variant['inventory_management'], 100 | variant['option1'], variant['option2'], variant['option3'], 101 | variant['created_at'], variant['updated_at'], variant['taxable'], 102 | variant['barcode'], variant['grams'], variant['image_id'], 103 | variant['weight'], variant['weight_unit'], variant['inventory_quantity'], 104 | variant['old_inventory_quantity'], variant['tax_code'], 105 | variant['requires_shipping'], variant['quantity_rule'], 106 | variant['price_currency'], variant['compare_at_price_currency'], 107 | variant['quantity_price_breaks'], 108 | product_url, title, description, body_description 109 | ] 110 | writer.writerow(row) 111 | else: 112 | row = [name, product_url, title, description, body_description] 113 | writer.writerow(row) 114 | page += 1 115 | products = get_page(page) 116 | --------------------------------------------------------------------------------