├── .gitignore ├── README.md ├── requirements.txt ├── setup.cfg ├── setup.py └── shopify_scraper ├── .gitkeep ├── __init__.py └── scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .idea 3 | example.py 4 | parents.csv 5 | children.csv 6 | images.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ShopifyScraper 2 | 3 | ShopifyScraper is a Python package that scrapes data from Shopify. Unlike a regular web scraper, that needs to visit every page on a site, this package fetches Shopify's publicly visible `products.json` file, allowing you to scrape an entire store inventory in seconds. 4 | 5 | When the commands below are run, ShopifyScraper will extract the store inventory and save the products and product variants to Pandas dataframes, from where you can access or analyse the data or write it to CSV or database. 6 | 7 | ### Installation 8 | To install ShopifyScraper, run the following command: 9 | 10 | ```bash 11 | pip3 install git+https://github.com/practical-data-science/ShopifyScraper.git 12 | ``` 13 | 14 | ### Usage 15 | 16 | ```python 17 | from shopify_scraper import scraper 18 | 19 | url = "https://yourshopifydomain.com" 20 | 21 | parents = scraper.get_products(url) 22 | parents.to_csv('parents.csv', index=False) 23 | print('Parents: ', len(parents)) 24 | 25 | 26 | children = scraper.get_variants(parents) 27 | children.to_csv('children.csv', index=False) 28 | print('Children: ', len(children)) 29 | 30 | 31 | images = scraper.get_images(parents) 32 | images.to_csv('images.csv', index=False) 33 | print('Images: ', len(images)) 34 | 35 | ``` 36 | 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pandas -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | from os import path 4 | this_directory = path.abspath(path.dirname(__file__)) 5 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name='ShopifyScraper', 10 | packages=['shopify_scraper'], 11 | version='0.002', 12 | license='MIT', 13 | description='ShopifyScraper is a Python package that scrapes Shopify products and variants to Pandas dataframes.', 14 | long_description=long_description, 15 | long_description_content_type='text/markdown', 16 | author='Matt Clarke', 17 | author_email='matt@practicaldatascience.co.uk', 18 | url='https://github.com/practical-data-science/ShopifyScraper', 19 | download_url='https://github.com/practical-data-science/ShopifyScraper/archive/master.zip', 20 | keywords=['python', 'requests', 'pandas', 'shopify', 'scraping'], 21 | classifiers=[ 22 | 'Development Status :: 3 - Alpha', 23 | 'Intended Audience :: Developers', 24 | 'Topic :: Software Development :: Libraries :: Python Modules', 25 | 'License :: OSI Approved :: MIT License', 26 | 'Programming Language :: Python :: 3.8', 27 | ], 28 | install_requires=['pandas', 'requests'] 29 | ) -------------------------------------------------------------------------------- /shopify_scraper/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/practical-data-science/ShopifyScraper/109da60109deabdaa04cbb71670bb8b0083040ec/shopify_scraper/.gitkeep -------------------------------------------------------------------------------- /shopify_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/practical-data-science/ShopifyScraper/109da60109deabdaa04cbb71670bb8b0083040ec/shopify_scraper/__init__.py -------------------------------------------------------------------------------- /shopify_scraper/scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shopify scraper 3 | Description: Scrapes products from a Shopify store by parsing products.json and converting it to a pandas DataFrame. 4 | Author: Matt Clarke 5 | """ 6 | 7 | import json 8 | import pandas as pd 9 | import requests 10 | 11 | 12 | def get_json(url, page): 13 | """ 14 | Get Shopify products.json from a store URL. 15 | 16 | Args: 17 | url (str): URL of the store. 18 | page (int): Page number of the products.json. 19 | Returns: 20 | products_json: Products.json from the store. 21 | """ 22 | 23 | try: 24 | response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5) 25 | products_json = response.text 26 | response.raise_for_status() 27 | return products_json 28 | 29 | except requests.exceptions.HTTPError as error_http: 30 | print("HTTP Error:", error_http) 31 | 32 | except requests.exceptions.ConnectionError as error_connection: 33 | print("Connection Error:", error_connection) 34 | 35 | except requests.exceptions.Timeout as error_timeout: 36 | print("Timeout Error:", error_timeout) 37 | 38 | except requests.exceptions.RequestException as error: 39 | print("Error: ", error) 40 | 41 | 42 | def to_df(products_json): 43 | """ 44 | Convert products.json to a pandas DataFrame. 45 | 46 | Args: 47 | products_json (json): Products.json from the store. 48 | Returns: 49 | df: Pandas DataFrame of the products.json. 50 | """ 51 | 52 | try: 53 | products_dict = json.loads(products_json) 54 | df = pd.DataFrame.from_dict(products_dict['products']) 55 | return df 56 | except Exception as e: 57 | print(e) 58 | 59 | 60 | def get_products(url): 61 | """ 62 | Get all products from a store. 63 | 64 | Returns: 65 | df: Pandas DataFrame of the products.json. 66 | """ 67 | 68 | results = True 69 | page = 1 70 | df = pd.DataFrame() 71 | 72 | while results: 73 | products_json = get_json(url, page) 74 | products_dict = to_df(products_json) 75 | 76 | if len(products_dict) == 0: 77 | break 78 | else: 79 | df = pd.concat([df, products_dict], ignore_index=True) 80 | page += 1 81 | 82 | df['url'] = f"{url}/products/" + df['handle'] 83 | return df 84 | 85 | 86 | def get_variants(products): 87 | """Get variants from a list of products. 88 | 89 | Args: 90 | products (pd.DataFrame): Pandas dataframe of products from get_products() 91 | 92 | Returns: 93 | variants (pd.DataFrame): Pandas dataframe of variants 94 | """ 95 | 96 | products['id'].astype(int) 97 | df_variants = pd.DataFrame() 98 | 99 | for row in products.itertuples(index='True'): 100 | 101 | for variant in getattr(row, 'variants'): 102 | df_variants = pd.concat([df_variants, pd.DataFrame.from_records(variant, index=[0])]) 103 | 104 | df_variants['id'].astype(int) 105 | df_variants['product_id'].astype(int) 106 | df_parent_data = products[['id', 'title', 'vendor']] 107 | df_parent_data = df_parent_data.rename(columns={'title': 'parent_title', 'id': 'parent_id'}) 108 | df_variants = df_variants.merge(df_parent_data, left_on='product_id', right_on='parent_id') 109 | return df_variants 110 | 111 | 112 | def json_list_to_df(df, col): 113 | """Return a Pandas dataframe based on a column that contains a list of JSON objects. 114 | 115 | Args: 116 | df (Pandas dataframe): The dataframe to be flattened. 117 | col (str): The name of the column that contains the JSON objects. 118 | 119 | Returns: 120 | Pandas dataframe: A new dataframe with the JSON objects expanded into columns. 121 | """ 122 | 123 | rows = [] 124 | for index, row in df[col].iteritems(): 125 | for item in row: 126 | rows.append(item) 127 | df = pd.DataFrame(rows) 128 | return df 129 | 130 | 131 | def get_images(df_products): 132 | """Get images from a list of products. 133 | 134 | Args: 135 | df_products (pd.DataFrame): Pandas dataframe of products from get_products() 136 | 137 | Returns: 138 | images (pd.DataFrame): Pandas dataframe of images 139 | """ 140 | 141 | return json_list_to_df(df_products, 'images') 142 | 143 | --------------------------------------------------------------------------------