├── static └── css │ └── style.css ├── README.md ├── app.py ├── LICENSE ├── templates ├── base.html └── index.html ├── .gitignore └── crawler.py /static/css/style.css: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GroceryCompare 2 | Scraper for comparing grocery prices between NZ supermarkets - with flask! Currently comparing Countdown, New World and PaknSave. 3 | 4 | 5 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | import crawler 3 | 4 | app = Flask(__name__) 5 | 6 | @app.route('/', methods = ['POST', 'GET']) 7 | def index(): 8 | results = {} 9 | 10 | if request.method == 'POST': 11 | searchTerm = request.form['search'] 12 | results = crawler.getCountdown(searchTerm) 13 | return render_template('index.html', results=results) 14 | else: 15 | return render_template('index.html') 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jess 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | {% block title %} {% endblock %} 15 | 16 | 17 |
18 | {% block content %} {% endblock %} 19 |
20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 |

{% block title %} Welcome to Grocery Comparer v1{% endblock %}

5 | 6 |
7 | 8 |
9 | 10 | 11 | 12 |
13 | 14 | 15 |
16 | 17 |
18 | 19 |
20 | 21 | {% if request.method == "POST" %} 22 | 23 |
24 |
25 | 26 |
27 |
28 | 29 |
30 |
31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | {% for gtin in results %} 45 | 46 | 47 | 48 | 49 | 56 | 63 | 70 | 71 | {% endfor %} 72 | 73 |
GTINNameSizeCountdownNew WorldPaknSave
{{ gtin }}{{ results[gtin]["name"] }}{{ results[gtin]["size"] }} 50 | {% if results[gtin]["countdownPrice"] %} 51 | ${{ results[gtin]["countdownPrice"] }} 52 | {% else %} 53 | - 54 | {% endif %} 55 | 57 | {% if results[gtin]["newWorldPrice"] %} 58 | ${{ results[gtin]["newWorldPrice"] }} 59 | {% else %} 60 | - 61 | {% endif %} 62 | 64 | {% if results[gtin]["paknSavePrice"] %} 65 | ${{ results[gtin]["paknSavePrice"] }} 66 | {% else %} 67 | - 68 | {% endif %} 69 |
74 |
75 |
76 | 77 | {% endif %} 78 | 79 | {% endblock %} 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # Other 124 | .DS_Store 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # Other 135 | .DS_Store 136 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | foodStuffsURL = {'NEWWORLD': {'URL': 'https://www.ishopnewworld.co.nz', 'ID': '60928d93-06fa-4d8f-92a6-8c359e7e846d'}, 'PAKNSAVE': {'URL': 'https://www.paknsaveonline.co.nz', 'ID': '3404c253-577f-45ca-b301-c98312e46efb'}} 7 | 8 | 9 | def escapeSpaces(str): 10 | return(str.replace(' ', '%20')) 11 | 12 | 13 | def getFoodStuffs(gtin, item, brandName): 14 | if ('countdown' not in item['brand'].lower().split()): # Ensure item is not a countdown brand item 15 | url = foodStuffsURL[brandName]['URL'] + '/Search?q=' + str(gtin) 16 | cookies = {'STORE_ID': foodStuffsURL[brandName]['ID']} # Necessary for request to be accepted 17 | 18 | # Request html for new world search result page 19 | response = requests.get(url, cookies=cookies) 20 | 21 | # Soupify and scrape html 22 | soup = BeautifulSoup(response.content, 'html.parser') 23 | data = soup.find('div', {'class': ['js-product-card-footer', 'fs-product-card__footer-container']}) 24 | 25 | if (data): # Check if results were found for search 26 | # Extract json and convert to python dictionary 27 | data = json.loads(data.get('data-options')) 28 | 29 | # Get the price 30 | price = data['ProductDetails']['PricePerItem'] 31 | 32 | return([price, None]) 33 | elif (brandName == "NEWWORLD"): # If no results could be found and the brand is new world 34 | name, brand, size = item['name'], item['brand'], item['size']['volumeSize'] 35 | 36 | # Create new url using the brand, size and name of the item - limit to 1 result 37 | url = foodStuffsURL[brandName]['URL'] + '/Search?q=' + escapeSpaces(brand + ' ' + size + ' ' + name) + '&ps=1&pg=1' 38 | 39 | # Send the new request 40 | response = requests.get(url, cookies=cookies) 41 | 42 | # Soupify and scrape html 43 | soup = BeautifulSoup(response.content, 'html.parser') 44 | data = soup.find('div', {'class': ['js-product-card-footer', 'fs-product-card__footer-container']}) 45 | 46 | if (data): 47 | # Extract json and convert to python dictionary 48 | data = json.loads(data.get('data-options')) 49 | 50 | # Split words in brand name into list 51 | brandSubStrs = item['brand'].lower().split() 52 | 53 | # Check if top matching product contains words from the brand name 54 | for word in brandSubStrs: 55 | if (word in (data['productName'].lower())): 56 | # Scrape found product's size and check if it equals countdown product size 57 | size = soup.find('a', {'class': ['fs-product-card__row-details']}) 58 | if (size is None): # If the class 'fs-product-card__row-details' couldn't be found 59 | size = soup.find('a', {'class': ['fs-product-card__details']}) 60 | size.find('p').text.lower() 61 | 62 | if (size == item['size']['volumeSize'].lower()): 63 | # Get the price 64 | price = data['ProductDetails']['PricePerItem'] 65 | 66 | return([price, 'Showing result for: "%s %s"' % (data['productName'], size)]) 67 | 68 | # Return null if no valid products could be found 69 | return([None, None]) 70 | 71 | 72 | def getCountdown(searchTerm): 73 | url = 'https://shop.countdown.co.nz/api/v1/products?target=search&search=' + searchTerm 74 | headers = {'X-Requested-With': 'OnlineShopping.WebApp'} # Necessary for request to be accepted 75 | 76 | if (searchTerm): # Ensure search isn't an empty string 77 | # Send get request to countdown's api 78 | response = requests.get(url, headers=headers) 79 | 80 | # Convert the json into a python dictionary 81 | data = json.loads(response.text)['products']['items'] 82 | 83 | # Declare results dictionary 84 | results = {} 85 | 86 | if (data): # Check if results were found for search 87 | for item in data: 88 | if (item['type'] == 'Product'): # Ensure item is not a promotion or non product type 89 | # Retrieve general information about product 90 | gtin, name, size = item['barcode'], item['name'], item['size']['volumeSize'] 91 | 92 | # Retrieve countdown prices 93 | countdownPrice = item['price']['originalPrice'] 94 | 95 | # Retrieve new world price and warning messages 96 | newWorldPrice, newWorldMsg = [(getFoodStuffs(gtin, item, 'NEWWORLD'))[i] for i in (0, 1)] 97 | 98 | # Retrieve paknsave price and warning messages 99 | paknSavePrice, paknSaveMsg = [(getFoodStuffs(gtin, item, 'PAKNSAVE'))[i] for i in (0, 1)] 100 | 101 | print('%s | %s %s' % (gtin, name, '(' + size + ')' if (size) else '')) 102 | print('Countdown: %s' % ('$' + str(countdownPrice))) 103 | print('New World: %s %s' % ('$' + str(newWorldPrice) if (newWorldPrice) else 'NOT FOUND', newWorldMsg if (newWorldMsg) else '')) 104 | print('PaknSave: %s %s' % ('$' + str(paknSavePrice) if (paknSavePrice) else 'NOT FOUND', paknSaveMsg if (paknSaveMsg) else '')) 105 | 106 | print('---') 107 | 108 | # Create product and add it to dictionary 109 | product = {"name": name, "size": size, "countdownPrice": countdownPrice, "newWorldPrice": newWorldPrice, "paknSavePrice": paknSavePrice} 110 | results[gtin] = product 111 | 112 | else: # Output if no products were found in db 113 | print('No results could be found for that search.') 114 | 115 | else: # Output if search was an empty string 116 | print('No search was entered.') 117 | 118 | # Return the search results 119 | return(results) 120 | 121 | 122 | # getCountdown(input('Search for: ')) --------------------------------------------------------------------------------