├── static
└── css
│ └── style.css
├── README.md
├── app.py
├── LICENSE
├── templates
├── base.html
└── index.html
├── .gitignore
└── crawler.py
/static/css/style.css:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GroceryCompare
2 | Scraper for comparing grocery prices between NZ supermarkets - with flask! Currently comparing Countdown, New World and PaknSave.
3 |
4 |
5 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, request
2 | import crawler
3 |
4 | app = Flask(__name__)
5 |
6 | @app.route('/', methods = ['POST', 'GET'])
7 | def index():
8 | results = {}
9 |
10 | if request.method == 'POST':
11 | searchTerm = request.form['search']
12 | results = crawler.getCountdown(searchTerm)
13 | return render_template('index.html', results=results)
14 | else:
15 | return render_template('index.html')
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Jess
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/templates/base.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | {% block title %} {% endblock %}
15 |
16 |
17 |
18 | {% block content %} {% endblock %}
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 | {% block content %}
4 | {% block title %} Welcome to Grocery Comparer v1{% endblock %}
5 |
6 |
7 |
8 |
9 |
10 | Please enter the product you would like to search for:
11 |
12 |
16 |
17 |
18 |
19 |
20 |
21 | {% if request.method == "POST" %}
22 |
23 |
24 |
25 | Results:
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | GTIN
35 | Name
36 | Size
37 | Countdown
38 | New World
39 | PaknSave
40 |
41 |
42 |
43 |
44 | {% for gtin in results %}
45 |
46 | {{ gtin }}
47 | {{ results[gtin]["name"] }}
48 | {{ results[gtin]["size"] }}
49 |
50 | {% if results[gtin]["countdownPrice"] %}
51 | ${{ results[gtin]["countdownPrice"] }}
52 | {% else %}
53 | -
54 | {% endif %}
55 |
56 |
57 | {% if results[gtin]["newWorldPrice"] %}
58 | ${{ results[gtin]["newWorldPrice"] }}
59 | {% else %}
60 | -
61 | {% endif %}
62 |
63 |
64 | {% if results[gtin]["paknSavePrice"] %}
65 | ${{ results[gtin]["paknSavePrice"] }}
66 | {% else %}
67 | -
68 | {% endif %}
69 |
70 |
71 | {% endfor %}
72 |
73 |
74 |
75 |
76 |
77 | {% endif %}
78 |
79 | {% endblock %}
80 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # Other
124 | .DS_Store
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # Other
135 | .DS_Store
136 |
--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | from bs4 import BeautifulSoup
4 |
5 |
6 | foodStuffsURL = {'NEWWORLD': {'URL': 'https://www.ishopnewworld.co.nz', 'ID': '60928d93-06fa-4d8f-92a6-8c359e7e846d'}, 'PAKNSAVE': {'URL': 'https://www.paknsaveonline.co.nz', 'ID': '3404c253-577f-45ca-b301-c98312e46efb'}}
7 |
8 |
9 | def escapeSpaces(str):
10 | return(str.replace(' ', '%20'))
11 |
12 |
13 | def getFoodStuffs(gtin, item, brandName):
14 | if ('countdown' not in item['brand'].lower().split()): # Ensure item is not a countdown brand item
15 | url = foodStuffsURL[brandName]['URL'] + '/Search?q=' + str(gtin)
16 | cookies = {'STORE_ID': foodStuffsURL[brandName]['ID']} # Necessary for request to be accepted
17 |
18 | # Request html for new world search result page
19 | response = requests.get(url, cookies=cookies)
20 |
21 | # Soupify and scrape html
22 | soup = BeautifulSoup(response.content, 'html.parser')
23 | data = soup.find('div', {'class': ['js-product-card-footer', 'fs-product-card__footer-container']})
24 |
25 | if (data): # Check if results were found for search
26 | # Extract json and convert to python dictionary
27 | data = json.loads(data.get('data-options'))
28 |
29 | # Get the price
30 | price = data['ProductDetails']['PricePerItem']
31 |
32 | return([price, None])
33 | elif (brandName == "NEWWORLD"): # If no results could be found and the brand is new world
34 | name, brand, size = item['name'], item['brand'], item['size']['volumeSize']
35 |
36 | # Create new url using the brand, size and name of the item - limit to 1 result
37 | url = foodStuffsURL[brandName]['URL'] + '/Search?q=' + escapeSpaces(brand + ' ' + size + ' ' + name) + '&ps=1&pg=1'
38 |
39 | # Send the new request
40 | response = requests.get(url, cookies=cookies)
41 |
42 | # Soupify and scrape html
43 | soup = BeautifulSoup(response.content, 'html.parser')
44 | data = soup.find('div', {'class': ['js-product-card-footer', 'fs-product-card__footer-container']})
45 |
46 | if (data):
47 | # Extract json and convert to python dictionary
48 | data = json.loads(data.get('data-options'))
49 |
50 | # Split words in brand name into list
51 | brandSubStrs = item['brand'].lower().split()
52 |
53 | # Check if top matching product contains words from the brand name
54 | for word in brandSubStrs:
55 | if (word in (data['productName'].lower())):
56 | # Scrape found product's size and check if it equals countdown product size
57 | size = soup.find('a', {'class': ['fs-product-card__row-details']})
58 | if (size is None): # If the class 'fs-product-card__row-details' couldn't be found
59 | size = soup.find('a', {'class': ['fs-product-card__details']})
60 | size.find('p').text.lower()
61 |
62 | if (size == item['size']['volumeSize'].lower()):
63 | # Get the price
64 | price = data['ProductDetails']['PricePerItem']
65 |
66 | return([price, 'Showing result for: "%s %s"' % (data['productName'], size)])
67 |
68 | # Return null if no valid products could be found
69 | return([None, None])
70 |
71 |
72 | def getCountdown(searchTerm):
73 | url = 'https://shop.countdown.co.nz/api/v1/products?target=search&search=' + searchTerm
74 | headers = {'X-Requested-With': 'OnlineShopping.WebApp'} # Necessary for request to be accepted
75 |
76 | if (searchTerm): # Ensure search isn't an empty string
77 | # Send get request to countdown's api
78 | response = requests.get(url, headers=headers)
79 |
80 | # Convert the json into a python dictionary
81 | data = json.loads(response.text)['products']['items']
82 |
83 | # Declare results dictionary
84 | results = {}
85 |
86 | if (data): # Check if results were found for search
87 | for item in data:
88 | if (item['type'] == 'Product'): # Ensure item is not a promotion or non product type
89 | # Retrieve general information about product
90 | gtin, name, size = item['barcode'], item['name'], item['size']['volumeSize']
91 |
92 | # Retrieve countdown prices
93 | countdownPrice = item['price']['originalPrice']
94 |
95 | # Retrieve new world price and warning messages
96 | newWorldPrice, newWorldMsg = [(getFoodStuffs(gtin, item, 'NEWWORLD'))[i] for i in (0, 1)]
97 |
98 | # Retrieve paknsave price and warning messages
99 | paknSavePrice, paknSaveMsg = [(getFoodStuffs(gtin, item, 'PAKNSAVE'))[i] for i in (0, 1)]
100 |
101 | print('%s | %s %s' % (gtin, name, '(' + size + ')' if (size) else ''))
102 | print('Countdown: %s' % ('$' + str(countdownPrice)))
103 | print('New World: %s %s' % ('$' + str(newWorldPrice) if (newWorldPrice) else 'NOT FOUND', newWorldMsg if (newWorldMsg) else ''))
104 | print('PaknSave: %s %s' % ('$' + str(paknSavePrice) if (paknSavePrice) else 'NOT FOUND', paknSaveMsg if (paknSaveMsg) else ''))
105 |
106 | print('---')
107 |
108 | # Create product and add it to dictionary
109 | product = {"name": name, "size": size, "countdownPrice": countdownPrice, "newWorldPrice": newWorldPrice, "paknSavePrice": paknSavePrice}
110 | results[gtin] = product
111 |
112 | else: # Output if no products were found in db
113 | print('No results could be found for that search.')
114 |
115 | else: # Output if search was an empty string
116 | print('No search was entered.')
117 |
118 | # Return the search results
119 | return(results)
120 |
121 |
122 | # getCountdown(input('Search for: '))
--------------------------------------------------------------------------------