├── .flake8
├── .gitattributes
├── .github
└── workflows
│ ├── codeql-analysis.yml
│ └── python-app.yml
├── .gitignore
├── LICENSE
├── README.md
├── main.py
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── scraper
├── __init__.py
├── add_product.py
├── arguments.py
├── clean_data.py
├── constants.py
├── database
│ ├── __init__.py
│ ├── db.py
│ ├── functions.py
│ └── models.py
├── delete_data.py
├── domains.py
├── exceptions.py
├── filemanager.py
├── format.py
├── format_to_new.py
├── logfile.log
├── logging.ini
├── models
│ ├── __init__.py
│ └── product.py
├── print_products.py
├── products.csv
├── records.json
├── reset_data.py
├── scrape.py
├── search_data.py
├── settings.ini
└── visualize.py
└── tests
├── __init__.py
├── test_add_product.py
├── test_domains.py
├── test_objects.json
├── test_visualize.py
└── test_website_handlers.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-complexity = 10
3 | max-line-length = 127
4 | exclude = .git,__pycache__
5 | per-file-ignores = __init__.py:F401
6 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | name: "CodeQL"
7 |
8 | on:
9 | push:
10 | branches: [master]
11 | pull_request:
12 | # The branches below must be a subset of the branches above
13 | branches: [master]
14 | schedule:
15 | - cron: '0 11 * * 6'
16 |
17 | jobs:
18 | analyze:
19 | name: Analyze
20 | runs-on: ubuntu-latest
21 |
22 | strategy:
23 | fail-fast: false
24 | matrix:
25 | # Override automatic language detection by changing the below list
26 | # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
27 | language: ['python']
28 | # Learn more...
29 | # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
30 |
31 | steps:
32 | - name: Checkout repository
33 | uses: actions/checkout@v2
34 | with:
35 | # We must fetch at least the immediate parents so that if this is
36 | # a pull request then we can checkout the head.
37 | fetch-depth: 2
38 |
39 | # Initializes the CodeQL tools for scanning.
40 | - name: Initialize CodeQL
41 | uses: github/codeql-action/init@v2
42 | with:
43 | languages: ${{ matrix.language }}
44 | # If you wish to specify custom queries, you can do so here or in a config file.
45 | # By default, queries listed here will override any specified in a config file.
46 | # Prefix the list here with "+" to use these queries and those in the config file.
47 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
48 |
49 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
50 | # If this step fails, then you should remove it and run the build manually (see below)
51 | - name: Autobuild
52 | uses: github/codeql-action/autobuild@v2
53 |
54 | # ℹ️ Command-line programs to run using the OS shell.
55 | # 📚 https://git.io/JvXDl
56 |
57 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
58 | # and modify them (or add more) to build your code if your project
59 | # uses a compiled language
60 |
61 | #- run: |
62 | # make bootstrap
63 | # make release
64 |
65 | - name: Perform CodeQL Analysis
66 | uses: github/codeql-action/analyze@v2
67 |
--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
1 | name: Python application
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 |
9 | jobs:
10 | build:
11 |
12 | runs-on: ubuntu-latest
13 | strategy:
14 | matrix:
15 | python-version: ["3.10"]
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install flake8 pytest
27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 | - name: Lint with flake8
29 | run: |
30 | # stop the build if there are Python syntax errors or undefined names
31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --per-file-ignores=__init__.py:F401,tests/*:E501 --statistics
34 | # - name: Test with pytest
35 | # run: |
36 | # pytest -k "not Amazon"
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | __pycache__/
3 | .pytest_cache/
4 |
5 | test_new_features.py
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Crinibus
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Table of contents
2 | - [Intro](#intro)
3 | - [Contributing](#contributing)
4 | - [Installation](#installation)
5 | - [Add products](#add-products)
6 | - [Websites to scrape from](#websites-to-scrape-from)
7 | - [Scrape products](#scrape-products)
8 | - [Delete data](#delete-data)
9 | - [User settings](#user-settings)
10 | - [Clean up data](#clean-up-data)
11 | - [View the latest datapoint of product(s)](#view-the-latest-datapoint-of-products)
12 | - [View all products](#view-all-products)
13 | - [Visualize data](#visualize-data)
14 | - [Command examples](#command-examples)
15 |
16 |
17 |
18 |
19 | ## Intro
20 | With this program you can easily scrape and track prices on product at multiple [websites](#websites-to-scrape-from).
21 | This program can also visualize price over time of the products being tracked. That can be helpful if you want to buy a product in the future and wants to know if a discount might be around the corner.
22 |
23 | **Requires** `python 3.10+`
24 |
25 |
26 |
27 |
28 | ## Contributing
29 | Feel free to fork the project and create a pull request with new features or refactoring of the code. Also feel free to make issues with problems or suggestions to new features.
30 |
31 |
32 |
33 |
34 | UPDATE TO HOW DATA IS STORED IN V1.1
35 |
36 |
37 | In version v1.1, I have changed how data is stored in ```records.json```: ```dates``` under each product have been changed to ```datapoints``` and now a list containing dictionaries with ```date``` and ```price``` keys.
38 | If you want to update your data to be compatible with version v1.1, then open an interactive python session where this repository is located and run the following commands:
39 | ```
40 | >>> from scraper.format_to_new import Format
41 | >>> Format.format_old_records_to_new()
42 | ```
43 |
44 |
45 |
46 |
47 |
48 | UPDATE TO PRODUCTS.CSV IN V2.3.0
49 |
50 |
51 | In version v2.3.0, I have add the column ```short_url``` to ```products.csv```. If you have add products before v2.3.0, then run the following commands in an interactive python session to add the new column:
52 | ```
53 | >>> from scraper.format_to_new import Format
54 | >>> Format.add_short_urls_to_products_csv()
55 | ```
56 |
57 |
58 |
59 |
60 | UPDATE TO HOW DATA IS STORED IN V3.0.0
61 |
62 |
63 | In version v3.0.0, I have changed where data is stored from a json file to a SQLite database. If you have data from before v3.0.0, then run the following commands in an interactive python session to add the data from records.json to the database (**OBS: Pandas is required**):
64 | ```
65 | >>> from scraper.format_to_new import Format
66 | >>> Format.from_json_to_db()
67 | ```
68 |
69 |
70 |
71 | **NOTE:** This will replace the content in the database with what is in records.json. That means if you have products and/or datapoints in the database but not records.json, they will be deleted.
72 |
73 |
74 |
75 |
76 | OBS: If you doesn't have Pandas installed run this command:
77 | ```
78 | pip3 install pandas
79 | ```
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | ## Installation
88 | **Requires** `python 3.10+`
89 |
90 | Clone this repository and move into the repository:
91 | ```
92 | git clone https://github.com/Crinibus/scraper.git
93 | ```
94 | ```
95 | cd scraper
96 | ```
97 |
98 | Then make sure you have the modules, run this in the terminal:
99 | ```
100 | pip3 install -r requirements.txt
101 | ```
102 |
103 |
104 |
105 |
106 | ## Add products
107 | To add a single product, use the following command, where you replace `````` and `````` with your category and url:
108 | ```
109 | python3 main.py -a -c -u
110 | ```
111 |
112 | e.g.
113 | ```
114 | python3 main.py -a -c vr -u https://www.komplett.dk/product/1168594/gaming/spiludstyr/vr/vr-briller/oculus-quest-2-vr-briller
115 | ```
116 |
117 | This adds the category (if new) and the product to the records.json file, and adds a line at the end of the products.csv file so the script can scrape price of the new product.
118 |
119 |
120 |
121 | To add multiple products at once, just add specify another category and url with ```-c ``` and ```-u ```. E.g. with the following command I add two products:
122 | ```
123 | python3 main.py -a -c -u -c -u
124 | ```
125 | This is equivalent to the above:
126 | ```
127 | python3 main.py -a -c -u
128 | ```
129 |
130 | **OBS**: The url must have a schema like: ```https://``` or ```http://```.
131 | **OBS**: If an error occures when adding a product, then the error might happen because the url has a ```&``` in it, when this happens then just put quotation marks around the url. This should solve the problem. If this doesn't solve the problem then summit a issue.
132 |
133 |
134 |
135 |
136 | ### Websites to scrape from
137 | This scraper can (so far) scrape prices on products from:
138 | - [Amazon](https://www.amazon.com/)*
139 | - [eBay.com](https://www.ebay.com/)
140 | - [Komplett.dk](https://www.komplett.dk/)
141 | - [Proshop.dk](https://www.proshop.dk/)
142 | - [Computersalg.dk](https://www.computersalg.dk/)
143 | - [Elgiganten.dk](https://www.elgiganten.dk/) & [Elgiganten.se](https://www.elgiganten.se/)
144 | - [AvXperten.dk](https://www.avxperten.dk/)
145 | - [Av-Cables.dk](https://www.av-cables.dk/)
146 | - [Power.dk](https://www.power.dk/)
147 | - [Expert.dk](https://www.expert.dk/)
148 | - [MM-Vision.dk](https://www.mm-vision.dk/)
149 | - [Coolshop.dk](https://www.coolshop.dk/)
150 | - [Sharkgaming.dk](https://www.sharkgaming.dk/)
151 | - [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/)
152 | - [HifiKlubben.dk](https://www.hifiklubben.dk/)
153 | - [Shein.com](https://www.shein.com/)
154 |
155 | ****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
156 | The listed Amazon domains is from my quick testing with one or two products from each domain.
157 | If you find that some other Amazon domains works or some of the listed doesn't please create an issue.***
158 |
159 |
160 |
161 |
162 | ## Scrape products
163 | To scrape prices of products run this in the terminal:
164 | ```
165 | python3 main.py -s
166 | ```
167 | To scrape with threads run the same command but with the ```--threads``` argument:
168 | ```
169 | python3 main.py -s --threads
170 | ```
171 |
172 |
173 |
174 | ## Activating and deactivating products
175 |
176 | When you add a new product the product is activated to be scraped. If you wish to not scrape a product anymore, you can deactivate the product with the following command:
177 | ```
178 | python3 main.py --deactivate --id
179 | ```
180 |
181 | You can activate a product again with the following command:
182 | ```
183 | python3 main.py --activate --id
184 | ```
185 |
186 |
187 |
188 | ## Delete data
189 |
190 | If you want to start from scratch with no data in the records.json and products.csv files, then just run the following command:
191 | ```
192 | python3 main.py --delete --all
193 | ```
194 |
195 | You can also just delete some products or some categories:
196 | ```
197 | python3 main.py --delete --id
198 | ```
199 | ```
200 | python3 main.py --delete --name
201 | ```
202 | ```
203 | python3 main.py --delete --category
204 | ```
205 |
206 |
207 | Then just add products like described [here](#add-products).
208 |
209 |
210 |
211 | If you just want to delete all datapoints for every product, then run this command:
212 | ```
213 | python3 main.py --reset --all
214 | ```
215 |
216 |
217 | You can also just delete datapoints for some products:
218 | ```
219 | python3 main.py --reset --id
220 | ```
221 | ```
222 | python3 main.py --reset --name
223 | ```
224 | ```
225 | python3 main.py --reset --category
226 | ```
227 |
228 |
229 |
230 |
231 | ## User settings
232 | User settings can be added and changed in the file settings.ini.
233 |
234 | #### ChangeName
235 | Under the category ```ChangeName``` you can change how the script changes product names, so similar products will be placed in the same product in records.json file.
236 |
237 | When adding a new setting under the category ```ChangeName``` in settings.ini, there must be a line with ```key``` and a line with ```value```, where `````` is the "link" between keywords and valuewords. E.g. ```value3``` is the value to ```key3```.
238 |
239 | In ```key``` you set the keywords (seperated by a comma) that the product name must have for to be changed to what ```value``` is equal to. Example if the user settings is the following:
240 |
241 | ```
242 | [ChangeName]
243 | key1 = asus,3080,rog,strix,oc
244 | value1 = asus geforce rtx 3080 rog strix oc
245 | ```
246 |
247 | The script checks if a product name has all of the words in ```key1```, it gets changed to what ```value1``` is.
248 |
249 | #### Scraping
250 | You can change the time between each time a url is being request by changing the field ```request_delay``` in the file scraper/settings.ini under the ```Scraping``` section.
251 |
252 | Default is 0 seconds, but to avoid the website you scrape products from thinking you are DDOS attacting them or you being restricted from scraping on their websites temporarily, set the request_delay in settings.ini to a higher number of seconds, e.g. 5 seconds.
253 |
254 |
255 |
256 |
257 | ## Clean up data
258 | If you want to clean up your data, meaning you want to remove unnecessary datapoints (datapoints that have the same price as the datapoint before and after it), then run the following command:
259 | ```
260 | python3 main.py --clean-data
261 | ```
262 |
263 |
264 |
265 | ## Search products and categories
266 | You can search for product names and categories you have in your records.json by using the argument ```--search [ ...]```. The search is like a keyword search, so e.g. if you enter ```--search logitech``` all product names and categories that contains the word "logitech" are found.
267 |
268 | You can search with multiple keywords, just seperate them with a space: ```--search logitech corsair```. Here all the product names and categories that contains the words "logitech" or "corsair" are found.
269 |
270 |
271 |
272 |
273 | ## View the latest datapoint of product(s)
274 | If you want to view the latest datapoint of a product, you can use the argument ```--latest-datapoint``` with ```--id``` and/or ```--name```.
275 |
276 | Example:
277 | ```
278 | python3 main.py --name "logitech z533" --latest-datapoint
279 | ```
280 |
281 | The above command will show the latest datapoint for all the websites the specified product, in this case "logitech z533", has been scraped from and will show something like this:
282 |
283 | ```
284 | LOGITECH Z533
285 | > Komplett - 849816
286 | - DKK 999.0
287 | - 2022-09-12
288 | > Proshop - 2511000
289 | - DKK 669.0
290 | - 2022-09-12
291 | > Avxperten - 25630
292 | - DKK 699.0
293 | - 2022-09-12
294 | ```
295 |
296 |
297 |
298 |
299 | ## View all products
300 | To view all the products you have scraped, you can use the argument ```--list-products```.
301 |
302 | Example:
303 | ```
304 | python3 main.py --list-products
305 | ```
306 |
307 | This will list all the products in the following format:
308 |
309 | ```
310 | CATEGORY
311 | > PRODUCT NAME
312 | - WEBSITE NAME - PRODUCT ID
313 | - ✓ WEBSITE NAME - PRODUCT ID
314 | ```
315 |
316 | The check mark (✓) shows that the product is activated.
317 |
318 |
319 |
320 |
321 | ## Visualize data
322 | To visualize your data, just run main.py with the ```-v``` or ```--visualize``` argument and then specify which products you want to be visualized. These are your options for how you want to visualize your products:
323 |
324 | - ```--all``` to visualize all your products
325 | - ```-c [ [ ...]]``` or ```--category [ [ ...]]``` to visualize all products in one or more categories
326 | - ```--id [ [ ...]]``` to visualize one or more products with the specified id(s)
327 | - ```-n [ [ ...]]``` or ```--name [ ...]]``` to visualize one or more products with the specified name(s)
328 | - ```--compare``` to compare two or more products with the specified id(s), name(s) and/or category(s) or all products on one graph. Use with ```--id```, ```--name```, ```--category``` and/or ```--all```
329 |
330 | ### Example graph
331 | 
332 |
333 | ### Command examples
334 | **Show graphs for all products**
335 |
336 | To show graphs for all products, run the following command:
337 | ```
338 | python3 main.py -v --all
339 | ```
340 |
341 |
342 |
343 | **Show graph(s) for specific products**
344 |
345 | To show a graph for only one product, run the following command where `````` is the id of the product you want a graph for:
346 | ```
347 | python3 main.py -v --id
348 | ```
349 |
350 | For multiple products, just add another id, like so:
351 | ```
352 | python3 main.py -v --id
353 | ```
354 |
355 |
356 |
357 | **Show graphs for products in one or more categories**
358 |
359 | To show graphs for all products in one category, run the following command where `````` is the category you want graph from:
360 | ```
361 | python3 main.py -v -c
362 | ```
363 |
364 | For multiple categories, just add another category, like so:
365 | ```
366 | python3 main.py -v -c
367 | ```
368 |
369 |
370 |
371 | **Show graps for products with a specific name**
372 |
373 | To show graphs for product(s) with a specific name, run the following command where `````` is the name of the product(s) you want graphs for:
374 | ```
375 | python3 main.py -v --name
376 | ```
377 |
378 | For multiple products with different names, just add another name, like so:
379 | ```
380 | python3 main.py -v --name
381 | ```
382 |
383 | If the name of a product has multiple words in it, then just add quotation marks around the name.
384 |
385 |
386 |
387 | **Only show graph for products that are up to date**
388 |
389 | To only show graphs for the products that are up to date, use the flag ```--up-to-date``` or ```-utd```, like so:
390 | ```
391 | python3 main.py -v --all -utd
392 | ```
393 | The use of the flag ```-utd``` is only implemented when visualizing all products like the example above or when visualizing all products in a category:
394 | ```
395 | python3 main.py -v -c -utd
396 | ```
397 |
398 |
399 |
400 | **Compare two products**
401 |
402 | To compare two products on one graph, use the flag ```--compare``` with flag ```--id```, ```--name```, ```--category``` and/or ```--all```, like so:
403 | ```
404 | python3 main.py -v --compare --id
405 | ```
406 | ```
407 | python3 main.py -v --compare --name
408 | ```
409 | ```
410 | python3 main.py -v --compare --category
411 | ```
412 | ```
413 | python3 main.py -v --compare --id --name --category
414 | ```
415 | ```
416 | python3 main.py -v --compare --all
417 | ```
418 |
419 | ***OBS** when using ```--name``` or ```--category``` multiple products can be visualized*
420 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import logging.config
3 | import logging
4 | import time
5 | import alive_progress
6 | import scraper
7 |
8 | alive_progress.config_handler.set_global(ctrl_c=False, dual_line=True, theme="classic", stats=False)
9 |
10 |
11 | def main() -> None:
12 | args = scraper.argparse_setup()
13 |
14 | if args.clean_data:
15 | scraper.clean_datapoints()
16 |
17 | if args.visualize:
18 | scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare)
19 |
20 | if args.reset:
21 | scraper.reset(args.category, args.name, args.id, args.all)
22 |
23 | if args.add:
24 | scraper.add_products(args.category, args.url)
25 |
26 | if args.activate:
27 | scraper.update_products_is_active_with_product_codes(args.id, True)
28 |
29 | if args.deactivate:
30 | scraper.update_products_is_active_with_product_codes(args.id, False)
31 |
32 | if args.search:
33 | scraper.search(args.search)
34 |
35 | if args.scrape:
36 | if args.threads:
37 | scrape_with_threads()
38 | else:
39 | scrape()
40 |
41 | if args.latest_datapoint:
42 | scraper.print_latest_datapoints(args.name, args.id, args.category)
43 |
44 | if args.list_products:
45 | if any([args.name, args.id, args.category]):
46 | scraper.list_products_with_filters(args.name, args.id, args.category)
47 | else:
48 | scraper.print_all_products()
49 |
50 | if args.delete:
51 | scraper.delete(args.category, args.name, args.id, args.all)
52 |
53 |
54 | def scrape() -> None:
55 | print("Scraping...")
56 |
57 | request_delay = scraper.Config.get_request_delay()
58 | active_products = scraper.db.get_all_products(select_only_active=True)
59 |
60 | products = scraper.Format.db_products_to_scrapers(active_products)
61 |
62 | with alive_progress.alive_bar(len(products), title="Scraping") as bar:
63 | # Scrape and save scraped data for each product (sequentially)
64 | for product in products:
65 | bar.text = f"-> {product.url}"
66 | time.sleep(request_delay)
67 | product.scrape_info()
68 | scraper.add_product.add_new_datapoint_with_scraper(product)
69 | bar()
70 |
71 |
72 | def scrape_with_threads() -> None:
73 | print("Scraping with threads...")
74 |
75 | request_delay = scraper.Config.get_request_delay()
76 |
77 | grouped_db_products = scraper.db.get_all_products_grouped_by_domains(select_only_active=True)
78 | grouped_products: list[list[scraper.Scraper]] = []
79 |
80 | for db_products in grouped_db_products:
81 | products = scraper.Format.db_products_to_scrapers(db_products)
82 | grouped_products.append(products)
83 |
84 | grouped_scraper_threads: list[list[threading.Thread]] = []
85 |
86 | # Create scraper threads and group by domain
87 | for products in grouped_products:
88 | scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
89 | grouped_scraper_threads.append(scraper_threads)
90 |
91 | products_flatten = [product for products in grouped_products for product in products]
92 |
93 | with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar:
94 | # Create master threads to manage scraper threads sequentially for each domain
95 | master_threads = [
96 | threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay, progress_bar])
97 | for scraper_threads in grouped_scraper_threads
98 | ]
99 |
100 | # Start all master threads
101 | for master_thread in master_threads:
102 | master_thread.start()
103 |
104 | # Wait for all master threads to finish
105 | for master_thread in master_threads:
106 | master_thread.join()
107 |
108 | # Save scraped data for each product (sequentially)
109 | for product in products_flatten:
110 | scraper.add_product.add_new_datapoint_with_scraper(product)
111 |
112 |
113 | if __name__ == "__main__":
114 | scraper.db.create_db_and_tables()
115 | logging.config.fileConfig(
116 | fname=scraper.Filemanager.logging_ini_path,
117 | defaults={"logfilename": scraper.Filemanager.logfile_path},
118 | )
119 |
120 | main()
121 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 127
3 |
4 | [tool.ruff]
5 | line-length = 127
6 |
7 | [tool.ruff.per-file-ignores]
8 | "__init__.py" = ["E402"]
9 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths =
3 | tests
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.24.0
2 | beautifulsoup4>=4.9.1
3 | plotly>=4.12.0
4 | pytest>=7.1.2
5 | pytest-mock>=3.8.2
6 | alive-progress>=2.4.1
7 | flake8>=6.0.0
8 | sqlmodel>=0.0.8
9 |
--------------------------------------------------------------------------------
/scraper/__init__.py:
--------------------------------------------------------------------------------
1 | from .scrape import Scraper, start_threads_sequentially
2 | from .arguments import argparse_setup
3 | from .add_product import add_products, update_products_is_active_with_product_codes
4 | from .filemanager import Filemanager, Config
5 | from .visualize import visualize_data
6 | from .clean_data import clean_datapoints
7 | from .delete_data import delete
8 | from .reset_data import reset
9 | from .search_data import search
10 | from .print_products import print_latest_datapoints, print_all_products, list_products_with_filters
11 | from .format import Format
12 | import scraper.database as db
13 |
14 |
15 | __author__ = "Crinibus"
16 |
--------------------------------------------------------------------------------
/scraper/add_product.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from datetime import datetime
3 |
4 | import scraper.database as db
5 | from scraper.exceptions import WebsiteNotSupported, URLMissingSchema
6 | from scraper.format import Format
7 | from scraper.scrape import Scraper
8 | from scraper.domains import get_website_name, SUPPORTED_DOMAINS
9 | from scraper.constants import URL_SCHEMES
10 |
11 |
12 | def add_products(categories: list[str], urls: list[str]) -> None:
13 | for category, url in zip(categories, urls):
14 | try:
15 | add_product(category, url)
16 | except (WebsiteNotSupported, URLMissingSchema) as err:
17 | logging.getLogger(__name__).error(err)
18 | print(err)
19 |
20 |
21 | def add_product(category: str, url: str) -> None:
22 | logger = logging.getLogger(__name__)
23 |
24 | website_name = get_website_name(url, keep_subdomain=False)
25 |
26 | if website_name not in SUPPORTED_DOMAINS.keys():
27 | raise WebsiteNotSupported(website_name)
28 |
29 | if is_missing_url_schema(url):
30 | raise URLMissingSchema(url)
31 |
32 | print(f"Adding product with category '{category}' and url '{url}'")
33 | logger.info(f"Adding product with category '{category}' and url '{url}'")
34 |
35 | new_product = Scraper(category, url)
36 | new_product_info = new_product.scrape_info()
37 |
38 | if not new_product_info.valid:
39 | print("Product info is not valid - see logs for more info")
40 | return
41 |
42 | product_in_db = db.get_product_by_product_code(new_product_info.id)
43 |
44 | if product_in_db is None:
45 | add_new_product_to_db(new_product)
46 | add_new_datapoint_with_scraper(new_product)
47 | return
48 |
49 | logger.info("Product with the same product code already exists in database")
50 |
51 | if product_in_db.is_active:
52 | print("Product with the same product code already exists in database and is active")
53 | return
54 |
55 | user_input = input(
56 | "A product with the same product id already exist in the database but is not active, "
57 | "do you want to activate it? (y/n) > "
58 | )
59 |
60 | if user_input.lower() in ("y", "yes"):
61 | print("Activating product...")
62 | set_existing_product_is_active(product_in_db, True)
63 | logger.info("Product has been activated")
64 | else:
65 | print("Product has not been activated")
66 | logger.info("Product not activated")
67 |
68 |
69 | def add_new_product_to_db(product: Scraper) -> None:
70 | product_to_db = Format.scraper_to_db_product(product, True)
71 | db.add(product_to_db)
72 |
73 |
74 | def add_new_datapoint_to_db(product_code: str, price: float, currency: str, date: str | None = None):
75 | """Parameter 'date' defaults to the date of today in the format: YYYY-MM-DD"""
76 | if date is None:
77 | date = datetime.today().strftime("%Y-%m-%d")
78 |
79 | new_datapoint = db.DataPoint(
80 | product_code=product_code,
81 | date=date,
82 | price=price,
83 | currency=currency,
84 | )
85 |
86 | db.add(new_datapoint)
87 |
88 |
89 | def add_new_datapoint_with_scraper(product: Scraper, date: str | None = None) -> None:
90 | if not product.product_info or not product.product_info.valid:
91 | print(f"Product info is not valid - category: '{product.category}' - url: {product.url}")
92 | return
93 |
94 | product_code = product.product_info.id
95 | price = product.product_info.price
96 | currency = product.product_info.currency
97 |
98 | add_new_datapoint_to_db(product_code, price, currency, date)
99 |
100 |
101 | def update_products_is_active_with_product_codes(product_codes: list[str], is_active: bool) -> None:
102 | action = "Activating" if is_active else "Deactivating"
103 |
104 | for product_code in product_codes:
105 | print(f"{action} {product_code}")
106 | product = db.get_product_by_product_code(product_code)
107 | set_existing_product_is_active(product, is_active)
108 |
109 |
110 | def set_existing_product_is_active(product: db.Product, is_active: bool) -> None:
111 | product.is_active = is_active
112 | db.add(product)
113 |
114 |
115 | def is_missing_url_schema(url: str) -> bool:
116 | return not any(schema in url for schema in URL_SCHEMES)
117 |
--------------------------------------------------------------------------------
/scraper/arguments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def argparse_setup() -> argparse.Namespace:
5 | """Setup and return argparse."""
6 | parser = argparse.ArgumentParser(description="")
7 |
8 | parser.add_argument(
9 | "-s",
10 | "--scrape",
11 | help="scrape product info",
12 | action="store_true",
13 | )
14 |
15 | parser.add_argument("--threads", help="use threads when scraping product info", action="store_true")
16 |
17 | parser.add_argument(
18 | "-a",
19 | "--add",
20 | help="Add a new product",
21 | action="store_true",
22 | )
23 |
24 | parser.add_argument(
25 | "-c",
26 | "--category",
27 | help="specify category(s)",
28 | type=str,
29 | nargs="*",
30 | action="extend",
31 | default=[],
32 | )
33 |
34 | parser.add_argument("-u", "--url", help="the url to the product", type=str, nargs="*", action="extend")
35 |
36 | parser.add_argument("--activate", help="activate a product to be scraped", action="store_true")
37 |
38 | parser.add_argument("--deactivate", help="deactivate a product to not be scraped", action="store_true")
39 |
40 | parser.add_argument(
41 | "-v",
42 | "--visualize",
43 | help="visualize your product data",
44 | action="store_true",
45 | dest="visualize",
46 | )
47 |
48 | parser.add_argument(
49 | "--all",
50 | help="specify all products",
51 | action="store_true",
52 | dest="all",
53 | )
54 |
55 | parser.add_argument(
56 | "--id",
57 | help="specify id(s) of product(s)",
58 | type=str,
59 | nargs="*",
60 | action="extend",
61 | dest="id",
62 | default=[],
63 | )
64 |
65 | parser.add_argument(
66 | "-n",
67 | "--name",
68 | help="specify names(s) of product(s)",
69 | type=str,
70 | nargs="*",
71 | action="extend",
72 | dest="name",
73 | default=[],
74 | )
75 |
76 | parser.add_argument(
77 | "-utd",
78 | "--up-to-date",
79 | help="show only graph for a product if the latest product price is today",
80 | action="store_true",
81 | dest="up_to_date",
82 | )
83 |
84 | parser.add_argument(
85 | "--search",
86 | help="search for product names with the specified name(s)",
87 | type=str,
88 | nargs="*",
89 | action="extend",
90 | dest="search",
91 | metavar="SEARCH_TERM",
92 | )
93 |
94 | parser.add_argument(
95 | "--compare",
96 | help="compare two or more products",
97 | action="store_true",
98 | dest="compare",
99 | )
100 |
101 | parser.add_argument(
102 | "--reset",
103 | help="delete data for each product in records.json, such as prices of each recorded day",
104 | action="store_true",
105 | )
106 |
107 | parser.add_argument(
108 | "--clean-data",
109 | help="clean data so unnecessary product datapoints is removed from records",
110 | action="store_true",
111 | dest="clean_data",
112 | )
113 |
114 | parser.add_argument(
115 | "--latest-datapoint",
116 | help="get the latest datapoint of specified product(s)",
117 | dest="latest_datapoint",
118 | action="store_true",
119 | )
120 |
121 | parser.add_argument(
122 | "--list-products",
123 | help="lists the names, websites and ids of all products",
124 | dest="list_products",
125 | action="store_true",
126 | )
127 |
128 | parser.add_argument(
129 | "--delete",
130 | help="delete all or specific products or categories",
131 | dest="delete",
132 | action="store_true",
133 | )
134 |
135 | args = validate_arguments(parser)
136 |
137 | return args
138 |
139 |
140 | def validate_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
141 | """Validate arguments"""
142 | args = parser.parse_args()
143 |
144 | if args.add and args.visualize:
145 | parser.error("Cannot use --add and --visualize at the same time")
146 |
147 | if args.activate and args.deactivate:
148 | parser.error("Cannot use --activate and --deactivate at the same time")
149 |
150 | if (args.activate or args.deactivate) and not args.id:
151 | parser.error("When using --activate or --deactivate, then --id is required")
152 |
153 | if args.delete:
154 | if args.all and any([args.category, args.name, args.id]):
155 | parser.error("When using --delete and --all, then using --category, --name or --id does nothing")
156 |
157 | if args.add:
158 | if not args.category or not args.url:
159 | parser.error("When using --add, then --category and --url is required")
160 | if len(args.category) > len(args.url):
161 | parser.error("Specified more categories than urls")
162 | if len(args.category) < len(args.url):
163 | parser.error("Specified more urls than categories")
164 |
165 | if args.visualize:
166 | if not any([args.all, args.category, args.id, args.name, args.compare]):
167 | parser.error(
168 | "When using --visualize, then one of the following is required: --all, --category, --id, --name, --compare"
169 | )
170 | if args.compare and not any([args.id, args.name, args.category, args.all]):
171 | parser.error(
172 | "When using --visualize and --compare, then one of the following is required: --id, --name, --category, --all"
173 | )
174 |
175 | if args.latest_datapoint:
176 | if not any([args.name, args.id, args.category]):
177 | parser.error("When using --latest-datapoint, then --name, --id or --category is required")
178 |
179 | return args
180 |
--------------------------------------------------------------------------------
/scraper/clean_data.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import scraper.database as db
4 |
5 |
6 | def clean_datapoints() -> None:
7 | print("Cleaning data...")
8 | logging.getLogger(__name__).info("Cleaning database datapoints")
9 |
10 | all_products = db.get_all_products()
11 | datapoints_to_delete = []
12 |
13 | for product in all_products:
14 | datapoints = db.get_datapoints_by_product_codes([product.product_code])
15 |
16 | datapoints.sort(key=lambda product: product.date)
17 |
18 | for index, datapoint in enumerate(datapoints):
19 | if index in (0, len(datapoints) - 1):
20 | continue
21 |
22 | previous_datapoint = datapoints[index - 1]
23 | next_datapoint = datapoints[index + 1]
24 |
25 | if datapoint.price == previous_datapoint.price and datapoint.price == next_datapoint.price:
26 | datapoints_to_delete.append(datapoint)
27 |
28 | db.delete_all(datapoints_to_delete)
29 |
30 | print("Done cleaning data")
31 |
--------------------------------------------------------------------------------
/scraper/constants.py:
--------------------------------------------------------------------------------
1 | REQUEST_HEADER = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:112.0) Gecko/20100101 Firefox/112.0"}
2 |
3 | REQUEST_COOKIES = {"cookies_are": "working"}
4 |
5 | WEBSITE_COLORS = {
6 | "komplett": "orange",
7 | "proshop": "red",
8 | "computersalg": "blue",
9 | "elgiganten": "green",
10 | "avxperten": "aqua",
11 | "av-cables": "aquamarine",
12 | "amazon": "black",
13 | "ebay": "crimson",
14 | "power": "salmon",
15 | "expert": "olivedrab",
16 | "mm-vision": "mediumspringgreen",
17 | "coolshop": "mediumblue",
18 | "sharkgaming": "midnightblue",
19 | "newegg": "#f7c20a",
20 | "hifiklubben": "#231f20",
21 | "shein": "#ffed24",
22 | }
23 |
24 | URL_SCHEMES = ("http://", "https://")
25 |
26 | CHECK_MARK = "\u2713"
27 |
--------------------------------------------------------------------------------
/scraper/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Product, DataPoint
2 | from .db import create_db_and_tables, engine
3 |
4 | from .functions import (
5 | delete_all,
6 | add,
7 | add_all,
8 | get_all_products,
9 | get_all_datapoints,
10 | get_product_by_product_code,
11 | get_products_by_product_codes,
12 | get_products_by_categories,
13 | get_products_by_names,
14 | get_products_by_names_fuzzy,
15 | get_datapoints_by_categories,
16 | get_datapoints_by_names,
17 | get_datapoints_by_product_codes,
18 | get_all_products_with_datapoints,
19 | get_product_infos_from_products,
20 | get_all_unique_categories,
21 | get_all_unique_domains,
22 | get_products_by_domains,
23 | get_all_products_grouped_by_domains,
24 | group_products_by_domains,
25 | group_products_by_names,
26 | )
27 |
--------------------------------------------------------------------------------
/scraper/database/db.py:
--------------------------------------------------------------------------------
1 | from sqlmodel import SQLModel, create_engine
2 | from pathlib import Path
3 | from scraper.filemanager import Filemanager
4 | from .models import Product, DataPoint # noqa: F401
5 |
6 | sqlite_url = f"sqlite:///{Filemanager.database_path}"
7 |
8 | Path(Filemanager.database_folder).mkdir(exist_ok=True)
9 |
10 | engine = create_engine(sqlite_url, echo=False)
11 |
12 |
13 | def create_db_and_tables():
14 | SQLModel.metadata.create_all(engine)
15 |
--------------------------------------------------------------------------------
/scraper/database/functions.py:
--------------------------------------------------------------------------------
1 | from sqlmodel import Session, select, col
2 |
3 | from scraper.models.product import DataPointInfo, ProductInfo
4 | from .db import engine
5 | from .models import Product, DataPoint
6 |
7 |
8 | def delete_all(elements: list[Product | DataPoint]) -> None:
9 | with Session(engine) as session:
10 | for element in elements:
11 | session.delete(element)
12 | session.commit()
13 |
14 |
15 | def add(element: Product | DataPoint) -> None:
16 | with Session(engine) as session:
17 | session.add(element)
18 | session.commit()
19 |
20 |
21 | def add_all(elements: list[Product | DataPoint]) -> None:
22 | with Session(engine) as session:
23 | session.add_all(elements)
24 | session.commit()
25 |
26 |
27 | def get_all_products(select_only_active: bool = False) -> list[Product]:
28 | with Session(engine) as session:
29 | query = select(Product)
30 |
31 | if select_only_active:
32 | query = query.where(Product.is_active)
33 |
34 | return session.exec(query).all()
35 |
36 |
37 | def get_all_datapoints() -> list[DataPoint]:
38 | with Session(engine) as session:
39 | return session.exec(select(DataPoint)).all()
40 |
41 |
42 | def get_all_unique_categories() -> list[str]:
43 | with Session(engine) as session:
44 | return session.exec(select(Product.category).distinct()).all()
45 |
46 |
47 | def get_all_unique_domains() -> list[str]:
48 | with Session(engine) as session:
49 | return session.exec(select(Product.domain).distinct()).all()
50 |
51 |
52 | def get_product_by_product_code(product_code: str) -> Product | None:
53 | with Session(engine) as session:
54 | return session.exec(select(Product).where(Product.product_code == product_code)).first()
55 |
56 |
57 | def get_products_by_product_codes(product_codes: list[str]) -> list[Product]:
58 | with Session(engine) as session:
59 | return session.exec(select(Product).where(col(Product.product_code).in_(product_codes))).all()
60 |
61 |
62 | def get_products_by_categories(categories: list[str]) -> list[Product]:
63 | with Session(engine) as session:
64 | return session.exec(select(Product).where(col(Product.category).in_(categories))).all()
65 |
66 |
67 | def get_products_by_names(names: list[str]) -> list[Product]:
68 | with Session(engine) as session:
69 | return session.exec(select(Product).where(col(Product.name).in_(names))).all()
70 |
71 |
72 | def get_products_by_names_fuzzy(names: list[str]) -> list[Product]:
73 | with Session(engine) as session:
74 | matched_products = []
75 |
76 | for name in names:
77 | fuzzy_name = f"%{name}%"
78 | products = session.exec(select(Product).where(col(Product.name).like(fuzzy_name))).all()
79 | matched_products.extend(products)
80 |
81 | return matched_products
82 |
83 |
84 | def get_products_by_domains(domains: list[str], select_only_active: bool = False) -> list[Product]:
85 | with Session(engine) as session:
86 | query = select(Product).where(col(Product.domain).in_(domains))
87 |
88 | if select_only_active:
89 | query = query.where(Product.is_active)
90 |
91 | return session.exec(query).all()
92 |
93 |
94 | def get_datapoints_by_categories(categories: list[str]) -> list[DataPoint]:
95 | with Session(engine) as session:
96 | products = session.exec(select(Product).where(col(Product.category).in_(categories))).all()
97 | product_codes = [product.product_code for product in products]
98 | datapoints = session.exec(select(DataPoint).where(col(DataPoint.product_code).in_(product_codes))).all()
99 | return datapoints
100 |
101 |
102 | def get_datapoints_by_names(names: list[str]) -> list[DataPoint]:
103 | with Session(engine) as session:
104 | products = session.exec(select(Product).where(col(Product.name).in_(names))).all()
105 | product_codes = [product.product_code for product in products]
106 | datapoints = session.exec(select(DataPoint).where(col(DataPoint.product_code).in_(product_codes))).all()
107 | return datapoints
108 |
109 |
110 | def get_datapoints_by_product_codes(product_codes: list[str]) -> list[DataPoint]:
111 | with Session(engine) as session:
112 | products = session.exec(select(Product).where(col(Product.product_code).in_(product_codes))).all()
113 | found_product_codes = [product.product_code for product in products]
114 | datapoints = session.exec(select(DataPoint).where(col(DataPoint.product_code).in_(found_product_codes))).all()
115 | return datapoints
116 |
117 |
118 | def get_datapoints_by_product(product: Product) -> list[DataPoint]:
119 | with Session(engine) as session:
120 | datapoints = session.exec(
121 | select(DataPoint).where(DataPoint.product_code == product.product_code).order_by(DataPoint.date)
122 | ).all()
123 | return datapoints
124 |
125 |
126 | def get_all_products_with_datapoints(select_only_active: bool = False) -> list[ProductInfo]:
127 | products = get_all_products(select_only_active=select_only_active)
128 | return get_product_infos_from_products(products)
129 |
130 |
131 | def get_product_infos_from_products(products: list[Product]) -> list[ProductInfo]:
132 | product_infos: list[ProductInfo] = []
133 |
134 | for product in products:
135 | datapoints = get_datapoints_by_product(product)
136 |
137 | datapoint_infos = [DataPointInfo(date=datapoint.date, price=datapoint.price) for datapoint in datapoints]
138 |
139 | product_info = ProductInfo(
140 | id=product.product_code,
141 | product_name=product.name,
142 | category=product.category,
143 | currency=datapoints[0].currency if datapoints else "",
144 | datapoints=datapoint_infos,
145 | url=product.url,
146 | website=product.domain,
147 | )
148 |
149 | product_infos.append(product_info)
150 |
151 | return product_infos
152 |
153 |
154 | def get_all_products_grouped_by_domains(select_only_active: bool = False) -> list[list[Product]]:
155 | all_products = get_all_products(select_only_active=select_only_active)
156 | return group_products_by_domains(all_products)
157 |
158 |
159 | def group_products_by_domains(products: list[Product]) -> list[list[Product]]:
160 | grouped_products = []
161 |
162 | unique_domains = set([product.domain for product in products])
163 |
164 | for domain in unique_domains:
165 | products_with_domain = list(filter(lambda product: product.domain == domain, products))
166 |
167 | if not products_with_domain:
168 | continue
169 |
170 | grouped_products.append(products_with_domain)
171 |
172 | return grouped_products
173 |
174 |
175 | def group_products_by_names(products: list[Product]) -> list[list[Product]]:
176 | grouped_products = []
177 |
178 | unique_names = set([product.name for product in products])
179 |
180 | for name in unique_names:
181 | products_with_name = list(filter(lambda product: product.name == name, products))
182 |
183 | if not products_with_name:
184 | continue
185 |
186 | grouped_products.append(products_with_name)
187 |
188 | return grouped_products
189 |
--------------------------------------------------------------------------------
/scraper/database/models.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from sqlmodel import Field, SQLModel
3 |
4 |
5 | class Product(SQLModel, table=True):
6 | __tablename__: str = "products"
7 |
8 | id: int = Field(default=None, primary_key=True)
9 | product_code: str
10 | name: str
11 | category: str
12 | domain: str
13 | url: str
14 | short_url: str
15 | is_active: bool
16 | created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False)
17 |
18 |
19 | class DataPoint(SQLModel, table=True):
20 | __tablename__: str = "datapoints"
21 |
22 | id: int = Field(default=None, primary_key=True)
23 | product_code: str
24 | date: str
25 | price: float
26 | currency: str
27 | created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False)
28 |
--------------------------------------------------------------------------------
/scraper/delete_data.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import scraper.database as db
3 |
4 |
5 | def delete(categories: list[str], names: list[str], product_codes: list[str], all: bool) -> None:
6 | print("Deleting...")
7 | logging.getLogger(__name__).info(f"Deleting products and datapoint for {categories=}, {names=}, {product_codes=}, {all=}")
8 |
9 | if all:
10 | delete_all()
11 | return
12 |
13 | if categories:
14 | delete_products_by_categories(categories)
15 |
16 | if names:
17 | delete_products_by_names(names)
18 |
19 | if product_codes:
20 | delete_products_by_product_codes(product_codes)
21 |
22 |
23 | def delete_all() -> None:
24 | print("Deleting all products and datapoints...")
25 | logging.getLogger(__name__).info("Deleting all products and datapoints")
26 |
27 | all_products = db.get_all_products()
28 | all_datapoints = db.get_all_datapoints()
29 |
30 | db.delete_all(all_products)
31 | db.delete_all(all_datapoints)
32 |
33 |
34 | def delete_products_by_categories(categories: list[str]) -> None:
35 | products = db.get_products_by_categories(categories)
36 | log_product_codes_with_message(products, "Deleting products with categories")
37 | db.delete_all(products)
38 |
39 |
40 | def delete_products_by_names(names: list[str]) -> None:
41 | products = db.get_products_by_names(names)
42 | log_product_codes_with_message(products, "Deleting products by names")
43 | db.delete_all(products)
44 |
45 |
46 | def delete_products_by_product_codes(product_codes: list[str]) -> None:
47 | products = db.get_products_by_product_codes(product_codes)
48 | log_product_codes_with_message(products, "Deleting products with product codes")
49 | db.delete_all(products)
50 |
51 |
52 | def log_product_codes_with_message(products: list[db.Product], log_message: str) -> None:
53 | logger = logging.getLogger(__name__)
54 | product_codes = [product.product_code for product in products]
55 |
56 | if product_codes:
57 | product_codes_string = ", ".join(product_codes)
58 | print(f"Deleting product codes: {product_codes_string}")
59 | else:
60 | print("No product found to delete")
61 |
62 | logger.info(f"{log_message} - {product_codes=}")
63 |
--------------------------------------------------------------------------------
/scraper/domains.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import json
5 | import logging
6 | from abc import ABC, abstractmethod
7 |
8 | from scraper.models import Info
9 | from scraper.constants import REQUEST_HEADER, REQUEST_COOKIES
10 | from scraper.filemanager import Config
11 | from scraper.exceptions import WebsiteVersionNotSupported
12 |
13 |
14 | def request_url(url: str) -> requests.Response:
15 | request_timeout = Config.get_request_timeout()
16 |
17 | try:
18 | response = requests.get(url, headers=REQUEST_HEADER, cookies=REQUEST_COOKIES, timeout=request_timeout)
19 | return response
20 | except requests.RequestException:
21 | logging.getLogger(__name__).exception(f"Module requests exception with url: {url}")
22 |
23 |
24 | class BaseWebsiteHandler(ABC):
25 | def __init__(self, url: str) -> None:
26 | self.url = url
27 | self.website_name = get_website_name(url)
28 | self.info: Info = None
29 | self.request_data = None
30 |
31 | def get_product_info(self) -> Info:
32 | try:
33 | self._request_product_data()
34 | self._get_common_data()
35 | raw_name = self._get_product_name()
36 | name = Config.get_user_product_name(raw_name)
37 | price = self._get_product_price()
38 | currency = self._get_product_currency()
39 | id = self._get_product_id()
40 | self.info = Info(name, price, currency, id)
41 | return self.info
42 | except (AttributeError, ValueError, TypeError):
43 | logging.getLogger(__name__).exception(f"Could not get all the data needed from url: {self.url}")
44 | return Info(None, None, None, None, valid=False)
45 | except WebsiteVersionNotSupported as ex:
46 | logging.getLogger(__name__).error(ex)
47 | return Info(None, None, None, None, valid=False)
48 |
49 | def _request_product_data(self) -> None:
50 | # option for each specific class to change how the request data is being handled
51 | response = request_url(self.url)
52 | self.request_data = BeautifulSoup(response.text, "html.parser")
53 |
54 | def _get_common_data(self) -> None:
55 | # if the same data needs to be accessed from more than one of the abstract methods,
56 | # then you can use this method to store the data as a instance variable,
57 | # so that the other methods can access the data
58 | pass
59 |
60 | @abstractmethod
61 | def _get_product_name(self) -> str:
62 | pass
63 |
64 | @abstractmethod
65 | def _get_product_price(self) -> float:
66 | pass
67 |
68 | @abstractmethod
69 | def _get_product_currency(self) -> str:
70 | pass
71 |
72 | @abstractmethod
73 | def _get_product_id(self) -> str:
74 | pass
75 |
76 | @abstractmethod
77 | def get_short_url(self) -> str:
78 | pass
79 |
80 |
81 | class KomplettHandler(BaseWebsiteHandler):
82 | def _get_product_name(self) -> str:
83 | return self.request_data.find("div", class_="product-main-info__info").h1.span.text
84 |
85 | def _get_product_price(self) -> float:
86 | return float(self.request_data.find("span", class_="product-price-now").text.strip(",-").replace(".", ""))
87 |
88 | def _get_product_currency(self) -> str:
89 | script_tag = self.request_data.find("script", type="application/ld+json").contents[0]
90 | currency = json.loads(script_tag).get("offers").get("priceCurrency")
91 | return currency
92 |
93 | def _get_product_id(self) -> str:
94 | return self.url.split("/")[4]
95 |
96 | def get_short_url(self) -> str:
97 | id = self._get_product_id()
98 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
99 | return f"{website}/product/{id}"
100 |
101 |
102 | class ProshopHandler(BaseWebsiteHandler):
103 | def _get_common_data(self) -> None:
104 | soup_script_tag = self.request_data.find("script", type="application/ld+json").contents[0]
105 | self.script_json = json.loads(soup_script_tag)
106 |
107 | def _get_product_name(self) -> str:
108 | return self.script_json["name"]
109 |
110 | def _get_product_price(self) -> float:
111 | try:
112 | # find normal price
113 | price = float(
114 | self.request_data.find("span", class_="site-currency-attention")
115 | .text.replace(".", "")
116 | .replace(",", ".")
117 | .strip(" kr")
118 | )
119 | except AttributeError:
120 | try:
121 | # find discount price
122 | price = float(
123 | self.request_data.find("div", class_="site-currency-attention site-currency-campaign")
124 | .text.replace(".", "")
125 | .replace(",", ".")
126 | .strip(" kr")
127 | )
128 | except AttributeError:
129 | # if campaign is sold out (udsolgt)
130 | price = float(
131 | self.request_data.find("div", class_="site-currency-attention")
132 | .text.replace(".", "")
133 | .replace(",", ".")
134 | .strip(" kr")
135 | )
136 | return price
137 |
138 | def _get_product_currency(self) -> str:
139 | currency = self.script_json.get("offers").get("priceCurrency")
140 | return currency
141 |
142 | def _get_product_id(self) -> str:
143 | return self.url.split("/")[-1]
144 |
145 | def get_short_url(self) -> str:
146 | id = self._get_product_id()
147 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
148 | return f"{website}/{id}"
149 |
150 |
151 | class ComputerSalgHandler(BaseWebsiteHandler):
152 | def _get_product_name(self) -> str:
153 | return self.request_data.find("meta", {"name": "title"})["content"]
154 |
155 | def _get_product_price(self) -> float:
156 | return float(self.request_data.find("span", itemprop="price").text.strip().replace(".", "").replace(",", "."))
157 |
158 | def _get_product_currency(self) -> str:
159 | return self.request_data.find("span", itemprop="priceCurrency").get("content")
160 |
161 | def _get_product_id(self) -> str:
162 | return self.url.split("/")[4]
163 |
164 | def get_short_url(self) -> str:
165 | id = self._get_product_id()
166 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
167 | return f"{website}/i/{id}"
168 |
169 |
170 | class ElgigantenHandler(BaseWebsiteHandler):
171 | def _get_common_data(self) -> None:
172 | self.elgiganten_api_data = self._get_json_api_data()
173 |
174 | def _get_product_name(self) -> str:
175 | return self.request_data.find("h1", class_="product-title").text
176 |
177 | def _get_product_price(self) -> float:
178 | return float(self.elgiganten_api_data["data"]["product"]["currentPricing"]["price"]["value"])
179 |
180 | def _get_product_currency(self) -> str:
181 | return self.elgiganten_api_data["data"]["product"]["currentPricing"]["price"]["currency"]
182 |
183 | def _get_product_id(self) -> str:
184 | return self.url.split("/")[-1]
185 |
186 | def _get_json_api_data(self) -> dict:
187 | id_number = self._get_product_id()
188 |
189 | # API link to get price and currency
190 | if "elgiganten.dk" in self.url:
191 | api_link = f"https://www.elgiganten.dk/cxorchestrator/dk/api?getProductWithDynamicDetails&appMode=b2c&user=anonymous&operationName=getProductWithDynamicDetails&variables=%7B%22articleNumber%22%3A%22{id_number}%22%2C%22withCustomerSpecificPrices%22%3Afalse%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22229bbb14ee6f93449967eb326f5bfb87619a37e7ee6c4555b94496313c139ee1%22%7D%7D" # noqa E501
192 | elif "elgiganten.se" in self.url:
193 | api_link = f"https://www.elgiganten.se/cxorchestrator/se/api?getProductWithDynamicDetails&appMode=b2c&user=anonymous&operationName=getProductWithDynamicDetails&variables=%7B%22articleNumber%22%3A%22{id_number}%22%2C%22withCustomerSpecificPrices%22%3Afalse%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22229bbb14ee6f93449967eb326f5bfb87619a37e7ee6c4555b94496313c139ee1%22%7D%7D" # noqa E501
194 | else:
195 | raise WebsiteVersionNotSupported(get_website_name(self.url, keep_tld=True))
196 |
197 | response = request_url(api_link)
198 | return response.json()
199 |
200 | def get_short_url(self) -> str:
201 | id = self._get_product_id()
202 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
203 | return f"{website}/product/{id}"
204 |
205 |
206 | class AvXpertenHandler(BaseWebsiteHandler):
207 | def _get_common_data(self) -> None:
208 | soup_script_tag = self.request_data.find("script", type="application/ld+json").contents[0]
209 | self.script_json = json.loads(soup_script_tag)
210 |
211 | def _get_product_name(self) -> str:
212 | return self.request_data.find("div", class_="content-head").h1.text.strip()
213 |
214 | def _get_product_price(self) -> float:
215 | return float(self.request_data.find("div", class_="price").text.replace("\xa0DKK", "").replace(" DKK", ""))
216 |
217 | def _get_product_currency(self) -> str:
218 | return self.script_json.get("offers").get("priceCurrency")
219 |
220 | def _get_product_id(self) -> str:
221 | return self.script_json.get("sku")
222 |
223 | def get_short_url(self) -> str:
224 | return self.url
225 |
226 |
227 | class AvCablesHandler(BaseWebsiteHandler):
228 | def _get_product_name(self) -> str:
229 | return self.request_data.find("h1", class_="title").text
230 |
231 | def _get_product_price(self) -> float:
232 | return float(
233 | self.request_data.find("div", class_="regular-price")
234 | .text.strip()
235 | .replace("Pris: ", "")
236 | .replace("Tilbudspris: ", "")
237 | .split(",")[0]
238 | )
239 |
240 | def _get_product_currency(self) -> str:
241 | return self.request_data.find("meta", property="og:price:currency").get("content")
242 |
243 | def _get_product_id(self) -> str:
244 | script_tag = self.request_data.find("script", type="application/ld+json").contents[0]
245 | id = json.loads(script_tag).get("sku")
246 | return str(id)
247 |
248 | def get_short_url(self) -> str:
249 | return self.url
250 |
251 |
252 | class AmazonHandler(BaseWebsiteHandler):
253 | def _get_product_name(self) -> str:
254 | return self.request_data.find("span", id="productTitle").text.strip()
255 |
256 | def _get_product_price(self) -> float:
257 | raw_price = self.request_data.find("span", class_="a-price").span.text.replace(",", "").replace(" ", "")
258 | return float(get_number_string(raw_price))
259 |
260 | def _get_product_currency(self) -> str:
261 | regex_pattern = "%22currencyCode%22%3A%22(.{3})%22"
262 |
263 | regex_result = re.search(regex_pattern, str(self.request_data))
264 |
265 | if regex_result:
266 | return regex_result.group(1)
267 | return "N/F"
268 |
269 | def _get_product_id(self) -> str:
270 | try:
271 | return self.request_data.find("input", id="ASIN").get("value")
272 | except (AttributeError, ValueError, TypeError):
273 | asin_json = json.loads(self.request_data.find("span", id="cr-state-object").get("data-state"))
274 | return asin_json["asin"]
275 |
276 | def get_short_url(self) -> str:
277 | return self.url
278 |
279 |
280 | class EbayHandler(BaseWebsiteHandler):
281 | def _get_common_data(self) -> None:
282 | self.soup_url = self.request_data.find("meta", property="og:url").get("content")
283 |
284 | def _get_product_name(self) -> str:
285 | try:
286 | return self.request_data.find("h1", class_="x-item-title__mainTitle").text.strip()
287 | except (AttributeError, ValueError, TypeError):
288 | return self.request_data.find("meta", property="og:title").get("content").replace(" | eBay", "")
289 |
290 | def _get_product_price(self) -> float:
291 | if self.soup_url.split("/")[3] == "itm":
292 | price = float(self.request_data.find("div", class_="x-price-primary").text.replace("US $", ""))
293 | else:
294 | price = float(
295 | self.request_data.find("div", class_="x-price-primary")
296 | .text.replace("DKK ", "")
297 | .replace("$", "")
298 | .replace(",", "")
299 | )
300 |
301 | return price
302 |
303 | def _get_product_currency(self) -> str:
304 | if self.soup_url.split("/")[3] == "itm":
305 | currency = self.request_data.find("span", itemprop="priceCurrency").get("content")
306 | else:
307 | script_tag = self.request_data.find("script", type="application/ld+json").contents[0]
308 | currency = (
309 | json.loads(script_tag)
310 | .get("mainEntity")
311 | .get("offers")
312 | .get("itemOffered")[0]
313 | .get("offers")[0]
314 | .get("priceCurrency")
315 | )
316 |
317 | return currency
318 |
319 | def _get_product_id(self) -> str:
320 | return self.url.split("/")[4].split("?")[0]
321 |
322 | def get_short_url(self) -> str:
323 | id = self._get_product_id()
324 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
325 |
326 | if self.url.split("/")[3] == "itm":
327 | return f"{website}/itm/{id}"
328 | else:
329 | return f"{website}/p/{id}"
330 |
331 |
332 | class PowerHandler(BaseWebsiteHandler):
333 | def _get_common_data(self) -> None:
334 | id = self._get_product_id()
335 | self.api_json = request_url(f"https://www.power.dk/api/v2/products?ids={id}").json()
336 |
337 | def _get_product_name(self) -> str:
338 | return self.api_json[0].get("title")
339 |
340 | def _get_product_price(self) -> float:
341 | return float(self.api_json[0].get("price"))
342 |
343 | def _get_product_currency(self) -> str:
344 | return "DKK"
345 |
346 | def _get_product_id(self) -> str:
347 | return self.url.split("/")[-2].strip("p-")
348 |
349 | def get_short_url(self) -> str:
350 | id = self._get_product_id()
351 | url_id = self.url.split("/")[3]
352 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
353 | return f"{website}/{url_id}/p-{id}"
354 |
355 |
356 | class ExpertHandler(BaseWebsiteHandler):
357 | def _get_common_data(self) -> None:
358 | id = self._get_product_id()
359 | self.api_json = request_url(f"https://www.expert.dk/api/v2/products?ids={id}").json()
360 |
361 | def _get_product_name(self) -> str:
362 | return self.api_json[0].get("title")
363 |
364 | def _get_product_price(self) -> float:
365 | return float(self.api_json[0].get("price"))
366 |
367 | def _get_product_currency(self) -> str:
368 | return "DKK"
369 |
370 | def _get_product_id(self) -> str:
371 | return self.url.split("/")[-2].strip("p-")
372 |
373 | def get_short_url(self) -> str:
374 | id = self._get_product_id()
375 | url_id = self.url.split("/")[3]
376 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
377 | return f"{website}/{url_id}/p-{id}"
378 |
379 |
380 | class MMVisionHandler(BaseWebsiteHandler):
381 | def _get_common_data(self) -> None:
382 | soup_script_tag = self.request_data.find_all("script", type="application/ld+json")[1].contents[0]
383 | self.script_json = json.loads(soup_script_tag)
384 |
385 | def _get_product_name(self) -> str:
386 | return self.request_data.find("h1", itemprop="name").text.strip()
387 |
388 | def _get_product_price(self) -> float:
389 | return float(
390 | self.request_data.find("h3", class_="product-price text-right")
391 | .text.strip("fra ")
392 | .strip()
393 | .strip(",-")
394 | .replace(".", "")
395 | )
396 |
397 | def _get_product_currency(self) -> str:
398 | return self.script_json.get("offers").get("priceCurrency")
399 |
400 | def _get_product_id(self) -> str:
401 | return self.script_json.get("productID")
402 |
403 | def get_short_url(self) -> str:
404 | return self.url
405 |
406 |
407 | class CoolshopHandler(BaseWebsiteHandler):
408 | def _get_product_name(self) -> str:
409 | return self.request_data.find("div", class_="thing-header").h1.text.strip().replace("\n", " ")
410 |
411 | def _get_product_price(self) -> float:
412 | return float(self.request_data.find("meta", property="product:price:amount")["content"].split(".")[0])
413 |
414 | def _get_product_currency(self) -> str:
415 | return self.request_data.find("meta", property="product:price:currency").get("content")
416 |
417 | def _get_product_id(self) -> str:
418 | return self.request_data.find_all("div", id="attributeSku")[1].text.strip()
419 |
420 | def get_short_url(self) -> str:
421 | url_id = self.url.split("/")[-2]
422 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
423 | return f"{website}/produkt/{url_id}/"
424 |
425 |
426 | class SharkGamingHandler(BaseWebsiteHandler):
427 | def _get_product_name(self) -> str:
428 | return self.request_data.find("h1", class_="page-title").span.text
429 |
430 | def _get_product_price(self) -> float:
431 | return float(self.request_data.find("meta", property="product:price:amount").get("content"))
432 |
433 | def _get_product_currency(self) -> str:
434 | return self.request_data.find("meta", property="product:price:currency").get("content")
435 |
436 | def _get_product_id(self) -> str:
437 | return json.loads(self.request_data.find_all("script", type="application/ld+json")[3].text).get("productID")
438 |
439 | def get_short_url(self) -> str:
440 | return self.url
441 |
442 |
443 | class NeweggHandler(BaseWebsiteHandler):
444 | def _get_common_data(self) -> None:
445 | script_data_raw = self.request_data.find_all("script", type="application/ld+json")[2].text
446 | self.script_json = json.loads(script_data_raw)
447 |
448 | def _get_product_name(self) -> str:
449 | return self.script_json.get("name")
450 |
451 | def _get_product_price(self) -> float:
452 | return float(self.script_json.get("offers").get("price"))
453 |
454 | def _get_product_currency(self) -> str:
455 | return self.script_json.get("offers").get("priceCurrency")
456 |
457 | def _get_product_id(self) -> str:
458 | return self.url.split("/")[5].split("?")[0]
459 |
460 | def get_short_url(self) -> str:
461 | id = self._get_product_id()
462 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
463 | return f"{website}/p/{id}"
464 |
465 |
466 | class HifiKlubbenHandler(BaseWebsiteHandler):
467 | def _get_common_data(self) -> None:
468 | script_data_raw = self.request_data.findAll("script", type="application/ld+json")[1].text
469 | self.product_data = json.loads(script_data_raw)["offers"]
470 |
471 | def _get_product_name(self) -> str:
472 | brand_name = self.request_data.find("span", class_="product-page__brand-name").text
473 | model_name = self.request_data.find("span", class_="product-page__model-name").text
474 | return f"{brand_name} {model_name}"
475 |
476 | def _get_product_price(self) -> float:
477 | return float(self.product_data.get("price"))
478 |
479 | def _get_product_currency(self) -> str:
480 | return self.product_data.get("priceCurrency")
481 |
482 | def _get_product_id(self) -> str:
483 | return self.url.split("/")[4]
484 |
485 | def get_short_url(self) -> str:
486 | id = self._get_product_id()
487 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True)
488 | return f"{website}/{id}"
489 |
490 |
491 | class SheinHandler(BaseWebsiteHandler):
492 | def _get_common_data(self) -> None:
493 | script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
494 | self.script_json = json.loads(script_data_raw)
495 |
496 | def _get_product_name(self) -> str:
497 | return self.script_json.get("name")
498 |
499 | def _get_product_price(self) -> float:
500 | return float(self.script_json.get("offers").get("price"))
501 |
502 | def _get_product_currency(self) -> str:
503 | return self.script_json.get("offers").get("priceCurrency")
504 |
505 | def _get_product_id(self) -> str:
506 | return self.script_json.get("sku")
507 |
508 | def get_short_url(self) -> str:
509 | return self.url
510 |
511 |
512 | def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str:
513 | stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")
514 |
515 | if not keep_www and keep_http:
516 | stripped_url = stripped_url.replace("www.", "", 1)
517 | elif not keep_www:
518 | stripped_url = stripped_url.removeprefix("www.")
519 |
520 | domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0]
521 |
522 | # Remove the TLD/DNS name (such as ".com") if keep_tld is false
523 | website_name_list = domain.split(".") if keep_tld else domain.split(".")[:-1]
524 |
525 | # Remove subdomain if keep_subdomain is false
526 | if not keep_subdomain and len(website_name_list) > 1:
527 | subdomain_and_domain = get_website_name(domain, keep_subdomain=True)
528 | subdomains = subdomain_and_domain.split(".")[:-1]
529 |
530 | website_name_list_copy = website_name_list.copy()
531 | # remove subdomains
532 | website_name_list = [elem for elem in website_name_list_copy if elem not in subdomains]
533 |
534 | website_name = ".".join(website_name_list)
535 | return website_name
536 |
537 |
538 | def get_website_handler(url: str) -> BaseWebsiteHandler:
539 | website_name = get_website_name(url, keep_subdomain=False).lower()
540 |
541 | website_handler = SUPPORTED_DOMAINS.get(website_name, None)
542 |
543 | if not website_handler:
544 | logging.getLogger(__name__).error(f"Can't find a website handler - website: '{website_name}' possibly not supported")
545 | return None
546 |
547 | return website_handler(url)
548 |
549 |
550 | def get_number_string(value: str) -> str:
551 | """Return string with only digits, commas (,) and periods (.)"""
552 | text_pattern = re.compile(r"[^\d.,]+")
553 | result = text_pattern.sub("", value)
554 | return result
555 |
556 |
557 | SUPPORTED_DOMAINS: dict[str, BaseWebsiteHandler] = {
558 | "komplett": KomplettHandler,
559 | "proshop": ProshopHandler,
560 | "computersalg": ComputerSalgHandler,
561 | "elgiganten": ElgigantenHandler,
562 | "avxperten": AvXpertenHandler,
563 | "av-cables": AvCablesHandler,
564 | "amazon": AmazonHandler,
565 | "ebay": EbayHandler,
566 | "power": PowerHandler,
567 | "expert": ExpertHandler,
568 | "mm-vision": MMVisionHandler,
569 | "coolshop": CoolshopHandler,
570 | "sharkgaming": SharkGamingHandler,
571 | "newegg": NeweggHandler,
572 | "hifiklubben": HifiKlubbenHandler,
573 | "shein": SheinHandler,
574 | }
575 |
--------------------------------------------------------------------------------
/scraper/exceptions.py:
--------------------------------------------------------------------------------
1 | from scraper.constants import URL_SCHEMES
2 |
3 |
4 | class WebsiteNotSupported(Exception):
5 | def __init__(self, website_name: str, *args: object) -> None:
6 | super().__init__(*args)
7 | self.website_name = website_name
8 |
9 | def __str__(self) -> str:
10 | return f"Website '{self.website_name}' is currently not supported"
11 |
12 |
13 | class WebsiteVersionNotSupported(Exception):
14 | def __init__(self, website_name: str, *args: object) -> None:
15 | super().__init__(*args)
16 | self.website_name = website_name
17 |
18 | def __str__(self) -> str:
19 | return f"Website version '{self.website_name}' is currently not supported"
20 |
21 |
22 | class URLMissingSchema(Exception):
23 | def __init__(self, url, *args: object) -> None:
24 | super().__init__(*args)
25 | self.url = url
26 |
27 | def __str__(self) -> str:
28 | return f"Missing schema in url '{self.url}'. Consider prefixing the url with one of following schemes: {', '.join(URL_SCHEMES)}"
29 |
--------------------------------------------------------------------------------
/scraper/filemanager.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator
2 | import pathlib
3 | import configparser
4 |
5 |
6 | class Filemanager:
7 | # root path of this repository
8 | root_path = pathlib.Path(__file__).parent.parent.absolute()
9 | products_json_path = f"{root_path}/scraper/records.json"
10 | products_csv_path = f"{root_path}/scraper/products.csv"
11 | settings_ini_path = f"{root_path}/scraper/settings.ini"
12 | logging_ini_path = f"{root_path}/scraper/logging.ini"
13 | logfile_path = f"{root_path}/scraper/logfile.log"
14 | database_folder = f"{root_path}/scraper/data"
15 | database_path = f"{database_folder}/database.db"
16 |
17 |
18 | class Config:
19 | @staticmethod
20 | def read(filename: str) -> configparser.ConfigParser:
21 | config = configparser.ConfigParser()
22 | config.read(filename, encoding="utf8")
23 | return config
24 |
25 | @staticmethod
26 | def write(filename: str, config: configparser.ConfigParser) -> None:
27 | with open(filename, "w") as default_file:
28 | config.write(default_file)
29 |
30 | @staticmethod
31 | def get_section_by_name(section_name: str) -> configparser.SectionProxy:
32 | """Get a section from settings.ini file"""
33 | config = Config.read(Filemanager.settings_ini_path)
34 | return config[section_name]
35 |
36 | @staticmethod
37 | def get_key_values(elements: list) -> Iterator[str]:
38 | for elem in elements:
39 | if "key" in elem:
40 | yield elem
41 |
42 | @staticmethod
43 | def get_request_delay() -> int:
44 | config = Config.read(Filemanager.settings_ini_path)
45 | return int(config["Scraping"]["request_delay"])
46 |
47 | @staticmethod
48 | def get_request_timeout() -> float | None:
49 | """Get request timeout - if number return float else return None"""
50 | config = Config.read(Filemanager.settings_ini_path)
51 | timeout = config["Scraping"]["request_timeout"]
52 | try:
53 | return float(timeout)
54 | except ValueError:
55 | return None
56 |
57 | @staticmethod
58 | def get_user_product_name(product_name: str) -> str:
59 | product_name_lowercase = product_name.lower()
60 | user_product_names = Config.get_section_by_name("ChangeName")
61 |
62 | for key in Config.get_key_values(user_product_names):
63 | key_list = user_product_names[key].split(",")
64 | value_key = f'value{key.strip("key")}'
65 | if all(elem.lower() in product_name_lowercase for elem in key_list):
66 | return user_product_names[value_key]
67 |
68 | return product_name
69 |
--------------------------------------------------------------------------------
/scraper/format.py:
--------------------------------------------------------------------------------
1 | import scraper.database as db
2 | from scraper.models.product import ProductInfo
3 | from scraper.scrape import Scraper
4 | from scraper.domains import get_website_name
5 |
6 |
7 | class Format:
8 | def db_products_to_scrapers(products: list[db.Product]) -> list[Scraper]:
9 | scrapers = []
10 | for product in products:
11 | scraper = Format.db_product_to_scraper(product)
12 | scrapers.append(scraper)
13 | return scrapers
14 |
15 | @staticmethod
16 | def db_product_to_scraper(product: db.Product) -> Scraper:
17 | return Scraper(category=product.category, url=product.short_url)
18 |
19 | @staticmethod
20 | def scraper_to_db_product(product: Scraper, is_active: bool) -> db.Product:
21 | return db.Product(
22 | product_code=product.product_info.id,
23 | name=product.product_info.name,
24 | category=product.category,
25 | domain=product.website_handler.website_name,
26 | url=product.url,
27 | short_url=product.website_handler.get_short_url(),
28 | is_active=is_active,
29 | )
30 |
31 | @staticmethod
32 | def db_products_to_product_infos(products: list[db.Product]) -> list[ProductInfo]:
33 | product_infos = []
34 | for product in products:
35 | product_info = Format.db_product_to_product_info(product)
36 | product_infos.append(product_info)
37 | return product_infos
38 |
39 | @staticmethod
40 | def db_product_to_product_info(product: db.Product) -> ProductInfo:
41 | return ProductInfo(
42 | product_name=product.name,
43 | category=product.category,
44 | url=product.short_url,
45 | id=product.product_code,
46 | currency=None,
47 | website=get_website_name(product.short_url, keep_subdomain=False),
48 | datapoints=None,
49 | )
50 |
--------------------------------------------------------------------------------
/scraper/format_to_new.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Iterable
3 | from sqlmodel import Session, select
4 | from dataclasses import dataclass
5 | import pandas as pd
6 | import json
7 |
8 | from scraper.filemanager import Config, Filemanager
9 | from scraper.domains import get_website_handler
10 | from scraper.models.product import DataPointInfo, MasterProduct, ProductInfo
11 | from scraper.database.models import Product, DataPoint
12 | from scraper.database.db import engine, create_db_and_tables
13 |
14 |
15 | @dataclass
16 | class ProductCSV:
17 | url: str
18 | short_url: str
19 | category: str
20 |
21 |
22 | class FilemanagerLegacy:
23 | @staticmethod
24 | def read_json(filename: str) -> dict:
25 | with open(filename, "r", encoding="utf8") as file:
26 | data = json.load(file)
27 | return data
28 |
29 | @staticmethod
30 | def get_record_data() -> dict:
31 | data = FilemanagerLegacy.read_json(Filemanager.products_json_path)
32 | return data
33 |
34 | @staticmethod
35 | def save_record_data(data: dict) -> None:
36 | FilemanagerLegacy.write_json(Filemanager.products_json_path, data)
37 |
38 | @staticmethod
39 | def get_products_data() -> pd.DataFrame:
40 | df = pd.read_csv(Filemanager.products_csv_path, sep=",", header=0)
41 | return df
42 |
43 | @staticmethod
44 | def save_products_data(data_df: pd.DataFrame) -> None:
45 | data_df.to_csv(Filemanager.products_csv_path, sep=",", header=True, index=False)
46 |
47 |
48 | class Format:
49 | @staticmethod
50 | def format_old_records_to_new() -> None:
51 | """Format records data from pre v1.1 to new records data format in v1.1"""
52 | records_data = FilemanagerLegacy.get_record_data()
53 |
54 | for category_info in records_data.values():
55 | for product_info in category_info.values():
56 | for website_info in product_info.values():
57 | website_info["info"].update({"currency": "TBD"})
58 | website_info.update({"datapoints": []})
59 |
60 | for date_name, date_info in website_info["dates"].items():
61 | website_info["datapoints"].append({"date": date_name, "price": float(date_info["price"])})
62 |
63 | website_info.pop("dates")
64 |
65 | FilemanagerLegacy.save_record_data(records_data)
66 |
67 | @staticmethod
68 | def add_short_urls_to_products_csv() -> None:
69 | """Format products.csv to have short_url column - introduced in v2.3.0"""
70 | request_delay = Config.get_request_delay()
71 |
72 | products_df = FilemanagerLegacy.get_products_data()
73 |
74 | short_urls = []
75 | for _, row in products_df.iterrows():
76 | time.sleep(request_delay)
77 | website_handler = get_website_handler(row["url"])
78 | short_url = website_handler.get_short_url()
79 |
80 | # scrape only if short_url can't be created without
81 | if short_url is None:
82 | website_handler.get_product_info()
83 | short_url = website_handler.get_short_url()
84 | short_urls.append(short_url)
85 |
86 | products_df = products_df.drop("short_url", axis=1)
87 | products_df.insert(2, "short_url", short_urls, True)
88 |
89 | FilemanagerLegacy.save_products_data(products_df)
90 |
91 | @staticmethod
92 | def from_json_to_db() -> None:
93 | """Take the data in records.json and insert it in database - introduced in v3.0.0
94 | - NOTE all products in database will be deleted before inserting data from records.json"""
95 |
96 | create_db_and_tables()
97 | records = FilemanagerLegacy.get_record_data()
98 | products_df = FilemanagerLegacy.get_products_data()
99 |
100 | products_from_csv = [
101 | ProductCSV(category=category, url=url, short_url=short_url)
102 | for category, url, short_url in zip(products_df["category"], products_df["url"], products_df["short_url"])
103 | ]
104 |
105 | master_products = get_master_products(records)
106 | products_from_json = get_products_from_master_products(master_products)
107 |
108 | products_to_db: list[Product] = []
109 | for product_json in products_from_json:
110 | product_to_db = Product(
111 | name=product_json.product_name,
112 | product_code=product_json.id,
113 | domain=product_json.website,
114 | url="",
115 | short_url=product_json.url,
116 | category=product_json.category,
117 | is_active=False,
118 | )
119 |
120 | for product_csv in products_from_csv:
121 | if product_csv.short_url == product_json.url:
122 | product_to_db.url = product_csv.url
123 | product_to_db.is_active = True
124 |
125 | products_to_db.append(product_to_db)
126 |
127 | datapoints_to_db: list[DataPoint] = []
128 | for product in products_from_json:
129 | for datapoint in product.datapoints:
130 | datapoint_to_db = DataPoint(
131 | product_code=product.id, date=datapoint.date, price=datapoint.price, currency=product.currency
132 | )
133 | datapoints_to_db.append(datapoint_to_db)
134 |
135 | with Session(engine) as session:
136 | products_in_db = session.exec(select(Product)).all()
137 | for product_in_db in products_in_db:
138 | session.delete(product_in_db)
139 |
140 | datapoints_in_db = session.exec(select(DataPoint)).all()
141 | for datapoint_in_db in datapoints_in_db:
142 | session.delete(datapoint_in_db)
143 |
144 | session.add_all(products_to_db)
145 | session.add_all(datapoints_to_db)
146 |
147 | session.commit()
148 |
149 | with Session(engine) as session:
150 | products_in_db = session.exec(select(Product)).all()
151 | datapoints_in_db = session.exec(select(DataPoint)).all()
152 | print(f"Inserted products to db: {len(products_in_db)}")
153 | print(f"Inserted datapoints to db: {len(datapoints_in_db)}")
154 |
155 |
156 | def get_master_products(records_data: dict) -> tuple[MasterProduct]:
157 | master_products: list[MasterProduct] = []
158 |
159 | for category_name, category_info in records_data.items():
160 | for product_name, product_info in category_info.items():
161 | master_product = MasterProduct(product_name, category_name)
162 | for website_name, website_info in product_info.items():
163 | id = website_info["info"]["id"]
164 | url = website_info["info"]["url"]
165 | currency = website_info["info"]["currency"]
166 | datapoints = [DataPointInfo(datapoint["date"], datapoint["price"]) for datapoint in website_info["datapoints"]]
167 | product = ProductInfo(product_name, category_name, url, id, currency, website_name, datapoints)
168 | master_product.products.append(product)
169 | master_products.append(master_product)
170 |
171 | return tuple(master_products)
172 |
173 |
174 | def get_products_from_master_products(master_products: Iterable[MasterProduct]) -> list[ProductInfo]:
175 | return [product for master_product in master_products for product in master_product.products]
176 |
--------------------------------------------------------------------------------
/scraper/logfile.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Crinibus/scraper/3c37db625d4b47cdb547952e098d3a3cb494ab6f/scraper/logfile.log
--------------------------------------------------------------------------------
/scraper/logging.ini:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root,scraper,scraper.scrape
3 |
4 | [handlers]
5 | keys=fileHandler
6 |
7 | [formatters]
8 | keys=fileFormatter
9 |
10 | [logger_root]
11 | level=DEBUG
12 | handlers=fileHandler
13 |
14 | [logger_scraper]
15 | level=INFO
16 | handlers=fileHandler
17 | qualname=scraper
18 | propagate=0
19 |
20 | [logger_scraper.scrape]
21 | level=INFO
22 | handlers=fileHandler
23 | qualname=scraper.scrape
24 | propagate=0
25 |
26 | [handler_fileHandler]
27 | class=FileHandler
28 | level=DEBUG
29 | formatter=fileFormatter
30 | args=(r"%(logfilename)s", "a", "utf8")
31 |
32 | [formatter_fileFormatter]
33 | format=%(asctime)s : %(levelname)s : %(name)s : %(message)s
34 |
--------------------------------------------------------------------------------
/scraper/models/__init__.py:
--------------------------------------------------------------------------------
1 | from scraper.models.product import DataPointInfo, ProductInfo, Info, MasterProduct
2 |
--------------------------------------------------------------------------------
/scraper/models/product.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from datetime import datetime
3 | import re
4 |
5 |
6 | @dataclass
7 | class Info:
8 | """Scraped info about product"""
9 |
10 | name: str
11 | price: float
12 | currency: str
13 | id: str
14 | valid: bool = True
15 |
16 |
17 | @dataclass
18 | class DataPointInfo:
19 | date: str
20 | price: float
21 |
22 |
23 | @dataclass
24 | class ProductInfo:
25 | product_name: str
26 | category: str
27 | url: str
28 | id: str
29 | currency: str
30 | website: str
31 | datapoints: list[DataPointInfo]
32 |
33 | def get_all_dates(self) -> list[str]:
34 | return [datapoint.date for datapoint in self.datapoints]
35 |
36 | def get_all_prices(self) -> list[float]:
37 | return [datapoint.price for datapoint in self.datapoints]
38 |
39 | @property
40 | def is_up_to_date(self) -> bool:
41 | if not self.datapoints:
42 | return False
43 |
44 | latest_date = datetime.strptime(self.datapoints[-1].date, "%Y-%m-%d")
45 | date_diff = datetime.today() - latest_date
46 | return date_diff.days <= 1
47 |
48 | def to_string_format(self, format: str) -> str:
49 | """Return a string representing the product, controlled by an explicit format string.
50 |
51 | >>> p = Product("ASUS RTX 4090", "GPU", "https://www.example.com/", "123", "USD", "example", [datepoints], True)
52 | >>> p.to_string_format("Name: %name, Category: %category, URL: %url, ID: %id, Website: %website")
53 | 'Name: ASUS RTX 4090, Category: GPU, URL: https://www.example.com/, ID: 123, Website: example'
54 | """
55 | # inspiration from https://docs.python.org/3/library/re.html#writing-a-tokenizer
56 | token_specification = [
57 | ("NAME", r"(%name)"),
58 | ("CATEGORY", r"(%category)"),
59 | ("URL", r"(%url)"),
60 | ("ID", r"(%id)"),
61 | ("CURRENCY", r"(%currency)"),
62 | ("WEBSITE", r"(%website)"),
63 | ]
64 | format_to = {
65 | "NAME": self.product_name,
66 | "CATEGORY": self.category,
67 | "URL": self.url,
68 | "ID": self.id,
69 | "CURRENCY": self.currency,
70 | "WEBSITE": self.website,
71 | }
72 |
73 | tok_regex = "|".join("(?P<%s>%s)" % pair for pair in token_specification)
74 | new_string = format
75 |
76 | for mo in re.finditer(tok_regex, format):
77 | kind = mo.lastgroup
78 | value = mo.group()
79 |
80 | new_string = new_string.replace(value, format_to[kind], 1)
81 |
82 | return new_string
83 |
84 |
85 | @dataclass
86 | class MasterProduct:
87 | product_name: str
88 | category: str
89 | products: list[ProductInfo] = field(default_factory=list)
90 |
--------------------------------------------------------------------------------
/scraper/print_products.py:
--------------------------------------------------------------------------------
1 | from scraper.constants import CHECK_MARK
2 | import scraper.database as db
3 | from scraper.database.models import Product
4 | from scraper.models.product import ProductInfo
5 |
6 |
7 | def print_latest_datapoints(names: list[str], product_codes: list[str], categories: list[str]) -> None:
8 | if names:
9 | print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----")
10 | products = db.get_products_by_names(names)
11 | print_latest_datapoints_for_products(products)
12 |
13 | if product_codes:
14 | print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----")
15 | products = db.get_products_by_product_codes(product_codes)
16 | print_latest_datapoints_for_products(products)
17 |
18 | if categories:
19 | print("\n----- SHOWING LATEST DATAPOINT FOR CATEGORY(s) -----")
20 | products = db.get_products_by_categories(categories)
21 | print_latest_datapoints_for_products(products)
22 |
23 |
24 | def print_latest_datapoints_for_products(products: list[db.Product]):
25 | if not products:
26 | print("Found no products")
27 | return
28 |
29 | grouped_products = db.group_products_by_names(products)
30 |
31 | for products in grouped_products:
32 | product_infos = db.get_product_infos_from_products(products)
33 | print(product_infos[0].product_name)
34 |
35 | for product_info in product_infos:
36 | print_latest_datapoint(product_info)
37 | print()
38 |
39 |
40 | def print_latest_datapoint(product_info: ProductInfo) -> None:
41 | if not product_info.datapoints:
42 | print(f"> No datapoints for {product_info.id}")
43 | return
44 |
45 | id = product_info.id
46 | website_name = product_info.website.capitalize()
47 | currency = product_info.currency
48 | latest_datapoint = product_info.datapoints[-1]
49 | date = latest_datapoint.date
50 | price = latest_datapoint.price
51 | print(f"> {website_name} - {id}\n - {currency} {price}\n - {date}")
52 |
53 |
54 | def print_all_products() -> None:
55 | print("\n----- SHOWING ALL PRODUCTS -----")
56 | categories = db.get_all_unique_categories()
57 |
58 | if not categories:
59 | print("No products")
60 | return
61 |
62 | for category in categories:
63 | print(category)
64 |
65 | products = db.get_products_by_categories([category])
66 |
67 | grouped_products = db.group_products_by_names(products)
68 |
69 | list_grouped_products(grouped_products)
70 |
71 |
72 | def list_products_with_filters(names: list[str] | None, product_codes: list[str] | None, categories: list[str] | None) -> None:
73 | print("\n----- LISTING PRODUCTS -----")
74 | products_by_filters: list[Product] = []
75 |
76 | if names:
77 | products_with_names = db.get_products_by_names(names)
78 | products_by_filters.extend(products_with_names)
79 |
80 | if product_codes:
81 | products_with_product_codes = db.get_products_by_product_codes(product_codes)
82 | products_by_filters.extend(products_with_product_codes)
83 |
84 | if categories:
85 | products_with_categories = db.get_products_by_categories(categories)
86 | products_by_filters.extend(products_with_categories)
87 |
88 | if not products_by_filters:
89 | print("Found no products with filters")
90 | return
91 |
92 | categories = set([product.category for product in products_by_filters])
93 | sorted_categories = sorted(categories)
94 |
95 | for category in sorted_categories:
96 | print(category)
97 |
98 | products_with_category = [product for product in products_by_filters if product.category == category]
99 |
100 | grouped_products = db.group_products_by_names(products_with_category)
101 |
102 | list_grouped_products(grouped_products)
103 |
104 |
105 | def list_grouped_products(grouped_products: list[list[Product]]) -> None:
106 | for products in grouped_products:
107 | print(f" > {products[0].name}")
108 | for product in products:
109 | is_active_marker = f"{CHECK_MARK} " if product.is_active else ""
110 | print(f" - {is_active_marker}{product.domain.upper()} - {product.product_code}")
111 | print()
112 |
--------------------------------------------------------------------------------
/scraper/products.csv:
--------------------------------------------------------------------------------
1 | category,url,short_url
2 |
--------------------------------------------------------------------------------
/scraper/records.json:
--------------------------------------------------------------------------------
1 | {}
--------------------------------------------------------------------------------
/scraper/reset_data.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import scraper.database as db
4 |
5 |
6 | def reset(categories: list[str], names: list[str], product_codes: list[str], all: bool) -> None:
7 | print("Resetting datapoints...")
8 | logging.getLogger(__name__).info(f"Resetting datapoints for {categories=}, {names=}, {product_codes=}, {all=}")
9 |
10 | if all:
11 | delete_all_datapoints()
12 | return
13 |
14 | if categories:
15 | delete_datapoints_for_products_by_categories(categories)
16 |
17 | if names:
18 | delete_datapoints_for_products_by_names(names)
19 |
20 | if product_codes:
21 | delete_datapoints_for_products_by_product_codes(product_codes)
22 |
23 |
24 | def delete_all_datapoints():
25 | datapoints = db.get_all_datapoints()
26 | db.delete_all(datapoints)
27 |
28 |
29 | def delete_datapoints_for_products_by_categories(categories: list[str]):
30 | datapoints = db.get_datapoints_by_categories(categories)
31 | db.delete_all(datapoints)
32 |
33 |
34 | def delete_datapoints_for_products_by_names(names: list[str]):
35 | datapoints = db.get_datapoints_by_names(names)
36 | db.delete_all(datapoints)
37 |
38 |
39 | def delete_datapoints_for_products_by_product_codes(product_codes: list[str]):
40 | datapoints = db.get_datapoints_by_product_codes(product_codes)
41 | db.delete_all(datapoints)
42 |
--------------------------------------------------------------------------------
/scraper/scrape.py:
--------------------------------------------------------------------------------
1 | import time
2 | import threading
3 | import logging
4 |
5 | from scraper.models import Info
6 | from scraper.domains import get_website_handler
7 |
8 |
9 | class Scraper:
10 | def __init__(self, category: str, url: str) -> None:
11 | self.category = category
12 | self.url = url
13 | self.website_handler = get_website_handler(url)
14 | self.product_info: Info = None
15 |
16 | def scrape_info(self) -> Info:
17 | logging.getLogger(__name__).debug(f"Scraping: {self.category} - {self.url}")
18 | self.product_info = self.website_handler.get_product_info()
19 | return self.product_info
20 |
21 |
22 | def start_threads_sequentially(threads: list[threading.Thread], request_delay: int, progress_bar=None) -> None:
23 | for thread in threads:
24 | thread.start()
25 | thread.join()
26 | time.sleep(request_delay)
27 |
28 | if progress_bar:
29 | progress_bar()
30 |
--------------------------------------------------------------------------------
/scraper/search_data.py:
--------------------------------------------------------------------------------
1 | import scraper.database as db
2 |
3 |
4 | def search(search_terms: list[str]) -> None:
5 | print("Searching...")
6 |
7 | product_name_search_results = search_product_names(search_terms)
8 | categories_search_results = search_categories(search_terms)
9 |
10 | if product_name_search_results:
11 | print("\n--- Results from product name search ---")
12 | for result in product_name_search_results:
13 | print(f"> {result}\n")
14 | else:
15 | print("\nNo results for product name search")
16 |
17 | if categories_search_results:
18 | print("\n--- Results from category search ---")
19 | for result in categories_search_results:
20 | print(f"> {result}")
21 | else:
22 | print("\nNo results for categories search")
23 |
24 |
25 | def search_product_names(search_terms: list[str]) -> list[str]:
26 | products_strings = []
27 | products = db.get_products_by_names_fuzzy(search_terms)
28 |
29 | if not products:
30 | return []
31 |
32 | grouped_products = db.group_products_by_names(products)
33 |
34 | for products in grouped_products:
35 | matched_domains = []
36 | for product in products:
37 | match_string = f" - {product.domain.capitalize()} - {product.product_code}"
38 | matched_domains.append(match_string)
39 | matched_domains_string = "\n".join(matched_domains)
40 | products_strings.append(f"{products[0].name}\n{matched_domains_string}")
41 |
42 | return products_strings
43 |
44 |
45 | def search_categories(search_terms: list[str]) -> list[str]:
46 | all_results = []
47 | all_categories = db.get_all_unique_categories()
48 |
49 | for search_term in search_terms:
50 | results = [category for category in all_categories if search_term.lower() in category.lower()]
51 | all_results.extend(results)
52 |
53 | return all_results
54 |
--------------------------------------------------------------------------------
/scraper/settings.ini:
--------------------------------------------------------------------------------
1 | [ChangeName]
2 | ; Add your own keywords seperated with a comma (,) and what the product name should be renamed (valuewords) to if the product name has **all** the keywords. See example below:
3 | key1 = asus,3080,rog,strix,oc
4 | value1 = asus geforce rtx 3080 rog strix oc
5 |
6 | [Scraping]
7 | ; request_delay in seconds
8 | request_delay = 0
9 | ; request_timeout in seconds or None for indefinitely
10 | request_timeout = 25
11 |
--------------------------------------------------------------------------------
/scraper/visualize.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, Iterator
2 | import plotly.graph_objs as go
3 | from datetime import datetime
4 |
5 | import scraper.database as db
6 | from scraper.models import DataPointInfo, ProductInfo, MasterProduct
7 | from scraper.constants import WEBSITE_COLORS
8 |
9 |
10 | def visualize_data(
11 | show_all: bool, categories: list[str], ids: list[str], names: list[str], only_up_to_date: bool, compare: bool
12 | ) -> None:
13 | print("Visualizing...")
14 |
15 | # Convert all string to lowercase
16 | categories = [category.lower() for category in categories]
17 | ids = [id.lower() for id in ids]
18 | names = [name.lower() for name in names]
19 |
20 | master_products = get_master_products()
21 |
22 | if not master_products:
23 | print("No products saved")
24 | return
25 |
26 | if compare:
27 | compare_products(master_products, ids, names, categories, only_up_to_date, show_all)
28 | return
29 |
30 | if show_all:
31 | show_master_products(master_products, only_up_to_date)
32 |
33 | if categories:
34 | for master_product in get_master_products_with_categories(master_products, categories, only_up_to_date):
35 | product_name = master_product.product_name
36 | category = master_product.category
37 | status_of_master_product = get_status_of_master_product(master_product)
38 | title = f"Price(s) of {product_name} - {category} - {status_of_master_product}"
39 | show_products(master_product.products, title)
40 | else:
41 | print("No products found with category/categories")
42 |
43 | if ids:
44 | for product in get_products_with_ids(master_products, ids, only_up_to_date):
45 | status_of_product = get_status_of_product(product)
46 | product_name = product.product_name
47 | title = f"Price(s) of {product_name} - {status_of_product}"
48 | show_product(product, title)
49 | else:
50 | print("No products found with id(s)")
51 |
52 | if names:
53 | for master_product in get_master_products_with_names(master_products, names, only_up_to_date):
54 | product_name = master_product.product_name
55 | status_of_master_product = get_status_of_master_product(master_product)
56 | title = f"Price(s) of {product_name} - {status_of_master_product}"
57 | show_products(master_product.products, title)
58 | else:
59 | print("No products found with name(s)")
60 |
61 |
62 | def compare_products(
63 | master_products: tuple[MasterProduct],
64 | ids: list[str],
65 | names: list[str],
66 | categories: list[str],
67 | only_up_to_date: bool,
68 | show_all: bool,
69 | ) -> None:
70 | master_products_with_names = get_master_products_with_names(master_products, names, only_up_to_date)
71 | products_with_names = get_products_from_master_products(master_products_with_names)
72 |
73 | products_with_ids = list(get_products_with_ids(master_products, ids, only_up_to_date))
74 |
75 | master_products_with_categories = get_master_products_with_categories(master_products, categories, only_up_to_date)
76 | products_with_categories = get_products_from_master_products(master_products_with_categories)
77 |
78 | products_to_compare = [*products_with_ids, *products_with_names, *products_with_categories]
79 |
80 | if show_all:
81 | products_to_compare = get_products_from_master_products(master_products)
82 |
83 | if not products_to_compare:
84 | print("No products found to compare")
85 | return
86 |
87 | product_ids = [product.id for product in products_to_compare]
88 | product_ids_string = ", ".join(product_ids)
89 | title_ = product_ids_string[:100] + " ..." if len(product_ids_string) > 100 else product_ids_string
90 | show_products(products_to_compare, f"Comparing products with ids: {title_}")
91 |
92 |
93 | def show_master_products(master_products: tuple[MasterProduct], only_up_to_date: bool) -> None:
94 | if not master_products:
95 | print("No products found")
96 | return
97 |
98 | for master_product in master_products:
99 | if only_up_to_date and not is_master_product_up_to_date(master_product):
100 | continue
101 |
102 | status_of_master_product = get_status_of_master_product(master_product)
103 | show_products(master_product.products, f"Price(s) of {master_product.product_name} - {status_of_master_product}")
104 |
105 |
106 | def show_product(product: ProductInfo, title: str) -> None:
107 | show_products([product], title)
108 |
109 |
110 | def show_products(products: list[ProductInfo], title: str) -> None:
111 | fig = go.Figure()
112 | for product in products:
113 | add_scatter_plot(
114 | fig,
115 | product,
116 | name_format="%website - %name - %id",
117 | )
118 |
119 | num_products = len(products)
120 |
121 | config_figure(fig, title, num_products)
122 | fig.show(config={"scrollZoom": True})
123 |
124 |
125 | def get_master_products() -> tuple[MasterProduct]:
126 | master_products: list[MasterProduct] = []
127 |
128 | all_products = db.get_all_products_with_datapoints()
129 |
130 | unique_product_names = set([product.product_name for product in all_products])
131 |
132 | for unique_product_name in unique_product_names:
133 | products_from_db = db.get_products_by_names([unique_product_name])
134 | products = db.get_product_infos_from_products(products_from_db)
135 |
136 | category = products[0].category
137 | master_product = MasterProduct(unique_product_name, category, products)
138 | master_products.append(master_product)
139 |
140 | return tuple(master_products)
141 |
142 |
143 | def get_products_with_ids(
144 | master_products: tuple[MasterProduct], ids: list[str], only_up_to_date: bool
145 | ) -> Iterator[ProductInfo]:
146 | for master_product in master_products:
147 | for product in master_product.products:
148 | if only_up_to_date and not product.is_up_to_date:
149 | continue
150 |
151 | if product.id.lower() not in ids:
152 | continue
153 |
154 | yield product
155 |
156 |
157 | def get_master_products_with_categories(
158 | master_products: tuple[MasterProduct], categories: list[str], only_up_to_date: bool
159 | ) -> Iterator[MasterProduct]:
160 | for master_product in master_products:
161 | if master_product.category.lower() not in categories:
162 | continue
163 |
164 | if only_up_to_date and not is_master_product_up_to_date(master_product):
165 | continue
166 |
167 | yield master_product
168 |
169 |
170 | def get_master_products_with_names(
171 | master_products: tuple[MasterProduct], names: list[str], only_up_to_date: bool
172 | ) -> Iterator[MasterProduct]:
173 | for master_product in master_products:
174 | if master_product.product_name.lower() not in names:
175 | continue
176 |
177 | if only_up_to_date and not is_master_product_up_to_date(master_product):
178 | continue
179 |
180 | yield master_product
181 |
182 |
183 | def get_products_from_master_products(master_products: Iterable[MasterProduct]) -> list[ProductInfo]:
184 | return [product for master_product in master_products for product in master_product.products]
185 |
186 |
187 | def get_yvalue_for_configure_figure(num_products: int, min_value: int, max_value: int, max_num: int):
188 | value = ((num_products / max_num) * (max_value - min_value)) + min_value
189 |
190 | if value > max_value:
191 | value = max_value
192 | elif value < min_value:
193 | value = min_value
194 |
195 | return value
196 |
197 |
198 | def config_figure(figure: go.Figure, figure_title: str, num_products: int) -> None:
199 | figure.update_traces(mode="markers+lines", hovertemplate=None)
200 |
201 | y_value = get_yvalue_for_configure_figure(num_products, 0.1, 0.25, 30)
202 |
203 | figure.update_layout(
204 | title=dict(text=figure_title),
205 | xaxis_title="Date",
206 | yaxis_title="Price",
207 | hovermode="closest",
208 | separators=".,",
209 | legend=dict(orientation="h", y=-y_value, x=0, yref="paper", xref="paper", yanchor="top", xanchor="left"),
210 | hoverlabel_namelength=-1,
211 | )
212 |
213 |
214 | def add_scatter_plot(
215 | figure: go.Figure,
216 | product: ProductInfo,
217 | color: str = None,
218 | hover_text: str = None,
219 | name_format: str = None,
220 | ) -> None:
221 | scatter_name = product.to_string_format(name_format) if name_format else f"{product.website.capitalize()} - {product.id}"
222 | scatter_color = color if color else WEBSITE_COLORS[product.website]
223 | scatter_hover_text = hover_text if hover_text else "Price: %{y:.0f}" + f" {product.currency}"
224 |
225 | figure.add_trace(
226 | go.Scatter(
227 | name=scatter_name,
228 | x=product.get_all_dates(),
229 | y=product.get_all_prices(),
230 | line={"color": scatter_color, "width": 2},
231 | hovertemplate=scatter_hover_text,
232 | )
233 | )
234 |
235 |
236 | def is_datapoints_up_to_date(datapoints: list[DataPointInfo]) -> bool:
237 | """check if today and the last date in datapoints is at most 1 day apart"""
238 | if len(datapoints) == 0:
239 | return False
240 |
241 | return is_date_up_to_date(datapoints[-1].date)
242 |
243 |
244 | def is_date_up_to_date(date: str) -> bool:
245 | """check if today and date is at most 1 day apart"""
246 | latest_date = datetime.strptime(date, "%Y-%m-%d")
247 | date_diff = datetime.today() - latest_date
248 |
249 | return date_diff.days <= 1
250 |
251 |
252 | def is_master_product_up_to_date(master_product: MasterProduct) -> bool:
253 | return any((product.is_up_to_date for product in master_product.products))
254 |
255 |
256 | def get_status_of_master_product(master_product: MasterProduct) -> str:
257 | if is_master_product_up_to_date(master_product):
258 | return get_status_of_product_by_bool(True)
259 |
260 | return get_status_of_product_by_bool(False)
261 |
262 |
263 | def get_status_of_product(product: ProductInfo) -> str:
264 | return get_status_of_product_by_bool(product.is_up_to_date)
265 |
266 |
267 | def get_status_of_product_by_bool(up_to_date: bool) -> str:
268 | return "UP TO DATE" if up_to_date else "OUTDATED"
269 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Crinibus/scraper/3c37db625d4b47cdb547952e098d3a3cb494ab6f/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_add_product.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from contextlib import nullcontext as does_not_raise
3 |
4 | from scraper.add_product import add_product
5 | from scraper.exceptions import WebsiteNotSupported
6 | from scraper.models import Info
7 |
8 | test_domains = [
9 | ("https://www.amazon.com/", does_not_raise()),
10 | ("https://www.ebay.com/itm/", does_not_raise()),
11 | ("https://www.ebay.com/p/", does_not_raise()),
12 | ("https://www.komplett.dk/", does_not_raise()),
13 | ("https://www.proshop.dk/", does_not_raise()),
14 | ("https://www.computersalg.dk/", does_not_raise()),
15 | ("https://www.elgiganten.dk/", does_not_raise()),
16 | ("https://www.avxperten.dk/", does_not_raise()),
17 | ("https://www.av-cables.dk/", does_not_raise()),
18 | ("https://www.power.dk/", does_not_raise()),
19 | ("https://www.expert.dk/", does_not_raise()),
20 | ("https://www.mm-vision.dk/", does_not_raise()),
21 | ("https://www.coolshop.dk/", does_not_raise()),
22 | ("https://sharkgaming.dk/", does_not_raise()),
23 | ("https://www.newegg.com/", does_not_raise()),
24 | ("https://www.hifiklubben.dk/", does_not_raise()),
25 | ("https://us.shein.com/", does_not_raise()),
26 | ("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)),
27 | ]
28 |
29 |
30 | # Tests to make sure the websites that are supported can be added to be scraped
31 | @pytest.mark.parametrize("url,expectation", test_domains)
32 | def test_add_product(url, expectation, mocker) -> None:
33 | mock_info = Info(name="", price=1, currency="", id="")
34 | mocker.patch("scraper.Scraper.scrape_info", return_value=mock_info)
35 | mocker.patch("scraper.database.get_product_by_product_code", return_value=None)
36 | mocker.patch("scraper.add_product.add_new_product_to_db", return_value=None)
37 | mocker.patch("scraper.add_product.add_new_datapoint_with_scraper", return_value=None)
38 |
39 | with expectation:
40 | add_product("test", url)
41 |
--------------------------------------------------------------------------------
/tests/test_domains.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | import pytest
3 |
4 | from scraper.domains import get_website_name, get_number_string
5 |
6 |
7 | @dataclass
8 | class UrlSetting:
9 | keep_tld: bool = False
10 | keep_http: bool = False
11 | keep_www: bool = False
12 | keep_subdomain: bool = True
13 |
14 |
15 | test_websites = [
16 | ("https://www.amazon.com/", UrlSetting(), "amazon"),
17 | ("https://www.komplett.dk/", UrlSetting(), "komplett"),
18 | ("https://www.av-cables.dk/", UrlSetting(), "av-cables"),
19 | ("https://nowww.com/", UrlSetting(), "nowww"),
20 | ("https://no-ending-slash.com", UrlSetting(), "no-ending-slash"),
21 | ("https://www.test.testing.com/", UrlSetting(), "test.testing"),
22 | ("https://www.test.hello.com/hello/world", UrlSetting(), "test.hello"),
23 | ("https://sub.main.com", UrlSetting(keep_subdomain=False), "main"),
24 | ("https://www.sub.main.com", UrlSetting(keep_subdomain=False), "main"),
25 | ("https://main.com", UrlSetting(keep_subdomain=False), "main"),
26 | ("https://main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"),
27 | ("https://www.main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"),
28 | ("https://www.main.com/", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"),
29 | ("https://www.sub.main.com/", UrlSetting(keep_http=True), "https://sub.main"),
30 | ("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True), "https://www.sub.main"),
31 | ("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True, keep_subdomain=False), "https://www.main"),
32 | ]
33 |
34 |
35 | @pytest.mark.parametrize("url,setting,expected", test_websites)
36 | def test_get_website_name(url: str, setting: UrlSetting, expected: str) -> None:
37 | result = get_website_name(
38 | url,
39 | keep_tld=setting.keep_tld,
40 | keep_http=setting.keep_http,
41 | keep_www=setting.keep_www,
42 | keep_subdomain=setting.keep_subdomain,
43 | )
44 | assert result == expected
45 |
46 |
47 | test_price_values = [
48 | ("USD 12.40", "12.40"),
49 | ("$234.00", "234.00"),
50 | ("£345.37", "345.37"),
51 | ("486,89 kr", "486,89"),
52 | ("$345.37", "345.37"),
53 | ("£1345.37", "1345.37"),
54 | ("1345,37 DKK", "1345,37"),
55 | ("1345.37 DKK", "1345.37"),
56 | ("USD 1345.37", "1345.37"),
57 | ("USD 10345.37", "10345.37"),
58 | ]
59 |
60 |
61 | @pytest.mark.parametrize("value,expected", test_price_values)
62 | def test_get_number_string(value: str, expected: str) -> None:
63 | result = get_number_string(value)
64 |
65 | assert result == expected
66 |
--------------------------------------------------------------------------------
/tests/test_objects.json:
--------------------------------------------------------------------------------
1 | {
2 | "test_website_handlers": {
3 | "komplett": {
4 | "link": "https://www.komplett.dk/product/1205149/gaming/spiludstyr/vr/vr-briller/htc-vive-flow-sortreflekterende",
5 | "expected_title": "HTC VIVE Flow (sort/reflekterende)",
6 | "expected_id": "1205149",
7 | "expected_currency": "DKK"
8 | },
9 | "proshop": {
10 | "link": "https://www.proshop.dk/Hovedtelefonerheadset/Sony-WH-1000XM4/2883832",
11 | "expected_title": "Sony WH-1000XM4",
12 | "expected_id": "2883832",
13 | "expected_currency": "DKK"
14 | },
15 | "computersalg": {
16 | "link": "https://www.computersalg.dk/i/6647865/sony-wh-1000xm4-hovedtelefoner-med-mik-fuld-st%c3%b8rrelse-bluetooth-tr%c3%a5dl%c3%b8s-kabling-nfc-aktiv-st%c3%b8jfjerning-3-5-mm-jackstik-sort",
17 | "expected_title": "Sony WH-1000XM4 - Hovedtelefoner med mik. - fuld størrelse - Bluetooth - trådløs, kabling - NFC - aktiv støjfjerning - 3,5 mm jackstik - sort",
18 | "expected_id": "6647865",
19 | "expected_currency": "DKK"
20 | },
21 | "elgiganten": {
22 | "link": "https://www.elgiganten.dk/product/gaming/spillekonsol-tilbehor/playstation/playstation-spillekonsol/playstation-5-2022/533978",
23 | "expected_title": "PlayStation 5 (2022)",
24 | "expected_id": "533978",
25 | "expected_currency": "DKK"
26 | },
27 | "avxperten": {
28 | "link": "https://www.avxperten.dk/noise-cancelling-head-set/sony-wh-1000xm4-bluetooth-hovedtelefoner-anc-sort.asp",
29 | "expected_title": "Sony WH-1000XM4 Bluetooth hovedtelefoner (m/ANC) Sort",
30 | "expected_id": "33590",
31 | "expected_currency": "DKK"
32 | },
33 | "av-cables": {
34 | "link": "https://www.av-cables.dk/bluetooth-hoeretelefoner/sony-wh-1000xm4-over-ear-bluetooth-headset-sort.html",
35 | "expected_title": "Sony WH-1000XM4 Over-Ear Bluetooth Headset - Sort",
36 | "expected_id": "833015",
37 | "expected_currency": "DKK"
38 | },
39 | "amazon": {
40 | "link": "https://www.amazon.de/-/en/Google-Pixel-Pro-Smartphone-Obsidian/dp/B0DG9DD9VN",
41 | "expected_title": "Google Pixel 9 Pro (512GB, Obsi, EU / UK) + Pixel 9/9 Pro Case, Obsidian",
42 | "expected_id": "B0DG9DD9VN",
43 | "expected_currency": "EUR"
44 | },
45 | "ebay_with_itm": {
46 | "link": "https://www.ebay.com/itm/265771092654",
47 | "expected_title": "BRAND NEW Sony PS5 Playstation 5 Blu-Ray Disc Edition Console -Fast Delivery",
48 | "expected_id": "265771092654",
49 | "expected_currency": "USD"
50 | },
51 | "ebay_with_p": {
52 | "link": "https://www.ebay.com/p/17005345300?iid=391613649077",
53 | "expected_title": "O Hui Age Recovery Eye Cream 1ml X 40pcs (40ml) Baby Collagen OHUI",
54 | "expected_id": "17005345300",
55 | "expected_currency": "USD"
56 | },
57 | "expert": {
58 | "link": "https://www.expert.dk/hoejtalere-og-lyd/hovedtelefoner/traadloese-hovedtelefoner/sony-wh-1000xm4-traadloese-stoejdaempende-hovedtelefoner-sort/p-1106907/",
59 | "expected_title": "SONY WH-1000XM4 TRÅDLØSE STØJDÆMPENDE HOVEDTELEFONER, SORT",
60 | "expected_id": "1106907",
61 | "expected_currency": "DKK"
62 | },
63 | "power": {
64 | "link": "https://www.power.dk/tv-og-lyd/hovedtelefoner/traadloese-hovedtelefoner/sony-wh-1000xm4-traadloese-stoejdaempende-hovedtelefoner-blaa/p-1185731/",
65 | "expected_title": "SONY WH-1000XM4 TRÅDLØSE STØJDÆMPENDE HOVEDTELEFONER, BLÅ",
66 | "expected_id": "1185731",
67 | "expected_currency": "DKK"
68 | },
69 | "mm-vision": {
70 | "link": "https://www.mm-vision.dk/demo-asus-rog-flow-x16-gv601",
71 | "expected_title": "DEMO Asus ROG Flow X16 (GV601)",
72 | "expected_id": "6987145",
73 | "expected_currency": "DKK"
74 | },
75 | "coolshop": {
76 | "link": "https://www.coolshop.dk/produkt/pokemon-brilliant-diamond/238G6U/",
77 | "expected_title": "Pokemon Brilliant Diamond - Nintendo Switch",
78 | "expected_id": "1177871",
79 | "expected_currency": "DKK"
80 | },
81 | "sharkgaming": {
82 | "link": "https://sharkgaming.dk/asus-gladius-ii-origin-gaming-mouse",
83 | "expected_title": "ASUS Gladius II Origin gaming mouse",
84 | "expected_id": "90MP00U1-B0UA00",
85 | "expected_currency": "DKK"
86 | },
87 | "newegg": {
88 | "link": "https://www.newegg.com/sony-wh1000xm4b-bluetooth-headset-black/p/0G6-001C-00614?Description=sony%20xm4&cm_re=sony_xm4-_-0G6-001C-00614-_-Product&quicklink=true",
89 | "expected_title": "Sony WH-1000XM4 Wireless Industry Leading Noise Canceling Overhead Headphones with Mic for Phone-Call and Alexa Voice Control, Silver",
90 | "expected_id": "0G6-001C-00614",
91 | "expected_currency": "USD"
92 | },
93 | "hifiklubben": {
94 | "link": "https://www.hifiklubben.dk/sennheiser-momentum-4-wireless-hoeretelefoner/senmomentum4bk/",
95 | "expected_title": "SENNHEISER MOMENTUM 4 WIRELESS",
96 | "expected_id": "senmomentum4bk",
97 | "expected_currency": "DKK"
98 | },
99 | "shein": {
100 | "link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html",
101 | "expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt",
102 | "expected_id": "sm2311284334246374",
103 | "expected_currency": "EUR"
104 | }
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/tests/test_visualize.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from scraper.domains import SUPPORTED_DOMAINS
4 | from scraper.constants import WEBSITE_COLORS
5 |
6 |
7 | @pytest.mark.parametrize("domain", SUPPORTED_DOMAINS.keys())
8 | def test_get_website_color_for_supported_domain(domain: str) -> None:
9 | color = WEBSITE_COLORS.get(domain, None)
10 | assert color is not None
11 |
--------------------------------------------------------------------------------
/tests/test_website_handlers.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | import json
3 | from scraper.domains import (
4 | AmazonHandler,
5 | AvCablesHandler,
6 | AvXpertenHandler,
7 | BaseWebsiteHandler,
8 | ComputerSalgHandler,
9 | CoolshopHandler,
10 | EbayHandler,
11 | ElgigantenHandler,
12 | ExpertHandler,
13 | KomplettHandler,
14 | MMVisionHandler,
15 | NeweggHandler,
16 | PowerHandler,
17 | ProshopHandler,
18 | SharkGamingHandler,
19 | HifiKlubbenHandler,
20 | SheinHandler,
21 | )
22 | from scraper.models import Info
23 |
24 |
25 | def read_json(filename: str) -> dict:
26 | with open(filename, "r", encoding="utf8") as file:
27 | data = json.load(file)
28 | return data
29 |
30 |
31 | test_objects_json = read_json("./tests/test_objects.json")
32 |
33 | test_website_handlers_json: dict[str, dict[str, str]] = test_objects_json["test_website_handlers"]
34 |
35 | komplett_test = test_website_handlers_json["komplett"]
36 | proshop_test = test_website_handlers_json["proshop"]
37 | computersalg_test = test_website_handlers_json["computersalg"]
38 | elgiganten_test = test_website_handlers_json["elgiganten"]
39 | avxperten_test = test_website_handlers_json["avxperten"]
40 | avcables_test = test_website_handlers_json["av-cables"]
41 | amazon_test = test_website_handlers_json["amazon"]
42 | ebay_with_itm_test = test_website_handlers_json["ebay_with_itm"]
43 | ebay_with_p_test = test_website_handlers_json["ebay_with_p"]
44 | expert_test = test_website_handlers_json["expert"]
45 | power_test = test_website_handlers_json["power"]
46 | mmvision_test = test_website_handlers_json["mm-vision"]
47 | coolshop_test = test_website_handlers_json["coolshop"]
48 | sharkgaming_test = test_website_handlers_json["sharkgaming"]
49 | newegg_test = test_website_handlers_json["newegg"]
50 | hifiklubben_test = test_website_handlers_json["hifiklubben"]
51 | shein_test = test_website_handlers_json["shein"]
52 |
53 |
54 | class BaseTestWebsiteHandler(ABC):
55 | test_handler: BaseWebsiteHandler
56 |
57 | def setup_method(self) -> None:
58 | if not self.test_handler.request_data:
59 | self.test_handler._request_product_data()
60 | self.test_handler._get_common_data()
61 |
62 | @abstractmethod
63 | def test_get_product_info(self) -> None:
64 | pass
65 |
66 | @abstractmethod
67 | def test_get_name(self) -> None:
68 | pass
69 |
70 | @abstractmethod
71 | def test_get_price(self) -> None:
72 | pass
73 |
74 | @abstractmethod
75 | def test_get_currency(self) -> None:
76 | pass
77 |
78 | @abstractmethod
79 | def test_get_id(self) -> None:
80 | pass
81 |
82 |
83 | class TestKomplettHandler(BaseTestWebsiteHandler):
84 | test_handler = KomplettHandler(komplett_test["link"])
85 |
86 | def test_get_product_info(self, mocker) -> None:
87 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
88 | actual = self.test_handler.get_product_info()
89 | assert isinstance(actual, Info)
90 | assert actual.valid
91 |
92 | def test_get_name(self) -> None:
93 | actual = self.test_handler._get_product_name().lower()
94 | expected = komplett_test["expected_title"].lower()
95 | assert isinstance(actual, str)
96 | assert actual == expected
97 |
98 | def test_get_price(self) -> None:
99 | price = self.test_handler._get_product_price()
100 | assert isinstance(price, float)
101 |
102 | def test_get_currency(self) -> None:
103 | currency = self.test_handler._get_product_currency()
104 | assert isinstance(currency, str)
105 | assert currency == komplett_test["expected_currency"]
106 |
107 | def test_get_id(self) -> None:
108 | id = self.test_handler._get_product_id()
109 | assert isinstance(id, str)
110 | assert id == komplett_test["expected_id"]
111 |
112 |
113 | class TestProshopHandler(BaseTestWebsiteHandler):
114 | test_handler = ProshopHandler(proshop_test["link"])
115 |
116 | def test_get_product_info(self, mocker) -> None:
117 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
118 | actual = self.test_handler.get_product_info()
119 | assert isinstance(actual, Info)
120 | assert actual.valid
121 |
122 | def test_get_name(self) -> None:
123 | actual = self.test_handler._get_product_name().lower()
124 | expected = proshop_test["expected_title"].lower()
125 | assert isinstance(actual, str)
126 | assert actual == expected
127 |
128 | def test_get_price(self) -> None:
129 | price = self.test_handler._get_product_price()
130 | assert isinstance(price, float)
131 |
132 | def test_get_currency(self) -> None:
133 | currency = self.test_handler._get_product_currency()
134 | assert isinstance(currency, str)
135 | assert currency == proshop_test["expected_currency"]
136 |
137 | def test_get_id(self) -> None:
138 | id = self.test_handler._get_product_id()
139 | assert isinstance(id, str)
140 | assert id == proshop_test["expected_id"]
141 |
142 |
143 | class TestComputersalgHandler(BaseTestWebsiteHandler):
144 | test_handler = ComputerSalgHandler(computersalg_test["link"])
145 |
146 | def test_get_product_info(self, mocker) -> None:
147 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
148 | actual = self.test_handler.get_product_info()
149 | assert isinstance(actual, Info)
150 | assert actual.valid
151 |
152 | def test_get_name(self) -> None:
153 | actual = self.test_handler._get_product_name().lower()
154 | expected = computersalg_test["expected_title"].lower()
155 | assert isinstance(actual, str)
156 | assert actual == expected
157 |
158 | def test_get_price(self) -> None:
159 | price = self.test_handler._get_product_price()
160 | assert isinstance(price, float)
161 |
162 | def test_get_currency(self) -> None:
163 | currency = self.test_handler._get_product_currency()
164 | assert isinstance(currency, str)
165 | assert currency == computersalg_test["expected_currency"]
166 |
167 | def test_get_id(self) -> None:
168 | id = self.test_handler._get_product_id()
169 | assert isinstance(id, str)
170 | assert id == computersalg_test["expected_id"]
171 |
172 |
173 | class TestElgigantenHandler(BaseTestWebsiteHandler):
174 | test_handler = ElgigantenHandler(elgiganten_test["link"])
175 |
176 | def test_get_product_info(self, mocker) -> None:
177 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
178 | actual = self.test_handler.get_product_info()
179 | assert isinstance(actual, Info)
180 | assert actual.valid
181 |
182 | def test_get_name(self) -> None:
183 | actual = self.test_handler._get_product_name().lower()
184 | expected = elgiganten_test["expected_title"].lower()
185 | assert isinstance(actual, str)
186 | assert actual == expected
187 |
188 | def test_get_price(self) -> None:
189 | price = self.test_handler._get_product_price()
190 | assert isinstance(price, float)
191 |
192 | def test_get_currency(self) -> None:
193 | currency = self.test_handler._get_product_currency()
194 | assert isinstance(currency, str)
195 | assert currency == elgiganten_test["expected_currency"]
196 |
197 | def test_get_id(self) -> None:
198 | id = self.test_handler._get_product_id()
199 | assert isinstance(id, str)
200 | assert id == elgiganten_test["expected_id"]
201 |
202 |
203 | class TestAvXpertenHandler(BaseTestWebsiteHandler):
204 | test_handler = AvXpertenHandler(avxperten_test["link"])
205 |
206 | def test_get_product_info(self, mocker) -> None:
207 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
208 | actual = self.test_handler.get_product_info()
209 | assert isinstance(actual, Info)
210 | assert actual.valid
211 |
212 | def test_get_name(self) -> None:
213 | actual = self.test_handler._get_product_name().lower()
214 | expected = avxperten_test["expected_title"].lower()
215 | assert isinstance(actual, str)
216 | assert actual == expected
217 |
218 | def test_get_price(self) -> None:
219 | price = self.test_handler._get_product_price()
220 | assert isinstance(price, float)
221 |
222 | def test_get_currency(self) -> None:
223 | currency = self.test_handler._get_product_currency()
224 | assert isinstance(currency, str)
225 | assert currency == avxperten_test["expected_currency"]
226 |
227 | def test_get_id(self) -> None:
228 | id = self.test_handler._get_product_id()
229 | assert isinstance(id, str)
230 | assert id == avxperten_test["expected_id"]
231 |
232 |
233 | class TestAvCablesHandler(BaseTestWebsiteHandler):
234 | test_handler = AvCablesHandler(avcables_test["link"])
235 |
236 | def test_get_product_info(self, mocker) -> None:
237 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
238 | actual = self.test_handler.get_product_info()
239 | assert isinstance(actual, Info)
240 | assert actual.valid
241 |
242 | def test_get_name(self) -> None:
243 | actual = self.test_handler._get_product_name().lower()
244 | expected = avcables_test["expected_title"].lower()
245 | assert isinstance(actual, str)
246 | assert actual == expected
247 |
248 | def test_get_price(self) -> None:
249 | price = self.test_handler._get_product_price()
250 | assert isinstance(price, float)
251 |
252 | def test_get_currency(self) -> None:
253 | currency = self.test_handler._get_product_currency()
254 | assert isinstance(currency, str)
255 | assert currency == avcables_test["expected_currency"]
256 |
257 | def test_get_id(self) -> None:
258 | id = self.test_handler._get_product_id()
259 | assert isinstance(id, str)
260 | assert id == avcables_test["expected_id"]
261 |
262 |
263 | class TestAmazonHandler(BaseTestWebsiteHandler):
264 | test_handler = AmazonHandler(amazon_test["link"])
265 |
266 | def test_get_product_info(self, mocker) -> None:
267 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
268 | actual = self.test_handler.get_product_info()
269 | assert isinstance(actual, Info)
270 | assert actual.valid
271 |
272 | def test_get_name(self) -> None:
273 | actual = self.test_handler._get_product_name().lower()
274 | expected = amazon_test["expected_title"].lower()
275 | assert isinstance(actual, str)
276 | assert actual == expected
277 |
278 | def test_get_price(self) -> None:
279 | price = self.test_handler._get_product_price()
280 | assert isinstance(price, float)
281 |
282 | def test_get_currency(self) -> None:
283 | currency = self.test_handler._get_product_currency()
284 | assert isinstance(currency, str)
285 | assert currency == amazon_test["expected_currency"]
286 |
287 | def test_get_id(self) -> None:
288 | id = self.test_handler._get_product_id()
289 | assert isinstance(id, str)
290 | assert id == amazon_test["expected_id"]
291 |
292 |
293 | # OBS: There is two Ebay versions - This is for url that start with 'ebay.com/itm/'
294 | class TestEbayHandler_with_itm(BaseTestWebsiteHandler):
295 | test_handler = EbayHandler(ebay_with_itm_test["link"])
296 |
297 | def test_get_product_info(self, mocker) -> None:
298 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
299 | actual = self.test_handler.get_product_info()
300 | assert isinstance(actual, Info)
301 | assert actual.valid
302 |
303 | def test_get_name(self) -> None:
304 | actual = self.test_handler._get_product_name().lower()
305 | expected = ebay_with_itm_test["expected_title"].lower()
306 | assert isinstance(actual, str)
307 | assert actual == expected
308 |
309 | def test_get_price(self) -> None:
310 | price = self.test_handler._get_product_price()
311 | assert isinstance(price, float)
312 |
313 | def test_get_currency(self) -> None:
314 | currency = self.test_handler._get_product_currency()
315 | assert isinstance(currency, str)
316 | assert len(currency) == 3
317 | assert currency == ebay_with_itm_test["expected_currency"]
318 |
319 | def test_get_id(self) -> None:
320 | id = self.test_handler._get_product_id()
321 | assert isinstance(id, str)
322 | assert id == ebay_with_itm_test["expected_id"]
323 |
324 |
325 | # OBS: There is two Ebay versions - This is for url that start with 'ebay.com/p/'
326 | class TestEbayHandler_with_p(BaseTestWebsiteHandler):
327 | test_handler = EbayHandler(ebay_with_p_test["link"])
328 |
329 | def test_get_product_info(self, mocker) -> None:
330 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
331 | actual = self.test_handler.get_product_info()
332 | assert isinstance(actual, Info)
333 | assert actual.valid
334 |
335 | def test_get_name(self) -> None:
336 | actual = self.test_handler._get_product_name().lower()
337 | expected = ebay_with_p_test["expected_title"].lower()
338 | assert isinstance(actual, str)
339 | assert actual == expected
340 |
341 | def test_get_price(self) -> None:
342 | price = self.test_handler._get_product_price()
343 | assert isinstance(price, float)
344 |
345 | def test_get_currency(self) -> None:
346 | currency = self.test_handler._get_product_currency()
347 | assert isinstance(currency, str)
348 | assert len(currency) == 3
349 | # assert currency == ebay_with_p_test["expected_currency"]
350 |
351 | def test_get_id(self) -> None:
352 | id = self.test_handler._get_product_id()
353 | assert isinstance(id, str)
354 | assert id == ebay_with_p_test["expected_id"]
355 |
356 |
357 | class TestPowerHandler(BaseTestWebsiteHandler):
358 | test_handler = PowerHandler(power_test["link"])
359 |
360 | def test_get_product_info(self, mocker) -> None:
361 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
362 | actual = self.test_handler.get_product_info()
363 | assert isinstance(actual, Info)
364 | assert actual.valid
365 |
366 | def test_get_name(self) -> None:
367 | actual = self.test_handler._get_product_name().lower()
368 | expected = power_test["expected_title"].lower()
369 | assert isinstance(actual, str)
370 | assert actual == expected
371 |
372 | def test_get_price(self) -> None:
373 | price = self.test_handler._get_product_price()
374 | assert isinstance(price, float)
375 |
376 | def test_get_currency(self) -> None:
377 | currency = self.test_handler._get_product_currency()
378 | assert isinstance(currency, str)
379 | assert currency == power_test["expected_currency"]
380 |
381 | def test_get_id(self) -> None:
382 | id = self.test_handler._get_product_id()
383 | assert isinstance(id, str)
384 | assert id == power_test["expected_id"]
385 |
386 |
387 | class TestExpertHandler(BaseTestWebsiteHandler):
388 | test_handler = ExpertHandler(expert_test["link"])
389 |
390 | def test_get_product_info(self, mocker) -> None:
391 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
392 | actual = self.test_handler.get_product_info()
393 | assert isinstance(actual, Info)
394 | assert actual.valid
395 |
396 | def test_get_name(self) -> None:
397 | actual = self.test_handler._get_product_name().lower()
398 | expected = expert_test["expected_title"].lower()
399 | assert isinstance(actual, str)
400 | assert actual == expected
401 |
402 | def test_get_price(self) -> None:
403 | price = self.test_handler._get_product_price()
404 | assert isinstance(price, float)
405 |
406 | def test_get_currency(self) -> None:
407 | currency = self.test_handler._get_product_currency()
408 | assert isinstance(currency, str)
409 | assert currency == expert_test["expected_currency"]
410 |
411 | def test_get_id(self) -> None:
412 | id = self.test_handler._get_product_id()
413 | assert isinstance(id, str)
414 | assert id == expert_test["expected_id"]
415 |
416 |
417 | class TestMMVisionHandler(BaseTestWebsiteHandler):
418 | test_handler = MMVisionHandler(mmvision_test["link"])
419 |
420 | def test_get_product_info(self, mocker) -> None:
421 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
422 | actual = self.test_handler.get_product_info()
423 | assert isinstance(actual, Info)
424 | assert actual.valid
425 |
426 | def test_get_name(self) -> None:
427 | actual = self.test_handler._get_product_name().lower()
428 | expected = mmvision_test["expected_title"].lower()
429 | assert isinstance(actual, str)
430 | assert actual == expected
431 |
432 | def test_get_price(self) -> None:
433 | price = self.test_handler._get_product_price()
434 | assert isinstance(price, float)
435 |
436 | def test_get_currency(self) -> None:
437 | currency = self.test_handler._get_product_currency()
438 | assert isinstance(currency, str)
439 | assert currency == mmvision_test["expected_currency"]
440 |
441 | def test_get_id(self) -> None:
442 | id = self.test_handler._get_product_id()
443 | assert isinstance(id, str)
444 | assert id == mmvision_test["expected_id"]
445 |
446 |
447 | class TestCoolshopHandler(BaseTestWebsiteHandler):
448 | test_handler = CoolshopHandler(coolshop_test["link"])
449 |
450 | def test_get_product_info(self, mocker) -> None:
451 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
452 | actual = self.test_handler.get_product_info()
453 | assert isinstance(actual, Info)
454 | assert actual.valid
455 |
456 | def test_get_name(self) -> None:
457 | actual = self.test_handler._get_product_name().lower()
458 | expected = coolshop_test["expected_title"].lower()
459 | assert isinstance(actual, str)
460 | assert actual == expected
461 |
462 | def test_get_price(self) -> None:
463 | price = self.test_handler._get_product_price()
464 | assert isinstance(price, float)
465 |
466 | def test_get_currency(self) -> None:
467 | currency = self.test_handler._get_product_currency()
468 | assert isinstance(currency, str)
469 | assert currency == coolshop_test["expected_currency"]
470 |
471 | def test_get_id(self) -> None:
472 | id = self.test_handler._get_product_id()
473 | assert isinstance(id, str)
474 | assert id == coolshop_test["expected_id"]
475 |
476 |
477 | class TestSharkGamingHandler(BaseTestWebsiteHandler):
478 | test_handler = SharkGamingHandler(sharkgaming_test["link"])
479 |
480 | def test_get_product_info(self, mocker) -> None:
481 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
482 | actual = self.test_handler.get_product_info()
483 | assert isinstance(actual, Info)
484 | assert actual.valid
485 |
486 | def test_get_name(self) -> None:
487 | actual = self.test_handler._get_product_name().lower()
488 | expected = sharkgaming_test["expected_title"].lower()
489 | assert isinstance(actual, str)
490 | assert actual == expected
491 |
492 | def test_get_price(self) -> None:
493 | price = self.test_handler._get_product_price()
494 | assert isinstance(price, float)
495 |
496 | def test_get_currency(self) -> None:
497 | currency = self.test_handler._get_product_currency()
498 | assert isinstance(currency, str)
499 | assert currency == sharkgaming_test["expected_currency"]
500 |
501 | def test_get_id(self) -> None:
502 | id = self.test_handler._get_product_id()
503 | assert isinstance(id, str)
504 | assert id == sharkgaming_test["expected_id"]
505 |
506 |
507 | class TestNeweggHandler(BaseTestWebsiteHandler):
508 | test_handler = NeweggHandler(newegg_test["link"])
509 |
510 | def test_get_product_info(self, mocker) -> None:
511 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
512 | actual = self.test_handler.get_product_info()
513 | assert isinstance(actual, Info)
514 | assert actual.valid
515 |
516 | def test_get_name(self) -> None:
517 | actual = self.test_handler._get_product_name().lower()
518 | expected = newegg_test["expected_title"].lower()
519 | assert isinstance(actual, str)
520 | assert actual == expected
521 |
522 | def test_get_price(self) -> None:
523 | price = self.test_handler._get_product_price()
524 | assert isinstance(price, float)
525 |
526 | def test_get_currency(self) -> None:
527 | currency = self.test_handler._get_product_currency()
528 | assert isinstance(currency, str)
529 | assert currency == newegg_test["expected_currency"]
530 |
531 | def test_get_id(self) -> None:
532 | id = self.test_handler._get_product_id()
533 | assert isinstance(id, str)
534 | assert id == newegg_test["expected_id"]
535 |
536 |
537 | class TestHifiKlubbenHandler(BaseTestWebsiteHandler):
538 | test_handler = HifiKlubbenHandler(hifiklubben_test["link"])
539 |
540 | def test_get_product_info(self, mocker) -> None:
541 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
542 | actual = self.test_handler.get_product_info()
543 | assert isinstance(actual, Info)
544 | assert actual.valid
545 |
546 | def test_get_name(self) -> None:
547 | actual = self.test_handler._get_product_name().lower()
548 | expected = hifiklubben_test["expected_title"].lower()
549 | assert isinstance(actual, str)
550 | assert actual == expected
551 |
552 | def test_get_price(self) -> None:
553 | price = self.test_handler._get_product_price()
554 | assert isinstance(price, float)
555 |
556 | def test_get_currency(self) -> None:
557 | currency = self.test_handler._get_product_currency()
558 | assert isinstance(currency, str)
559 | assert currency == hifiklubben_test["expected_currency"]
560 |
561 | def test_get_id(self) -> None:
562 | id = self.test_handler._get_product_id()
563 | assert isinstance(id, str)
564 | assert id == hifiklubben_test["expected_id"]
565 |
566 |
567 | class TestSheinHandler(BaseTestWebsiteHandler):
568 | test_handler = SheinHandler(shein_test["link"])
569 |
570 | def test_get_product_info(self, mocker) -> None:
571 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
572 | actual = self.test_handler.get_product_info()
573 | assert isinstance(actual, Info)
574 | assert actual.valid
575 |
576 | def test_get_name(self) -> None:
577 | actual = self.test_handler._get_product_name().lower()
578 | expected = shein_test["expected_title"].lower()
579 | assert isinstance(actual, str)
580 | assert actual == expected
581 |
582 | def test_get_price(self) -> None:
583 | price = self.test_handler._get_product_price()
584 | assert isinstance(price, float)
585 |
586 | def test_get_currency(self) -> None:
587 | currency = self.test_handler._get_product_currency()
588 | assert isinstance(currency, str)
589 | assert currency == shein_test["expected_currency"]
590 |
591 | def test_get_id(self) -> None:
592 | id = self.test_handler._get_product_id()
593 | assert isinstance(id, str)
594 | assert id == shein_test["expected_id"]
595 |
--------------------------------------------------------------------------------