├── .flake8 ├── .gitattributes ├── .github └── workflows │ ├── codeql-analysis.yml │ └── python-app.yml ├── .gitignore ├── LICENSE ├── README.md ├── main.py ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── scraper ├── __init__.py ├── add_product.py ├── arguments.py ├── clean_data.py ├── constants.py ├── database │ ├── __init__.py │ ├── db.py │ ├── functions.py │ └── models.py ├── delete_data.py ├── domains.py ├── exceptions.py ├── filemanager.py ├── format.py ├── format_to_new.py ├── logfile.log ├── logging.ini ├── models │ ├── __init__.py │ └── product.py ├── print_products.py ├── products.csv ├── records.json ├── reset_data.py ├── scrape.py ├── search_data.py ├── settings.ini └── visualize.py └── tests ├── __init__.py ├── test_add_product.py ├── test_domains.py ├── test_objects.json ├── test_visualize.py └── test_website_handlers.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-complexity = 10 3 | max-line-length = 127 4 | exclude = .git,__pycache__ 5 | per-file-ignores = __init__.py:F401 6 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | name: "CodeQL" 7 | 8 | on: 9 | push: 10 | branches: [master] 11 | pull_request: 12 | # The branches below must be a subset of the branches above 13 | branches: [master] 14 | schedule: 15 | - cron: '0 11 * * 6' 16 | 17 | jobs: 18 | analyze: 19 | name: Analyze 20 | runs-on: ubuntu-latest 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | # Override automatic language detection by changing the below list 26 | # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] 27 | language: ['python'] 28 | # Learn more... 29 | # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection 30 | 31 | steps: 32 | - name: Checkout repository 33 | uses: actions/checkout@v2 34 | with: 35 | # We must fetch at least the immediate parents so that if this is 36 | # a pull request then we can checkout the head. 37 | fetch-depth: 2 38 | 39 | # Initializes the CodeQL tools for scanning. 40 | - name: Initialize CodeQL 41 | uses: github/codeql-action/init@v2 42 | with: 43 | languages: ${{ matrix.language }} 44 | # If you wish to specify custom queries, you can do so here or in a config file. 45 | # By default, queries listed here will override any specified in a config file. 46 | # Prefix the list here with "+" to use these queries and those in the config file. 47 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 48 | 49 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 50 | # If this step fails, then you should remove it and run the build manually (see below) 51 | - name: Autobuild 52 | uses: github/codeql-action/autobuild@v2 53 | 54 | # ℹ️ Command-line programs to run using the OS shell. 55 | # 📚 https://git.io/JvXDl 56 | 57 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 58 | # and modify them (or add more) to build your code if your project 59 | # uses a compiled language 60 | 61 | #- run: | 62 | # make bootstrap 63 | # make release 64 | 65 | - name: Perform CodeQL Analysis 66 | uses: github/codeql-action/analyze@v2 67 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | name: Python application 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.10"] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install flake8 pytest 27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --per-file-ignores=__init__.py:F401,tests/*:E501 --statistics 34 | # - name: Test with pytest 35 | # run: | 36 | # pytest -k "not Amazon" 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | __pycache__/ 3 | .pytest_cache/ 4 | 5 | test_new_features.py 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Crinibus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | - [Intro](#intro) 3 | - [Contributing](#contributing) 4 | - [Installation](#installation) 5 | - [Add products](#add-products) 6 | - [Websites to scrape from](#websites-to-scrape-from) 7 | - [Scrape products](#scrape-products) 8 | - [Delete data](#delete-data) 9 | - [User settings](#user-settings) 10 | - [Clean up data](#clean-up-data) 11 | - [View the latest datapoint of product(s)](#view-the-latest-datapoint-of-products) 12 | - [View all products](#view-all-products) 13 | - [Visualize data](#visualize-data) 14 | - [Command examples](#command-examples) 15 | 16 |
17 | 18 | 19 | ## Intro 20 | With this program you can easily scrape and track prices on product at multiple [websites](#websites-to-scrape-from).
21 | This program can also visualize price over time of the products being tracked. That can be helpful if you want to buy a product in the future and wants to know if a discount might be around the corner. 22 | 23 | **Requires** `python 3.10+` 24 | 25 |
26 | 27 | 28 | ## Contributing 29 | Feel free to fork the project and create a pull request with new features or refactoring of the code. Also feel free to make issues with problems or suggestions to new features. 30 | 31 |
32 | 33 | 34 |

UPDATE TO HOW DATA IS STORED IN V1.1

35 |

36 | 37 | In version v1.1, I have changed how data is stored in ```records.json```: ```dates``` under each product have been changed to ```datapoints``` and now a list containing dictionaries with ```date``` and ```price``` keys.
38 | If you want to update your data to be compatible with version v1.1, then open an interactive python session where this repository is located and run the following commands: 39 | ``` 40 | >>> from scraper.format_to_new import Format 41 | >>> Format.format_old_records_to_new() 42 | ``` 43 | 44 |

45 |
46 | 47 | 48 |

UPDATE TO PRODUCTS.CSV IN V2.3.0

49 |

50 | 51 | In version v2.3.0, I have add the column ```short_url``` to ```products.csv```. If you have add products before v2.3.0, then run the following commands in an interactive python session to add the new column: 52 | ``` 53 | >>> from scraper.format_to_new import Format 54 | >>> Format.add_short_urls_to_products_csv() 55 | ``` 56 | 57 |

58 |
59 | 60 |

UPDATE TO HOW DATA IS STORED IN V3.0.0

61 |

62 | 63 | In version v3.0.0, I have changed where data is stored from a json file to a SQLite database. If you have data from before v3.0.0, then run the following commands in an interactive python session to add the data from records.json to the database (**OBS: Pandas is required**): 64 | ``` 65 | >>> from scraper.format_to_new import Format 66 | >>> Format.from_json_to_db() 67 | ``` 68 | 69 |
70 | 71 | **NOTE:** This will replace the content in the database with what is in records.json. That means if you have products and/or datapoints in the database but not records.json, they will be deleted. 72 | 73 | 74 |
75 | 76 | OBS: If you doesn't have Pandas installed run this command: 77 | ``` 78 | pip3 install pandas 79 | ``` 80 | 81 |

82 |
83 | 84 |
85 | 86 | 87 | ## Installation 88 | **Requires** `python 3.10+` 89 | 90 | Clone this repository and move into the repository: 91 | ``` 92 | git clone https://github.com/Crinibus/scraper.git 93 | ``` 94 | ``` 95 | cd scraper 96 | ``` 97 | 98 | Then make sure you have the modules, run this in the terminal: 99 | ``` 100 | pip3 install -r requirements.txt 101 | ``` 102 | 103 |
104 | 105 | 106 | ## Add products 107 | To add a single product, use the following command, where you replace `````` and `````` with your category and url: 108 | ``` 109 | python3 main.py -a -c -u 110 | ``` 111 | 112 | e.g. 113 | ``` 114 | python3 main.py -a -c vr -u https://www.komplett.dk/product/1168594/gaming/spiludstyr/vr/vr-briller/oculus-quest-2-vr-briller 115 | ``` 116 | 117 | This adds the category (if new) and the product to the records.json file, and adds a line at the end of the products.csv file so the script can scrape price of the new product. 118 | 119 |
120 | 121 | To add multiple products at once, just add specify another category and url with ```-c ``` and ```-u ```. E.g. with the following command I add two products: 122 | ``` 123 | python3 main.py -a -c -u -c -u 124 | ``` 125 | This is equivalent to the above: 126 | ``` 127 | python3 main.py -a -c -u 128 | ``` 129 | 130 | **OBS**: The url must have a schema like: ```https://``` or ```http://```.
131 | **OBS**: If an error occures when adding a product, then the error might happen because the url has a ```&``` in it, when this happens then just put quotation marks around the url. This should solve the problem. If this doesn't solve the problem then summit a issue.
132 | 133 |
134 | 135 | 136 | ### Websites to scrape from 137 | This scraper can (so far) scrape prices on products from: 138 | - [Amazon](https://www.amazon.com/)* 139 | - [eBay.com](https://www.ebay.com/) 140 | - [Komplett.dk](https://www.komplett.dk/) 141 | - [Proshop.dk](https://www.proshop.dk/) 142 | - [Computersalg.dk](https://www.computersalg.dk/) 143 | - [Elgiganten.dk](https://www.elgiganten.dk/) & [Elgiganten.se](https://www.elgiganten.se/) 144 | - [AvXperten.dk](https://www.avxperten.dk/) 145 | - [Av-Cables.dk](https://www.av-cables.dk/) 146 | - [Power.dk](https://www.power.dk/) 147 | - [Expert.dk](https://www.expert.dk/) 148 | - [MM-Vision.dk](https://www.mm-vision.dk/) 149 | - [Coolshop.dk](https://www.coolshop.dk/) 150 | - [Sharkgaming.dk](https://www.sharkgaming.dk/) 151 | - [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/) 152 | - [HifiKlubben.dk](https://www.hifiklubben.dk/) 153 | - [Shein.com](https://www.shein.com/) 154 | 155 | ****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
156 | The listed Amazon domains is from my quick testing with one or two products from each domain.
157 | If you find that some other Amazon domains works or some of the listed doesn't please create an issue.*** 158 | 159 |
160 | 161 | 162 | ## Scrape products 163 | To scrape prices of products run this in the terminal: 164 | ``` 165 | python3 main.py -s 166 | ``` 167 | To scrape with threads run the same command but with the ```--threads``` argument: 168 | ``` 169 | python3 main.py -s --threads 170 | ``` 171 | 172 |
173 | 174 | ## Activating and deactivating products 175 | 176 | When you add a new product the product is activated to be scraped. If you wish to not scrape a product anymore, you can deactivate the product with the following command: 177 | ``` 178 | python3 main.py --deactivate --id 179 | ``` 180 | 181 | You can activate a product again with the following command: 182 | ``` 183 | python3 main.py --activate --id 184 | ``` 185 | 186 |
187 | 188 | ## Delete data 189 | 190 | If you want to start from scratch with no data in the records.json and products.csv files, then just run the following command: 191 | ``` 192 | python3 main.py --delete --all 193 | ``` 194 | 195 | You can also just delete some products or some categories: 196 | ``` 197 | python3 main.py --delete --id 198 | ``` 199 | ``` 200 | python3 main.py --delete --name 201 | ``` 202 | ``` 203 | python3 main.py --delete --category 204 | ``` 205 | 206 | 207 | Then just add products like described [here](#add-products). 208 | 209 |
210 | 211 | If you just want to delete all datapoints for every product, then run this command: 212 | ``` 213 | python3 main.py --reset --all 214 | ``` 215 | 216 | 217 | You can also just delete datapoints for some products: 218 | ``` 219 | python3 main.py --reset --id 220 | ``` 221 | ``` 222 | python3 main.py --reset --name 223 | ``` 224 | ``` 225 | python3 main.py --reset --category 226 | ``` 227 | 228 |
229 | 230 | 231 | ## User settings 232 | User settings can be added and changed in the file settings.ini. 233 | 234 | #### ChangeName 235 | Under the category ```ChangeName``` you can change how the script changes product names, so similar products will be placed in the same product in records.json file. 236 | 237 | When adding a new setting under the category ```ChangeName``` in settings.ini, there must be a line with ```key``` and a line with ```value```, where `````` is the "link" between keywords and valuewords. E.g. ```value3``` is the value to ```key3```. 238 | 239 | In ```key``` you set the keywords (seperated by a comma) that the product name must have for to be changed to what ```value``` is equal to. Example if the user settings is the following: 240 | 241 | ``` 242 | [ChangeName] 243 | key1 = asus,3080,rog,strix,oc 244 | value1 = asus geforce rtx 3080 rog strix oc 245 | ``` 246 | 247 | The script checks if a product name has all of the words in ```key1```, it gets changed to what ```value1``` is. 248 | 249 | #### Scraping 250 | You can change the time between each time a url is being request by changing the field ```request_delay``` in the file scraper/settings.ini under the ```Scraping``` section. 251 | 252 | Default is 0 seconds, but to avoid the website you scrape products from thinking you are DDOS attacting them or you being restricted from scraping on their websites temporarily, set the request_delay in settings.ini to a higher number of seconds, e.g. 5 seconds. 253 | 254 |
255 | 256 | 257 | ## Clean up data 258 | If you want to clean up your data, meaning you want to remove unnecessary datapoints (datapoints that have the same price as the datapoint before and after it), then run the following command: 259 | ``` 260 | python3 main.py --clean-data 261 | ``` 262 |
263 | 264 | 265 | ## Search products and categories 266 | You can search for product names and categories you have in your records.json by using the argument ```--search [ ...]```. The search is like a keyword search, so e.g. if you enter ```--search logitech``` all product names and categories that contains the word "logitech" are found. 267 | 268 | You can search with multiple keywords, just seperate them with a space: ```--search logitech corsair```. Here all the product names and categories that contains the words "logitech" or "corsair" are found. 269 | 270 |
271 | 272 | 273 | ## View the latest datapoint of product(s) 274 | If you want to view the latest datapoint of a product, you can use the argument ```--latest-datapoint``` with ```--id``` and/or ```--name```. 275 | 276 | Example: 277 | ``` 278 | python3 main.py --name "logitech z533" --latest-datapoint 279 | ``` 280 | 281 | The above command will show the latest datapoint for all the websites the specified product, in this case "logitech z533", has been scraped from and will show something like this: 282 | 283 | ``` 284 | LOGITECH Z533 285 | > Komplett - 849816 286 | - DKK 999.0 287 | - 2022-09-12 288 | > Proshop - 2511000 289 | - DKK 669.0 290 | - 2022-09-12 291 | > Avxperten - 25630 292 | - DKK 699.0 293 | - 2022-09-12 294 | ``` 295 | 296 |
297 | 298 | 299 | ## View all products 300 | To view all the products you have scraped, you can use the argument ```--list-products```. 301 | 302 | Example: 303 | ``` 304 | python3 main.py --list-products 305 | ``` 306 | 307 | This will list all the products in the following format: 308 | 309 | ``` 310 | CATEGORY 311 | > PRODUCT NAME 312 | - WEBSITE NAME - PRODUCT ID 313 | - ✓ WEBSITE NAME - PRODUCT ID 314 | ``` 315 | 316 | The check mark (✓) shows that the product is activated. 317 | 318 |
319 | 320 | 321 | ## Visualize data 322 | To visualize your data, just run main.py with the ```-v``` or ```--visualize``` argument and then specify which products you want to be visualized. These are your options for how you want to visualize your products: 323 | 324 | - ```--all``` to visualize all your products 325 | - ```-c [ [ ...]]``` or ```--category [ [ ...]]``` to visualize all products in one or more categories 326 | - ```--id [ [ ...]]``` to visualize one or more products with the specified id(s) 327 | - ```-n [ [ ...]]``` or ```--name [ ...]]``` to visualize one or more products with the specified name(s) 328 | - ```--compare``` to compare two or more products with the specified id(s), name(s) and/or category(s) or all products on one graph. Use with ```--id```, ```--name```, ```--category``` and/or ```--all``` 329 | 330 | ### Example graph 331 | ![](https://user-images.githubusercontent.com/57172157/171033112-908f6420-6c7a-44ef-ba67-8a4a73bbd96e.png) 332 | 333 | ### Command examples 334 | **Show graphs for all products** 335 | 336 | To show graphs for all products, run the following command: 337 | ``` 338 | python3 main.py -v --all 339 | ``` 340 | 341 |
342 | 343 | **Show graph(s) for specific products** 344 | 345 | To show a graph for only one product, run the following command where `````` is the id of the product you want a graph for: 346 | ``` 347 | python3 main.py -v --id 348 | ``` 349 | 350 | For multiple products, just add another id, like so: 351 | ``` 352 | python3 main.py -v --id 353 | ``` 354 | 355 |
356 | 357 | **Show graphs for products in one or more categories** 358 | 359 | To show graphs for all products in one category, run the following command where `````` is the category you want graph from: 360 | ``` 361 | python3 main.py -v -c 362 | ``` 363 | 364 | For multiple categories, just add another category, like so: 365 | ``` 366 | python3 main.py -v -c 367 | ``` 368 | 369 |
370 | 371 | **Show graps for products with a specific name** 372 | 373 | To show graphs for product(s) with a specific name, run the following command where `````` is the name of the product(s) you want graphs for: 374 | ``` 375 | python3 main.py -v --name 376 | ``` 377 | 378 | For multiple products with different names, just add another name, like so: 379 | ``` 380 | python3 main.py -v --name 381 | ``` 382 | 383 | If the name of a product has multiple words in it, then just add quotation marks around the name. 384 | 385 |
386 | 387 | **Only show graph for products that are up to date** 388 | 389 | To only show graphs for the products that are up to date, use the flag ```--up-to-date``` or ```-utd```, like so: 390 | ``` 391 | python3 main.py -v --all -utd 392 | ``` 393 | The use of the flag ```-utd``` is only implemented when visualizing all products like the example above or when visualizing all products in a category: 394 | ``` 395 | python3 main.py -v -c -utd 396 | ``` 397 | 398 |
399 | 400 | **Compare two products** 401 | 402 | To compare two products on one graph, use the flag ```--compare``` with flag ```--id```, ```--name```, ```--category``` and/or ```--all```, like so: 403 | ``` 404 | python3 main.py -v --compare --id 405 | ``` 406 | ``` 407 | python3 main.py -v --compare --name 408 | ``` 409 | ``` 410 | python3 main.py -v --compare --category 411 | ``` 412 | ``` 413 | python3 main.py -v --compare --id --name --category 414 | ``` 415 | ``` 416 | python3 main.py -v --compare --all 417 | ``` 418 | 419 | ***OBS** when using ```--name``` or ```--category``` multiple products can be visualized* 420 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import logging.config 3 | import logging 4 | import time 5 | import alive_progress 6 | import scraper 7 | 8 | alive_progress.config_handler.set_global(ctrl_c=False, dual_line=True, theme="classic", stats=False) 9 | 10 | 11 | def main() -> None: 12 | args = scraper.argparse_setup() 13 | 14 | if args.clean_data: 15 | scraper.clean_datapoints() 16 | 17 | if args.visualize: 18 | scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare) 19 | 20 | if args.reset: 21 | scraper.reset(args.category, args.name, args.id, args.all) 22 | 23 | if args.add: 24 | scraper.add_products(args.category, args.url) 25 | 26 | if args.activate: 27 | scraper.update_products_is_active_with_product_codes(args.id, True) 28 | 29 | if args.deactivate: 30 | scraper.update_products_is_active_with_product_codes(args.id, False) 31 | 32 | if args.search: 33 | scraper.search(args.search) 34 | 35 | if args.scrape: 36 | if args.threads: 37 | scrape_with_threads() 38 | else: 39 | scrape() 40 | 41 | if args.latest_datapoint: 42 | scraper.print_latest_datapoints(args.name, args.id, args.category) 43 | 44 | if args.list_products: 45 | if any([args.name, args.id, args.category]): 46 | scraper.list_products_with_filters(args.name, args.id, args.category) 47 | else: 48 | scraper.print_all_products() 49 | 50 | if args.delete: 51 | scraper.delete(args.category, args.name, args.id, args.all) 52 | 53 | 54 | def scrape() -> None: 55 | print("Scraping...") 56 | 57 | request_delay = scraper.Config.get_request_delay() 58 | active_products = scraper.db.get_all_products(select_only_active=True) 59 | 60 | products = scraper.Format.db_products_to_scrapers(active_products) 61 | 62 | with alive_progress.alive_bar(len(products), title="Scraping") as bar: 63 | # Scrape and save scraped data for each product (sequentially) 64 | for product in products: 65 | bar.text = f"-> {product.url}" 66 | time.sleep(request_delay) 67 | product.scrape_info() 68 | scraper.add_product.add_new_datapoint_with_scraper(product) 69 | bar() 70 | 71 | 72 | def scrape_with_threads() -> None: 73 | print("Scraping with threads...") 74 | 75 | request_delay = scraper.Config.get_request_delay() 76 | 77 | grouped_db_products = scraper.db.get_all_products_grouped_by_domains(select_only_active=True) 78 | grouped_products: list[list[scraper.Scraper]] = [] 79 | 80 | for db_products in grouped_db_products: 81 | products = scraper.Format.db_products_to_scrapers(db_products) 82 | grouped_products.append(products) 83 | 84 | grouped_scraper_threads: list[list[threading.Thread]] = [] 85 | 86 | # Create scraper threads and group by domain 87 | for products in grouped_products: 88 | scraper_threads = [threading.Thread(target=product.scrape_info) for product in products] 89 | grouped_scraper_threads.append(scraper_threads) 90 | 91 | products_flatten = [product for products in grouped_products for product in products] 92 | 93 | with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar: 94 | # Create master threads to manage scraper threads sequentially for each domain 95 | master_threads = [ 96 | threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay, progress_bar]) 97 | for scraper_threads in grouped_scraper_threads 98 | ] 99 | 100 | # Start all master threads 101 | for master_thread in master_threads: 102 | master_thread.start() 103 | 104 | # Wait for all master threads to finish 105 | for master_thread in master_threads: 106 | master_thread.join() 107 | 108 | # Save scraped data for each product (sequentially) 109 | for product in products_flatten: 110 | scraper.add_product.add_new_datapoint_with_scraper(product) 111 | 112 | 113 | if __name__ == "__main__": 114 | scraper.db.create_db_and_tables() 115 | logging.config.fileConfig( 116 | fname=scraper.Filemanager.logging_ini_path, 117 | defaults={"logfilename": scraper.Filemanager.logfile_path}, 118 | ) 119 | 120 | main() 121 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 127 3 | 4 | [tool.ruff] 5 | line-length = 127 6 | 7 | [tool.ruff.per-file-ignores] 8 | "__init__.py" = ["E402"] 9 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = 3 | tests 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.24.0 2 | beautifulsoup4>=4.9.1 3 | plotly>=4.12.0 4 | pytest>=7.1.2 5 | pytest-mock>=3.8.2 6 | alive-progress>=2.4.1 7 | flake8>=6.0.0 8 | sqlmodel>=0.0.8 9 | -------------------------------------------------------------------------------- /scraper/__init__.py: -------------------------------------------------------------------------------- 1 | from .scrape import Scraper, start_threads_sequentially 2 | from .arguments import argparse_setup 3 | from .add_product import add_products, update_products_is_active_with_product_codes 4 | from .filemanager import Filemanager, Config 5 | from .visualize import visualize_data 6 | from .clean_data import clean_datapoints 7 | from .delete_data import delete 8 | from .reset_data import reset 9 | from .search_data import search 10 | from .print_products import print_latest_datapoints, print_all_products, list_products_with_filters 11 | from .format import Format 12 | import scraper.database as db 13 | 14 | 15 | __author__ = "Crinibus" 16 | -------------------------------------------------------------------------------- /scraper/add_product.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | 4 | import scraper.database as db 5 | from scraper.exceptions import WebsiteNotSupported, URLMissingSchema 6 | from scraper.format import Format 7 | from scraper.scrape import Scraper 8 | from scraper.domains import get_website_name, SUPPORTED_DOMAINS 9 | from scraper.constants import URL_SCHEMES 10 | 11 | 12 | def add_products(categories: list[str], urls: list[str]) -> None: 13 | for category, url in zip(categories, urls): 14 | try: 15 | add_product(category, url) 16 | except (WebsiteNotSupported, URLMissingSchema) as err: 17 | logging.getLogger(__name__).error(err) 18 | print(err) 19 | 20 | 21 | def add_product(category: str, url: str) -> None: 22 | logger = logging.getLogger(__name__) 23 | 24 | website_name = get_website_name(url, keep_subdomain=False) 25 | 26 | if website_name not in SUPPORTED_DOMAINS.keys(): 27 | raise WebsiteNotSupported(website_name) 28 | 29 | if is_missing_url_schema(url): 30 | raise URLMissingSchema(url) 31 | 32 | print(f"Adding product with category '{category}' and url '{url}'") 33 | logger.info(f"Adding product with category '{category}' and url '{url}'") 34 | 35 | new_product = Scraper(category, url) 36 | new_product_info = new_product.scrape_info() 37 | 38 | if not new_product_info.valid: 39 | print("Product info is not valid - see logs for more info") 40 | return 41 | 42 | product_in_db = db.get_product_by_product_code(new_product_info.id) 43 | 44 | if product_in_db is None: 45 | add_new_product_to_db(new_product) 46 | add_new_datapoint_with_scraper(new_product) 47 | return 48 | 49 | logger.info("Product with the same product code already exists in database") 50 | 51 | if product_in_db.is_active: 52 | print("Product with the same product code already exists in database and is active") 53 | return 54 | 55 | user_input = input( 56 | "A product with the same product id already exist in the database but is not active, " 57 | "do you want to activate it? (y/n) > " 58 | ) 59 | 60 | if user_input.lower() in ("y", "yes"): 61 | print("Activating product...") 62 | set_existing_product_is_active(product_in_db, True) 63 | logger.info("Product has been activated") 64 | else: 65 | print("Product has not been activated") 66 | logger.info("Product not activated") 67 | 68 | 69 | def add_new_product_to_db(product: Scraper) -> None: 70 | product_to_db = Format.scraper_to_db_product(product, True) 71 | db.add(product_to_db) 72 | 73 | 74 | def add_new_datapoint_to_db(product_code: str, price: float, currency: str, date: str | None = None): 75 | """Parameter 'date' defaults to the date of today in the format: YYYY-MM-DD""" 76 | if date is None: 77 | date = datetime.today().strftime("%Y-%m-%d") 78 | 79 | new_datapoint = db.DataPoint( 80 | product_code=product_code, 81 | date=date, 82 | price=price, 83 | currency=currency, 84 | ) 85 | 86 | db.add(new_datapoint) 87 | 88 | 89 | def add_new_datapoint_with_scraper(product: Scraper, date: str | None = None) -> None: 90 | if not product.product_info or not product.product_info.valid: 91 | print(f"Product info is not valid - category: '{product.category}' - url: {product.url}") 92 | return 93 | 94 | product_code = product.product_info.id 95 | price = product.product_info.price 96 | currency = product.product_info.currency 97 | 98 | add_new_datapoint_to_db(product_code, price, currency, date) 99 | 100 | 101 | def update_products_is_active_with_product_codes(product_codes: list[str], is_active: bool) -> None: 102 | action = "Activating" if is_active else "Deactivating" 103 | 104 | for product_code in product_codes: 105 | print(f"{action} {product_code}") 106 | product = db.get_product_by_product_code(product_code) 107 | set_existing_product_is_active(product, is_active) 108 | 109 | 110 | def set_existing_product_is_active(product: db.Product, is_active: bool) -> None: 111 | product.is_active = is_active 112 | db.add(product) 113 | 114 | 115 | def is_missing_url_schema(url: str) -> bool: 116 | return not any(schema in url for schema in URL_SCHEMES) 117 | -------------------------------------------------------------------------------- /scraper/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def argparse_setup() -> argparse.Namespace: 5 | """Setup and return argparse.""" 6 | parser = argparse.ArgumentParser(description="") 7 | 8 | parser.add_argument( 9 | "-s", 10 | "--scrape", 11 | help="scrape product info", 12 | action="store_true", 13 | ) 14 | 15 | parser.add_argument("--threads", help="use threads when scraping product info", action="store_true") 16 | 17 | parser.add_argument( 18 | "-a", 19 | "--add", 20 | help="Add a new product", 21 | action="store_true", 22 | ) 23 | 24 | parser.add_argument( 25 | "-c", 26 | "--category", 27 | help="specify category(s)", 28 | type=str, 29 | nargs="*", 30 | action="extend", 31 | default=[], 32 | ) 33 | 34 | parser.add_argument("-u", "--url", help="the url to the product", type=str, nargs="*", action="extend") 35 | 36 | parser.add_argument("--activate", help="activate a product to be scraped", action="store_true") 37 | 38 | parser.add_argument("--deactivate", help="deactivate a product to not be scraped", action="store_true") 39 | 40 | parser.add_argument( 41 | "-v", 42 | "--visualize", 43 | help="visualize your product data", 44 | action="store_true", 45 | dest="visualize", 46 | ) 47 | 48 | parser.add_argument( 49 | "--all", 50 | help="specify all products", 51 | action="store_true", 52 | dest="all", 53 | ) 54 | 55 | parser.add_argument( 56 | "--id", 57 | help="specify id(s) of product(s)", 58 | type=str, 59 | nargs="*", 60 | action="extend", 61 | dest="id", 62 | default=[], 63 | ) 64 | 65 | parser.add_argument( 66 | "-n", 67 | "--name", 68 | help="specify names(s) of product(s)", 69 | type=str, 70 | nargs="*", 71 | action="extend", 72 | dest="name", 73 | default=[], 74 | ) 75 | 76 | parser.add_argument( 77 | "-utd", 78 | "--up-to-date", 79 | help="show only graph for a product if the latest product price is today", 80 | action="store_true", 81 | dest="up_to_date", 82 | ) 83 | 84 | parser.add_argument( 85 | "--search", 86 | help="search for product names with the specified name(s)", 87 | type=str, 88 | nargs="*", 89 | action="extend", 90 | dest="search", 91 | metavar="SEARCH_TERM", 92 | ) 93 | 94 | parser.add_argument( 95 | "--compare", 96 | help="compare two or more products", 97 | action="store_true", 98 | dest="compare", 99 | ) 100 | 101 | parser.add_argument( 102 | "--reset", 103 | help="delete data for each product in records.json, such as prices of each recorded day", 104 | action="store_true", 105 | ) 106 | 107 | parser.add_argument( 108 | "--clean-data", 109 | help="clean data so unnecessary product datapoints is removed from records", 110 | action="store_true", 111 | dest="clean_data", 112 | ) 113 | 114 | parser.add_argument( 115 | "--latest-datapoint", 116 | help="get the latest datapoint of specified product(s)", 117 | dest="latest_datapoint", 118 | action="store_true", 119 | ) 120 | 121 | parser.add_argument( 122 | "--list-products", 123 | help="lists the names, websites and ids of all products", 124 | dest="list_products", 125 | action="store_true", 126 | ) 127 | 128 | parser.add_argument( 129 | "--delete", 130 | help="delete all or specific products or categories", 131 | dest="delete", 132 | action="store_true", 133 | ) 134 | 135 | args = validate_arguments(parser) 136 | 137 | return args 138 | 139 | 140 | def validate_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace: 141 | """Validate arguments""" 142 | args = parser.parse_args() 143 | 144 | if args.add and args.visualize: 145 | parser.error("Cannot use --add and --visualize at the same time") 146 | 147 | if args.activate and args.deactivate: 148 | parser.error("Cannot use --activate and --deactivate at the same time") 149 | 150 | if (args.activate or args.deactivate) and not args.id: 151 | parser.error("When using --activate or --deactivate, then --id is required") 152 | 153 | if args.delete: 154 | if args.all and any([args.category, args.name, args.id]): 155 | parser.error("When using --delete and --all, then using --category, --name or --id does nothing") 156 | 157 | if args.add: 158 | if not args.category or not args.url: 159 | parser.error("When using --add, then --category and --url is required") 160 | if len(args.category) > len(args.url): 161 | parser.error("Specified more categories than urls") 162 | if len(args.category) < len(args.url): 163 | parser.error("Specified more urls than categories") 164 | 165 | if args.visualize: 166 | if not any([args.all, args.category, args.id, args.name, args.compare]): 167 | parser.error( 168 | "When using --visualize, then one of the following is required: --all, --category, --id, --name, --compare" 169 | ) 170 | if args.compare and not any([args.id, args.name, args.category, args.all]): 171 | parser.error( 172 | "When using --visualize and --compare, then one of the following is required: --id, --name, --category, --all" 173 | ) 174 | 175 | if args.latest_datapoint: 176 | if not any([args.name, args.id, args.category]): 177 | parser.error("When using --latest-datapoint, then --name, --id or --category is required") 178 | 179 | return args 180 | -------------------------------------------------------------------------------- /scraper/clean_data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import scraper.database as db 4 | 5 | 6 | def clean_datapoints() -> None: 7 | print("Cleaning data...") 8 | logging.getLogger(__name__).info("Cleaning database datapoints") 9 | 10 | all_products = db.get_all_products() 11 | datapoints_to_delete = [] 12 | 13 | for product in all_products: 14 | datapoints = db.get_datapoints_by_product_codes([product.product_code]) 15 | 16 | datapoints.sort(key=lambda product: product.date) 17 | 18 | for index, datapoint in enumerate(datapoints): 19 | if index in (0, len(datapoints) - 1): 20 | continue 21 | 22 | previous_datapoint = datapoints[index - 1] 23 | next_datapoint = datapoints[index + 1] 24 | 25 | if datapoint.price == previous_datapoint.price and datapoint.price == next_datapoint.price: 26 | datapoints_to_delete.append(datapoint) 27 | 28 | db.delete_all(datapoints_to_delete) 29 | 30 | print("Done cleaning data") 31 | -------------------------------------------------------------------------------- /scraper/constants.py: -------------------------------------------------------------------------------- 1 | REQUEST_HEADER = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:112.0) Gecko/20100101 Firefox/112.0"} 2 | 3 | REQUEST_COOKIES = {"cookies_are": "working"} 4 | 5 | WEBSITE_COLORS = { 6 | "komplett": "orange", 7 | "proshop": "red", 8 | "computersalg": "blue", 9 | "elgiganten": "green", 10 | "avxperten": "aqua", 11 | "av-cables": "aquamarine", 12 | "amazon": "black", 13 | "ebay": "crimson", 14 | "power": "salmon", 15 | "expert": "olivedrab", 16 | "mm-vision": "mediumspringgreen", 17 | "coolshop": "mediumblue", 18 | "sharkgaming": "midnightblue", 19 | "newegg": "#f7c20a", 20 | "hifiklubben": "#231f20", 21 | "shein": "#ffed24", 22 | } 23 | 24 | URL_SCHEMES = ("http://", "https://") 25 | 26 | CHECK_MARK = "\u2713" 27 | -------------------------------------------------------------------------------- /scraper/database/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import Product, DataPoint 2 | from .db import create_db_and_tables, engine 3 | 4 | from .functions import ( 5 | delete_all, 6 | add, 7 | add_all, 8 | get_all_products, 9 | get_all_datapoints, 10 | get_product_by_product_code, 11 | get_products_by_product_codes, 12 | get_products_by_categories, 13 | get_products_by_names, 14 | get_products_by_names_fuzzy, 15 | get_datapoints_by_categories, 16 | get_datapoints_by_names, 17 | get_datapoints_by_product_codes, 18 | get_all_products_with_datapoints, 19 | get_product_infos_from_products, 20 | get_all_unique_categories, 21 | get_all_unique_domains, 22 | get_products_by_domains, 23 | get_all_products_grouped_by_domains, 24 | group_products_by_domains, 25 | group_products_by_names, 26 | ) 27 | -------------------------------------------------------------------------------- /scraper/database/db.py: -------------------------------------------------------------------------------- 1 | from sqlmodel import SQLModel, create_engine 2 | from pathlib import Path 3 | from scraper.filemanager import Filemanager 4 | from .models import Product, DataPoint # noqa: F401 5 | 6 | sqlite_url = f"sqlite:///{Filemanager.database_path}" 7 | 8 | Path(Filemanager.database_folder).mkdir(exist_ok=True) 9 | 10 | engine = create_engine(sqlite_url, echo=False) 11 | 12 | 13 | def create_db_and_tables(): 14 | SQLModel.metadata.create_all(engine) 15 | -------------------------------------------------------------------------------- /scraper/database/functions.py: -------------------------------------------------------------------------------- 1 | from sqlmodel import Session, select, col 2 | 3 | from scraper.models.product import DataPointInfo, ProductInfo 4 | from .db import engine 5 | from .models import Product, DataPoint 6 | 7 | 8 | def delete_all(elements: list[Product | DataPoint]) -> None: 9 | with Session(engine) as session: 10 | for element in elements: 11 | session.delete(element) 12 | session.commit() 13 | 14 | 15 | def add(element: Product | DataPoint) -> None: 16 | with Session(engine) as session: 17 | session.add(element) 18 | session.commit() 19 | 20 | 21 | def add_all(elements: list[Product | DataPoint]) -> None: 22 | with Session(engine) as session: 23 | session.add_all(elements) 24 | session.commit() 25 | 26 | 27 | def get_all_products(select_only_active: bool = False) -> list[Product]: 28 | with Session(engine) as session: 29 | query = select(Product) 30 | 31 | if select_only_active: 32 | query = query.where(Product.is_active) 33 | 34 | return session.exec(query).all() 35 | 36 | 37 | def get_all_datapoints() -> list[DataPoint]: 38 | with Session(engine) as session: 39 | return session.exec(select(DataPoint)).all() 40 | 41 | 42 | def get_all_unique_categories() -> list[str]: 43 | with Session(engine) as session: 44 | return session.exec(select(Product.category).distinct()).all() 45 | 46 | 47 | def get_all_unique_domains() -> list[str]: 48 | with Session(engine) as session: 49 | return session.exec(select(Product.domain).distinct()).all() 50 | 51 | 52 | def get_product_by_product_code(product_code: str) -> Product | None: 53 | with Session(engine) as session: 54 | return session.exec(select(Product).where(Product.product_code == product_code)).first() 55 | 56 | 57 | def get_products_by_product_codes(product_codes: list[str]) -> list[Product]: 58 | with Session(engine) as session: 59 | return session.exec(select(Product).where(col(Product.product_code).in_(product_codes))).all() 60 | 61 | 62 | def get_products_by_categories(categories: list[str]) -> list[Product]: 63 | with Session(engine) as session: 64 | return session.exec(select(Product).where(col(Product.category).in_(categories))).all() 65 | 66 | 67 | def get_products_by_names(names: list[str]) -> list[Product]: 68 | with Session(engine) as session: 69 | return session.exec(select(Product).where(col(Product.name).in_(names))).all() 70 | 71 | 72 | def get_products_by_names_fuzzy(names: list[str]) -> list[Product]: 73 | with Session(engine) as session: 74 | matched_products = [] 75 | 76 | for name in names: 77 | fuzzy_name = f"%{name}%" 78 | products = session.exec(select(Product).where(col(Product.name).like(fuzzy_name))).all() 79 | matched_products.extend(products) 80 | 81 | return matched_products 82 | 83 | 84 | def get_products_by_domains(domains: list[str], select_only_active: bool = False) -> list[Product]: 85 | with Session(engine) as session: 86 | query = select(Product).where(col(Product.domain).in_(domains)) 87 | 88 | if select_only_active: 89 | query = query.where(Product.is_active) 90 | 91 | return session.exec(query).all() 92 | 93 | 94 | def get_datapoints_by_categories(categories: list[str]) -> list[DataPoint]: 95 | with Session(engine) as session: 96 | products = session.exec(select(Product).where(col(Product.category).in_(categories))).all() 97 | product_codes = [product.product_code for product in products] 98 | datapoints = session.exec(select(DataPoint).where(col(DataPoint.product_code).in_(product_codes))).all() 99 | return datapoints 100 | 101 | 102 | def get_datapoints_by_names(names: list[str]) -> list[DataPoint]: 103 | with Session(engine) as session: 104 | products = session.exec(select(Product).where(col(Product.name).in_(names))).all() 105 | product_codes = [product.product_code for product in products] 106 | datapoints = session.exec(select(DataPoint).where(col(DataPoint.product_code).in_(product_codes))).all() 107 | return datapoints 108 | 109 | 110 | def get_datapoints_by_product_codes(product_codes: list[str]) -> list[DataPoint]: 111 | with Session(engine) as session: 112 | products = session.exec(select(Product).where(col(Product.product_code).in_(product_codes))).all() 113 | found_product_codes = [product.product_code for product in products] 114 | datapoints = session.exec(select(DataPoint).where(col(DataPoint.product_code).in_(found_product_codes))).all() 115 | return datapoints 116 | 117 | 118 | def get_datapoints_by_product(product: Product) -> list[DataPoint]: 119 | with Session(engine) as session: 120 | datapoints = session.exec( 121 | select(DataPoint).where(DataPoint.product_code == product.product_code).order_by(DataPoint.date) 122 | ).all() 123 | return datapoints 124 | 125 | 126 | def get_all_products_with_datapoints(select_only_active: bool = False) -> list[ProductInfo]: 127 | products = get_all_products(select_only_active=select_only_active) 128 | return get_product_infos_from_products(products) 129 | 130 | 131 | def get_product_infos_from_products(products: list[Product]) -> list[ProductInfo]: 132 | product_infos: list[ProductInfo] = [] 133 | 134 | for product in products: 135 | datapoints = get_datapoints_by_product(product) 136 | 137 | datapoint_infos = [DataPointInfo(date=datapoint.date, price=datapoint.price) for datapoint in datapoints] 138 | 139 | product_info = ProductInfo( 140 | id=product.product_code, 141 | product_name=product.name, 142 | category=product.category, 143 | currency=datapoints[0].currency if datapoints else "", 144 | datapoints=datapoint_infos, 145 | url=product.url, 146 | website=product.domain, 147 | ) 148 | 149 | product_infos.append(product_info) 150 | 151 | return product_infos 152 | 153 | 154 | def get_all_products_grouped_by_domains(select_only_active: bool = False) -> list[list[Product]]: 155 | all_products = get_all_products(select_only_active=select_only_active) 156 | return group_products_by_domains(all_products) 157 | 158 | 159 | def group_products_by_domains(products: list[Product]) -> list[list[Product]]: 160 | grouped_products = [] 161 | 162 | unique_domains = set([product.domain for product in products]) 163 | 164 | for domain in unique_domains: 165 | products_with_domain = list(filter(lambda product: product.domain == domain, products)) 166 | 167 | if not products_with_domain: 168 | continue 169 | 170 | grouped_products.append(products_with_domain) 171 | 172 | return grouped_products 173 | 174 | 175 | def group_products_by_names(products: list[Product]) -> list[list[Product]]: 176 | grouped_products = [] 177 | 178 | unique_names = set([product.name for product in products]) 179 | 180 | for name in unique_names: 181 | products_with_name = list(filter(lambda product: product.name == name, products)) 182 | 183 | if not products_with_name: 184 | continue 185 | 186 | grouped_products.append(products_with_name) 187 | 188 | return grouped_products 189 | -------------------------------------------------------------------------------- /scraper/database/models.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from sqlmodel import Field, SQLModel 3 | 4 | 5 | class Product(SQLModel, table=True): 6 | __tablename__: str = "products" 7 | 8 | id: int = Field(default=None, primary_key=True) 9 | product_code: str 10 | name: str 11 | category: str 12 | domain: str 13 | url: str 14 | short_url: str 15 | is_active: bool 16 | created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False) 17 | 18 | 19 | class DataPoint(SQLModel, table=True): 20 | __tablename__: str = "datapoints" 21 | 22 | id: int = Field(default=None, primary_key=True) 23 | product_code: str 24 | date: str 25 | price: float 26 | currency: str 27 | created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False) 28 | -------------------------------------------------------------------------------- /scraper/delete_data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import scraper.database as db 3 | 4 | 5 | def delete(categories: list[str], names: list[str], product_codes: list[str], all: bool) -> None: 6 | print("Deleting...") 7 | logging.getLogger(__name__).info(f"Deleting products and datapoint for {categories=}, {names=}, {product_codes=}, {all=}") 8 | 9 | if all: 10 | delete_all() 11 | return 12 | 13 | if categories: 14 | delete_products_by_categories(categories) 15 | 16 | if names: 17 | delete_products_by_names(names) 18 | 19 | if product_codes: 20 | delete_products_by_product_codes(product_codes) 21 | 22 | 23 | def delete_all() -> None: 24 | print("Deleting all products and datapoints...") 25 | logging.getLogger(__name__).info("Deleting all products and datapoints") 26 | 27 | all_products = db.get_all_products() 28 | all_datapoints = db.get_all_datapoints() 29 | 30 | db.delete_all(all_products) 31 | db.delete_all(all_datapoints) 32 | 33 | 34 | def delete_products_by_categories(categories: list[str]) -> None: 35 | products = db.get_products_by_categories(categories) 36 | log_product_codes_with_message(products, "Deleting products with categories") 37 | db.delete_all(products) 38 | 39 | 40 | def delete_products_by_names(names: list[str]) -> None: 41 | products = db.get_products_by_names(names) 42 | log_product_codes_with_message(products, "Deleting products by names") 43 | db.delete_all(products) 44 | 45 | 46 | def delete_products_by_product_codes(product_codes: list[str]) -> None: 47 | products = db.get_products_by_product_codes(product_codes) 48 | log_product_codes_with_message(products, "Deleting products with product codes") 49 | db.delete_all(products) 50 | 51 | 52 | def log_product_codes_with_message(products: list[db.Product], log_message: str) -> None: 53 | logger = logging.getLogger(__name__) 54 | product_codes = [product.product_code for product in products] 55 | 56 | if product_codes: 57 | product_codes_string = ", ".join(product_codes) 58 | print(f"Deleting product codes: {product_codes_string}") 59 | else: 60 | print("No product found to delete") 61 | 62 | logger.info(f"{log_message} - {product_codes=}") 63 | -------------------------------------------------------------------------------- /scraper/domains.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import json 5 | import logging 6 | from abc import ABC, abstractmethod 7 | 8 | from scraper.models import Info 9 | from scraper.constants import REQUEST_HEADER, REQUEST_COOKIES 10 | from scraper.filemanager import Config 11 | from scraper.exceptions import WebsiteVersionNotSupported 12 | 13 | 14 | def request_url(url: str) -> requests.Response: 15 | request_timeout = Config.get_request_timeout() 16 | 17 | try: 18 | response = requests.get(url, headers=REQUEST_HEADER, cookies=REQUEST_COOKIES, timeout=request_timeout) 19 | return response 20 | except requests.RequestException: 21 | logging.getLogger(__name__).exception(f"Module requests exception with url: {url}") 22 | 23 | 24 | class BaseWebsiteHandler(ABC): 25 | def __init__(self, url: str) -> None: 26 | self.url = url 27 | self.website_name = get_website_name(url) 28 | self.info: Info = None 29 | self.request_data = None 30 | 31 | def get_product_info(self) -> Info: 32 | try: 33 | self._request_product_data() 34 | self._get_common_data() 35 | raw_name = self._get_product_name() 36 | name = Config.get_user_product_name(raw_name) 37 | price = self._get_product_price() 38 | currency = self._get_product_currency() 39 | id = self._get_product_id() 40 | self.info = Info(name, price, currency, id) 41 | return self.info 42 | except (AttributeError, ValueError, TypeError): 43 | logging.getLogger(__name__).exception(f"Could not get all the data needed from url: {self.url}") 44 | return Info(None, None, None, None, valid=False) 45 | except WebsiteVersionNotSupported as ex: 46 | logging.getLogger(__name__).error(ex) 47 | return Info(None, None, None, None, valid=False) 48 | 49 | def _request_product_data(self) -> None: 50 | # option for each specific class to change how the request data is being handled 51 | response = request_url(self.url) 52 | self.request_data = BeautifulSoup(response.text, "html.parser") 53 | 54 | def _get_common_data(self) -> None: 55 | # if the same data needs to be accessed from more than one of the abstract methods, 56 | # then you can use this method to store the data as a instance variable, 57 | # so that the other methods can access the data 58 | pass 59 | 60 | @abstractmethod 61 | def _get_product_name(self) -> str: 62 | pass 63 | 64 | @abstractmethod 65 | def _get_product_price(self) -> float: 66 | pass 67 | 68 | @abstractmethod 69 | def _get_product_currency(self) -> str: 70 | pass 71 | 72 | @abstractmethod 73 | def _get_product_id(self) -> str: 74 | pass 75 | 76 | @abstractmethod 77 | def get_short_url(self) -> str: 78 | pass 79 | 80 | 81 | class KomplettHandler(BaseWebsiteHandler): 82 | def _get_product_name(self) -> str: 83 | return self.request_data.find("div", class_="product-main-info__info").h1.span.text 84 | 85 | def _get_product_price(self) -> float: 86 | return float(self.request_data.find("span", class_="product-price-now").text.strip(",-").replace(".", "")) 87 | 88 | def _get_product_currency(self) -> str: 89 | script_tag = self.request_data.find("script", type="application/ld+json").contents[0] 90 | currency = json.loads(script_tag).get("offers").get("priceCurrency") 91 | return currency 92 | 93 | def _get_product_id(self) -> str: 94 | return self.url.split("/")[4] 95 | 96 | def get_short_url(self) -> str: 97 | id = self._get_product_id() 98 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 99 | return f"{website}/product/{id}" 100 | 101 | 102 | class ProshopHandler(BaseWebsiteHandler): 103 | def _get_common_data(self) -> None: 104 | soup_script_tag = self.request_data.find("script", type="application/ld+json").contents[0] 105 | self.script_json = json.loads(soup_script_tag) 106 | 107 | def _get_product_name(self) -> str: 108 | return self.script_json["name"] 109 | 110 | def _get_product_price(self) -> float: 111 | try: 112 | # find normal price 113 | price = float( 114 | self.request_data.find("span", class_="site-currency-attention") 115 | .text.replace(".", "") 116 | .replace(",", ".") 117 | .strip(" kr") 118 | ) 119 | except AttributeError: 120 | try: 121 | # find discount price 122 | price = float( 123 | self.request_data.find("div", class_="site-currency-attention site-currency-campaign") 124 | .text.replace(".", "") 125 | .replace(",", ".") 126 | .strip(" kr") 127 | ) 128 | except AttributeError: 129 | # if campaign is sold out (udsolgt) 130 | price = float( 131 | self.request_data.find("div", class_="site-currency-attention") 132 | .text.replace(".", "") 133 | .replace(",", ".") 134 | .strip(" kr") 135 | ) 136 | return price 137 | 138 | def _get_product_currency(self) -> str: 139 | currency = self.script_json.get("offers").get("priceCurrency") 140 | return currency 141 | 142 | def _get_product_id(self) -> str: 143 | return self.url.split("/")[-1] 144 | 145 | def get_short_url(self) -> str: 146 | id = self._get_product_id() 147 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 148 | return f"{website}/{id}" 149 | 150 | 151 | class ComputerSalgHandler(BaseWebsiteHandler): 152 | def _get_product_name(self) -> str: 153 | return self.request_data.find("meta", {"name": "title"})["content"] 154 | 155 | def _get_product_price(self) -> float: 156 | return float(self.request_data.find("span", itemprop="price").text.strip().replace(".", "").replace(",", ".")) 157 | 158 | def _get_product_currency(self) -> str: 159 | return self.request_data.find("span", itemprop="priceCurrency").get("content") 160 | 161 | def _get_product_id(self) -> str: 162 | return self.url.split("/")[4] 163 | 164 | def get_short_url(self) -> str: 165 | id = self._get_product_id() 166 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 167 | return f"{website}/i/{id}" 168 | 169 | 170 | class ElgigantenHandler(BaseWebsiteHandler): 171 | def _get_common_data(self) -> None: 172 | self.elgiganten_api_data = self._get_json_api_data() 173 | 174 | def _get_product_name(self) -> str: 175 | return self.request_data.find("h1", class_="product-title").text 176 | 177 | def _get_product_price(self) -> float: 178 | return float(self.elgiganten_api_data["data"]["product"]["currentPricing"]["price"]["value"]) 179 | 180 | def _get_product_currency(self) -> str: 181 | return self.elgiganten_api_data["data"]["product"]["currentPricing"]["price"]["currency"] 182 | 183 | def _get_product_id(self) -> str: 184 | return self.url.split("/")[-1] 185 | 186 | def _get_json_api_data(self) -> dict: 187 | id_number = self._get_product_id() 188 | 189 | # API link to get price and currency 190 | if "elgiganten.dk" in self.url: 191 | api_link = f"https://www.elgiganten.dk/cxorchestrator/dk/api?getProductWithDynamicDetails&appMode=b2c&user=anonymous&operationName=getProductWithDynamicDetails&variables=%7B%22articleNumber%22%3A%22{id_number}%22%2C%22withCustomerSpecificPrices%22%3Afalse%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22229bbb14ee6f93449967eb326f5bfb87619a37e7ee6c4555b94496313c139ee1%22%7D%7D" # noqa E501 192 | elif "elgiganten.se" in self.url: 193 | api_link = f"https://www.elgiganten.se/cxorchestrator/se/api?getProductWithDynamicDetails&appMode=b2c&user=anonymous&operationName=getProductWithDynamicDetails&variables=%7B%22articleNumber%22%3A%22{id_number}%22%2C%22withCustomerSpecificPrices%22%3Afalse%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22229bbb14ee6f93449967eb326f5bfb87619a37e7ee6c4555b94496313c139ee1%22%7D%7D" # noqa E501 194 | else: 195 | raise WebsiteVersionNotSupported(get_website_name(self.url, keep_tld=True)) 196 | 197 | response = request_url(api_link) 198 | return response.json() 199 | 200 | def get_short_url(self) -> str: 201 | id = self._get_product_id() 202 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 203 | return f"{website}/product/{id}" 204 | 205 | 206 | class AvXpertenHandler(BaseWebsiteHandler): 207 | def _get_common_data(self) -> None: 208 | soup_script_tag = self.request_data.find("script", type="application/ld+json").contents[0] 209 | self.script_json = json.loads(soup_script_tag) 210 | 211 | def _get_product_name(self) -> str: 212 | return self.request_data.find("div", class_="content-head").h1.text.strip() 213 | 214 | def _get_product_price(self) -> float: 215 | return float(self.request_data.find("div", class_="price").text.replace("\xa0DKK", "").replace(" DKK", "")) 216 | 217 | def _get_product_currency(self) -> str: 218 | return self.script_json.get("offers").get("priceCurrency") 219 | 220 | def _get_product_id(self) -> str: 221 | return self.script_json.get("sku") 222 | 223 | def get_short_url(self) -> str: 224 | return self.url 225 | 226 | 227 | class AvCablesHandler(BaseWebsiteHandler): 228 | def _get_product_name(self) -> str: 229 | return self.request_data.find("h1", class_="title").text 230 | 231 | def _get_product_price(self) -> float: 232 | return float( 233 | self.request_data.find("div", class_="regular-price") 234 | .text.strip() 235 | .replace("Pris: ", "") 236 | .replace("Tilbudspris: ", "") 237 | .split(",")[0] 238 | ) 239 | 240 | def _get_product_currency(self) -> str: 241 | return self.request_data.find("meta", property="og:price:currency").get("content") 242 | 243 | def _get_product_id(self) -> str: 244 | script_tag = self.request_data.find("script", type="application/ld+json").contents[0] 245 | id = json.loads(script_tag).get("sku") 246 | return str(id) 247 | 248 | def get_short_url(self) -> str: 249 | return self.url 250 | 251 | 252 | class AmazonHandler(BaseWebsiteHandler): 253 | def _get_product_name(self) -> str: 254 | return self.request_data.find("span", id="productTitle").text.strip() 255 | 256 | def _get_product_price(self) -> float: 257 | raw_price = self.request_data.find("span", class_="a-price").span.text.replace(",", "").replace(" ", "") 258 | return float(get_number_string(raw_price)) 259 | 260 | def _get_product_currency(self) -> str: 261 | regex_pattern = "%22currencyCode%22%3A%22(.{3})%22" 262 | 263 | regex_result = re.search(regex_pattern, str(self.request_data)) 264 | 265 | if regex_result: 266 | return regex_result.group(1) 267 | return "N/F" 268 | 269 | def _get_product_id(self) -> str: 270 | try: 271 | return self.request_data.find("input", id="ASIN").get("value") 272 | except (AttributeError, ValueError, TypeError): 273 | asin_json = json.loads(self.request_data.find("span", id="cr-state-object").get("data-state")) 274 | return asin_json["asin"] 275 | 276 | def get_short_url(self) -> str: 277 | return self.url 278 | 279 | 280 | class EbayHandler(BaseWebsiteHandler): 281 | def _get_common_data(self) -> None: 282 | self.soup_url = self.request_data.find("meta", property="og:url").get("content") 283 | 284 | def _get_product_name(self) -> str: 285 | try: 286 | return self.request_data.find("h1", class_="x-item-title__mainTitle").text.strip() 287 | except (AttributeError, ValueError, TypeError): 288 | return self.request_data.find("meta", property="og:title").get("content").replace(" | eBay", "") 289 | 290 | def _get_product_price(self) -> float: 291 | if self.soup_url.split("/")[3] == "itm": 292 | price = float(self.request_data.find("div", class_="x-price-primary").text.replace("US $", "")) 293 | else: 294 | price = float( 295 | self.request_data.find("div", class_="x-price-primary") 296 | .text.replace("DKK ", "") 297 | .replace("$", "") 298 | .replace(",", "") 299 | ) 300 | 301 | return price 302 | 303 | def _get_product_currency(self) -> str: 304 | if self.soup_url.split("/")[3] == "itm": 305 | currency = self.request_data.find("span", itemprop="priceCurrency").get("content") 306 | else: 307 | script_tag = self.request_data.find("script", type="application/ld+json").contents[0] 308 | currency = ( 309 | json.loads(script_tag) 310 | .get("mainEntity") 311 | .get("offers") 312 | .get("itemOffered")[0] 313 | .get("offers")[0] 314 | .get("priceCurrency") 315 | ) 316 | 317 | return currency 318 | 319 | def _get_product_id(self) -> str: 320 | return self.url.split("/")[4].split("?")[0] 321 | 322 | def get_short_url(self) -> str: 323 | id = self._get_product_id() 324 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 325 | 326 | if self.url.split("/")[3] == "itm": 327 | return f"{website}/itm/{id}" 328 | else: 329 | return f"{website}/p/{id}" 330 | 331 | 332 | class PowerHandler(BaseWebsiteHandler): 333 | def _get_common_data(self) -> None: 334 | id = self._get_product_id() 335 | self.api_json = request_url(f"https://www.power.dk/api/v2/products?ids={id}").json() 336 | 337 | def _get_product_name(self) -> str: 338 | return self.api_json[0].get("title") 339 | 340 | def _get_product_price(self) -> float: 341 | return float(self.api_json[0].get("price")) 342 | 343 | def _get_product_currency(self) -> str: 344 | return "DKK" 345 | 346 | def _get_product_id(self) -> str: 347 | return self.url.split("/")[-2].strip("p-") 348 | 349 | def get_short_url(self) -> str: 350 | id = self._get_product_id() 351 | url_id = self.url.split("/")[3] 352 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 353 | return f"{website}/{url_id}/p-{id}" 354 | 355 | 356 | class ExpertHandler(BaseWebsiteHandler): 357 | def _get_common_data(self) -> None: 358 | id = self._get_product_id() 359 | self.api_json = request_url(f"https://www.expert.dk/api/v2/products?ids={id}").json() 360 | 361 | def _get_product_name(self) -> str: 362 | return self.api_json[0].get("title") 363 | 364 | def _get_product_price(self) -> float: 365 | return float(self.api_json[0].get("price")) 366 | 367 | def _get_product_currency(self) -> str: 368 | return "DKK" 369 | 370 | def _get_product_id(self) -> str: 371 | return self.url.split("/")[-2].strip("p-") 372 | 373 | def get_short_url(self) -> str: 374 | id = self._get_product_id() 375 | url_id = self.url.split("/")[3] 376 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 377 | return f"{website}/{url_id}/p-{id}" 378 | 379 | 380 | class MMVisionHandler(BaseWebsiteHandler): 381 | def _get_common_data(self) -> None: 382 | soup_script_tag = self.request_data.find_all("script", type="application/ld+json")[1].contents[0] 383 | self.script_json = json.loads(soup_script_tag) 384 | 385 | def _get_product_name(self) -> str: 386 | return self.request_data.find("h1", itemprop="name").text.strip() 387 | 388 | def _get_product_price(self) -> float: 389 | return float( 390 | self.request_data.find("h3", class_="product-price text-right") 391 | .text.strip("fra ") 392 | .strip() 393 | .strip(",-") 394 | .replace(".", "") 395 | ) 396 | 397 | def _get_product_currency(self) -> str: 398 | return self.script_json.get("offers").get("priceCurrency") 399 | 400 | def _get_product_id(self) -> str: 401 | return self.script_json.get("productID") 402 | 403 | def get_short_url(self) -> str: 404 | return self.url 405 | 406 | 407 | class CoolshopHandler(BaseWebsiteHandler): 408 | def _get_product_name(self) -> str: 409 | return self.request_data.find("div", class_="thing-header").h1.text.strip().replace("\n", " ") 410 | 411 | def _get_product_price(self) -> float: 412 | return float(self.request_data.find("meta", property="product:price:amount")["content"].split(".")[0]) 413 | 414 | def _get_product_currency(self) -> str: 415 | return self.request_data.find("meta", property="product:price:currency").get("content") 416 | 417 | def _get_product_id(self) -> str: 418 | return self.request_data.find_all("div", id="attributeSku")[1].text.strip() 419 | 420 | def get_short_url(self) -> str: 421 | url_id = self.url.split("/")[-2] 422 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 423 | return f"{website}/produkt/{url_id}/" 424 | 425 | 426 | class SharkGamingHandler(BaseWebsiteHandler): 427 | def _get_product_name(self) -> str: 428 | return self.request_data.find("h1", class_="page-title").span.text 429 | 430 | def _get_product_price(self) -> float: 431 | return float(self.request_data.find("meta", property="product:price:amount").get("content")) 432 | 433 | def _get_product_currency(self) -> str: 434 | return self.request_data.find("meta", property="product:price:currency").get("content") 435 | 436 | def _get_product_id(self) -> str: 437 | return json.loads(self.request_data.find_all("script", type="application/ld+json")[3].text).get("productID") 438 | 439 | def get_short_url(self) -> str: 440 | return self.url 441 | 442 | 443 | class NeweggHandler(BaseWebsiteHandler): 444 | def _get_common_data(self) -> None: 445 | script_data_raw = self.request_data.find_all("script", type="application/ld+json")[2].text 446 | self.script_json = json.loads(script_data_raw) 447 | 448 | def _get_product_name(self) -> str: 449 | return self.script_json.get("name") 450 | 451 | def _get_product_price(self) -> float: 452 | return float(self.script_json.get("offers").get("price")) 453 | 454 | def _get_product_currency(self) -> str: 455 | return self.script_json.get("offers").get("priceCurrency") 456 | 457 | def _get_product_id(self) -> str: 458 | return self.url.split("/")[5].split("?")[0] 459 | 460 | def get_short_url(self) -> str: 461 | id = self._get_product_id() 462 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 463 | return f"{website}/p/{id}" 464 | 465 | 466 | class HifiKlubbenHandler(BaseWebsiteHandler): 467 | def _get_common_data(self) -> None: 468 | script_data_raw = self.request_data.findAll("script", type="application/ld+json")[1].text 469 | self.product_data = json.loads(script_data_raw)["offers"] 470 | 471 | def _get_product_name(self) -> str: 472 | brand_name = self.request_data.find("span", class_="product-page__brand-name").text 473 | model_name = self.request_data.find("span", class_="product-page__model-name").text 474 | return f"{brand_name} {model_name}" 475 | 476 | def _get_product_price(self) -> float: 477 | return float(self.product_data.get("price")) 478 | 479 | def _get_product_currency(self) -> str: 480 | return self.product_data.get("priceCurrency") 481 | 482 | def _get_product_id(self) -> str: 483 | return self.url.split("/")[4] 484 | 485 | def get_short_url(self) -> str: 486 | id = self._get_product_id() 487 | website = get_website_name(self.url, keep_tld=True, keep_http=True, keep_www=True) 488 | return f"{website}/{id}" 489 | 490 | 491 | class SheinHandler(BaseWebsiteHandler): 492 | def _get_common_data(self) -> None: 493 | script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text 494 | self.script_json = json.loads(script_data_raw) 495 | 496 | def _get_product_name(self) -> str: 497 | return self.script_json.get("name") 498 | 499 | def _get_product_price(self) -> float: 500 | return float(self.script_json.get("offers").get("price")) 501 | 502 | def _get_product_currency(self) -> str: 503 | return self.script_json.get("offers").get("priceCurrency") 504 | 505 | def _get_product_id(self) -> str: 506 | return self.script_json.get("sku") 507 | 508 | def get_short_url(self) -> str: 509 | return self.url 510 | 511 | 512 | def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str: 513 | stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://") 514 | 515 | if not keep_www and keep_http: 516 | stripped_url = stripped_url.replace("www.", "", 1) 517 | elif not keep_www: 518 | stripped_url = stripped_url.removeprefix("www.") 519 | 520 | domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0] 521 | 522 | # Remove the TLD/DNS name (such as ".com") if keep_tld is false 523 | website_name_list = domain.split(".") if keep_tld else domain.split(".")[:-1] 524 | 525 | # Remove subdomain if keep_subdomain is false 526 | if not keep_subdomain and len(website_name_list) > 1: 527 | subdomain_and_domain = get_website_name(domain, keep_subdomain=True) 528 | subdomains = subdomain_and_domain.split(".")[:-1] 529 | 530 | website_name_list_copy = website_name_list.copy() 531 | # remove subdomains 532 | website_name_list = [elem for elem in website_name_list_copy if elem not in subdomains] 533 | 534 | website_name = ".".join(website_name_list) 535 | return website_name 536 | 537 | 538 | def get_website_handler(url: str) -> BaseWebsiteHandler: 539 | website_name = get_website_name(url, keep_subdomain=False).lower() 540 | 541 | website_handler = SUPPORTED_DOMAINS.get(website_name, None) 542 | 543 | if not website_handler: 544 | logging.getLogger(__name__).error(f"Can't find a website handler - website: '{website_name}' possibly not supported") 545 | return None 546 | 547 | return website_handler(url) 548 | 549 | 550 | def get_number_string(value: str) -> str: 551 | """Return string with only digits, commas (,) and periods (.)""" 552 | text_pattern = re.compile(r"[^\d.,]+") 553 | result = text_pattern.sub("", value) 554 | return result 555 | 556 | 557 | SUPPORTED_DOMAINS: dict[str, BaseWebsiteHandler] = { 558 | "komplett": KomplettHandler, 559 | "proshop": ProshopHandler, 560 | "computersalg": ComputerSalgHandler, 561 | "elgiganten": ElgigantenHandler, 562 | "avxperten": AvXpertenHandler, 563 | "av-cables": AvCablesHandler, 564 | "amazon": AmazonHandler, 565 | "ebay": EbayHandler, 566 | "power": PowerHandler, 567 | "expert": ExpertHandler, 568 | "mm-vision": MMVisionHandler, 569 | "coolshop": CoolshopHandler, 570 | "sharkgaming": SharkGamingHandler, 571 | "newegg": NeweggHandler, 572 | "hifiklubben": HifiKlubbenHandler, 573 | "shein": SheinHandler, 574 | } 575 | -------------------------------------------------------------------------------- /scraper/exceptions.py: -------------------------------------------------------------------------------- 1 | from scraper.constants import URL_SCHEMES 2 | 3 | 4 | class WebsiteNotSupported(Exception): 5 | def __init__(self, website_name: str, *args: object) -> None: 6 | super().__init__(*args) 7 | self.website_name = website_name 8 | 9 | def __str__(self) -> str: 10 | return f"Website '{self.website_name}' is currently not supported" 11 | 12 | 13 | class WebsiteVersionNotSupported(Exception): 14 | def __init__(self, website_name: str, *args: object) -> None: 15 | super().__init__(*args) 16 | self.website_name = website_name 17 | 18 | def __str__(self) -> str: 19 | return f"Website version '{self.website_name}' is currently not supported" 20 | 21 | 22 | class URLMissingSchema(Exception): 23 | def __init__(self, url, *args: object) -> None: 24 | super().__init__(*args) 25 | self.url = url 26 | 27 | def __str__(self) -> str: 28 | return f"Missing schema in url '{self.url}'. Consider prefixing the url with one of following schemes: {', '.join(URL_SCHEMES)}" 29 | -------------------------------------------------------------------------------- /scraper/filemanager.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator 2 | import pathlib 3 | import configparser 4 | 5 | 6 | class Filemanager: 7 | # root path of this repository 8 | root_path = pathlib.Path(__file__).parent.parent.absolute() 9 | products_json_path = f"{root_path}/scraper/records.json" 10 | products_csv_path = f"{root_path}/scraper/products.csv" 11 | settings_ini_path = f"{root_path}/scraper/settings.ini" 12 | logging_ini_path = f"{root_path}/scraper/logging.ini" 13 | logfile_path = f"{root_path}/scraper/logfile.log" 14 | database_folder = f"{root_path}/scraper/data" 15 | database_path = f"{database_folder}/database.db" 16 | 17 | 18 | class Config: 19 | @staticmethod 20 | def read(filename: str) -> configparser.ConfigParser: 21 | config = configparser.ConfigParser() 22 | config.read(filename, encoding="utf8") 23 | return config 24 | 25 | @staticmethod 26 | def write(filename: str, config: configparser.ConfigParser) -> None: 27 | with open(filename, "w") as default_file: 28 | config.write(default_file) 29 | 30 | @staticmethod 31 | def get_section_by_name(section_name: str) -> configparser.SectionProxy: 32 | """Get a section from settings.ini file""" 33 | config = Config.read(Filemanager.settings_ini_path) 34 | return config[section_name] 35 | 36 | @staticmethod 37 | def get_key_values(elements: list) -> Iterator[str]: 38 | for elem in elements: 39 | if "key" in elem: 40 | yield elem 41 | 42 | @staticmethod 43 | def get_request_delay() -> int: 44 | config = Config.read(Filemanager.settings_ini_path) 45 | return int(config["Scraping"]["request_delay"]) 46 | 47 | @staticmethod 48 | def get_request_timeout() -> float | None: 49 | """Get request timeout - if number return float else return None""" 50 | config = Config.read(Filemanager.settings_ini_path) 51 | timeout = config["Scraping"]["request_timeout"] 52 | try: 53 | return float(timeout) 54 | except ValueError: 55 | return None 56 | 57 | @staticmethod 58 | def get_user_product_name(product_name: str) -> str: 59 | product_name_lowercase = product_name.lower() 60 | user_product_names = Config.get_section_by_name("ChangeName") 61 | 62 | for key in Config.get_key_values(user_product_names): 63 | key_list = user_product_names[key].split(",") 64 | value_key = f'value{key.strip("key")}' 65 | if all(elem.lower() in product_name_lowercase for elem in key_list): 66 | return user_product_names[value_key] 67 | 68 | return product_name 69 | -------------------------------------------------------------------------------- /scraper/format.py: -------------------------------------------------------------------------------- 1 | import scraper.database as db 2 | from scraper.models.product import ProductInfo 3 | from scraper.scrape import Scraper 4 | from scraper.domains import get_website_name 5 | 6 | 7 | class Format: 8 | def db_products_to_scrapers(products: list[db.Product]) -> list[Scraper]: 9 | scrapers = [] 10 | for product in products: 11 | scraper = Format.db_product_to_scraper(product) 12 | scrapers.append(scraper) 13 | return scrapers 14 | 15 | @staticmethod 16 | def db_product_to_scraper(product: db.Product) -> Scraper: 17 | return Scraper(category=product.category, url=product.short_url) 18 | 19 | @staticmethod 20 | def scraper_to_db_product(product: Scraper, is_active: bool) -> db.Product: 21 | return db.Product( 22 | product_code=product.product_info.id, 23 | name=product.product_info.name, 24 | category=product.category, 25 | domain=product.website_handler.website_name, 26 | url=product.url, 27 | short_url=product.website_handler.get_short_url(), 28 | is_active=is_active, 29 | ) 30 | 31 | @staticmethod 32 | def db_products_to_product_infos(products: list[db.Product]) -> list[ProductInfo]: 33 | product_infos = [] 34 | for product in products: 35 | product_info = Format.db_product_to_product_info(product) 36 | product_infos.append(product_info) 37 | return product_infos 38 | 39 | @staticmethod 40 | def db_product_to_product_info(product: db.Product) -> ProductInfo: 41 | return ProductInfo( 42 | product_name=product.name, 43 | category=product.category, 44 | url=product.short_url, 45 | id=product.product_code, 46 | currency=None, 47 | website=get_website_name(product.short_url, keep_subdomain=False), 48 | datapoints=None, 49 | ) 50 | -------------------------------------------------------------------------------- /scraper/format_to_new.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Iterable 3 | from sqlmodel import Session, select 4 | from dataclasses import dataclass 5 | import pandas as pd 6 | import json 7 | 8 | from scraper.filemanager import Config, Filemanager 9 | from scraper.domains import get_website_handler 10 | from scraper.models.product import DataPointInfo, MasterProduct, ProductInfo 11 | from scraper.database.models import Product, DataPoint 12 | from scraper.database.db import engine, create_db_and_tables 13 | 14 | 15 | @dataclass 16 | class ProductCSV: 17 | url: str 18 | short_url: str 19 | category: str 20 | 21 | 22 | class FilemanagerLegacy: 23 | @staticmethod 24 | def read_json(filename: str) -> dict: 25 | with open(filename, "r", encoding="utf8") as file: 26 | data = json.load(file) 27 | return data 28 | 29 | @staticmethod 30 | def get_record_data() -> dict: 31 | data = FilemanagerLegacy.read_json(Filemanager.products_json_path) 32 | return data 33 | 34 | @staticmethod 35 | def save_record_data(data: dict) -> None: 36 | FilemanagerLegacy.write_json(Filemanager.products_json_path, data) 37 | 38 | @staticmethod 39 | def get_products_data() -> pd.DataFrame: 40 | df = pd.read_csv(Filemanager.products_csv_path, sep=",", header=0) 41 | return df 42 | 43 | @staticmethod 44 | def save_products_data(data_df: pd.DataFrame) -> None: 45 | data_df.to_csv(Filemanager.products_csv_path, sep=",", header=True, index=False) 46 | 47 | 48 | class Format: 49 | @staticmethod 50 | def format_old_records_to_new() -> None: 51 | """Format records data from pre v1.1 to new records data format in v1.1""" 52 | records_data = FilemanagerLegacy.get_record_data() 53 | 54 | for category_info in records_data.values(): 55 | for product_info in category_info.values(): 56 | for website_info in product_info.values(): 57 | website_info["info"].update({"currency": "TBD"}) 58 | website_info.update({"datapoints": []}) 59 | 60 | for date_name, date_info in website_info["dates"].items(): 61 | website_info["datapoints"].append({"date": date_name, "price": float(date_info["price"])}) 62 | 63 | website_info.pop("dates") 64 | 65 | FilemanagerLegacy.save_record_data(records_data) 66 | 67 | @staticmethod 68 | def add_short_urls_to_products_csv() -> None: 69 | """Format products.csv to have short_url column - introduced in v2.3.0""" 70 | request_delay = Config.get_request_delay() 71 | 72 | products_df = FilemanagerLegacy.get_products_data() 73 | 74 | short_urls = [] 75 | for _, row in products_df.iterrows(): 76 | time.sleep(request_delay) 77 | website_handler = get_website_handler(row["url"]) 78 | short_url = website_handler.get_short_url() 79 | 80 | # scrape only if short_url can't be created without 81 | if short_url is None: 82 | website_handler.get_product_info() 83 | short_url = website_handler.get_short_url() 84 | short_urls.append(short_url) 85 | 86 | products_df = products_df.drop("short_url", axis=1) 87 | products_df.insert(2, "short_url", short_urls, True) 88 | 89 | FilemanagerLegacy.save_products_data(products_df) 90 | 91 | @staticmethod 92 | def from_json_to_db() -> None: 93 | """Take the data in records.json and insert it in database - introduced in v3.0.0 94 | - NOTE all products in database will be deleted before inserting data from records.json""" 95 | 96 | create_db_and_tables() 97 | records = FilemanagerLegacy.get_record_data() 98 | products_df = FilemanagerLegacy.get_products_data() 99 | 100 | products_from_csv = [ 101 | ProductCSV(category=category, url=url, short_url=short_url) 102 | for category, url, short_url in zip(products_df["category"], products_df["url"], products_df["short_url"]) 103 | ] 104 | 105 | master_products = get_master_products(records) 106 | products_from_json = get_products_from_master_products(master_products) 107 | 108 | products_to_db: list[Product] = [] 109 | for product_json in products_from_json: 110 | product_to_db = Product( 111 | name=product_json.product_name, 112 | product_code=product_json.id, 113 | domain=product_json.website, 114 | url="", 115 | short_url=product_json.url, 116 | category=product_json.category, 117 | is_active=False, 118 | ) 119 | 120 | for product_csv in products_from_csv: 121 | if product_csv.short_url == product_json.url: 122 | product_to_db.url = product_csv.url 123 | product_to_db.is_active = True 124 | 125 | products_to_db.append(product_to_db) 126 | 127 | datapoints_to_db: list[DataPoint] = [] 128 | for product in products_from_json: 129 | for datapoint in product.datapoints: 130 | datapoint_to_db = DataPoint( 131 | product_code=product.id, date=datapoint.date, price=datapoint.price, currency=product.currency 132 | ) 133 | datapoints_to_db.append(datapoint_to_db) 134 | 135 | with Session(engine) as session: 136 | products_in_db = session.exec(select(Product)).all() 137 | for product_in_db in products_in_db: 138 | session.delete(product_in_db) 139 | 140 | datapoints_in_db = session.exec(select(DataPoint)).all() 141 | for datapoint_in_db in datapoints_in_db: 142 | session.delete(datapoint_in_db) 143 | 144 | session.add_all(products_to_db) 145 | session.add_all(datapoints_to_db) 146 | 147 | session.commit() 148 | 149 | with Session(engine) as session: 150 | products_in_db = session.exec(select(Product)).all() 151 | datapoints_in_db = session.exec(select(DataPoint)).all() 152 | print(f"Inserted products to db: {len(products_in_db)}") 153 | print(f"Inserted datapoints to db: {len(datapoints_in_db)}") 154 | 155 | 156 | def get_master_products(records_data: dict) -> tuple[MasterProduct]: 157 | master_products: list[MasterProduct] = [] 158 | 159 | for category_name, category_info in records_data.items(): 160 | for product_name, product_info in category_info.items(): 161 | master_product = MasterProduct(product_name, category_name) 162 | for website_name, website_info in product_info.items(): 163 | id = website_info["info"]["id"] 164 | url = website_info["info"]["url"] 165 | currency = website_info["info"]["currency"] 166 | datapoints = [DataPointInfo(datapoint["date"], datapoint["price"]) for datapoint in website_info["datapoints"]] 167 | product = ProductInfo(product_name, category_name, url, id, currency, website_name, datapoints) 168 | master_product.products.append(product) 169 | master_products.append(master_product) 170 | 171 | return tuple(master_products) 172 | 173 | 174 | def get_products_from_master_products(master_products: Iterable[MasterProduct]) -> list[ProductInfo]: 175 | return [product for master_product in master_products for product in master_product.products] 176 | -------------------------------------------------------------------------------- /scraper/logfile.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Crinibus/scraper/3c37db625d4b47cdb547952e098d3a3cb494ab6f/scraper/logfile.log -------------------------------------------------------------------------------- /scraper/logging.ini: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,scraper,scraper.scrape 3 | 4 | [handlers] 5 | keys=fileHandler 6 | 7 | [formatters] 8 | keys=fileFormatter 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=fileHandler 13 | 14 | [logger_scraper] 15 | level=INFO 16 | handlers=fileHandler 17 | qualname=scraper 18 | propagate=0 19 | 20 | [logger_scraper.scrape] 21 | level=INFO 22 | handlers=fileHandler 23 | qualname=scraper.scrape 24 | propagate=0 25 | 26 | [handler_fileHandler] 27 | class=FileHandler 28 | level=DEBUG 29 | formatter=fileFormatter 30 | args=(r"%(logfilename)s", "a", "utf8") 31 | 32 | [formatter_fileFormatter] 33 | format=%(asctime)s : %(levelname)s : %(name)s : %(message)s 34 | -------------------------------------------------------------------------------- /scraper/models/__init__.py: -------------------------------------------------------------------------------- 1 | from scraper.models.product import DataPointInfo, ProductInfo, Info, MasterProduct 2 | -------------------------------------------------------------------------------- /scraper/models/product.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from datetime import datetime 3 | import re 4 | 5 | 6 | @dataclass 7 | class Info: 8 | """Scraped info about product""" 9 | 10 | name: str 11 | price: float 12 | currency: str 13 | id: str 14 | valid: bool = True 15 | 16 | 17 | @dataclass 18 | class DataPointInfo: 19 | date: str 20 | price: float 21 | 22 | 23 | @dataclass 24 | class ProductInfo: 25 | product_name: str 26 | category: str 27 | url: str 28 | id: str 29 | currency: str 30 | website: str 31 | datapoints: list[DataPointInfo] 32 | 33 | def get_all_dates(self) -> list[str]: 34 | return [datapoint.date for datapoint in self.datapoints] 35 | 36 | def get_all_prices(self) -> list[float]: 37 | return [datapoint.price for datapoint in self.datapoints] 38 | 39 | @property 40 | def is_up_to_date(self) -> bool: 41 | if not self.datapoints: 42 | return False 43 | 44 | latest_date = datetime.strptime(self.datapoints[-1].date, "%Y-%m-%d") 45 | date_diff = datetime.today() - latest_date 46 | return date_diff.days <= 1 47 | 48 | def to_string_format(self, format: str) -> str: 49 | """Return a string representing the product, controlled by an explicit format string. 50 | 51 | >>> p = Product("ASUS RTX 4090", "GPU", "https://www.example.com/", "123", "USD", "example", [datepoints], True) 52 | >>> p.to_string_format("Name: %name, Category: %category, URL: %url, ID: %id, Website: %website") 53 | 'Name: ASUS RTX 4090, Category: GPU, URL: https://www.example.com/, ID: 123, Website: example' 54 | """ 55 | # inspiration from https://docs.python.org/3/library/re.html#writing-a-tokenizer 56 | token_specification = [ 57 | ("NAME", r"(%name)"), 58 | ("CATEGORY", r"(%category)"), 59 | ("URL", r"(%url)"), 60 | ("ID", r"(%id)"), 61 | ("CURRENCY", r"(%currency)"), 62 | ("WEBSITE", r"(%website)"), 63 | ] 64 | format_to = { 65 | "NAME": self.product_name, 66 | "CATEGORY": self.category, 67 | "URL": self.url, 68 | "ID": self.id, 69 | "CURRENCY": self.currency, 70 | "WEBSITE": self.website, 71 | } 72 | 73 | tok_regex = "|".join("(?P<%s>%s)" % pair for pair in token_specification) 74 | new_string = format 75 | 76 | for mo in re.finditer(tok_regex, format): 77 | kind = mo.lastgroup 78 | value = mo.group() 79 | 80 | new_string = new_string.replace(value, format_to[kind], 1) 81 | 82 | return new_string 83 | 84 | 85 | @dataclass 86 | class MasterProduct: 87 | product_name: str 88 | category: str 89 | products: list[ProductInfo] = field(default_factory=list) 90 | -------------------------------------------------------------------------------- /scraper/print_products.py: -------------------------------------------------------------------------------- 1 | from scraper.constants import CHECK_MARK 2 | import scraper.database as db 3 | from scraper.database.models import Product 4 | from scraper.models.product import ProductInfo 5 | 6 | 7 | def print_latest_datapoints(names: list[str], product_codes: list[str], categories: list[str]) -> None: 8 | if names: 9 | print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----") 10 | products = db.get_products_by_names(names) 11 | print_latest_datapoints_for_products(products) 12 | 13 | if product_codes: 14 | print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----") 15 | products = db.get_products_by_product_codes(product_codes) 16 | print_latest_datapoints_for_products(products) 17 | 18 | if categories: 19 | print("\n----- SHOWING LATEST DATAPOINT FOR CATEGORY(s) -----") 20 | products = db.get_products_by_categories(categories) 21 | print_latest_datapoints_for_products(products) 22 | 23 | 24 | def print_latest_datapoints_for_products(products: list[db.Product]): 25 | if not products: 26 | print("Found no products") 27 | return 28 | 29 | grouped_products = db.group_products_by_names(products) 30 | 31 | for products in grouped_products: 32 | product_infos = db.get_product_infos_from_products(products) 33 | print(product_infos[0].product_name) 34 | 35 | for product_info in product_infos: 36 | print_latest_datapoint(product_info) 37 | print() 38 | 39 | 40 | def print_latest_datapoint(product_info: ProductInfo) -> None: 41 | if not product_info.datapoints: 42 | print(f"> No datapoints for {product_info.id}") 43 | return 44 | 45 | id = product_info.id 46 | website_name = product_info.website.capitalize() 47 | currency = product_info.currency 48 | latest_datapoint = product_info.datapoints[-1] 49 | date = latest_datapoint.date 50 | price = latest_datapoint.price 51 | print(f"> {website_name} - {id}\n - {currency} {price}\n - {date}") 52 | 53 | 54 | def print_all_products() -> None: 55 | print("\n----- SHOWING ALL PRODUCTS -----") 56 | categories = db.get_all_unique_categories() 57 | 58 | if not categories: 59 | print("No products") 60 | return 61 | 62 | for category in categories: 63 | print(category) 64 | 65 | products = db.get_products_by_categories([category]) 66 | 67 | grouped_products = db.group_products_by_names(products) 68 | 69 | list_grouped_products(grouped_products) 70 | 71 | 72 | def list_products_with_filters(names: list[str] | None, product_codes: list[str] | None, categories: list[str] | None) -> None: 73 | print("\n----- LISTING PRODUCTS -----") 74 | products_by_filters: list[Product] = [] 75 | 76 | if names: 77 | products_with_names = db.get_products_by_names(names) 78 | products_by_filters.extend(products_with_names) 79 | 80 | if product_codes: 81 | products_with_product_codes = db.get_products_by_product_codes(product_codes) 82 | products_by_filters.extend(products_with_product_codes) 83 | 84 | if categories: 85 | products_with_categories = db.get_products_by_categories(categories) 86 | products_by_filters.extend(products_with_categories) 87 | 88 | if not products_by_filters: 89 | print("Found no products with filters") 90 | return 91 | 92 | categories = set([product.category for product in products_by_filters]) 93 | sorted_categories = sorted(categories) 94 | 95 | for category in sorted_categories: 96 | print(category) 97 | 98 | products_with_category = [product for product in products_by_filters if product.category == category] 99 | 100 | grouped_products = db.group_products_by_names(products_with_category) 101 | 102 | list_grouped_products(grouped_products) 103 | 104 | 105 | def list_grouped_products(grouped_products: list[list[Product]]) -> None: 106 | for products in grouped_products: 107 | print(f" > {products[0].name}") 108 | for product in products: 109 | is_active_marker = f"{CHECK_MARK} " if product.is_active else "" 110 | print(f" - {is_active_marker}{product.domain.upper()} - {product.product_code}") 111 | print() 112 | -------------------------------------------------------------------------------- /scraper/products.csv: -------------------------------------------------------------------------------- 1 | category,url,short_url 2 | -------------------------------------------------------------------------------- /scraper/records.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /scraper/reset_data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import scraper.database as db 4 | 5 | 6 | def reset(categories: list[str], names: list[str], product_codes: list[str], all: bool) -> None: 7 | print("Resetting datapoints...") 8 | logging.getLogger(__name__).info(f"Resetting datapoints for {categories=}, {names=}, {product_codes=}, {all=}") 9 | 10 | if all: 11 | delete_all_datapoints() 12 | return 13 | 14 | if categories: 15 | delete_datapoints_for_products_by_categories(categories) 16 | 17 | if names: 18 | delete_datapoints_for_products_by_names(names) 19 | 20 | if product_codes: 21 | delete_datapoints_for_products_by_product_codes(product_codes) 22 | 23 | 24 | def delete_all_datapoints(): 25 | datapoints = db.get_all_datapoints() 26 | db.delete_all(datapoints) 27 | 28 | 29 | def delete_datapoints_for_products_by_categories(categories: list[str]): 30 | datapoints = db.get_datapoints_by_categories(categories) 31 | db.delete_all(datapoints) 32 | 33 | 34 | def delete_datapoints_for_products_by_names(names: list[str]): 35 | datapoints = db.get_datapoints_by_names(names) 36 | db.delete_all(datapoints) 37 | 38 | 39 | def delete_datapoints_for_products_by_product_codes(product_codes: list[str]): 40 | datapoints = db.get_datapoints_by_product_codes(product_codes) 41 | db.delete_all(datapoints) 42 | -------------------------------------------------------------------------------- /scraper/scrape.py: -------------------------------------------------------------------------------- 1 | import time 2 | import threading 3 | import logging 4 | 5 | from scraper.models import Info 6 | from scraper.domains import get_website_handler 7 | 8 | 9 | class Scraper: 10 | def __init__(self, category: str, url: str) -> None: 11 | self.category = category 12 | self.url = url 13 | self.website_handler = get_website_handler(url) 14 | self.product_info: Info = None 15 | 16 | def scrape_info(self) -> Info: 17 | logging.getLogger(__name__).debug(f"Scraping: {self.category} - {self.url}") 18 | self.product_info = self.website_handler.get_product_info() 19 | return self.product_info 20 | 21 | 22 | def start_threads_sequentially(threads: list[threading.Thread], request_delay: int, progress_bar=None) -> None: 23 | for thread in threads: 24 | thread.start() 25 | thread.join() 26 | time.sleep(request_delay) 27 | 28 | if progress_bar: 29 | progress_bar() 30 | -------------------------------------------------------------------------------- /scraper/search_data.py: -------------------------------------------------------------------------------- 1 | import scraper.database as db 2 | 3 | 4 | def search(search_terms: list[str]) -> None: 5 | print("Searching...") 6 | 7 | product_name_search_results = search_product_names(search_terms) 8 | categories_search_results = search_categories(search_terms) 9 | 10 | if product_name_search_results: 11 | print("\n--- Results from product name search ---") 12 | for result in product_name_search_results: 13 | print(f"> {result}\n") 14 | else: 15 | print("\nNo results for product name search") 16 | 17 | if categories_search_results: 18 | print("\n--- Results from category search ---") 19 | for result in categories_search_results: 20 | print(f"> {result}") 21 | else: 22 | print("\nNo results for categories search") 23 | 24 | 25 | def search_product_names(search_terms: list[str]) -> list[str]: 26 | products_strings = [] 27 | products = db.get_products_by_names_fuzzy(search_terms) 28 | 29 | if not products: 30 | return [] 31 | 32 | grouped_products = db.group_products_by_names(products) 33 | 34 | for products in grouped_products: 35 | matched_domains = [] 36 | for product in products: 37 | match_string = f" - {product.domain.capitalize()} - {product.product_code}" 38 | matched_domains.append(match_string) 39 | matched_domains_string = "\n".join(matched_domains) 40 | products_strings.append(f"{products[0].name}\n{matched_domains_string}") 41 | 42 | return products_strings 43 | 44 | 45 | def search_categories(search_terms: list[str]) -> list[str]: 46 | all_results = [] 47 | all_categories = db.get_all_unique_categories() 48 | 49 | for search_term in search_terms: 50 | results = [category for category in all_categories if search_term.lower() in category.lower()] 51 | all_results.extend(results) 52 | 53 | return all_results 54 | -------------------------------------------------------------------------------- /scraper/settings.ini: -------------------------------------------------------------------------------- 1 | [ChangeName] 2 | ; Add your own keywords seperated with a comma (,) and what the product name should be renamed (valuewords) to if the product name has **all** the keywords. See example below: 3 | key1 = asus,3080,rog,strix,oc 4 | value1 = asus geforce rtx 3080 rog strix oc 5 | 6 | [Scraping] 7 | ; request_delay in seconds 8 | request_delay = 0 9 | ; request_timeout in seconds or None for indefinitely 10 | request_timeout = 25 11 | -------------------------------------------------------------------------------- /scraper/visualize.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Iterator 2 | import plotly.graph_objs as go 3 | from datetime import datetime 4 | 5 | import scraper.database as db 6 | from scraper.models import DataPointInfo, ProductInfo, MasterProduct 7 | from scraper.constants import WEBSITE_COLORS 8 | 9 | 10 | def visualize_data( 11 | show_all: bool, categories: list[str], ids: list[str], names: list[str], only_up_to_date: bool, compare: bool 12 | ) -> None: 13 | print("Visualizing...") 14 | 15 | # Convert all string to lowercase 16 | categories = [category.lower() for category in categories] 17 | ids = [id.lower() for id in ids] 18 | names = [name.lower() for name in names] 19 | 20 | master_products = get_master_products() 21 | 22 | if not master_products: 23 | print("No products saved") 24 | return 25 | 26 | if compare: 27 | compare_products(master_products, ids, names, categories, only_up_to_date, show_all) 28 | return 29 | 30 | if show_all: 31 | show_master_products(master_products, only_up_to_date) 32 | 33 | if categories: 34 | for master_product in get_master_products_with_categories(master_products, categories, only_up_to_date): 35 | product_name = master_product.product_name 36 | category = master_product.category 37 | status_of_master_product = get_status_of_master_product(master_product) 38 | title = f"Price(s) of {product_name} - {category} - {status_of_master_product}" 39 | show_products(master_product.products, title) 40 | else: 41 | print("No products found with category/categories") 42 | 43 | if ids: 44 | for product in get_products_with_ids(master_products, ids, only_up_to_date): 45 | status_of_product = get_status_of_product(product) 46 | product_name = product.product_name 47 | title = f"Price(s) of {product_name} - {status_of_product}" 48 | show_product(product, title) 49 | else: 50 | print("No products found with id(s)") 51 | 52 | if names: 53 | for master_product in get_master_products_with_names(master_products, names, only_up_to_date): 54 | product_name = master_product.product_name 55 | status_of_master_product = get_status_of_master_product(master_product) 56 | title = f"Price(s) of {product_name} - {status_of_master_product}" 57 | show_products(master_product.products, title) 58 | else: 59 | print("No products found with name(s)") 60 | 61 | 62 | def compare_products( 63 | master_products: tuple[MasterProduct], 64 | ids: list[str], 65 | names: list[str], 66 | categories: list[str], 67 | only_up_to_date: bool, 68 | show_all: bool, 69 | ) -> None: 70 | master_products_with_names = get_master_products_with_names(master_products, names, only_up_to_date) 71 | products_with_names = get_products_from_master_products(master_products_with_names) 72 | 73 | products_with_ids = list(get_products_with_ids(master_products, ids, only_up_to_date)) 74 | 75 | master_products_with_categories = get_master_products_with_categories(master_products, categories, only_up_to_date) 76 | products_with_categories = get_products_from_master_products(master_products_with_categories) 77 | 78 | products_to_compare = [*products_with_ids, *products_with_names, *products_with_categories] 79 | 80 | if show_all: 81 | products_to_compare = get_products_from_master_products(master_products) 82 | 83 | if not products_to_compare: 84 | print("No products found to compare") 85 | return 86 | 87 | product_ids = [product.id for product in products_to_compare] 88 | product_ids_string = ", ".join(product_ids) 89 | title_ = product_ids_string[:100] + " ..." if len(product_ids_string) > 100 else product_ids_string 90 | show_products(products_to_compare, f"Comparing products with ids: {title_}") 91 | 92 | 93 | def show_master_products(master_products: tuple[MasterProduct], only_up_to_date: bool) -> None: 94 | if not master_products: 95 | print("No products found") 96 | return 97 | 98 | for master_product in master_products: 99 | if only_up_to_date and not is_master_product_up_to_date(master_product): 100 | continue 101 | 102 | status_of_master_product = get_status_of_master_product(master_product) 103 | show_products(master_product.products, f"Price(s) of {master_product.product_name} - {status_of_master_product}") 104 | 105 | 106 | def show_product(product: ProductInfo, title: str) -> None: 107 | show_products([product], title) 108 | 109 | 110 | def show_products(products: list[ProductInfo], title: str) -> None: 111 | fig = go.Figure() 112 | for product in products: 113 | add_scatter_plot( 114 | fig, 115 | product, 116 | name_format="%website - %name - %id", 117 | ) 118 | 119 | num_products = len(products) 120 | 121 | config_figure(fig, title, num_products) 122 | fig.show(config={"scrollZoom": True}) 123 | 124 | 125 | def get_master_products() -> tuple[MasterProduct]: 126 | master_products: list[MasterProduct] = [] 127 | 128 | all_products = db.get_all_products_with_datapoints() 129 | 130 | unique_product_names = set([product.product_name for product in all_products]) 131 | 132 | for unique_product_name in unique_product_names: 133 | products_from_db = db.get_products_by_names([unique_product_name]) 134 | products = db.get_product_infos_from_products(products_from_db) 135 | 136 | category = products[0].category 137 | master_product = MasterProduct(unique_product_name, category, products) 138 | master_products.append(master_product) 139 | 140 | return tuple(master_products) 141 | 142 | 143 | def get_products_with_ids( 144 | master_products: tuple[MasterProduct], ids: list[str], only_up_to_date: bool 145 | ) -> Iterator[ProductInfo]: 146 | for master_product in master_products: 147 | for product in master_product.products: 148 | if only_up_to_date and not product.is_up_to_date: 149 | continue 150 | 151 | if product.id.lower() not in ids: 152 | continue 153 | 154 | yield product 155 | 156 | 157 | def get_master_products_with_categories( 158 | master_products: tuple[MasterProduct], categories: list[str], only_up_to_date: bool 159 | ) -> Iterator[MasterProduct]: 160 | for master_product in master_products: 161 | if master_product.category.lower() not in categories: 162 | continue 163 | 164 | if only_up_to_date and not is_master_product_up_to_date(master_product): 165 | continue 166 | 167 | yield master_product 168 | 169 | 170 | def get_master_products_with_names( 171 | master_products: tuple[MasterProduct], names: list[str], only_up_to_date: bool 172 | ) -> Iterator[MasterProduct]: 173 | for master_product in master_products: 174 | if master_product.product_name.lower() not in names: 175 | continue 176 | 177 | if only_up_to_date and not is_master_product_up_to_date(master_product): 178 | continue 179 | 180 | yield master_product 181 | 182 | 183 | def get_products_from_master_products(master_products: Iterable[MasterProduct]) -> list[ProductInfo]: 184 | return [product for master_product in master_products for product in master_product.products] 185 | 186 | 187 | def get_yvalue_for_configure_figure(num_products: int, min_value: int, max_value: int, max_num: int): 188 | value = ((num_products / max_num) * (max_value - min_value)) + min_value 189 | 190 | if value > max_value: 191 | value = max_value 192 | elif value < min_value: 193 | value = min_value 194 | 195 | return value 196 | 197 | 198 | def config_figure(figure: go.Figure, figure_title: str, num_products: int) -> None: 199 | figure.update_traces(mode="markers+lines", hovertemplate=None) 200 | 201 | y_value = get_yvalue_for_configure_figure(num_products, 0.1, 0.25, 30) 202 | 203 | figure.update_layout( 204 | title=dict(text=figure_title), 205 | xaxis_title="Date", 206 | yaxis_title="Price", 207 | hovermode="closest", 208 | separators=".,", 209 | legend=dict(orientation="h", y=-y_value, x=0, yref="paper", xref="paper", yanchor="top", xanchor="left"), 210 | hoverlabel_namelength=-1, 211 | ) 212 | 213 | 214 | def add_scatter_plot( 215 | figure: go.Figure, 216 | product: ProductInfo, 217 | color: str = None, 218 | hover_text: str = None, 219 | name_format: str = None, 220 | ) -> None: 221 | scatter_name = product.to_string_format(name_format) if name_format else f"{product.website.capitalize()} - {product.id}" 222 | scatter_color = color if color else WEBSITE_COLORS[product.website] 223 | scatter_hover_text = hover_text if hover_text else "Price: %{y:.0f}" + f" {product.currency}" 224 | 225 | figure.add_trace( 226 | go.Scatter( 227 | name=scatter_name, 228 | x=product.get_all_dates(), 229 | y=product.get_all_prices(), 230 | line={"color": scatter_color, "width": 2}, 231 | hovertemplate=scatter_hover_text, 232 | ) 233 | ) 234 | 235 | 236 | def is_datapoints_up_to_date(datapoints: list[DataPointInfo]) -> bool: 237 | """check if today and the last date in datapoints is at most 1 day apart""" 238 | if len(datapoints) == 0: 239 | return False 240 | 241 | return is_date_up_to_date(datapoints[-1].date) 242 | 243 | 244 | def is_date_up_to_date(date: str) -> bool: 245 | """check if today and date is at most 1 day apart""" 246 | latest_date = datetime.strptime(date, "%Y-%m-%d") 247 | date_diff = datetime.today() - latest_date 248 | 249 | return date_diff.days <= 1 250 | 251 | 252 | def is_master_product_up_to_date(master_product: MasterProduct) -> bool: 253 | return any((product.is_up_to_date for product in master_product.products)) 254 | 255 | 256 | def get_status_of_master_product(master_product: MasterProduct) -> str: 257 | if is_master_product_up_to_date(master_product): 258 | return get_status_of_product_by_bool(True) 259 | 260 | return get_status_of_product_by_bool(False) 261 | 262 | 263 | def get_status_of_product(product: ProductInfo) -> str: 264 | return get_status_of_product_by_bool(product.is_up_to_date) 265 | 266 | 267 | def get_status_of_product_by_bool(up_to_date: bool) -> str: 268 | return "UP TO DATE" if up_to_date else "OUTDATED" 269 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Crinibus/scraper/3c37db625d4b47cdb547952e098d3a3cb494ab6f/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_add_product.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from contextlib import nullcontext as does_not_raise 3 | 4 | from scraper.add_product import add_product 5 | from scraper.exceptions import WebsiteNotSupported 6 | from scraper.models import Info 7 | 8 | test_domains = [ 9 | ("https://www.amazon.com/", does_not_raise()), 10 | ("https://www.ebay.com/itm/", does_not_raise()), 11 | ("https://www.ebay.com/p/", does_not_raise()), 12 | ("https://www.komplett.dk/", does_not_raise()), 13 | ("https://www.proshop.dk/", does_not_raise()), 14 | ("https://www.computersalg.dk/", does_not_raise()), 15 | ("https://www.elgiganten.dk/", does_not_raise()), 16 | ("https://www.avxperten.dk/", does_not_raise()), 17 | ("https://www.av-cables.dk/", does_not_raise()), 18 | ("https://www.power.dk/", does_not_raise()), 19 | ("https://www.expert.dk/", does_not_raise()), 20 | ("https://www.mm-vision.dk/", does_not_raise()), 21 | ("https://www.coolshop.dk/", does_not_raise()), 22 | ("https://sharkgaming.dk/", does_not_raise()), 23 | ("https://www.newegg.com/", does_not_raise()), 24 | ("https://www.hifiklubben.dk/", does_not_raise()), 25 | ("https://us.shein.com/", does_not_raise()), 26 | ("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)), 27 | ] 28 | 29 | 30 | # Tests to make sure the websites that are supported can be added to be scraped 31 | @pytest.mark.parametrize("url,expectation", test_domains) 32 | def test_add_product(url, expectation, mocker) -> None: 33 | mock_info = Info(name="", price=1, currency="", id="") 34 | mocker.patch("scraper.Scraper.scrape_info", return_value=mock_info) 35 | mocker.patch("scraper.database.get_product_by_product_code", return_value=None) 36 | mocker.patch("scraper.add_product.add_new_product_to_db", return_value=None) 37 | mocker.patch("scraper.add_product.add_new_datapoint_with_scraper", return_value=None) 38 | 39 | with expectation: 40 | add_product("test", url) 41 | -------------------------------------------------------------------------------- /tests/test_domains.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import pytest 3 | 4 | from scraper.domains import get_website_name, get_number_string 5 | 6 | 7 | @dataclass 8 | class UrlSetting: 9 | keep_tld: bool = False 10 | keep_http: bool = False 11 | keep_www: bool = False 12 | keep_subdomain: bool = True 13 | 14 | 15 | test_websites = [ 16 | ("https://www.amazon.com/", UrlSetting(), "amazon"), 17 | ("https://www.komplett.dk/", UrlSetting(), "komplett"), 18 | ("https://www.av-cables.dk/", UrlSetting(), "av-cables"), 19 | ("https://nowww.com/", UrlSetting(), "nowww"), 20 | ("https://no-ending-slash.com", UrlSetting(), "no-ending-slash"), 21 | ("https://www.test.testing.com/", UrlSetting(), "test.testing"), 22 | ("https://www.test.hello.com/hello/world", UrlSetting(), "test.hello"), 23 | ("https://sub.main.com", UrlSetting(keep_subdomain=False), "main"), 24 | ("https://www.sub.main.com", UrlSetting(keep_subdomain=False), "main"), 25 | ("https://main.com", UrlSetting(keep_subdomain=False), "main"), 26 | ("https://main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), 27 | ("https://www.main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), 28 | ("https://www.main.com/", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), 29 | ("https://www.sub.main.com/", UrlSetting(keep_http=True), "https://sub.main"), 30 | ("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True), "https://www.sub.main"), 31 | ("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True, keep_subdomain=False), "https://www.main"), 32 | ] 33 | 34 | 35 | @pytest.mark.parametrize("url,setting,expected", test_websites) 36 | def test_get_website_name(url: str, setting: UrlSetting, expected: str) -> None: 37 | result = get_website_name( 38 | url, 39 | keep_tld=setting.keep_tld, 40 | keep_http=setting.keep_http, 41 | keep_www=setting.keep_www, 42 | keep_subdomain=setting.keep_subdomain, 43 | ) 44 | assert result == expected 45 | 46 | 47 | test_price_values = [ 48 | ("USD 12.40", "12.40"), 49 | ("$234.00", "234.00"), 50 | ("£345.37", "345.37"), 51 | ("486,89 kr", "486,89"), 52 | ("$345.37", "345.37"), 53 | ("£1345.37", "1345.37"), 54 | ("1345,37 DKK", "1345,37"), 55 | ("1345.37 DKK", "1345.37"), 56 | ("USD 1345.37", "1345.37"), 57 | ("USD 10345.37", "10345.37"), 58 | ] 59 | 60 | 61 | @pytest.mark.parametrize("value,expected", test_price_values) 62 | def test_get_number_string(value: str, expected: str) -> None: 63 | result = get_number_string(value) 64 | 65 | assert result == expected 66 | -------------------------------------------------------------------------------- /tests/test_objects.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_website_handlers": { 3 | "komplett": { 4 | "link": "https://www.komplett.dk/product/1205149/gaming/spiludstyr/vr/vr-briller/htc-vive-flow-sortreflekterende", 5 | "expected_title": "HTC VIVE Flow (sort/reflekterende)", 6 | "expected_id": "1205149", 7 | "expected_currency": "DKK" 8 | }, 9 | "proshop": { 10 | "link": "https://www.proshop.dk/Hovedtelefonerheadset/Sony-WH-1000XM4/2883832", 11 | "expected_title": "Sony WH-1000XM4", 12 | "expected_id": "2883832", 13 | "expected_currency": "DKK" 14 | }, 15 | "computersalg": { 16 | "link": "https://www.computersalg.dk/i/6647865/sony-wh-1000xm4-hovedtelefoner-med-mik-fuld-st%c3%b8rrelse-bluetooth-tr%c3%a5dl%c3%b8s-kabling-nfc-aktiv-st%c3%b8jfjerning-3-5-mm-jackstik-sort", 17 | "expected_title": "Sony WH-1000XM4 - Hovedtelefoner med mik. - fuld størrelse - Bluetooth - trådløs, kabling - NFC - aktiv støjfjerning - 3,5 mm jackstik - sort", 18 | "expected_id": "6647865", 19 | "expected_currency": "DKK" 20 | }, 21 | "elgiganten": { 22 | "link": "https://www.elgiganten.dk/product/gaming/spillekonsol-tilbehor/playstation/playstation-spillekonsol/playstation-5-2022/533978", 23 | "expected_title": "PlayStation 5 (2022)", 24 | "expected_id": "533978", 25 | "expected_currency": "DKK" 26 | }, 27 | "avxperten": { 28 | "link": "https://www.avxperten.dk/noise-cancelling-head-set/sony-wh-1000xm4-bluetooth-hovedtelefoner-anc-sort.asp", 29 | "expected_title": "Sony WH-1000XM4 Bluetooth hovedtelefoner (m/ANC) Sort", 30 | "expected_id": "33590", 31 | "expected_currency": "DKK" 32 | }, 33 | "av-cables": { 34 | "link": "https://www.av-cables.dk/bluetooth-hoeretelefoner/sony-wh-1000xm4-over-ear-bluetooth-headset-sort.html", 35 | "expected_title": "Sony WH-1000XM4 Over-Ear Bluetooth Headset - Sort", 36 | "expected_id": "833015", 37 | "expected_currency": "DKK" 38 | }, 39 | "amazon": { 40 | "link": "https://www.amazon.de/-/en/Google-Pixel-Pro-Smartphone-Obsidian/dp/B0DG9DD9VN", 41 | "expected_title": "Google Pixel 9 Pro (512GB, Obsi, EU / UK) + Pixel 9/9 Pro Case, Obsidian", 42 | "expected_id": "B0DG9DD9VN", 43 | "expected_currency": "EUR" 44 | }, 45 | "ebay_with_itm": { 46 | "link": "https://www.ebay.com/itm/265771092654", 47 | "expected_title": "BRAND NEW Sony PS5 Playstation 5 Blu-Ray Disc Edition Console -Fast Delivery", 48 | "expected_id": "265771092654", 49 | "expected_currency": "USD" 50 | }, 51 | "ebay_with_p": { 52 | "link": "https://www.ebay.com/p/17005345300?iid=391613649077", 53 | "expected_title": "O Hui Age Recovery Eye Cream 1ml X 40pcs (40ml) Baby Collagen OHUI", 54 | "expected_id": "17005345300", 55 | "expected_currency": "USD" 56 | }, 57 | "expert": { 58 | "link": "https://www.expert.dk/hoejtalere-og-lyd/hovedtelefoner/traadloese-hovedtelefoner/sony-wh-1000xm4-traadloese-stoejdaempende-hovedtelefoner-sort/p-1106907/", 59 | "expected_title": "SONY WH-1000XM4 TRÅDLØSE STØJDÆMPENDE HOVEDTELEFONER, SORT", 60 | "expected_id": "1106907", 61 | "expected_currency": "DKK" 62 | }, 63 | "power": { 64 | "link": "https://www.power.dk/tv-og-lyd/hovedtelefoner/traadloese-hovedtelefoner/sony-wh-1000xm4-traadloese-stoejdaempende-hovedtelefoner-blaa/p-1185731/", 65 | "expected_title": "SONY WH-1000XM4 TRÅDLØSE STØJDÆMPENDE HOVEDTELEFONER, BLÅ", 66 | "expected_id": "1185731", 67 | "expected_currency": "DKK" 68 | }, 69 | "mm-vision": { 70 | "link": "https://www.mm-vision.dk/demo-asus-rog-flow-x16-gv601", 71 | "expected_title": "DEMO Asus ROG Flow X16 (GV601)", 72 | "expected_id": "6987145", 73 | "expected_currency": "DKK" 74 | }, 75 | "coolshop": { 76 | "link": "https://www.coolshop.dk/produkt/pokemon-brilliant-diamond/238G6U/", 77 | "expected_title": "Pokemon Brilliant Diamond - Nintendo Switch", 78 | "expected_id": "1177871", 79 | "expected_currency": "DKK" 80 | }, 81 | "sharkgaming": { 82 | "link": "https://sharkgaming.dk/asus-gladius-ii-origin-gaming-mouse", 83 | "expected_title": "ASUS Gladius II Origin gaming mouse", 84 | "expected_id": "90MP00U1-B0UA00", 85 | "expected_currency": "DKK" 86 | }, 87 | "newegg": { 88 | "link": "https://www.newegg.com/sony-wh1000xm4b-bluetooth-headset-black/p/0G6-001C-00614?Description=sony%20xm4&cm_re=sony_xm4-_-0G6-001C-00614-_-Product&quicklink=true", 89 | "expected_title": "Sony WH-1000XM4 Wireless Industry Leading Noise Canceling Overhead Headphones with Mic for Phone-Call and Alexa Voice Control, Silver", 90 | "expected_id": "0G6-001C-00614", 91 | "expected_currency": "USD" 92 | }, 93 | "hifiklubben": { 94 | "link": "https://www.hifiklubben.dk/sennheiser-momentum-4-wireless-hoeretelefoner/senmomentum4bk/", 95 | "expected_title": "SENNHEISER MOMENTUM 4 WIRELESS", 96 | "expected_id": "senmomentum4bk", 97 | "expected_currency": "DKK" 98 | }, 99 | "shein": { 100 | "link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html", 101 | "expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt", 102 | "expected_id": "sm2311284334246374", 103 | "expected_currency": "EUR" 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /tests/test_visualize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from scraper.domains import SUPPORTED_DOMAINS 4 | from scraper.constants import WEBSITE_COLORS 5 | 6 | 7 | @pytest.mark.parametrize("domain", SUPPORTED_DOMAINS.keys()) 8 | def test_get_website_color_for_supported_domain(domain: str) -> None: 9 | color = WEBSITE_COLORS.get(domain, None) 10 | assert color is not None 11 | -------------------------------------------------------------------------------- /tests/test_website_handlers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import json 3 | from scraper.domains import ( 4 | AmazonHandler, 5 | AvCablesHandler, 6 | AvXpertenHandler, 7 | BaseWebsiteHandler, 8 | ComputerSalgHandler, 9 | CoolshopHandler, 10 | EbayHandler, 11 | ElgigantenHandler, 12 | ExpertHandler, 13 | KomplettHandler, 14 | MMVisionHandler, 15 | NeweggHandler, 16 | PowerHandler, 17 | ProshopHandler, 18 | SharkGamingHandler, 19 | HifiKlubbenHandler, 20 | SheinHandler, 21 | ) 22 | from scraper.models import Info 23 | 24 | 25 | def read_json(filename: str) -> dict: 26 | with open(filename, "r", encoding="utf8") as file: 27 | data = json.load(file) 28 | return data 29 | 30 | 31 | test_objects_json = read_json("./tests/test_objects.json") 32 | 33 | test_website_handlers_json: dict[str, dict[str, str]] = test_objects_json["test_website_handlers"] 34 | 35 | komplett_test = test_website_handlers_json["komplett"] 36 | proshop_test = test_website_handlers_json["proshop"] 37 | computersalg_test = test_website_handlers_json["computersalg"] 38 | elgiganten_test = test_website_handlers_json["elgiganten"] 39 | avxperten_test = test_website_handlers_json["avxperten"] 40 | avcables_test = test_website_handlers_json["av-cables"] 41 | amazon_test = test_website_handlers_json["amazon"] 42 | ebay_with_itm_test = test_website_handlers_json["ebay_with_itm"] 43 | ebay_with_p_test = test_website_handlers_json["ebay_with_p"] 44 | expert_test = test_website_handlers_json["expert"] 45 | power_test = test_website_handlers_json["power"] 46 | mmvision_test = test_website_handlers_json["mm-vision"] 47 | coolshop_test = test_website_handlers_json["coolshop"] 48 | sharkgaming_test = test_website_handlers_json["sharkgaming"] 49 | newegg_test = test_website_handlers_json["newegg"] 50 | hifiklubben_test = test_website_handlers_json["hifiklubben"] 51 | shein_test = test_website_handlers_json["shein"] 52 | 53 | 54 | class BaseTestWebsiteHandler(ABC): 55 | test_handler: BaseWebsiteHandler 56 | 57 | def setup_method(self) -> None: 58 | if not self.test_handler.request_data: 59 | self.test_handler._request_product_data() 60 | self.test_handler._get_common_data() 61 | 62 | @abstractmethod 63 | def test_get_product_info(self) -> None: 64 | pass 65 | 66 | @abstractmethod 67 | def test_get_name(self) -> None: 68 | pass 69 | 70 | @abstractmethod 71 | def test_get_price(self) -> None: 72 | pass 73 | 74 | @abstractmethod 75 | def test_get_currency(self) -> None: 76 | pass 77 | 78 | @abstractmethod 79 | def test_get_id(self) -> None: 80 | pass 81 | 82 | 83 | class TestKomplettHandler(BaseTestWebsiteHandler): 84 | test_handler = KomplettHandler(komplett_test["link"]) 85 | 86 | def test_get_product_info(self, mocker) -> None: 87 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 88 | actual = self.test_handler.get_product_info() 89 | assert isinstance(actual, Info) 90 | assert actual.valid 91 | 92 | def test_get_name(self) -> None: 93 | actual = self.test_handler._get_product_name().lower() 94 | expected = komplett_test["expected_title"].lower() 95 | assert isinstance(actual, str) 96 | assert actual == expected 97 | 98 | def test_get_price(self) -> None: 99 | price = self.test_handler._get_product_price() 100 | assert isinstance(price, float) 101 | 102 | def test_get_currency(self) -> None: 103 | currency = self.test_handler._get_product_currency() 104 | assert isinstance(currency, str) 105 | assert currency == komplett_test["expected_currency"] 106 | 107 | def test_get_id(self) -> None: 108 | id = self.test_handler._get_product_id() 109 | assert isinstance(id, str) 110 | assert id == komplett_test["expected_id"] 111 | 112 | 113 | class TestProshopHandler(BaseTestWebsiteHandler): 114 | test_handler = ProshopHandler(proshop_test["link"]) 115 | 116 | def test_get_product_info(self, mocker) -> None: 117 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 118 | actual = self.test_handler.get_product_info() 119 | assert isinstance(actual, Info) 120 | assert actual.valid 121 | 122 | def test_get_name(self) -> None: 123 | actual = self.test_handler._get_product_name().lower() 124 | expected = proshop_test["expected_title"].lower() 125 | assert isinstance(actual, str) 126 | assert actual == expected 127 | 128 | def test_get_price(self) -> None: 129 | price = self.test_handler._get_product_price() 130 | assert isinstance(price, float) 131 | 132 | def test_get_currency(self) -> None: 133 | currency = self.test_handler._get_product_currency() 134 | assert isinstance(currency, str) 135 | assert currency == proshop_test["expected_currency"] 136 | 137 | def test_get_id(self) -> None: 138 | id = self.test_handler._get_product_id() 139 | assert isinstance(id, str) 140 | assert id == proshop_test["expected_id"] 141 | 142 | 143 | class TestComputersalgHandler(BaseTestWebsiteHandler): 144 | test_handler = ComputerSalgHandler(computersalg_test["link"]) 145 | 146 | def test_get_product_info(self, mocker) -> None: 147 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 148 | actual = self.test_handler.get_product_info() 149 | assert isinstance(actual, Info) 150 | assert actual.valid 151 | 152 | def test_get_name(self) -> None: 153 | actual = self.test_handler._get_product_name().lower() 154 | expected = computersalg_test["expected_title"].lower() 155 | assert isinstance(actual, str) 156 | assert actual == expected 157 | 158 | def test_get_price(self) -> None: 159 | price = self.test_handler._get_product_price() 160 | assert isinstance(price, float) 161 | 162 | def test_get_currency(self) -> None: 163 | currency = self.test_handler._get_product_currency() 164 | assert isinstance(currency, str) 165 | assert currency == computersalg_test["expected_currency"] 166 | 167 | def test_get_id(self) -> None: 168 | id = self.test_handler._get_product_id() 169 | assert isinstance(id, str) 170 | assert id == computersalg_test["expected_id"] 171 | 172 | 173 | class TestElgigantenHandler(BaseTestWebsiteHandler): 174 | test_handler = ElgigantenHandler(elgiganten_test["link"]) 175 | 176 | def test_get_product_info(self, mocker) -> None: 177 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 178 | actual = self.test_handler.get_product_info() 179 | assert isinstance(actual, Info) 180 | assert actual.valid 181 | 182 | def test_get_name(self) -> None: 183 | actual = self.test_handler._get_product_name().lower() 184 | expected = elgiganten_test["expected_title"].lower() 185 | assert isinstance(actual, str) 186 | assert actual == expected 187 | 188 | def test_get_price(self) -> None: 189 | price = self.test_handler._get_product_price() 190 | assert isinstance(price, float) 191 | 192 | def test_get_currency(self) -> None: 193 | currency = self.test_handler._get_product_currency() 194 | assert isinstance(currency, str) 195 | assert currency == elgiganten_test["expected_currency"] 196 | 197 | def test_get_id(self) -> None: 198 | id = self.test_handler._get_product_id() 199 | assert isinstance(id, str) 200 | assert id == elgiganten_test["expected_id"] 201 | 202 | 203 | class TestAvXpertenHandler(BaseTestWebsiteHandler): 204 | test_handler = AvXpertenHandler(avxperten_test["link"]) 205 | 206 | def test_get_product_info(self, mocker) -> None: 207 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 208 | actual = self.test_handler.get_product_info() 209 | assert isinstance(actual, Info) 210 | assert actual.valid 211 | 212 | def test_get_name(self) -> None: 213 | actual = self.test_handler._get_product_name().lower() 214 | expected = avxperten_test["expected_title"].lower() 215 | assert isinstance(actual, str) 216 | assert actual == expected 217 | 218 | def test_get_price(self) -> None: 219 | price = self.test_handler._get_product_price() 220 | assert isinstance(price, float) 221 | 222 | def test_get_currency(self) -> None: 223 | currency = self.test_handler._get_product_currency() 224 | assert isinstance(currency, str) 225 | assert currency == avxperten_test["expected_currency"] 226 | 227 | def test_get_id(self) -> None: 228 | id = self.test_handler._get_product_id() 229 | assert isinstance(id, str) 230 | assert id == avxperten_test["expected_id"] 231 | 232 | 233 | class TestAvCablesHandler(BaseTestWebsiteHandler): 234 | test_handler = AvCablesHandler(avcables_test["link"]) 235 | 236 | def test_get_product_info(self, mocker) -> None: 237 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 238 | actual = self.test_handler.get_product_info() 239 | assert isinstance(actual, Info) 240 | assert actual.valid 241 | 242 | def test_get_name(self) -> None: 243 | actual = self.test_handler._get_product_name().lower() 244 | expected = avcables_test["expected_title"].lower() 245 | assert isinstance(actual, str) 246 | assert actual == expected 247 | 248 | def test_get_price(self) -> None: 249 | price = self.test_handler._get_product_price() 250 | assert isinstance(price, float) 251 | 252 | def test_get_currency(self) -> None: 253 | currency = self.test_handler._get_product_currency() 254 | assert isinstance(currency, str) 255 | assert currency == avcables_test["expected_currency"] 256 | 257 | def test_get_id(self) -> None: 258 | id = self.test_handler._get_product_id() 259 | assert isinstance(id, str) 260 | assert id == avcables_test["expected_id"] 261 | 262 | 263 | class TestAmazonHandler(BaseTestWebsiteHandler): 264 | test_handler = AmazonHandler(amazon_test["link"]) 265 | 266 | def test_get_product_info(self, mocker) -> None: 267 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 268 | actual = self.test_handler.get_product_info() 269 | assert isinstance(actual, Info) 270 | assert actual.valid 271 | 272 | def test_get_name(self) -> None: 273 | actual = self.test_handler._get_product_name().lower() 274 | expected = amazon_test["expected_title"].lower() 275 | assert isinstance(actual, str) 276 | assert actual == expected 277 | 278 | def test_get_price(self) -> None: 279 | price = self.test_handler._get_product_price() 280 | assert isinstance(price, float) 281 | 282 | def test_get_currency(self) -> None: 283 | currency = self.test_handler._get_product_currency() 284 | assert isinstance(currency, str) 285 | assert currency == amazon_test["expected_currency"] 286 | 287 | def test_get_id(self) -> None: 288 | id = self.test_handler._get_product_id() 289 | assert isinstance(id, str) 290 | assert id == amazon_test["expected_id"] 291 | 292 | 293 | # OBS: There is two Ebay versions - This is for url that start with 'ebay.com/itm/' 294 | class TestEbayHandler_with_itm(BaseTestWebsiteHandler): 295 | test_handler = EbayHandler(ebay_with_itm_test["link"]) 296 | 297 | def test_get_product_info(self, mocker) -> None: 298 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 299 | actual = self.test_handler.get_product_info() 300 | assert isinstance(actual, Info) 301 | assert actual.valid 302 | 303 | def test_get_name(self) -> None: 304 | actual = self.test_handler._get_product_name().lower() 305 | expected = ebay_with_itm_test["expected_title"].lower() 306 | assert isinstance(actual, str) 307 | assert actual == expected 308 | 309 | def test_get_price(self) -> None: 310 | price = self.test_handler._get_product_price() 311 | assert isinstance(price, float) 312 | 313 | def test_get_currency(self) -> None: 314 | currency = self.test_handler._get_product_currency() 315 | assert isinstance(currency, str) 316 | assert len(currency) == 3 317 | assert currency == ebay_with_itm_test["expected_currency"] 318 | 319 | def test_get_id(self) -> None: 320 | id = self.test_handler._get_product_id() 321 | assert isinstance(id, str) 322 | assert id == ebay_with_itm_test["expected_id"] 323 | 324 | 325 | # OBS: There is two Ebay versions - This is for url that start with 'ebay.com/p/' 326 | class TestEbayHandler_with_p(BaseTestWebsiteHandler): 327 | test_handler = EbayHandler(ebay_with_p_test["link"]) 328 | 329 | def test_get_product_info(self, mocker) -> None: 330 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 331 | actual = self.test_handler.get_product_info() 332 | assert isinstance(actual, Info) 333 | assert actual.valid 334 | 335 | def test_get_name(self) -> None: 336 | actual = self.test_handler._get_product_name().lower() 337 | expected = ebay_with_p_test["expected_title"].lower() 338 | assert isinstance(actual, str) 339 | assert actual == expected 340 | 341 | def test_get_price(self) -> None: 342 | price = self.test_handler._get_product_price() 343 | assert isinstance(price, float) 344 | 345 | def test_get_currency(self) -> None: 346 | currency = self.test_handler._get_product_currency() 347 | assert isinstance(currency, str) 348 | assert len(currency) == 3 349 | # assert currency == ebay_with_p_test["expected_currency"] 350 | 351 | def test_get_id(self) -> None: 352 | id = self.test_handler._get_product_id() 353 | assert isinstance(id, str) 354 | assert id == ebay_with_p_test["expected_id"] 355 | 356 | 357 | class TestPowerHandler(BaseTestWebsiteHandler): 358 | test_handler = PowerHandler(power_test["link"]) 359 | 360 | def test_get_product_info(self, mocker) -> None: 361 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 362 | actual = self.test_handler.get_product_info() 363 | assert isinstance(actual, Info) 364 | assert actual.valid 365 | 366 | def test_get_name(self) -> None: 367 | actual = self.test_handler._get_product_name().lower() 368 | expected = power_test["expected_title"].lower() 369 | assert isinstance(actual, str) 370 | assert actual == expected 371 | 372 | def test_get_price(self) -> None: 373 | price = self.test_handler._get_product_price() 374 | assert isinstance(price, float) 375 | 376 | def test_get_currency(self) -> None: 377 | currency = self.test_handler._get_product_currency() 378 | assert isinstance(currency, str) 379 | assert currency == power_test["expected_currency"] 380 | 381 | def test_get_id(self) -> None: 382 | id = self.test_handler._get_product_id() 383 | assert isinstance(id, str) 384 | assert id == power_test["expected_id"] 385 | 386 | 387 | class TestExpertHandler(BaseTestWebsiteHandler): 388 | test_handler = ExpertHandler(expert_test["link"]) 389 | 390 | def test_get_product_info(self, mocker) -> None: 391 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 392 | actual = self.test_handler.get_product_info() 393 | assert isinstance(actual, Info) 394 | assert actual.valid 395 | 396 | def test_get_name(self) -> None: 397 | actual = self.test_handler._get_product_name().lower() 398 | expected = expert_test["expected_title"].lower() 399 | assert isinstance(actual, str) 400 | assert actual == expected 401 | 402 | def test_get_price(self) -> None: 403 | price = self.test_handler._get_product_price() 404 | assert isinstance(price, float) 405 | 406 | def test_get_currency(self) -> None: 407 | currency = self.test_handler._get_product_currency() 408 | assert isinstance(currency, str) 409 | assert currency == expert_test["expected_currency"] 410 | 411 | def test_get_id(self) -> None: 412 | id = self.test_handler._get_product_id() 413 | assert isinstance(id, str) 414 | assert id == expert_test["expected_id"] 415 | 416 | 417 | class TestMMVisionHandler(BaseTestWebsiteHandler): 418 | test_handler = MMVisionHandler(mmvision_test["link"]) 419 | 420 | def test_get_product_info(self, mocker) -> None: 421 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 422 | actual = self.test_handler.get_product_info() 423 | assert isinstance(actual, Info) 424 | assert actual.valid 425 | 426 | def test_get_name(self) -> None: 427 | actual = self.test_handler._get_product_name().lower() 428 | expected = mmvision_test["expected_title"].lower() 429 | assert isinstance(actual, str) 430 | assert actual == expected 431 | 432 | def test_get_price(self) -> None: 433 | price = self.test_handler._get_product_price() 434 | assert isinstance(price, float) 435 | 436 | def test_get_currency(self) -> None: 437 | currency = self.test_handler._get_product_currency() 438 | assert isinstance(currency, str) 439 | assert currency == mmvision_test["expected_currency"] 440 | 441 | def test_get_id(self) -> None: 442 | id = self.test_handler._get_product_id() 443 | assert isinstance(id, str) 444 | assert id == mmvision_test["expected_id"] 445 | 446 | 447 | class TestCoolshopHandler(BaseTestWebsiteHandler): 448 | test_handler = CoolshopHandler(coolshop_test["link"]) 449 | 450 | def test_get_product_info(self, mocker) -> None: 451 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 452 | actual = self.test_handler.get_product_info() 453 | assert isinstance(actual, Info) 454 | assert actual.valid 455 | 456 | def test_get_name(self) -> None: 457 | actual = self.test_handler._get_product_name().lower() 458 | expected = coolshop_test["expected_title"].lower() 459 | assert isinstance(actual, str) 460 | assert actual == expected 461 | 462 | def test_get_price(self) -> None: 463 | price = self.test_handler._get_product_price() 464 | assert isinstance(price, float) 465 | 466 | def test_get_currency(self) -> None: 467 | currency = self.test_handler._get_product_currency() 468 | assert isinstance(currency, str) 469 | assert currency == coolshop_test["expected_currency"] 470 | 471 | def test_get_id(self) -> None: 472 | id = self.test_handler._get_product_id() 473 | assert isinstance(id, str) 474 | assert id == coolshop_test["expected_id"] 475 | 476 | 477 | class TestSharkGamingHandler(BaseTestWebsiteHandler): 478 | test_handler = SharkGamingHandler(sharkgaming_test["link"]) 479 | 480 | def test_get_product_info(self, mocker) -> None: 481 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 482 | actual = self.test_handler.get_product_info() 483 | assert isinstance(actual, Info) 484 | assert actual.valid 485 | 486 | def test_get_name(self) -> None: 487 | actual = self.test_handler._get_product_name().lower() 488 | expected = sharkgaming_test["expected_title"].lower() 489 | assert isinstance(actual, str) 490 | assert actual == expected 491 | 492 | def test_get_price(self) -> None: 493 | price = self.test_handler._get_product_price() 494 | assert isinstance(price, float) 495 | 496 | def test_get_currency(self) -> None: 497 | currency = self.test_handler._get_product_currency() 498 | assert isinstance(currency, str) 499 | assert currency == sharkgaming_test["expected_currency"] 500 | 501 | def test_get_id(self) -> None: 502 | id = self.test_handler._get_product_id() 503 | assert isinstance(id, str) 504 | assert id == sharkgaming_test["expected_id"] 505 | 506 | 507 | class TestNeweggHandler(BaseTestWebsiteHandler): 508 | test_handler = NeweggHandler(newegg_test["link"]) 509 | 510 | def test_get_product_info(self, mocker) -> None: 511 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 512 | actual = self.test_handler.get_product_info() 513 | assert isinstance(actual, Info) 514 | assert actual.valid 515 | 516 | def test_get_name(self) -> None: 517 | actual = self.test_handler._get_product_name().lower() 518 | expected = newegg_test["expected_title"].lower() 519 | assert isinstance(actual, str) 520 | assert actual == expected 521 | 522 | def test_get_price(self) -> None: 523 | price = self.test_handler._get_product_price() 524 | assert isinstance(price, float) 525 | 526 | def test_get_currency(self) -> None: 527 | currency = self.test_handler._get_product_currency() 528 | assert isinstance(currency, str) 529 | assert currency == newegg_test["expected_currency"] 530 | 531 | def test_get_id(self) -> None: 532 | id = self.test_handler._get_product_id() 533 | assert isinstance(id, str) 534 | assert id == newegg_test["expected_id"] 535 | 536 | 537 | class TestHifiKlubbenHandler(BaseTestWebsiteHandler): 538 | test_handler = HifiKlubbenHandler(hifiklubben_test["link"]) 539 | 540 | def test_get_product_info(self, mocker) -> None: 541 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 542 | actual = self.test_handler.get_product_info() 543 | assert isinstance(actual, Info) 544 | assert actual.valid 545 | 546 | def test_get_name(self) -> None: 547 | actual = self.test_handler._get_product_name().lower() 548 | expected = hifiklubben_test["expected_title"].lower() 549 | assert isinstance(actual, str) 550 | assert actual == expected 551 | 552 | def test_get_price(self) -> None: 553 | price = self.test_handler._get_product_price() 554 | assert isinstance(price, float) 555 | 556 | def test_get_currency(self) -> None: 557 | currency = self.test_handler._get_product_currency() 558 | assert isinstance(currency, str) 559 | assert currency == hifiklubben_test["expected_currency"] 560 | 561 | def test_get_id(self) -> None: 562 | id = self.test_handler._get_product_id() 563 | assert isinstance(id, str) 564 | assert id == hifiklubben_test["expected_id"] 565 | 566 | 567 | class TestSheinHandler(BaseTestWebsiteHandler): 568 | test_handler = SheinHandler(shein_test["link"]) 569 | 570 | def test_get_product_info(self, mocker) -> None: 571 | mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) 572 | actual = self.test_handler.get_product_info() 573 | assert isinstance(actual, Info) 574 | assert actual.valid 575 | 576 | def test_get_name(self) -> None: 577 | actual = self.test_handler._get_product_name().lower() 578 | expected = shein_test["expected_title"].lower() 579 | assert isinstance(actual, str) 580 | assert actual == expected 581 | 582 | def test_get_price(self) -> None: 583 | price = self.test_handler._get_product_price() 584 | assert isinstance(price, float) 585 | 586 | def test_get_currency(self) -> None: 587 | currency = self.test_handler._get_product_currency() 588 | assert isinstance(currency, str) 589 | assert currency == shein_test["expected_currency"] 590 | 591 | def test_get_id(self) -> None: 592 | id = self.test_handler._get_product_id() 593 | assert isinstance(id, str) 594 | assert id == shein_test["expected_id"] 595 | --------------------------------------------------------------------------------