├── .gitignore ├── LICENSE ├── README.md ├── commands.md ├── compose.yaml ├── django-celery-redis.code-workspace ├── requirements.txt └── src ├── .sample.env ├── cfehome ├── __init__.py ├── asgi.py ├── celery.py ├── settings.py ├── urls.py └── wsgi.py ├── helpers ├── __init__.py ├── amazon.py └── brightdata.py ├── manage.py ├── movies ├── __init__.py ├── admin.py ├── apps.py ├── migrations │ └── __init__.py ├── models.py ├── tasks.py ├── tests.py └── views.py ├── nbs ├── 1 - Hello World with Selenium.ipynb ├── 2 - Proxy Scraping with Bright Data and Selenium.ipynb ├── 3 - Helper-based Proxy Scraping with Bright Data and Selenium.ipynb ├── 4 - Parse HTML Data with BeautifulSoup.ipynb ├── 5 - Amazon Captcha and Prepare Parser Helper Functions.ipynb ├── 6 - Tracking Scraped Data with Django Models.ipynb ├── 7 - Trigger Scrape Task.ipynb └── setup.py └── products ├── __init__.py ├── admin.py ├── apps.py ├── migrations ├── 0001_initial.py ├── 0002_alter_productscrapeevent_asin.py ├── 0003_product__trigger_scrape_product_trigger_scrape_and_more.py ├── 0004_remove_product__trigger_scrape_and_more.py ├── 0005_alter_product_active.py └── __init__.py ├── models.py ├── tasks.py ├── tests.py └── views.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | .DS_Store 162 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Coding For Entrepreneurs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Scraping on a Schedule with Django & Celery 2 | Learn how to schedule regular web scraping, save the data, and more with Django & Celery. 3 | 4 | Topics: 5 | 6 | - Django 7 | - Celery 8 | - Selenium 9 | - Scraped Data to Database via Django 10 | - Reliable Web Scraping with Selenium + Bright Data 11 | 12 | References: 13 | - [Celery + Redis + Django configuration guide](https://www.codingforentrepreneurs.com/blog/celery-redis-django/) 14 | - Django + Celery Redis [blank project code](https://github.com/codingforentrepreneurs/Django-Celery-Redis) 15 | - Django + Jupyter Setup Module [short + code](https://www.codingforentrepreneurs.com/shorts/django-setup-for-use-in-jupyter-notebooks/) 16 | 17 | Requirements: 18 | - Django experience such as Try Django (on [YouTube](https://www.youtube.com/playlist?list=PLEsfXFp6DpzRMby_cSoWTFw8zaMdTEXgL) or on [CFE](https://www.codingforentrepreneurs.com/topics/try-django/)) or [Your First Django Project](https://www.codingforentrepreneurs.com/courses/your-first-django-project/). 19 | - Redis Instance 20 | - Setup Redis on Windows [blog post](https://www.codingforentrepreneurs.com/blog/redis-on-windows/) 21 | - Setup Redis on MacOS or Linux [blog post](https://www.codingforentrepreneurs.com/blog/install-redis-mac-and-linux) 22 | - Setup Redis on Remote Virtual Machine [blog post](https://www.codingforentrepreneurs.com/blog/remote-redis-servers-for-development/) 23 | - How I use Redis for new projects [short + code](https://www.codingforentrepreneurs.com/shorts/how-i-use-redis-for-new-projects-with-docker-compose/) 24 | - A Bright Data Account [$25 credit for new accounts](https://brdta.com/justin) 25 | 26 | ## Getting Started 27 | 28 | ```bash 29 | git clone https://github.com/codingforentrepreneurs/Django-Celery-Redis 30 | mv Django-Celery-Redis scrape-scheduler 31 | cd scrape-scheduler 32 | ``` 33 | 34 | `macos/linux` 35 | ``` 36 | python3 -m venv venv 37 | source venv/bin/activate 38 | ``` 39 | 40 | `windows` 41 | ``` 42 | c:\Python311\python.exe -m venv venv 43 | .\venv\Scripts\activate 44 | ``` 45 | 46 | Install requirements 47 | ```bash 48 | python -m pip install pip --upgrade 49 | python -m pip install -r requirements.txt 50 | ``` 51 | 52 | Run a local redis instance via Docker Compose 53 | ```bash 54 | docker compose -f compose.yaml up -d 55 | ``` 56 | This will give us `redis://localhost:6170` 57 | 58 | Create `.env` in `src/.env` with: 59 | 60 | ```bash 61 | CELERY_BROKER_REDIS_URL="redis://localhost:6170" 62 | DEBUG=True 63 | ``` 64 | 65 | Navigate into your Django root: 66 | 67 | ```bash 68 | cd src/ 69 | ls 70 | ``` 71 | You should see at least `cfehome/` and `manage.py`. 72 | 73 | Run your project in 2 terminals: 74 | - `python manage.py runserver` 75 | - `celery -A cfehome worker --beat` 76 | 77 | Let's go! 78 | -------------------------------------------------------------------------------- /commands.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ``` 4 | docker compose up 5 | ``` 6 | 7 | ``` 8 | celery -A cfehome worker --beat -l info 9 | ``` 10 | 11 | ``` 12 | python manage.py shell 13 | ``` 14 | 15 | ``` 16 | python manage.py runserver 17 | ``` 18 | 19 | ``` 20 | jupyter notebook 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /compose.yaml: -------------------------------------------------------------------------------- 1 | # read more on https://www.codingforentrepreneurs.com/shorts/how-i-use-redis-for-new-projects-with-docker-compose/ 2 | version: '3.9' 3 | services: 4 | redis: 5 | image: redis 6 | restart: always 7 | ports: 8 | - 6170:6379 9 | volumes: 10 | - data:/data 11 | entrypoint: redis-server --appendonly yes 12 | volumes: 13 | data: 14 | 15 | networks: 16 | default: 17 | name: scrape_scheduler_network 18 | -------------------------------------------------------------------------------- /django-celery-redis.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": {} 8 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Celery 2 | Django 3 | django-celery-beat 4 | django-celery-results 5 | python-decouple 6 | redis 7 | jupyter 8 | selenium 9 | beautifulsoup4 -------------------------------------------------------------------------------- /src/.sample.env: -------------------------------------------------------------------------------- 1 | # celery via Docker compose 2 | CELERY_BROKER_REDIS_URL="redis://localhost:6170" 3 | DEBUG=True 4 | DJANGO_SECRET_KEY="django-insecure-g4x&pvo@a^5s&e51s$+tuk_aaf)rdcu19v_f@d*iqp0opzoy#4" 5 | SBR_WEBDRIVER="https://{user}:{pw}@{host}:{port}" -------------------------------------------------------------------------------- /src/cfehome/__init__.py: -------------------------------------------------------------------------------- 1 | from .celery import app as celery_app 2 | 3 | __all__ = ["celery_app"] 4 | -------------------------------------------------------------------------------- /src/cfehome/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for cfehome project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings") 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /src/cfehome/celery.py: -------------------------------------------------------------------------------- 1 | # path/to/your/proj/src/cfehome/celery.py 2 | import os 3 | 4 | from celery import Celery 5 | from celery.schedules import crontab 6 | 7 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings") 8 | 9 | app = Celery("cfehome") 10 | 11 | # Using a string here means the worker don't have to serialize 12 | # the configuration object to child processes. 13 | # - namespace='CELERY' means all celery-related configuration keys 14 | # should have a `CELERY_` prefix. 15 | app.config_from_object("django.conf:settings", namespace="CELERY") 16 | 17 | # Load task modules from all registered Django app configs. 18 | app.autodiscover_tasks() 19 | 20 | # We used CELERY_BROKER_URL in settings.py instead of: 21 | # app.conf.broker_url = '' 22 | 23 | # We used CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP in settings.py instead of: 24 | # app.conf.broker_connection_retry_on_startup = True 25 | 26 | # We used CELERY_BEAT_SCHEDULER in settings.py instead of: 27 | # app.conf.beat_scheduler = ''django_celery_beat.schedulers.DatabaseScheduler' 28 | 29 | 30 | # Below is for illustration purposes. We configured our project 31 | # So we can perform all kinds of scheduling in the Django admin 32 | # under Periodic Tasks. 33 | # app.conf.beat_schedule = { 34 | # "multiply-task-crontab": { 35 | # "task": "multiply_two_numbers", 36 | # "schedule": crontab(hour=7, minute=30, day_of_week=1), 37 | # "args": (16, 16), 38 | # }, 39 | # "multiply-every-5-seconds": { 40 | # "task": "multiply_two_numbers", 41 | # "schedule": 5.0, 42 | # "args": (16, 16), 43 | # }, 44 | # "add-every-30-seconds": { 45 | # "task": "movies.tasks.add", 46 | # "schedule": 30.0, 47 | # "args": (16, 16), 48 | # }, 49 | # } 50 | -------------------------------------------------------------------------------- /src/cfehome/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for cfehome project. 3 | 4 | Generated by 'django-admin startproject' using Django 5.0.2. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/5.0/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | 15 | from decouple import config 16 | 17 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 18 | BASE_DIR = Path(__file__).resolve().parent.parent 19 | 20 | 21 | # Quick-start development settings - unsuitable for production 22 | # See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/ 23 | 24 | # SECURITY WARNING: keep the secret key used in production secret! 25 | SECRET_KEY = config("DJANGO_SECRET_KEY", default=None) 26 | 27 | # SECURITY WARNING: don't run with debug turned on in production! 28 | DEBUG = config("DEBUG", cast=bool, default=False) 29 | 30 | ALLOWED_HOSTS = [] 31 | 32 | 33 | # Application definition 34 | 35 | INSTALLED_APPS = [ 36 | "django.contrib.admin", 37 | "django.contrib.auth", 38 | "django.contrib.contenttypes", 39 | "django.contrib.sessions", 40 | "django.contrib.messages", 41 | "django.contrib.staticfiles", 42 | "django_celery_beat", 43 | "django_celery_results", 44 | "movies", 45 | "products", 46 | ] 47 | 48 | MIDDLEWARE = [ 49 | "django.middleware.security.SecurityMiddleware", 50 | "django.contrib.sessions.middleware.SessionMiddleware", 51 | "django.middleware.common.CommonMiddleware", 52 | "django.middleware.csrf.CsrfViewMiddleware", 53 | "django.contrib.auth.middleware.AuthenticationMiddleware", 54 | "django.contrib.messages.middleware.MessageMiddleware", 55 | "django.middleware.clickjacking.XFrameOptionsMiddleware", 56 | ] 57 | 58 | ROOT_URLCONF = "cfehome.urls" 59 | 60 | TEMPLATES = [ 61 | { 62 | "BACKEND": "django.template.backends.django.DjangoTemplates", 63 | "DIRS": [], 64 | "APP_DIRS": True, 65 | "OPTIONS": { 66 | "context_processors": [ 67 | "django.template.context_processors.debug", 68 | "django.template.context_processors.request", 69 | "django.contrib.auth.context_processors.auth", 70 | "django.contrib.messages.context_processors.messages", 71 | ], 72 | }, 73 | }, 74 | ] 75 | 76 | WSGI_APPLICATION = "cfehome.wsgi.application" 77 | 78 | 79 | # Database 80 | # https://docs.djangoproject.com/en/5.0/ref/settings/#databases 81 | 82 | DATABASES = { 83 | "default": { 84 | "ENGINE": "django.db.backends.sqlite3", 85 | "NAME": BASE_DIR / "db.sqlite3", 86 | } 87 | } 88 | 89 | 90 | # Password validation 91 | # https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators 92 | 93 | AUTH_PASSWORD_VALIDATORS = [ 94 | { 95 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", 96 | }, 97 | { 98 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", 99 | }, 100 | { 101 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", 102 | }, 103 | { 104 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", 105 | }, 106 | ] 107 | 108 | 109 | # Internationalization 110 | # https://docs.djangoproject.com/en/5.0/topics/i18n/ 111 | 112 | LANGUAGE_CODE = "en-us" 113 | 114 | TIME_ZONE = "UTC" 115 | 116 | USE_I18N = True 117 | 118 | USE_TZ = True 119 | 120 | 121 | # Static files (CSS, JavaScript, Images) 122 | # https://docs.djangoproject.com/en/5.0/howto/static-files/ 123 | 124 | STATIC_URL = "static/" 125 | 126 | # Default primary key field type 127 | # https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field 128 | 129 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" 130 | 131 | 132 | # save Celery task results in Django's database 133 | CELERY_RESULT_BACKEND = "django-db" 134 | 135 | # broker_connection_retry_on_startup 136 | CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP = True 137 | 138 | # This configures Redis as the datastore between Django + Celery 139 | CELERY_BROKER_URL = config("CELERY_BROKER_REDIS_URL", default="redis://localhost:6379") 140 | # if you out to use os.environ the config is: 141 | # CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_REDIS_URL', 'redis://localhost:6379') 142 | 143 | # this allows you to schedule items in the Django admin. 144 | CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers.DatabaseScheduler" 145 | -------------------------------------------------------------------------------- /src/cfehome/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL configuration for cfehome project. 3 | 4 | The `urlpatterns` list routes URLs to views. For more information please see: 5 | https://docs.djangoproject.com/en/5.0/topics/http/urls/ 6 | Examples: 7 | Function views 8 | 1. Add an import: from my_app import views 9 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 10 | Class-based views 11 | 1. Add an import: from other_app.views import Home 12 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 13 | Including another URLconf 14 | 1. Import the include() function: from django.urls import include, path 15 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 16 | """ 17 | from django.contrib import admin 18 | from django.urls import path 19 | 20 | urlpatterns = [ 21 | path("admin/", admin.site.urls), 22 | ] 23 | -------------------------------------------------------------------------------- /src/cfehome/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for cfehome project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /src/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from .amazon import extract_amazon_product_data 2 | from .brightdata import scrape 3 | 4 | __all__ = ['extract_amazon_product_data', 'scrape'] -------------------------------------------------------------------------------- /src/helpers/amazon.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | def find_product_table_data(html): 4 | soup = BeautifulSoup(html, "html.parser") 5 | product_data = soup.find('div', id='prodDetails') 6 | if product_data is None: 7 | return [] 8 | table = product_data.find('table') 9 | columns = [f"{x.text}".strip() for x in table.find_all('th')] 10 | table_data=[] 11 | for i, row in enumerate(table.find_all('tr')): # [1:] to skip the header row 12 | # Get all cells in the row 13 | cells = row.find_all('td') 14 | # Create a dictionary for the current row, mapping header to cell data 15 | row_data = {columns[i]: f'{cell.text}'.strip() for cell in cells} 16 | # Add the dictionary to your list 17 | table_data.append(row_data) 18 | return table_data 19 | 20 | 21 | def find_product_rating(html): 22 | soup = BeautifulSoup(html, "html.parser") 23 | average_rating = soup.find(id='averageCustomerReviews').find_all("span", class_='a-size-base')[0].text.strip() 24 | average_rating = "".join([x for x in f"{average_rating}".strip() if x.isdigit() or x == '.']) 25 | average_rating = float(average_rating) 26 | rating_data = soup.find(id='acrCustomerReviewText').text 27 | rating_count = int(''.join([x for x in rating_data if x.isdigit()])) 28 | rating_count 29 | return { 30 | 'average': average_rating, 31 | 'count': rating_count, 32 | } 33 | 34 | def extract_amazon_product_data(html): 35 | soup = BeautifulSoup(html, "html.parser") 36 | productTitle = soup.find('span', id='productTitle') 37 | productTitleText = f"{productTitle.text}".strip() 38 | productPrice = soup.find_all('span', class_='a-price-whole')[0] 39 | productPrice = f"{productPrice.text}".strip() 40 | productPriceText = "".join([x for x in productPrice if x.isdigit() or x == '.']) 41 | productPriceNum = float(productPriceText) 42 | try: 43 | productDescription = soup.find('div', id='productDescription').text 44 | except: 45 | productDescription = '' 46 | featureBullets = soup.find('div', id='feature-bullets').text 47 | asin = '' 48 | metadata_items = find_product_table_data(html) 49 | for data in metadata_items: 50 | if data.get("ASIN") is None: 51 | continue 52 | else: 53 | asin = data.get("ASIN") 54 | break 55 | return { 56 | 'asin': asin, 57 | 'title': productTitleText, 58 | 'price_raw': productPrice, 59 | 'price_text': productPriceText, 60 | 'price': productPriceNum, 61 | 'metadata': metadata_items, 62 | 'description': productDescription, 63 | 'feature_bullets': featureBullets, 64 | 'rating': find_product_rating(html) 65 | } -------------------------------------------------------------------------------- /src/helpers/brightdata.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import Remote, ChromeOptions 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection 4 | from decouple import config 5 | from urllib.parse import urljoin, urlparse 6 | 7 | 8 | SBR_WEBDRIVER = config('SBR_WEBDRIVER', default=None) 9 | 10 | 11 | def scrape(url=None, body_only=True, solve_captcha=False, wait_seconds=0): 12 | print('Connecting to Scraping Browser...') 13 | sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome') 14 | html = "" 15 | url = urljoin(url, urlparse(url).path) 16 | with Remote(sbr_connection, options=ChromeOptions()) as driver: 17 | print(f'Connected! Navigating to {url}') 18 | driver.get(url) 19 | if wait_seconds > 0: 20 | driver.implicitly_wait(wait_seconds) 21 | if solve_captcha: 22 | solve_res = driver.execute('executeCdpCommand', { 23 | 'cmd': 'Captcha.waitForSolve', 24 | 'params': {'detectTimeout': 10000}, 25 | }) 26 | print('Captcha solve status:', solve_res['value']['status']) 27 | print('Navigated! Scraping page content...') 28 | html = driver.page_source 29 | if body_only: 30 | body = driver.find_element(By.TAG_NAME, "body") 31 | html = body.get_attribute('innerHTML') 32 | return html -------------------------------------------------------------------------------- /src/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings") 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /src/movies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/movies/__init__.py -------------------------------------------------------------------------------- /src/movies/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /src/movies/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class MoviesConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "movies" 7 | -------------------------------------------------------------------------------- /src/movies/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/movies/migrations/__init__.py -------------------------------------------------------------------------------- /src/movies/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | -------------------------------------------------------------------------------- /src/movies/tasks.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from celery import shared_task 4 | 5 | 6 | @shared_task 7 | def add(x, y): 8 | # Celery recognizes this as the `movies.tasks.add` task 9 | # the name is purposefully omitted here. 10 | return x + y 11 | 12 | 13 | @shared_task(name="multiply_two_numbers") 14 | def mul(x, y): 15 | # Celery recognizes this as the `multiple_two_numbers` task 16 | total = x * (y * random.randint(3, 100)) 17 | return total 18 | 19 | 20 | @shared_task(name="sum_list_numbers") 21 | def xsum(numbers): 22 | # Celery recognizes this as the `sum_list_numbers` task 23 | return sum(numbers) 24 | -------------------------------------------------------------------------------- /src/movies/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /src/movies/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /src/nbs/4 - Parse HTML Data with BeautifulSoup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 63, 6 | "id": "439ab9a8-51cf-4591-836a-0cbbac9ff3d6", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "setup.init_django(project_name='cfehome')" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 64, 17 | "id": "5e6b875f-38aa-43d1-8bf6-2f51deb88170", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 65, 27 | "id": "251aa699-5c55-4d25-a617-1d43047cfe5d", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "url = \"https://www.amazon.com/Microsoft-RRT-00001-Xbox-Enhanced-Protection-X/dp/B0CMFGD9C4/ref=sr_1_3?crid=O4D5MCNRPHZG&dib=eyJ2IjoiMSJ9.BesESVeGKfCLfmoE_Vj2bHkyIqHkjt6vWY83koTZ_cz2I8dLY4__kVTkLkSzkH-tXhAxpP2gQNyM5eVeTcHhvxw10QXkh4Y_4mErHWUhqbSt9lrSfOO_-FmaAKtnyeF8iomBtkjnfyUknCfgCkCLPUr4uTyEnz5WtXbXQI-WN6fXgPPlvAdJXHHZEW4PuI3no5Em_YuyrBMW5_lEcEiUTpPXfdpztJ9ofmYYPF6l8WI.dg-z2b_4ANwk_S5W2m6JFCSScCDS2b3idQV1r-yQOWQ&dib_tag=se&keywords=xbox+series+x&qid=1709681924&sprefix=xbox%2Caps%2C264&sr=8-3\"" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 66, 37 | "id": "f5db35da-6859-42ef-b5fe-81c55bc89bfa", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "\n", 42 | "import helpers" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 67, 48 | "id": "fadd2521-984e-41af-b5ce-0206c9dbcaca", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Connecting to Scraping Browser...\n", 56 | "Connected! Navigating to https://www.amazon.com/Microsoft-RRT-00001-Xbox-Enhanced-Protection-X/dp/B0CMFGD9C4/ref=sr_1_3?crid=O4D5MCNRPHZG&dib=eyJ2IjoiMSJ9.BesESVeGKfCLfmoE_Vj2bHkyIqHkjt6vWY83koTZ_cz2I8dLY4__kVTkLkSzkH-tXhAxpP2gQNyM5eVeTcHhvxw10QXkh4Y_4mErHWUhqbSt9lrSfOO_-FmaAKtnyeF8iomBtkjnfyUknCfgCkCLPUr4uTyEnz5WtXbXQI-WN6fXgPPlvAdJXHHZEW4PuI3no5Em_YuyrBMW5_lEcEiUTpPXfdpztJ9ofmYYPF6l8WI.dg-z2b_4ANwk_S5W2m6JFCSScCDS2b3idQV1r-yQOWQ&dib_tag=se&keywords=xbox+series+x&qid=1709681924&sprefix=xbox%2Caps%2C264&sr=8-3\n", 57 | "Navigated! Scraping page content...\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "html = helpers.scrape(url=url)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 68, 68 | "id": "346d4e92-b6ec-4101-848b-8b77af70b2dd", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from bs4 import BeautifulSoup" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 69, 78 | "id": "f9461659-a112-40f5-947f-a95c5002fec1", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "soup = BeautifulSoup(html)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "0d436797-3317-4a4b-ad0c-61d5355e4bcf", 88 | "metadata": {}, 89 | "source": [ 90 | "```html\n", 91 | " \n", 92 | " PlayStation®5 Digital Edition (slim) \n", 93 | "\n", 94 | "\n", 95 | "
\n", 96 | "

\n", 97 | " PlayStation®5 Digital Edition (slim) \n", 98 | "

\n", 99 | "
\n", 100 | "
\n", 101 | "```" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 70, 107 | "id": "b20623ab-099e-402f-adbb-51d46e2abfff", 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "'Microsoft RRT-00001 Xbox Series X 1TB SSD Bundle with 2 YR CPS Enhanced Protection Pack'" 114 | ] 115 | }, 116 | "execution_count": 70, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "productTitle = soup.find('span', id='productTitle')\n", 123 | "productTitleText = f\"{productTitle.text}\".strip()\n", 124 | "productTitleText" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "a38928b6-ee4a-4bf1-a706-94249f103712", 130 | "metadata": {}, 131 | "source": [ 132 | "```html\n", 133 | "449.\n", 134 | "```" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "40ba7cf5-74ab-4f40-a661-eb2098263435", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 71, 148 | "id": "4839ac54-5962-42c3-af3a-203eca77420e", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "481.0" 155 | ] 156 | }, 157 | "execution_count": 71, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "productPrice = soup.find_all('span', class_='a-price-whole')[0]\n", 164 | "productPrice\n", 165 | "productPriceText = \"\".join([x for x in f\"{productPrice.text}\".strip() if x.isdigit() or x == '.'])\n", 166 | "productPriceNum = float(productPriceText)\n", 167 | "productPriceNum" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 72, 173 | "id": "40c32d52-edeb-4924-87aa-b08efe73d838", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "['ASIN',\n", 180 | " 'Customer Reviews',\n", 181 | " 'Best Sellers Rank',\n", 182 | " 'Product Dimensions',\n", 183 | " 'Item model number',\n", 184 | " 'Item Weight',\n", 185 | " 'Manufacturer',\n", 186 | " 'Date First Available']" 187 | ] 188 | }, 189 | "execution_count": 72, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "product_data = soup.find('div', id='prodDetails')\n", 196 | "table = product_data.find('table')\n", 197 | "columns = [f\"{x.text}\".strip() for x in table.find_all('th')]\n", 198 | "columns" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 73, 204 | "id": "f6426180-b88d-4be1-8455-c90b73e8bbc1", 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "table_data=[]\n", 209 | "for i, row in enumerate(table.find_all('tr')): # [1:] to skip the header row\n", 210 | " # Get all cells in the row\n", 211 | " cells = row.find_all('td')\n", 212 | " # Create a dictionary for the current row, mapping header to cell data\n", 213 | " row_data = {columns[i]: f'{cell.text}'.strip() for cell in cells}\n", 214 | " # Add the dictionary to your list\n", 215 | " table_data.append(row_data)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 74, 221 | "id": "5d3c27ef-4e11-45ea-9b1c-5f6cf6bd8ef0", 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "[{'ASIN': 'B0CMFGD9C4'},\n", 228 | " {'Customer Reviews': '4.5 4.5 out of 5 stars \\n 48 ratings \\n\\n\\n 4.5 out of 5 stars'},\n", 229 | " {'Best Sellers Rank': '#1,892 in Video Games (See Top 100 in Video Games) #40 in Xbox Accessories #101 in Xbox Series X & S Accessories'},\n", 230 | " {'Product Dimensions': '5.9 x 5.9 x 11.9 inches; 9.8 Pounds'},\n", 231 | " {'Item model number': 'E99MSRRT00001'},\n", 232 | " {'Item Weight': '9.8 pounds'},\n", 233 | " {'Manufacturer': 'Microsoft'},\n", 234 | " {'Date First Available': 'November 3, 2023'}]" 235 | ] 236 | }, 237 | "execution_count": 74, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "table_data" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 75, 249 | "id": "48c9c6b6-d419-4d6a-97b2-b0e1ee37f726", 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "['B0CMFGD9C4']" 256 | ] 257 | }, 258 | "execution_count": 75, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "elements_with_attribute = soup.find_all(lambda tag: tag.has_attr('data-csa-c-asin'))\n", 265 | "asins = [x.attrs.get('data-csa-c-asin') for x in elements_with_attribute if x]\n", 266 | "asins = list(set([x for x in asins if x != \"\"]))\n", 267 | "asins" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 76, 273 | "id": "04209cfe-d944-4f9a-ac75-9246f241a2c8", 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "\" \\n The all-new XBOX SERIES X THE FASTEST, MOST POWERFUL XBOX EVER POWER YOUR DREAMS OPTIMIZED FOR SERIES X|S Games built with the Xbox Series X|S development kit showcase significantly reduced load times and stunning visuals at up to 120FPS. GET IT ONCE With Smart Delivery, you can buy a supported game once and always have the best available version for whatever console you play on. GAME ON From future adventures, to current obsessions, to classic titles, thousands of favorites across four generations of Xbox look and play best on Xbox Series X. 12 TFLOPS OF POWER The 12 teraflops of processing power housed in the system on a chip (SOC) work with AMD’s Zen 2 and RDNA 2 architectures to result in worlds that demand a closer look. LOOKS BETTER. PLAYS BETTER. Equipped with AMD’s Zen 2 and RDNA 2 architectures, DirectX ray tracing delivers true-to-life lighting, shadows, and accurate reflections to create dynamic, living worlds. LISTEN, YOU'RE BEING SURROUNDED. 3D Spatial Sound is the next evolution in audio technology, using advanced algorithms to create immersive lifelike worlds that put you at the center of your experience. TRUE 4K GAMING The Xbox Series X delivers sensationally smooth frame rates of up to 120FPS with the visual pop of HDR. Immerse yourself with sharper characters, brighter worlds, and impossible details with true-to-life 4K. From original classics like Halo: Combat Evolved to future favorites like Halo Infinite, every title looks and plays best on the Xbox Series X. \"" 280 | ] 281 | }, 282 | "execution_count": 76, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "productDescription = soup.find('div', id='productDescription').text\n", 289 | "productDescription" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 77, 295 | "id": "175c12e0-cbb8-4725-982a-09f19ef679fc", 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "' \\n About this item Microsoft Xbox Series X 1TB SSD True 4K gaming | Up to 120 frames per second 8K High Dynamic Range | Xbox Velocity Architecture IN THE BOX: Microsoft Xbox Series X 1TB SSD - Carbon Black Xbox wireless controller - Ultra high speed HDMI cable - Power cable - Quick start guide INCLUDED IN THE BUNDLE: 2 Year Premium Extended Service Protection Plan | Deco Gear Microfiber Cleaning Cloth 2 Year Premium Extended Service Protection Plan \\n'" 302 | ] 303 | }, 304 | "execution_count": 77, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "featureBullets = soup.find('div', id='feature-bullets').text\n", 311 | "featureBullets" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 78, 317 | "id": "7af4a3ff-a358-45ac-a29b-47e980975ac0", 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "4.5" 324 | ] 325 | }, 326 | "execution_count": 78, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "average_rating = soup.find(id='averageCustomerReviews').find_all(\"span\", class_='a-size-base')[0].text.strip()\n", 333 | "average_rating = \"\".join([x for x in f\"{average_rating}\".strip() if x.isdigit() or x == '.'])\n", 334 | "average_rating = float(average_rating)\n", 335 | "average_rating" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 79, 341 | "id": "0e17ef25-c190-48de-a9d6-0879f7bb053c", 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "48" 348 | ] 349 | }, 350 | "execution_count": 79, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "rating_data = soup.find(id='acrCustomerReviewText').text\n", 357 | "rating_count = int(''.join([x for x in rating_data if x.isdigit()]))\n", 358 | "rating_count" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "id": "708ac4c2-ea29-4100-8d51-8dc7c0a96fcd", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "id": "10e4f2d5-4e87-47c4-a3be-793537b8255f", 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [] 376 | } 377 | ], 378 | "metadata": { 379 | "kernelspec": { 380 | "display_name": "Python 3 (ipykernel)", 381 | "language": "python", 382 | "name": "python3" 383 | }, 384 | "language_info": { 385 | "codemirror_mode": { 386 | "name": "ipython", 387 | "version": 3 388 | }, 389 | "file_extension": ".py", 390 | "mimetype": "text/x-python", 391 | "name": "python", 392 | "nbconvert_exporter": "python", 393 | "pygments_lexer": "ipython3", 394 | "version": "3.11.4" 395 | } 396 | }, 397 | "nbformat": 4, 398 | "nbformat_minor": 5 399 | } 400 | -------------------------------------------------------------------------------- /src/nbs/5 - Amazon Captcha and Prepare Parser Helper Functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "488ae7f3-dd1f-46ea-8782-02853b4f0036", 6 | "metadata": {}, 7 | "source": [ 8 | "Note: This Notebook was named _5 - Save Extracted Data to Django Model_ in the video" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "439ab9a8-51cf-4591-836a-0cbbac9ff3d6", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import setup\n", 19 | "setup.init_django(project_name='cfehome')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "f5db35da-6859-42ef-b5fe-81c55bc89bfa", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import helpers" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "id": "346d4e92-b6ec-4101-848b-8b77af70b2dd", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from bs4 import BeautifulSoup" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "id": "5e6b875f-38aa-43d1-8bf6-2f51deb88170", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\"\n", 50 | "# url = \"https://www.amazon.com/Microsoft-RRT-00001-Xbox-Enhanced-Protection-X/dp/B0CMFGD9C4/ref=sr_1_3?crid=O4D5MCNRPHZG&dib=eyJ2IjoiMSJ9.BesESVeGKfCLfmoE_Vj2bHkyIqHkjt6vWY83koTZ_cz2I8dLY4__kVTkLkSzkH-tXhAxpP2gQNyM5eVeTcHhvxw10QXkh4Y_4mErHWUhqbSt9lrSfOO_-FmaAKtnyeF8iomBtkjnfyUknCfgCkCLPUr4uTyEnz5WtXbXQI-WN6fXgPPlvAdJXHHZEW4PuI3no5Em_YuyrBMW5_lEcEiUTpPXfdpztJ9ofmYYPF6l8WI.dg-z2b_4ANwk_S5W2m6JFCSScCDS2b3idQV1r-yQOWQ&dib_tag=se&keywords=xbox+series+x&qid=1709681924&sprefix=xbox%2Caps%2C264&sr=8-3\"" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "id": "251aa699-5c55-4d25-a617-1d43047cfe5d", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "'https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3'" 63 | ] 64 | }, 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "url" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "id": "f1d2c8c6-8c78-4863-a022-1839a66eb8c8", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "Connecting to Scraping Browser...\n", 85 | "Connected! Navigating to https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3\n", 86 | "Navigated! Scraping page content...\n" 87 | ] 88 | }, 89 | { 90 | "name": "stderr", 91 | "output_type": "stream", 92 | "text": [ 93 | "/Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py:38: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 94 | "\n", 95 | "The code that caused this warning is on line 38 of the file /Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n", 96 | "\n", 97 | " soup = BeautifulSoup(html)\n", 98 | "/Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py:6: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 99 | "\n", 100 | "The code that caused this warning is on line 6 of the file /Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n", 101 | "\n", 102 | " soup = BeautifulSoup(html)\n", 103 | "/Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py:25: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 104 | "\n", 105 | "The code that caused this warning is on line 25 of the file /Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n", 106 | "\n", 107 | " soup = BeautifulSoup(html)\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "html = helpers.scrape(url=url, solve_captcha=False)\n", 113 | "data = helpers.extract_amazon_product_data(html)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 7, 119 | "id": "32dbe860-6a5a-405c-a71c-2dc1a218e5c7", 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "{'title': 'PlayStation®5 Digital Edition (slim)',\n", 126 | " 'price_raw': '449.',\n", 127 | " 'price_text': '449.',\n", 128 | " 'price': 449.0,\n", 129 | " 'metadata': [{'ASIN': 'B0CL5KNB9M'},\n", 130 | " {'Release date': 'November 24, 2023'},\n", 131 | " {'Customer Reviews': '4.7 4.7 out of 5 stars \\n 3,538 ratings \\n\\n\\n 4.7 out of 5 stars'},\n", 132 | " {'Best Sellers Rank': '#216 in Video Games (See Top 100 in Video Games) #2 in PlayStation 5 Consoles'},\n", 133 | " {'Product Dimensions': '17 x 15 x 6 inches; 8.9 Pounds'},\n", 134 | " {'Type of item': 'Video Game'},\n", 135 | " {'Item model number': 'CFI-2000'},\n", 136 | " {'Item Weight': '8.9 pounds'},\n", 137 | " {'Manufacturer': 'Sony'},\n", 138 | " {'Country of Origin': 'China'},\n", 139 | " {'Batteries': '1 Lithium Ion batteries required. (included)'},\n", 140 | " {'Date First Available': 'November 24, 2023'}],\n", 141 | " 'description': ' \\n Play Like Never Before. The PS5 Digital Edition unleashes new gaming possibilities that you never anticipated. Experience lightning fast loading with an ultra-high speed SSD, deeper immersion with support for haptic feedback, adaptive triggers, and 3D Audio*,and an all-new generation of incredible PlayStation® games. PS5 Digital Edition is an all-digital version of the PS5 console with no disc drive. Sign into your account for PlayStation Network and go to PlayStation Store to buy and download games (Account for PlayStation Network required). Lightning Speed - Harness the power of a custom CPU, GPU, and SSD with Integrated I/O that rewrite the rules of what a PlayStation console can do. Stunning Games - Marvel at incredible graphics and experience new PS5 features. Play a back catalog of supported PS4 games. Breathtaking Immersion - Discover a deeper gaming experience with support for haptic feedback, adaptive triggers, and 3D Audio technology. *3D audio via built-in TV speakers or analog/USB stereo headphones. Set up and latest system software update required. ',\n", 142 | " 'feature_bullets': ' \\n About this item Model Number CFI-2000 Includes DualSense Wireless Controller, 1TB SSD, 2 Horizontal Stand Feet, HDMI Cable, AC power cord, USB cable, printed materials, ASTRO’s PLAYROOM (Pre-installed game) Vertical Stand sold seperately \\n',\n", 143 | " 'rating': {'average': 4.7, 'count': 3538}}" 144 | ] 145 | }, 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "# data" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "258e49fa-3e55-4dd1-bfd0-ad510d4ebd14", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "def find_product_table_data(html):\n", 163 | " soup = BeautifulSoup(html)\n", 164 | " product_data = soup.find('div', id='prodDetails')\n", 165 | " if product_data is None:\n", 166 | " return []\n", 167 | " table = product_data.find('table')\n", 168 | " columns = [f\"{x.text}\".strip() for x in table.find_all('th')]\n", 169 | " table_data=[]\n", 170 | " for i, row in enumerate(table.find_all('tr')): # [1:] to skip the header row\n", 171 | " # Get all cells in the row\n", 172 | " cells = row.find_all('td')\n", 173 | " # Create a dictionary for the current row, mapping header to cell data\n", 174 | " row_data = {columns[i]: f'{cell.text}'.strip() for cell in cells}\n", 175 | " # Add the dictionary to your list\n", 176 | " table_data.append(row_data)\n", 177 | " return table_data\n", 178 | "\n", 179 | "def find_product_rating(html):\n", 180 | " soup = BeautifulSoup(html)\n", 181 | " average_rating = soup.find(id='averageCustomerReviews').find_all(\"span\", class_='a-size-base')[0].text.strip()\n", 182 | " average_rating = \"\".join([x for x in f\"{average_rating}\".strip() if x.isdigit() or x == '.'])\n", 183 | " average_rating = float(average_rating)\n", 184 | " rating_data = soup.find(id='acrCustomerReviewText').text\n", 185 | " rating_count = int(''.join([x for x in rating_data if x.isdigit()]))\n", 186 | " rating_count\n", 187 | " return {\n", 188 | " 'average': average_rating,\n", 189 | " 'count': rating_count,\n", 190 | " }\n", 191 | "\n", 192 | "def extract_amazon_product_data(html):\n", 193 | " soup = BeautifulSoup(html)\n", 194 | " productTitle = soup.find('span', id='productTitle')\n", 195 | " productTitleText = f\"{productTitle.text}\".strip()\n", 196 | " productPrice = soup.find_all('span', class_='a-price-whole')[0]\n", 197 | " productPrice = f\"{productPrice.text}\".strip()\n", 198 | " productPriceText = \"\".join([x for x in productPrice if x.isdigit() or x == '.'])\n", 199 | " productPriceNum = float(productPriceText)\n", 200 | " try:\n", 201 | " productDescription = soup.find('div', id='productDescription').text\n", 202 | " except:\n", 203 | " productDescription = ''\n", 204 | " featureBullets = soup.find('div', id='feature-bullets').text\n", 205 | " return {\n", 206 | " 'title': productTitleText,\n", 207 | " 'price_raw': productPrice,\n", 208 | " 'price_text': productPriceText,\n", 209 | " 'price': productPriceNum,\n", 210 | " 'metadata': find_product_table_data(html),\n", 211 | " 'description': productDescription,\n", 212 | " 'feature_bullets': featureBullets,\n", 213 | " 'rating': find_product_rating(html)\n", 214 | " }" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "id": "10e4f2d5-4e87-47c4-a3be-793537b8255f", 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "html = helpers.scrape(url=url, solve_captcha=False)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "7e2bd341-d7a6-4c71-a691-8cfb51769fcf", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# with open('output.html', 'w+') as f:\n", 235 | "# f.write(html)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "71daf2fb-8fc3-4aeb-8912-e65df9105a4e", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "data = extract_amazon_product_data(html)\n", 246 | "data" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "id": "a11ec32a-5afd-437b-a64e-6d54b4659ff2", 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [] 256 | } 257 | ], 258 | "metadata": { 259 | "kernelspec": { 260 | "display_name": "Python 3 (ipykernel)", 261 | "language": "python", 262 | "name": "python3" 263 | }, 264 | "language_info": { 265 | "codemirror_mode": { 266 | "name": "ipython", 267 | "version": 3 268 | }, 269 | "file_extension": ".py", 270 | "mimetype": "text/x-python", 271 | "name": "python", 272 | "nbconvert_exporter": "python", 273 | "pygments_lexer": "ipython3", 274 | "version": "3.11.4" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 5 279 | } 280 | -------------------------------------------------------------------------------- /src/nbs/6 - Tracking Scraped Data with Django Models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "a18d1078-d317-44cd-a6f9-01641a439cd8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "setup.init_django(project_name='cfehome')" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 14, 17 | "id": "d5ab0966-17a0-4f8d-8b67-1344a19ab0a1", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import helpers\n", 22 | "from products.models import ProductScrapeEvent, Product" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "id": "93cd08b1-8d07-4f1a-9cad-69d2f8e3d35b", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\"" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "id": "b553ebdd-b86e-4475-8506-2723ef108ce4", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Connecting to Scraping Browser...\n", 46 | "Connected! Navigating to https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3\n", 47 | "Navigated! Scraping page content...\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "html = helpers.scrape(url=url, solve_captcha=False)\n", 53 | "data = helpers.extract_amazon_product_data(html)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "id": "d4d892e4-cb4e-400f-9e81-1caecf6e46b8", 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "{'asin': 'B0CL5KNB9M',\n", 66 | " 'title': 'PlayStation®5 Digital Edition (slim)',\n", 67 | " 'price_raw': '449.',\n", 68 | " 'price_text': '449.',\n", 69 | " 'price': 449.0,\n", 70 | " 'metadata': [{'ASIN': 'B0CL5KNB9M'},\n", 71 | " {'Release date': 'November 24, 2023'},\n", 72 | " {'Customer Reviews': '4.7 4.7 out of 5 stars \\n 3,538 ratings \\n\\n\\n 4.7 out of 5 stars'},\n", 73 | " {'Best Sellers Rank': '#216 in Video Games (See Top 100 in Video Games) #2 in PlayStation 5 Consoles'},\n", 74 | " {'Product Dimensions': '17 x 15 x 6 inches; 8.9 Pounds'},\n", 75 | " {'Type of item': 'Video Game'},\n", 76 | " {'Item model number': 'CFI-2000'},\n", 77 | " {'Item Weight': '8.9 pounds'},\n", 78 | " {'Manufacturer': 'Sony'},\n", 79 | " {'Country of Origin': 'China'},\n", 80 | " {'Batteries': '1 Lithium Ion batteries required. (included)'},\n", 81 | " {'Date First Available': 'November 24, 2023'}],\n", 82 | " 'description': ' \\n Play Like Never Before. The PS5 Digital Edition unleashes new gaming possibilities that you never anticipated. Experience lightning fast loading with an ultra-high speed SSD, deeper immersion with support for haptic feedback, adaptive triggers, and 3D Audio*,and an all-new generation of incredible PlayStation® games. PS5 Digital Edition is an all-digital version of the PS5 console with no disc drive. Sign into your account for PlayStation Network and go to PlayStation Store to buy and download games (Account for PlayStation Network required). Lightning Speed - Harness the power of a custom CPU, GPU, and SSD with Integrated I/O that rewrite the rules of what a PlayStation console can do. Stunning Games - Marvel at incredible graphics and experience new PS5 features. Play a back catalog of supported PS4 games. Breathtaking Immersion - Discover a deeper gaming experience with support for haptic feedback, adaptive triggers, and 3D Audio technology. *3D audio via built-in TV speakers or analog/USB stereo headphones. Set up and latest system software update required. ',\n", 83 | " 'feature_bullets': ' \\n About this item Model Number CFI-2000 Includes DualSense Wireless Controller, 1TB SSD, 2 Horizontal Stand Feet, HDMI Cable, AC power cord, USB cable, printed materials, ASTRO’s PLAYROOM (Pre-installed game) Vertical Stand sold seperately \\n',\n", 84 | " 'rating': {'average': 4.7, 'count': 3538}}" 85 | ] 86 | }, 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "data" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 12, 99 | "id": "7cf48a33-91d7-4afb-8e7f-f32c65a6ef95", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "" 106 | ] 107 | }, 108 | "execution_count": 12, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "ProductScrapeEvent.objects.create_scrape_event(data, url=url)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 13, 120 | "id": "68591fc9-1be3-4219-8fc9-d3f3e98fb99c", 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | ", , , ]>" 127 | ] 128 | }, 129 | "execution_count": 13, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "qs = ProductScrapeEvent.objects.all()\n", 136 | "qs" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 16, 142 | "id": "3dbffe27-bf93-4a72-9a3b-60efe3d3c075", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "]>" 149 | ] 150 | }, 151 | "execution_count": 16, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "product_qs = Product.objects.all()\n", 158 | "product_qs" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "c4379de2-ed36-4a62-9a2b-1a7a3404773b", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3 (ipykernel)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.11.4" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 5 191 | } 192 | -------------------------------------------------------------------------------- /src/nbs/7 - Trigger Scrape Task.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "id": "a18d1078-d317-44cd-a6f9-01641a439cd8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "setup.init_django(project_name='cfehome')" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 27, 17 | "id": "d5ab0966-17a0-4f8d-8b67-1344a19ab0a1", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import helpers\n", 22 | "from products.tasks import scrape_product_url_task, scrape_products_task" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 28, 28 | "id": "93cd08b1-8d07-4f1a-9cad-69d2f8e3d35b", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\"" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 31, 38 | "id": "c4379de2-ed36-4a62-9a2b-1a7a3404773b", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "" 45 | ] 46 | }, 47 | "execution_count": 31, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "scrape_product_url_task.delay(url)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 32, 59 | "id": "eaf49535-9717-46c1-88b7-f0a189e97009", 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "" 66 | ] 67 | }, 68 | "execution_count": 32, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "scrape_products_task.delay()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "5a8ce7a8-3d85-4e18-9b6c-22b605d050ae", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3 (ipykernel)", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.11.4" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 5 107 | } 108 | -------------------------------------------------------------------------------- /src/nbs/setup.py: -------------------------------------------------------------------------------- 1 | # https://www.codingforentrepreneurs.com/shorts/django-setup-for-use-in-jupyter-notebooks/ 2 | import os, sys 3 | 4 | PWD = os.getenv("PWD") 5 | DJANGO_PROJECT = os.environ.get("DJANGO_PROJECT") or "cfehome" 6 | DJANGO_ROOT_DIR = os.environ.get("DJANGO_ROOT_DIR") or "src" 7 | if not PWD.endswith(f"/{DJANGO_ROOT_DIR}"): 8 | # src is the django-root 9 | PWD = os.path.join(PWD, DJANGO_ROOT_DIR) 10 | 11 | 12 | PROJ_MISSING_MSG = """Set an enviroment variable:\n 13 | `DJANGO_PROJECT=your_project_name`\n 14 | or call:\n 15 | `init_django(project_name=your_project_name)` 16 | """ 17 | 18 | 19 | def init_django(project_name=None): 20 | os.chdir(PWD) 21 | dj_project_name = project_name or DJANGO_PROJECT 22 | if dj_project_name == None: 23 | raise Exception(PROJ_MISSING_MSG) 24 | sys.path.insert(0, os.getenv('PWD')) 25 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', f'{project_name}.settings') 26 | os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" 27 | import django 28 | django.setup() -------------------------------------------------------------------------------- /src/products/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/products/__init__.py -------------------------------------------------------------------------------- /src/products/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | 5 | from .models import Product, ProductScrapeEvent 6 | 7 | 8 | admin.site.register(Product) 9 | 10 | admin.site.register(ProductScrapeEvent) -------------------------------------------------------------------------------- /src/products/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class ProductsConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "products" 7 | -------------------------------------------------------------------------------- /src/products/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.3 on 2024-03-18 18:53 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | initial = True 9 | 10 | dependencies = [] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name="Product", 15 | fields=[ 16 | ( 17 | "id", 18 | models.BigAutoField( 19 | auto_created=True, 20 | primary_key=True, 21 | serialize=False, 22 | verbose_name="ID", 23 | ), 24 | ), 25 | ("asin", models.CharField(db_index=True, max_length=120, unique=True)), 26 | ("title", models.CharField(blank=True, max_length=220, null=True)), 27 | ( 28 | "current_price", 29 | models.FloatField(blank=True, default=0.0, null=True), 30 | ), 31 | ("timestamp", models.DateTimeField(auto_now_add=True)), 32 | ("updated", models.DateTimeField(auto_now=True)), 33 | ("metadata", models.JSONField(blank=True, null=True)), 34 | ], 35 | ), 36 | migrations.CreateModel( 37 | name="ProductScrapeEvent", 38 | fields=[ 39 | ( 40 | "id", 41 | models.BigAutoField( 42 | auto_created=True, 43 | primary_key=True, 44 | serialize=False, 45 | verbose_name="ID", 46 | ), 47 | ), 48 | ("url", models.URLField(blank=True, null=True)), 49 | ("data", models.JSONField(blank=True, null=True)), 50 | ("asin", models.CharField(db_index=True, max_length=120, unique=True)), 51 | ( 52 | "product", 53 | models.ForeignKey( 54 | on_delete=django.db.models.deletion.CASCADE, 55 | related_name="scrape_events", 56 | to="products.product", 57 | ), 58 | ), 59 | ], 60 | ), 61 | ] 62 | -------------------------------------------------------------------------------- /src/products/migrations/0002_alter_productscrapeevent_asin.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.3 on 2024-03-18 18:55 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("products", "0001_initial"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterField( 13 | model_name="productscrapeevent", 14 | name="asin", 15 | field=models.CharField(blank=True, max_length=120, null=True), 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /src/products/migrations/0003_product__trigger_scrape_product_trigger_scrape_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.3 on 2024-03-18 19:07 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("products", "0002_alter_productscrapeevent_asin"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AddField( 13 | model_name="product", 14 | name="_trigger_scrape", 15 | field=models.BooleanField(default=False), 16 | ), 17 | migrations.AddField( 18 | model_name="product", 19 | name="trigger_scrape", 20 | field=models.BooleanField(default=False), 21 | ), 22 | migrations.AddField( 23 | model_name="product", 24 | name="url", 25 | field=models.URLField(blank=True, null=True), 26 | ), 27 | ] 28 | -------------------------------------------------------------------------------- /src/products/migrations/0004_remove_product__trigger_scrape_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.3 on 2024-03-18 19:45 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("products", "0003_product__trigger_scrape_product_trigger_scrape_and_more"), 9 | ] 10 | 11 | operations = [ 12 | migrations.RemoveField( 13 | model_name="product", 14 | name="_trigger_scrape", 15 | ), 16 | migrations.RemoveField( 17 | model_name="product", 18 | name="trigger_scrape", 19 | ), 20 | migrations.AddField( 21 | model_name="product", 22 | name="active", 23 | field=models.BooleanField(default=False), 24 | ), 25 | ] 26 | -------------------------------------------------------------------------------- /src/products/migrations/0005_alter_product_active.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.3 on 2024-03-18 19:45 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("products", "0004_remove_product__trigger_scrape_and_more"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterField( 13 | model_name="product", 14 | name="active", 15 | field=models.BooleanField(default=True, help_text="Scrape daily?"), 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /src/products/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/products/migrations/__init__.py -------------------------------------------------------------------------------- /src/products/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | 5 | from .tasks import scrape_product_url_task 6 | 7 | class Product(models.Model): 8 | asin = models.CharField(max_length=120, unique=True, db_index=True) 9 | url = models.URLField(blank=True, null=True) 10 | title = models.CharField(max_length=220, blank=True, null=True) 11 | current_price = models.FloatField(blank=True, null=True, default=0.00) 12 | timestamp = models.DateTimeField(auto_now_add=True) 13 | updated = models.DateTimeField(auto_now=True) 14 | metadata = models.JSONField(null=True, blank=True) 15 | active = models.BooleanField(default=True, help_text="Scrape daily?") 16 | 17 | 18 | 19 | class ProductScrapeEventManager(models.Manager): 20 | def create_scrape_event(self, data, url=None): 21 | asin = data.get('asin') or None 22 | if asin is None: 23 | return None 24 | product, _ = Product.objects.update_or_create( 25 | asin=asin, 26 | defaults={ 27 | "url": url, 28 | "title": data.get('title') or "", 29 | "current_price": data.get('price') or 0.00, 30 | "metadata": data, 31 | } 32 | ) 33 | event = self.create( 34 | product=product, 35 | url=url, 36 | asin=asin, 37 | data=data, 38 | ) 39 | return event 40 | 41 | 42 | class ProductScrapeEvent(models.Model): 43 | product = models.ForeignKey(Product, on_delete=models.CASCADE, related_name='scrape_events') 44 | url = models.URLField(blank=True, null=True) 45 | data = models.JSONField(null=True, blank=True) 46 | asin = models.CharField(max_length=120, null=True, blank=True) 47 | 48 | objects = ProductScrapeEventManager() -------------------------------------------------------------------------------- /src/products/tasks.py: -------------------------------------------------------------------------------- 1 | from django.apps import apps 2 | from celery import shared_task 3 | import helpers 4 | 5 | 6 | @shared_task 7 | def scrape_product_url_task(url): 8 | if url is None: 9 | return 10 | elif url == "": 11 | return 12 | ProductScrapeEvent = apps.get_model('products', 'ProductScrapeEvent') 13 | # open the url 14 | html = helpers.scrape(url=url, solve_captcha=False) 15 | # scrape the url 16 | data = helpers.extract_amazon_product_data(html) 17 | # save the scraped data 18 | ProductScrapeEvent.objects.create_scrape_event(data, url=url) 19 | return 20 | 21 | 22 | @shared_task 23 | def scrape_products_task(): 24 | Product = apps.get_model('products', 'Product') 25 | qs = Product.objects.filter(active=True) 26 | for obj in qs: 27 | url = obj.url 28 | scrape_product_url_task.delay(url) -------------------------------------------------------------------------------- /src/products/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /src/products/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | --------------------------------------------------------------------------------