├── .gitignore
├── LICENSE
├── README.md
├── commands.md
├── compose.yaml
├── django-celery-redis.code-workspace
├── requirements.txt
└── src
├── .sample.env
├── cfehome
├── __init__.py
├── asgi.py
├── celery.py
├── settings.py
├── urls.py
└── wsgi.py
├── helpers
├── __init__.py
├── amazon.py
└── brightdata.py
├── manage.py
├── movies
├── __init__.py
├── admin.py
├── apps.py
├── migrations
│ └── __init__.py
├── models.py
├── tasks.py
├── tests.py
└── views.py
├── nbs
├── 1 - Hello World with Selenium.ipynb
├── 2 - Proxy Scraping with Bright Data and Selenium.ipynb
├── 3 - Helper-based Proxy Scraping with Bright Data and Selenium.ipynb
├── 4 - Parse HTML Data with BeautifulSoup.ipynb
├── 5 - Amazon Captcha and Prepare Parser Helper Functions.ipynb
├── 6 - Tracking Scraped Data with Django Models.ipynb
├── 7 - Trigger Scrape Task.ipynb
└── setup.py
└── products
├── __init__.py
├── admin.py
├── apps.py
├── migrations
├── 0001_initial.py
├── 0002_alter_productscrapeevent_asin.py
├── 0003_product__trigger_scrape_product_trigger_scrape_and_more.py
├── 0004_remove_product__trigger_scrape_and_more.py
├── 0005_alter_product_active.py
└── __init__.py
├── models.py
├── tasks.py
├── tests.py
└── views.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | .DS_Store
162 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Coding For Entrepreneurs
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping on a Schedule with Django & Celery
2 | Learn how to schedule regular web scraping, save the data, and more with Django & Celery.
3 |
4 | Topics:
5 |
6 | - Django
7 | - Celery
8 | - Selenium
9 | - Scraped Data to Database via Django
10 | - Reliable Web Scraping with Selenium + Bright Data
11 |
12 | References:
13 | - [Celery + Redis + Django configuration guide](https://www.codingforentrepreneurs.com/blog/celery-redis-django/)
14 | - Django + Celery Redis [blank project code](https://github.com/codingforentrepreneurs/Django-Celery-Redis)
15 | - Django + Jupyter Setup Module [short + code](https://www.codingforentrepreneurs.com/shorts/django-setup-for-use-in-jupyter-notebooks/)
16 |
17 | Requirements:
18 | - Django experience such as Try Django (on [YouTube](https://www.youtube.com/playlist?list=PLEsfXFp6DpzRMby_cSoWTFw8zaMdTEXgL) or on [CFE](https://www.codingforentrepreneurs.com/topics/try-django/)) or [Your First Django Project](https://www.codingforentrepreneurs.com/courses/your-first-django-project/).
19 | - Redis Instance
20 | - Setup Redis on Windows [blog post](https://www.codingforentrepreneurs.com/blog/redis-on-windows/)
21 | - Setup Redis on MacOS or Linux [blog post](https://www.codingforentrepreneurs.com/blog/install-redis-mac-and-linux)
22 | - Setup Redis on Remote Virtual Machine [blog post](https://www.codingforentrepreneurs.com/blog/remote-redis-servers-for-development/)
23 | - How I use Redis for new projects [short + code](https://www.codingforentrepreneurs.com/shorts/how-i-use-redis-for-new-projects-with-docker-compose/)
24 | - A Bright Data Account [$25 credit for new accounts](https://brdta.com/justin)
25 |
26 | ## Getting Started
27 |
28 | ```bash
29 | git clone https://github.com/codingforentrepreneurs/Django-Celery-Redis
30 | mv Django-Celery-Redis scrape-scheduler
31 | cd scrape-scheduler
32 | ```
33 |
34 | `macos/linux`
35 | ```
36 | python3 -m venv venv
37 | source venv/bin/activate
38 | ```
39 |
40 | `windows`
41 | ```
42 | c:\Python311\python.exe -m venv venv
43 | .\venv\Scripts\activate
44 | ```
45 |
46 | Install requirements
47 | ```bash
48 | python -m pip install pip --upgrade
49 | python -m pip install -r requirements.txt
50 | ```
51 |
52 | Run a local redis instance via Docker Compose
53 | ```bash
54 | docker compose -f compose.yaml up -d
55 | ```
56 | This will give us `redis://localhost:6170`
57 |
58 | Create `.env` in `src/.env` with:
59 |
60 | ```bash
61 | CELERY_BROKER_REDIS_URL="redis://localhost:6170"
62 | DEBUG=True
63 | ```
64 |
65 | Navigate into your Django root:
66 |
67 | ```bash
68 | cd src/
69 | ls
70 | ```
71 | You should see at least `cfehome/` and `manage.py`.
72 |
73 | Run your project in 2 terminals:
74 | - `python manage.py runserver`
75 | - `celery -A cfehome worker --beat`
76 |
77 | Let's go!
78 |
--------------------------------------------------------------------------------
/commands.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ```
4 | docker compose up
5 | ```
6 |
7 | ```
8 | celery -A cfehome worker --beat -l info
9 | ```
10 |
11 | ```
12 | python manage.py shell
13 | ```
14 |
15 | ```
16 | python manage.py runserver
17 | ```
18 |
19 | ```
20 | jupyter notebook
21 | ```
22 |
23 |
--------------------------------------------------------------------------------
/compose.yaml:
--------------------------------------------------------------------------------
1 | # read more on https://www.codingforentrepreneurs.com/shorts/how-i-use-redis-for-new-projects-with-docker-compose/
2 | version: '3.9'
3 | services:
4 | redis:
5 | image: redis
6 | restart: always
7 | ports:
8 | - 6170:6379
9 | volumes:
10 | - data:/data
11 | entrypoint: redis-server --appendonly yes
12 | volumes:
13 | data:
14 |
15 | networks:
16 | default:
17 | name: scrape_scheduler_network
18 |
--------------------------------------------------------------------------------
/django-celery-redis.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "."
5 | }
6 | ],
7 | "settings": {}
8 | }
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Celery
2 | Django
3 | django-celery-beat
4 | django-celery-results
5 | python-decouple
6 | redis
7 | jupyter
8 | selenium
9 | beautifulsoup4
--------------------------------------------------------------------------------
/src/.sample.env:
--------------------------------------------------------------------------------
1 | # celery via Docker compose
2 | CELERY_BROKER_REDIS_URL="redis://localhost:6170"
3 | DEBUG=True
4 | DJANGO_SECRET_KEY="django-insecure-g4x&pvo@a^5s&e51s$+tuk_aaf)rdcu19v_f@d*iqp0opzoy#4"
5 | SBR_WEBDRIVER="https://{user}:{pw}@{host}:{port}"
--------------------------------------------------------------------------------
/src/cfehome/__init__.py:
--------------------------------------------------------------------------------
1 | from .celery import app as celery_app
2 |
3 | __all__ = ["celery_app"]
4 |
--------------------------------------------------------------------------------
/src/cfehome/asgi.py:
--------------------------------------------------------------------------------
1 | """
2 | ASGI config for cfehome project.
3 |
4 | It exposes the ASGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.asgi import get_asgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings")
15 |
16 | application = get_asgi_application()
17 |
--------------------------------------------------------------------------------
/src/cfehome/celery.py:
--------------------------------------------------------------------------------
1 | # path/to/your/proj/src/cfehome/celery.py
2 | import os
3 |
4 | from celery import Celery
5 | from celery.schedules import crontab
6 |
7 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings")
8 |
9 | app = Celery("cfehome")
10 |
11 | # Using a string here means the worker don't have to serialize
12 | # the configuration object to child processes.
13 | # - namespace='CELERY' means all celery-related configuration keys
14 | # should have a `CELERY_` prefix.
15 | app.config_from_object("django.conf:settings", namespace="CELERY")
16 |
17 | # Load task modules from all registered Django app configs.
18 | app.autodiscover_tasks()
19 |
20 | # We used CELERY_BROKER_URL in settings.py instead of:
21 | # app.conf.broker_url = ''
22 |
23 | # We used CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP in settings.py instead of:
24 | # app.conf.broker_connection_retry_on_startup = True
25 |
26 | # We used CELERY_BEAT_SCHEDULER in settings.py instead of:
27 | # app.conf.beat_scheduler = ''django_celery_beat.schedulers.DatabaseScheduler'
28 |
29 |
30 | # Below is for illustration purposes. We configured our project
31 | # So we can perform all kinds of scheduling in the Django admin
32 | # under Periodic Tasks.
33 | # app.conf.beat_schedule = {
34 | # "multiply-task-crontab": {
35 | # "task": "multiply_two_numbers",
36 | # "schedule": crontab(hour=7, minute=30, day_of_week=1),
37 | # "args": (16, 16),
38 | # },
39 | # "multiply-every-5-seconds": {
40 | # "task": "multiply_two_numbers",
41 | # "schedule": 5.0,
42 | # "args": (16, 16),
43 | # },
44 | # "add-every-30-seconds": {
45 | # "task": "movies.tasks.add",
46 | # "schedule": 30.0,
47 | # "args": (16, 16),
48 | # },
49 | # }
50 |
--------------------------------------------------------------------------------
/src/cfehome/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for cfehome project.
3 |
4 | Generated by 'django-admin startproject' using Django 5.0.2.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/5.0/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/5.0/ref/settings/
11 | """
12 |
13 | from pathlib import Path
14 |
15 | from decouple import config
16 |
17 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
18 | BASE_DIR = Path(__file__).resolve().parent.parent
19 |
20 |
21 | # Quick-start development settings - unsuitable for production
22 | # See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/
23 |
24 | # SECURITY WARNING: keep the secret key used in production secret!
25 | SECRET_KEY = config("DJANGO_SECRET_KEY", default=None)
26 |
27 | # SECURITY WARNING: don't run with debug turned on in production!
28 | DEBUG = config("DEBUG", cast=bool, default=False)
29 |
30 | ALLOWED_HOSTS = []
31 |
32 |
33 | # Application definition
34 |
35 | INSTALLED_APPS = [
36 | "django.contrib.admin",
37 | "django.contrib.auth",
38 | "django.contrib.contenttypes",
39 | "django.contrib.sessions",
40 | "django.contrib.messages",
41 | "django.contrib.staticfiles",
42 | "django_celery_beat",
43 | "django_celery_results",
44 | "movies",
45 | "products",
46 | ]
47 |
48 | MIDDLEWARE = [
49 | "django.middleware.security.SecurityMiddleware",
50 | "django.contrib.sessions.middleware.SessionMiddleware",
51 | "django.middleware.common.CommonMiddleware",
52 | "django.middleware.csrf.CsrfViewMiddleware",
53 | "django.contrib.auth.middleware.AuthenticationMiddleware",
54 | "django.contrib.messages.middleware.MessageMiddleware",
55 | "django.middleware.clickjacking.XFrameOptionsMiddleware",
56 | ]
57 |
58 | ROOT_URLCONF = "cfehome.urls"
59 |
60 | TEMPLATES = [
61 | {
62 | "BACKEND": "django.template.backends.django.DjangoTemplates",
63 | "DIRS": [],
64 | "APP_DIRS": True,
65 | "OPTIONS": {
66 | "context_processors": [
67 | "django.template.context_processors.debug",
68 | "django.template.context_processors.request",
69 | "django.contrib.auth.context_processors.auth",
70 | "django.contrib.messages.context_processors.messages",
71 | ],
72 | },
73 | },
74 | ]
75 |
76 | WSGI_APPLICATION = "cfehome.wsgi.application"
77 |
78 |
79 | # Database
80 | # https://docs.djangoproject.com/en/5.0/ref/settings/#databases
81 |
82 | DATABASES = {
83 | "default": {
84 | "ENGINE": "django.db.backends.sqlite3",
85 | "NAME": BASE_DIR / "db.sqlite3",
86 | }
87 | }
88 |
89 |
90 | # Password validation
91 | # https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators
92 |
93 | AUTH_PASSWORD_VALIDATORS = [
94 | {
95 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
96 | },
97 | {
98 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
99 | },
100 | {
101 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
102 | },
103 | {
104 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
105 | },
106 | ]
107 |
108 |
109 | # Internationalization
110 | # https://docs.djangoproject.com/en/5.0/topics/i18n/
111 |
112 | LANGUAGE_CODE = "en-us"
113 |
114 | TIME_ZONE = "UTC"
115 |
116 | USE_I18N = True
117 |
118 | USE_TZ = True
119 |
120 |
121 | # Static files (CSS, JavaScript, Images)
122 | # https://docs.djangoproject.com/en/5.0/howto/static-files/
123 |
124 | STATIC_URL = "static/"
125 |
126 | # Default primary key field type
127 | # https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field
128 |
129 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
130 |
131 |
132 | # save Celery task results in Django's database
133 | CELERY_RESULT_BACKEND = "django-db"
134 |
135 | # broker_connection_retry_on_startup
136 | CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP = True
137 |
138 | # This configures Redis as the datastore between Django + Celery
139 | CELERY_BROKER_URL = config("CELERY_BROKER_REDIS_URL", default="redis://localhost:6379")
140 | # if you out to use os.environ the config is:
141 | # CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_REDIS_URL', 'redis://localhost:6379')
142 |
143 | # this allows you to schedule items in the Django admin.
144 | CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers.DatabaseScheduler"
145 |
--------------------------------------------------------------------------------
/src/cfehome/urls.py:
--------------------------------------------------------------------------------
1 | """
2 | URL configuration for cfehome project.
3 |
4 | The `urlpatterns` list routes URLs to views. For more information please see:
5 | https://docs.djangoproject.com/en/5.0/topics/http/urls/
6 | Examples:
7 | Function views
8 | 1. Add an import: from my_app import views
9 | 2. Add a URL to urlpatterns: path('', views.home, name='home')
10 | Class-based views
11 | 1. Add an import: from other_app.views import Home
12 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
13 | Including another URLconf
14 | 1. Import the include() function: from django.urls import include, path
15 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
16 | """
17 | from django.contrib import admin
18 | from django.urls import path
19 |
20 | urlpatterns = [
21 | path("admin/", admin.site.urls),
22 | ]
23 |
--------------------------------------------------------------------------------
/src/cfehome/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for cfehome project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings")
15 |
16 | application = get_wsgi_application()
17 |
--------------------------------------------------------------------------------
/src/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | from .amazon import extract_amazon_product_data
2 | from .brightdata import scrape
3 |
4 | __all__ = ['extract_amazon_product_data', 'scrape']
--------------------------------------------------------------------------------
/src/helpers/amazon.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 |
3 | def find_product_table_data(html):
4 | soup = BeautifulSoup(html, "html.parser")
5 | product_data = soup.find('div', id='prodDetails')
6 | if product_data is None:
7 | return []
8 | table = product_data.find('table')
9 | columns = [f"{x.text}".strip() for x in table.find_all('th')]
10 | table_data=[]
11 | for i, row in enumerate(table.find_all('tr')): # [1:] to skip the header row
12 | # Get all cells in the row
13 | cells = row.find_all('td')
14 | # Create a dictionary for the current row, mapping header to cell data
15 | row_data = {columns[i]: f'{cell.text}'.strip() for cell in cells}
16 | # Add the dictionary to your list
17 | table_data.append(row_data)
18 | return table_data
19 |
20 |
21 | def find_product_rating(html):
22 | soup = BeautifulSoup(html, "html.parser")
23 | average_rating = soup.find(id='averageCustomerReviews').find_all("span", class_='a-size-base')[0].text.strip()
24 | average_rating = "".join([x for x in f"{average_rating}".strip() if x.isdigit() or x == '.'])
25 | average_rating = float(average_rating)
26 | rating_data = soup.find(id='acrCustomerReviewText').text
27 | rating_count = int(''.join([x for x in rating_data if x.isdigit()]))
28 | rating_count
29 | return {
30 | 'average': average_rating,
31 | 'count': rating_count,
32 | }
33 |
34 | def extract_amazon_product_data(html):
35 | soup = BeautifulSoup(html, "html.parser")
36 | productTitle = soup.find('span', id='productTitle')
37 | productTitleText = f"{productTitle.text}".strip()
38 | productPrice = soup.find_all('span', class_='a-price-whole')[0]
39 | productPrice = f"{productPrice.text}".strip()
40 | productPriceText = "".join([x for x in productPrice if x.isdigit() or x == '.'])
41 | productPriceNum = float(productPriceText)
42 | try:
43 | productDescription = soup.find('div', id='productDescription').text
44 | except:
45 | productDescription = ''
46 | featureBullets = soup.find('div', id='feature-bullets').text
47 | asin = ''
48 | metadata_items = find_product_table_data(html)
49 | for data in metadata_items:
50 | if data.get("ASIN") is None:
51 | continue
52 | else:
53 | asin = data.get("ASIN")
54 | break
55 | return {
56 | 'asin': asin,
57 | 'title': productTitleText,
58 | 'price_raw': productPrice,
59 | 'price_text': productPriceText,
60 | 'price': productPriceNum,
61 | 'metadata': metadata_items,
62 | 'description': productDescription,
63 | 'feature_bullets': featureBullets,
64 | 'rating': find_product_rating(html)
65 | }
--------------------------------------------------------------------------------
/src/helpers/brightdata.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver import Remote, ChromeOptions
2 | from selenium.webdriver.common.by import By
3 | from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
4 | from decouple import config
5 | from urllib.parse import urljoin, urlparse
6 |
7 |
8 | SBR_WEBDRIVER = config('SBR_WEBDRIVER', default=None)
9 |
10 |
11 | def scrape(url=None, body_only=True, solve_captcha=False, wait_seconds=0):
12 | print('Connecting to Scraping Browser...')
13 | sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome')
14 | html = ""
15 | url = urljoin(url, urlparse(url).path)
16 | with Remote(sbr_connection, options=ChromeOptions()) as driver:
17 | print(f'Connected! Navigating to {url}')
18 | driver.get(url)
19 | if wait_seconds > 0:
20 | driver.implicitly_wait(wait_seconds)
21 | if solve_captcha:
22 | solve_res = driver.execute('executeCdpCommand', {
23 | 'cmd': 'Captcha.waitForSolve',
24 | 'params': {'detectTimeout': 10000},
25 | })
26 | print('Captcha solve status:', solve_res['value']['status'])
27 | print('Navigated! Scraping page content...')
28 | html = driver.page_source
29 | if body_only:
30 | body = driver.find_element(By.TAG_NAME, "body")
31 | html = body.get_attribute('innerHTML')
32 | return html
--------------------------------------------------------------------------------
/src/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Django's command-line utility for administrative tasks."""
3 | import os
4 | import sys
5 |
6 |
7 | def main():
8 | """Run administrative tasks."""
9 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings")
10 | try:
11 | from django.core.management import execute_from_command_line
12 | except ImportError as exc:
13 | raise ImportError(
14 | "Couldn't import Django. Are you sure it's installed and "
15 | "available on your PYTHONPATH environment variable? Did you "
16 | "forget to activate a virtual environment?"
17 | ) from exc
18 | execute_from_command_line(sys.argv)
19 |
20 |
21 | if __name__ == "__main__":
22 | main()
23 |
--------------------------------------------------------------------------------
/src/movies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/movies/__init__.py
--------------------------------------------------------------------------------
/src/movies/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
--------------------------------------------------------------------------------
/src/movies/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class MoviesConfig(AppConfig):
5 | default_auto_field = "django.db.models.BigAutoField"
6 | name = "movies"
7 |
--------------------------------------------------------------------------------
/src/movies/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/movies/migrations/__init__.py
--------------------------------------------------------------------------------
/src/movies/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | # Create your models here.
4 |
--------------------------------------------------------------------------------
/src/movies/tasks.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | from celery import shared_task
4 |
5 |
6 | @shared_task
7 | def add(x, y):
8 | # Celery recognizes this as the `movies.tasks.add` task
9 | # the name is purposefully omitted here.
10 | return x + y
11 |
12 |
13 | @shared_task(name="multiply_two_numbers")
14 | def mul(x, y):
15 | # Celery recognizes this as the `multiple_two_numbers` task
16 | total = x * (y * random.randint(3, 100))
17 | return total
18 |
19 |
20 | @shared_task(name="sum_list_numbers")
21 | def xsum(numbers):
22 | # Celery recognizes this as the `sum_list_numbers` task
23 | return sum(numbers)
24 |
--------------------------------------------------------------------------------
/src/movies/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/src/movies/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------
/src/nbs/4 - Parse HTML Data with BeautifulSoup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 63,
6 | "id": "439ab9a8-51cf-4591-836a-0cbbac9ff3d6",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import setup\n",
11 | "setup.init_django(project_name='cfehome')"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 64,
17 | "id": "5e6b875f-38aa-43d1-8bf6-2f51deb88170",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\""
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 65,
27 | "id": "251aa699-5c55-4d25-a617-1d43047cfe5d",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "url = \"https://www.amazon.com/Microsoft-RRT-00001-Xbox-Enhanced-Protection-X/dp/B0CMFGD9C4/ref=sr_1_3?crid=O4D5MCNRPHZG&dib=eyJ2IjoiMSJ9.BesESVeGKfCLfmoE_Vj2bHkyIqHkjt6vWY83koTZ_cz2I8dLY4__kVTkLkSzkH-tXhAxpP2gQNyM5eVeTcHhvxw10QXkh4Y_4mErHWUhqbSt9lrSfOO_-FmaAKtnyeF8iomBtkjnfyUknCfgCkCLPUr4uTyEnz5WtXbXQI-WN6fXgPPlvAdJXHHZEW4PuI3no5Em_YuyrBMW5_lEcEiUTpPXfdpztJ9ofmYYPF6l8WI.dg-z2b_4ANwk_S5W2m6JFCSScCDS2b3idQV1r-yQOWQ&dib_tag=se&keywords=xbox+series+x&qid=1709681924&sprefix=xbox%2Caps%2C264&sr=8-3\""
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 66,
37 | "id": "f5db35da-6859-42ef-b5fe-81c55bc89bfa",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "\n",
42 | "import helpers"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 67,
48 | "id": "fadd2521-984e-41af-b5ce-0206c9dbcaca",
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "Connecting to Scraping Browser...\n",
56 | "Connected! Navigating to https://www.amazon.com/Microsoft-RRT-00001-Xbox-Enhanced-Protection-X/dp/B0CMFGD9C4/ref=sr_1_3?crid=O4D5MCNRPHZG&dib=eyJ2IjoiMSJ9.BesESVeGKfCLfmoE_Vj2bHkyIqHkjt6vWY83koTZ_cz2I8dLY4__kVTkLkSzkH-tXhAxpP2gQNyM5eVeTcHhvxw10QXkh4Y_4mErHWUhqbSt9lrSfOO_-FmaAKtnyeF8iomBtkjnfyUknCfgCkCLPUr4uTyEnz5WtXbXQI-WN6fXgPPlvAdJXHHZEW4PuI3no5Em_YuyrBMW5_lEcEiUTpPXfdpztJ9ofmYYPF6l8WI.dg-z2b_4ANwk_S5W2m6JFCSScCDS2b3idQV1r-yQOWQ&dib_tag=se&keywords=xbox+series+x&qid=1709681924&sprefix=xbox%2Caps%2C264&sr=8-3\n",
57 | "Navigated! Scraping page content...\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "html = helpers.scrape(url=url)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 68,
68 | "id": "346d4e92-b6ec-4101-848b-8b77af70b2dd",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from bs4 import BeautifulSoup"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 69,
78 | "id": "f9461659-a112-40f5-947f-a95c5002fec1",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "soup = BeautifulSoup(html)"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "id": "0d436797-3317-4a4b-ad0c-61d5355e4bcf",
88 | "metadata": {},
89 | "source": [
90 | "```html\n",
91 | " \n",
92 | " PlayStation®5 Digital Edition (slim) \n",
93 | "\n",
94 | "\n",
95 | "
\n",
96 | "
\n",
97 | " PlayStation®5 Digital Edition (slim) \n",
98 | "
\n",
99 | "
\n",
100 | "
\n",
101 | "```"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 70,
107 | "id": "b20623ab-099e-402f-adbb-51d46e2abfff",
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "data": {
112 | "text/plain": [
113 | "'Microsoft RRT-00001 Xbox Series X 1TB SSD Bundle with 2 YR CPS Enhanced Protection Pack'"
114 | ]
115 | },
116 | "execution_count": 70,
117 | "metadata": {},
118 | "output_type": "execute_result"
119 | }
120 | ],
121 | "source": [
122 | "productTitle = soup.find('span', id='productTitle')\n",
123 | "productTitleText = f\"{productTitle.text}\".strip()\n",
124 | "productTitleText"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "id": "a38928b6-ee4a-4bf1-a706-94249f103712",
130 | "metadata": {},
131 | "source": [
132 | "```html\n",
133 | "449.\n",
134 | "```"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "id": "40ba7cf5-74ab-4f40-a661-eb2098263435",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": []
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 71,
148 | "id": "4839ac54-5962-42c3-af3a-203eca77420e",
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "data": {
153 | "text/plain": [
154 | "481.0"
155 | ]
156 | },
157 | "execution_count": 71,
158 | "metadata": {},
159 | "output_type": "execute_result"
160 | }
161 | ],
162 | "source": [
163 | "productPrice = soup.find_all('span', class_='a-price-whole')[0]\n",
164 | "productPrice\n",
165 | "productPriceText = \"\".join([x for x in f\"{productPrice.text}\".strip() if x.isdigit() or x == '.'])\n",
166 | "productPriceNum = float(productPriceText)\n",
167 | "productPriceNum"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 72,
173 | "id": "40c32d52-edeb-4924-87aa-b08efe73d838",
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/plain": [
179 | "['ASIN',\n",
180 | " 'Customer Reviews',\n",
181 | " 'Best Sellers Rank',\n",
182 | " 'Product Dimensions',\n",
183 | " 'Item model number',\n",
184 | " 'Item Weight',\n",
185 | " 'Manufacturer',\n",
186 | " 'Date First Available']"
187 | ]
188 | },
189 | "execution_count": 72,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "product_data = soup.find('div', id='prodDetails')\n",
196 | "table = product_data.find('table')\n",
197 | "columns = [f\"{x.text}\".strip() for x in table.find_all('th')]\n",
198 | "columns"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 73,
204 | "id": "f6426180-b88d-4be1-8455-c90b73e8bbc1",
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "table_data=[]\n",
209 | "for i, row in enumerate(table.find_all('tr')): # [1:] to skip the header row\n",
210 | " # Get all cells in the row\n",
211 | " cells = row.find_all('td')\n",
212 | " # Create a dictionary for the current row, mapping header to cell data\n",
213 | " row_data = {columns[i]: f'{cell.text}'.strip() for cell in cells}\n",
214 | " # Add the dictionary to your list\n",
215 | " table_data.append(row_data)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 74,
221 | "id": "5d3c27ef-4e11-45ea-9b1c-5f6cf6bd8ef0",
222 | "metadata": {},
223 | "outputs": [
224 | {
225 | "data": {
226 | "text/plain": [
227 | "[{'ASIN': 'B0CMFGD9C4'},\n",
228 | " {'Customer Reviews': '4.5 4.5 out of 5 stars \\n 48 ratings \\n\\n\\n 4.5 out of 5 stars'},\n",
229 | " {'Best Sellers Rank': '#1,892 in Video Games (See Top 100 in Video Games) #40 in Xbox Accessories #101 in Xbox Series X & S Accessories'},\n",
230 | " {'Product Dimensions': '5.9 x 5.9 x 11.9 inches; 9.8 Pounds'},\n",
231 | " {'Item model number': 'E99MSRRT00001'},\n",
232 | " {'Item Weight': '9.8 pounds'},\n",
233 | " {'Manufacturer': 'Microsoft'},\n",
234 | " {'Date First Available': 'November 3, 2023'}]"
235 | ]
236 | },
237 | "execution_count": 74,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "table_data"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 75,
249 | "id": "48c9c6b6-d419-4d6a-97b2-b0e1ee37f726",
250 | "metadata": {},
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/plain": [
255 | "['B0CMFGD9C4']"
256 | ]
257 | },
258 | "execution_count": 75,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "elements_with_attribute = soup.find_all(lambda tag: tag.has_attr('data-csa-c-asin'))\n",
265 | "asins = [x.attrs.get('data-csa-c-asin') for x in elements_with_attribute if x]\n",
266 | "asins = list(set([x for x in asins if x != \"\"]))\n",
267 | "asins"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 76,
273 | "id": "04209cfe-d944-4f9a-ac75-9246f241a2c8",
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "\" \\n The all-new XBOX SERIES X THE FASTEST, MOST POWERFUL XBOX EVER POWER YOUR DREAMS OPTIMIZED FOR SERIES X|S Games built with the Xbox Series X|S development kit showcase significantly reduced load times and stunning visuals at up to 120FPS. GET IT ONCE With Smart Delivery, you can buy a supported game once and always have the best available version for whatever console you play on. GAME ON From future adventures, to current obsessions, to classic titles, thousands of favorites across four generations of Xbox look and play best on Xbox Series X. 12 TFLOPS OF POWER The 12 teraflops of processing power housed in the system on a chip (SOC) work with AMD’s Zen 2 and RDNA 2 architectures to result in worlds that demand a closer look. LOOKS BETTER. PLAYS BETTER. Equipped with AMD’s Zen 2 and RDNA 2 architectures, DirectX ray tracing delivers true-to-life lighting, shadows, and accurate reflections to create dynamic, living worlds. LISTEN, YOU'RE BEING SURROUNDED. 3D Spatial Sound is the next evolution in audio technology, using advanced algorithms to create immersive lifelike worlds that put you at the center of your experience. TRUE 4K GAMING The Xbox Series X delivers sensationally smooth frame rates of up to 120FPS with the visual pop of HDR. Immerse yourself with sharper characters, brighter worlds, and impossible details with true-to-life 4K. From original classics like Halo: Combat Evolved to future favorites like Halo Infinite, every title looks and plays best on the Xbox Series X. \""
280 | ]
281 | },
282 | "execution_count": 76,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "productDescription = soup.find('div', id='productDescription').text\n",
289 | "productDescription"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 77,
295 | "id": "175c12e0-cbb8-4725-982a-09f19ef679fc",
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/plain": [
301 | "' \\n About this item Microsoft Xbox Series X 1TB SSD True 4K gaming | Up to 120 frames per second 8K High Dynamic Range | Xbox Velocity Architecture IN THE BOX: Microsoft Xbox Series X 1TB SSD - Carbon Black Xbox wireless controller - Ultra high speed HDMI cable - Power cable - Quick start guide INCLUDED IN THE BUNDLE: 2 Year Premium Extended Service Protection Plan | Deco Gear Microfiber Cleaning Cloth 2 Year Premium Extended Service Protection Plan \\n'"
302 | ]
303 | },
304 | "execution_count": 77,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "featureBullets = soup.find('div', id='feature-bullets').text\n",
311 | "featureBullets"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 78,
317 | "id": "7af4a3ff-a358-45ac-a29b-47e980975ac0",
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/plain": [
323 | "4.5"
324 | ]
325 | },
326 | "execution_count": 78,
327 | "metadata": {},
328 | "output_type": "execute_result"
329 | }
330 | ],
331 | "source": [
332 | "average_rating = soup.find(id='averageCustomerReviews').find_all(\"span\", class_='a-size-base')[0].text.strip()\n",
333 | "average_rating = \"\".join([x for x in f\"{average_rating}\".strip() if x.isdigit() or x == '.'])\n",
334 | "average_rating = float(average_rating)\n",
335 | "average_rating"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 79,
341 | "id": "0e17ef25-c190-48de-a9d6-0879f7bb053c",
342 | "metadata": {},
343 | "outputs": [
344 | {
345 | "data": {
346 | "text/plain": [
347 | "48"
348 | ]
349 | },
350 | "execution_count": 79,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "rating_data = soup.find(id='acrCustomerReviewText').text\n",
357 | "rating_count = int(''.join([x for x in rating_data if x.isdigit()]))\n",
358 | "rating_count"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "id": "708ac4c2-ea29-4100-8d51-8dc7c0a96fcd",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": []
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "id": "10e4f2d5-4e87-47c4-a3be-793537b8255f",
373 | "metadata": {},
374 | "outputs": [],
375 | "source": []
376 | }
377 | ],
378 | "metadata": {
379 | "kernelspec": {
380 | "display_name": "Python 3 (ipykernel)",
381 | "language": "python",
382 | "name": "python3"
383 | },
384 | "language_info": {
385 | "codemirror_mode": {
386 | "name": "ipython",
387 | "version": 3
388 | },
389 | "file_extension": ".py",
390 | "mimetype": "text/x-python",
391 | "name": "python",
392 | "nbconvert_exporter": "python",
393 | "pygments_lexer": "ipython3",
394 | "version": "3.11.4"
395 | }
396 | },
397 | "nbformat": 4,
398 | "nbformat_minor": 5
399 | }
400 |
--------------------------------------------------------------------------------
/src/nbs/5 - Amazon Captcha and Prepare Parser Helper Functions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "488ae7f3-dd1f-46ea-8782-02853b4f0036",
6 | "metadata": {},
7 | "source": [
8 | "Note: This Notebook was named _5 - Save Extracted Data to Django Model_ in the video"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "439ab9a8-51cf-4591-836a-0cbbac9ff3d6",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import setup\n",
19 | "setup.init_django(project_name='cfehome')"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "id": "f5db35da-6859-42ef-b5fe-81c55bc89bfa",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import helpers"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "id": "346d4e92-b6ec-4101-848b-8b77af70b2dd",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "from bs4 import BeautifulSoup"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "id": "5e6b875f-38aa-43d1-8bf6-2f51deb88170",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\"\n",
50 | "# url = \"https://www.amazon.com/Microsoft-RRT-00001-Xbox-Enhanced-Protection-X/dp/B0CMFGD9C4/ref=sr_1_3?crid=O4D5MCNRPHZG&dib=eyJ2IjoiMSJ9.BesESVeGKfCLfmoE_Vj2bHkyIqHkjt6vWY83koTZ_cz2I8dLY4__kVTkLkSzkH-tXhAxpP2gQNyM5eVeTcHhvxw10QXkh4Y_4mErHWUhqbSt9lrSfOO_-FmaAKtnyeF8iomBtkjnfyUknCfgCkCLPUr4uTyEnz5WtXbXQI-WN6fXgPPlvAdJXHHZEW4PuI3no5Em_YuyrBMW5_lEcEiUTpPXfdpztJ9ofmYYPF6l8WI.dg-z2b_4ANwk_S5W2m6JFCSScCDS2b3idQV1r-yQOWQ&dib_tag=se&keywords=xbox+series+x&qid=1709681924&sprefix=xbox%2Caps%2C264&sr=8-3\""
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 5,
56 | "id": "251aa699-5c55-4d25-a617-1d43047cfe5d",
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "'https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3'"
63 | ]
64 | },
65 | "execution_count": 5,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "url"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 6,
77 | "id": "f1d2c8c6-8c78-4863-a022-1839a66eb8c8",
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "Connecting to Scraping Browser...\n",
85 | "Connected! Navigating to https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3\n",
86 | "Navigated! Scraping page content...\n"
87 | ]
88 | },
89 | {
90 | "name": "stderr",
91 | "output_type": "stream",
92 | "text": [
93 | "/Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py:38: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
94 | "\n",
95 | "The code that caused this warning is on line 38 of the file /Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n",
96 | "\n",
97 | " soup = BeautifulSoup(html)\n",
98 | "/Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py:6: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
99 | "\n",
100 | "The code that caused this warning is on line 6 of the file /Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n",
101 | "\n",
102 | " soup = BeautifulSoup(html)\n",
103 | "/Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py:25: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
104 | "\n",
105 | "The code that caused this warning is on line 25 of the file /Users/cfe/Dev/scrape-scheduler/src/helpers/amazon.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n",
106 | "\n",
107 | " soup = BeautifulSoup(html)\n"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "html = helpers.scrape(url=url, solve_captcha=False)\n",
113 | "data = helpers.extract_amazon_product_data(html)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 7,
119 | "id": "32dbe860-6a5a-405c-a71c-2dc1a218e5c7",
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "{'title': 'PlayStation®5 Digital Edition (slim)',\n",
126 | " 'price_raw': '449.',\n",
127 | " 'price_text': '449.',\n",
128 | " 'price': 449.0,\n",
129 | " 'metadata': [{'ASIN': 'B0CL5KNB9M'},\n",
130 | " {'Release date': 'November 24, 2023'},\n",
131 | " {'Customer Reviews': '4.7 4.7 out of 5 stars \\n 3,538 ratings \\n\\n\\n 4.7 out of 5 stars'},\n",
132 | " {'Best Sellers Rank': '#216 in Video Games (See Top 100 in Video Games) #2 in PlayStation 5 Consoles'},\n",
133 | " {'Product Dimensions': '17 x 15 x 6 inches; 8.9 Pounds'},\n",
134 | " {'Type of item': 'Video Game'},\n",
135 | " {'Item model number': 'CFI-2000'},\n",
136 | " {'Item Weight': '8.9 pounds'},\n",
137 | " {'Manufacturer': 'Sony'},\n",
138 | " {'Country of Origin': 'China'},\n",
139 | " {'Batteries': '1 Lithium Ion batteries required. (included)'},\n",
140 | " {'Date First Available': 'November 24, 2023'}],\n",
141 | " 'description': ' \\n Play Like Never Before. The PS5 Digital Edition unleashes new gaming possibilities that you never anticipated. Experience lightning fast loading with an ultra-high speed SSD, deeper immersion with support for haptic feedback, adaptive triggers, and 3D Audio*,and an all-new generation of incredible PlayStation® games. PS5 Digital Edition is an all-digital version of the PS5 console with no disc drive. Sign into your account for PlayStation Network and go to PlayStation Store to buy and download games (Account for PlayStation Network required). Lightning Speed - Harness the power of a custom CPU, GPU, and SSD with Integrated I/O that rewrite the rules of what a PlayStation console can do. Stunning Games - Marvel at incredible graphics and experience new PS5 features. Play a back catalog of supported PS4 games. Breathtaking Immersion - Discover a deeper gaming experience with support for haptic feedback, adaptive triggers, and 3D Audio technology. *3D audio via built-in TV speakers or analog/USB stereo headphones. Set up and latest system software update required. ',\n",
142 | " 'feature_bullets': ' \\n About this item Model Number CFI-2000 Includes DualSense Wireless Controller, 1TB SSD, 2 Horizontal Stand Feet, HDMI Cable, AC power cord, USB cable, printed materials, ASTRO’s PLAYROOM (Pre-installed game) Vertical Stand sold seperately \\n',\n",
143 | " 'rating': {'average': 4.7, 'count': 3538}}"
144 | ]
145 | },
146 | "execution_count": 7,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "# data"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "id": "258e49fa-3e55-4dd1-bfd0-ad510d4ebd14",
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "def find_product_table_data(html):\n",
163 | " soup = BeautifulSoup(html)\n",
164 | " product_data = soup.find('div', id='prodDetails')\n",
165 | " if product_data is None:\n",
166 | " return []\n",
167 | " table = product_data.find('table')\n",
168 | " columns = [f\"{x.text}\".strip() for x in table.find_all('th')]\n",
169 | " table_data=[]\n",
170 | " for i, row in enumerate(table.find_all('tr')): # [1:] to skip the header row\n",
171 | " # Get all cells in the row\n",
172 | " cells = row.find_all('td')\n",
173 | " # Create a dictionary for the current row, mapping header to cell data\n",
174 | " row_data = {columns[i]: f'{cell.text}'.strip() for cell in cells}\n",
175 | " # Add the dictionary to your list\n",
176 | " table_data.append(row_data)\n",
177 | " return table_data\n",
178 | "\n",
179 | "def find_product_rating(html):\n",
180 | " soup = BeautifulSoup(html)\n",
181 | " average_rating = soup.find(id='averageCustomerReviews').find_all(\"span\", class_='a-size-base')[0].text.strip()\n",
182 | " average_rating = \"\".join([x for x in f\"{average_rating}\".strip() if x.isdigit() or x == '.'])\n",
183 | " average_rating = float(average_rating)\n",
184 | " rating_data = soup.find(id='acrCustomerReviewText').text\n",
185 | " rating_count = int(''.join([x for x in rating_data if x.isdigit()]))\n",
186 | " rating_count\n",
187 | " return {\n",
188 | " 'average': average_rating,\n",
189 | " 'count': rating_count,\n",
190 | " }\n",
191 | "\n",
192 | "def extract_amazon_product_data(html):\n",
193 | " soup = BeautifulSoup(html)\n",
194 | " productTitle = soup.find('span', id='productTitle')\n",
195 | " productTitleText = f\"{productTitle.text}\".strip()\n",
196 | " productPrice = soup.find_all('span', class_='a-price-whole')[0]\n",
197 | " productPrice = f\"{productPrice.text}\".strip()\n",
198 | " productPriceText = \"\".join([x for x in productPrice if x.isdigit() or x == '.'])\n",
199 | " productPriceNum = float(productPriceText)\n",
200 | " try:\n",
201 | " productDescription = soup.find('div', id='productDescription').text\n",
202 | " except:\n",
203 | " productDescription = ''\n",
204 | " featureBullets = soup.find('div', id='feature-bullets').text\n",
205 | " return {\n",
206 | " 'title': productTitleText,\n",
207 | " 'price_raw': productPrice,\n",
208 | " 'price_text': productPriceText,\n",
209 | " 'price': productPriceNum,\n",
210 | " 'metadata': find_product_table_data(html),\n",
211 | " 'description': productDescription,\n",
212 | " 'feature_bullets': featureBullets,\n",
213 | " 'rating': find_product_rating(html)\n",
214 | " }"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "id": "10e4f2d5-4e87-47c4-a3be-793537b8255f",
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "html = helpers.scrape(url=url, solve_captcha=False)"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "id": "7e2bd341-d7a6-4c71-a691-8cfb51769fcf",
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "# with open('output.html', 'w+') as f:\n",
235 | "# f.write(html)"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "id": "71daf2fb-8fc3-4aeb-8912-e65df9105a4e",
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "data = extract_amazon_product_data(html)\n",
246 | "data"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "id": "a11ec32a-5afd-437b-a64e-6d54b4659ff2",
253 | "metadata": {},
254 | "outputs": [],
255 | "source": []
256 | }
257 | ],
258 | "metadata": {
259 | "kernelspec": {
260 | "display_name": "Python 3 (ipykernel)",
261 | "language": "python",
262 | "name": "python3"
263 | },
264 | "language_info": {
265 | "codemirror_mode": {
266 | "name": "ipython",
267 | "version": 3
268 | },
269 | "file_extension": ".py",
270 | "mimetype": "text/x-python",
271 | "name": "python",
272 | "nbconvert_exporter": "python",
273 | "pygments_lexer": "ipython3",
274 | "version": "3.11.4"
275 | }
276 | },
277 | "nbformat": 4,
278 | "nbformat_minor": 5
279 | }
280 |
--------------------------------------------------------------------------------
/src/nbs/6 - Tracking Scraped Data with Django Models.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "a18d1078-d317-44cd-a6f9-01641a439cd8",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import setup\n",
11 | "setup.init_django(project_name='cfehome')"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 14,
17 | "id": "d5ab0966-17a0-4f8d-8b67-1344a19ab0a1",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "import helpers\n",
22 | "from products.models import ProductScrapeEvent, Product"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "id": "93cd08b1-8d07-4f1a-9cad-69d2f8e3d35b",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\""
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "id": "b553ebdd-b86e-4475-8506-2723ef108ce4",
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "Connecting to Scraping Browser...\n",
46 | "Connected! Navigating to https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3\n",
47 | "Navigated! Scraping page content...\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "html = helpers.scrape(url=url, solve_captcha=False)\n",
53 | "data = helpers.extract_amazon_product_data(html)"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 5,
59 | "id": "d4d892e4-cb4e-400f-9e81-1caecf6e46b8",
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "text/plain": [
65 | "{'asin': 'B0CL5KNB9M',\n",
66 | " 'title': 'PlayStation®5 Digital Edition (slim)',\n",
67 | " 'price_raw': '449.',\n",
68 | " 'price_text': '449.',\n",
69 | " 'price': 449.0,\n",
70 | " 'metadata': [{'ASIN': 'B0CL5KNB9M'},\n",
71 | " {'Release date': 'November 24, 2023'},\n",
72 | " {'Customer Reviews': '4.7 4.7 out of 5 stars \\n 3,538 ratings \\n\\n\\n 4.7 out of 5 stars'},\n",
73 | " {'Best Sellers Rank': '#216 in Video Games (See Top 100 in Video Games) #2 in PlayStation 5 Consoles'},\n",
74 | " {'Product Dimensions': '17 x 15 x 6 inches; 8.9 Pounds'},\n",
75 | " {'Type of item': 'Video Game'},\n",
76 | " {'Item model number': 'CFI-2000'},\n",
77 | " {'Item Weight': '8.9 pounds'},\n",
78 | " {'Manufacturer': 'Sony'},\n",
79 | " {'Country of Origin': 'China'},\n",
80 | " {'Batteries': '1 Lithium Ion batteries required. (included)'},\n",
81 | " {'Date First Available': 'November 24, 2023'}],\n",
82 | " 'description': ' \\n Play Like Never Before. The PS5 Digital Edition unleashes new gaming possibilities that you never anticipated. Experience lightning fast loading with an ultra-high speed SSD, deeper immersion with support for haptic feedback, adaptive triggers, and 3D Audio*,and an all-new generation of incredible PlayStation® games. PS5 Digital Edition is an all-digital version of the PS5 console with no disc drive. Sign into your account for PlayStation Network and go to PlayStation Store to buy and download games (Account for PlayStation Network required). Lightning Speed - Harness the power of a custom CPU, GPU, and SSD with Integrated I/O that rewrite the rules of what a PlayStation console can do. Stunning Games - Marvel at incredible graphics and experience new PS5 features. Play a back catalog of supported PS4 games. Breathtaking Immersion - Discover a deeper gaming experience with support for haptic feedback, adaptive triggers, and 3D Audio technology. *3D audio via built-in TV speakers or analog/USB stereo headphones. Set up and latest system software update required. ',\n",
83 | " 'feature_bullets': ' \\n About this item Model Number CFI-2000 Includes DualSense Wireless Controller, 1TB SSD, 2 Horizontal Stand Feet, HDMI Cable, AC power cord, USB cable, printed materials, ASTRO’s PLAYROOM (Pre-installed game) Vertical Stand sold seperately \\n',\n",
84 | " 'rating': {'average': 4.7, 'count': 3538}}"
85 | ]
86 | },
87 | "execution_count": 5,
88 | "metadata": {},
89 | "output_type": "execute_result"
90 | }
91 | ],
92 | "source": [
93 | "data"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 12,
99 | "id": "7cf48a33-91d7-4afb-8e7f-f32c65a6ef95",
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "data": {
104 | "text/plain": [
105 | ""
106 | ]
107 | },
108 | "execution_count": 12,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "ProductScrapeEvent.objects.create_scrape_event(data, url=url)"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 13,
120 | "id": "68591fc9-1be3-4219-8fc9-d3f3e98fb99c",
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "data": {
125 | "text/plain": [
126 | ", , , ]>"
127 | ]
128 | },
129 | "execution_count": 13,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "qs = ProductScrapeEvent.objects.all()\n",
136 | "qs"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 16,
142 | "id": "3dbffe27-bf93-4a72-9a3b-60efe3d3c075",
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "data": {
147 | "text/plain": [
148 | "]>"
149 | ]
150 | },
151 | "execution_count": 16,
152 | "metadata": {},
153 | "output_type": "execute_result"
154 | }
155 | ],
156 | "source": [
157 | "product_qs = Product.objects.all()\n",
158 | "product_qs"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "id": "c4379de2-ed36-4a62-9a2b-1a7a3404773b",
165 | "metadata": {},
166 | "outputs": [],
167 | "source": []
168 | }
169 | ],
170 | "metadata": {
171 | "kernelspec": {
172 | "display_name": "Python 3 (ipykernel)",
173 | "language": "python",
174 | "name": "python3"
175 | },
176 | "language_info": {
177 | "codemirror_mode": {
178 | "name": "ipython",
179 | "version": 3
180 | },
181 | "file_extension": ".py",
182 | "mimetype": "text/x-python",
183 | "name": "python",
184 | "nbconvert_exporter": "python",
185 | "pygments_lexer": "ipython3",
186 | "version": "3.11.4"
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 5
191 | }
192 |
--------------------------------------------------------------------------------
/src/nbs/7 - Trigger Scrape Task.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 26,
6 | "id": "a18d1078-d317-44cd-a6f9-01641a439cd8",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import setup\n",
11 | "setup.init_django(project_name='cfehome')"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 27,
17 | "id": "d5ab0966-17a0-4f8d-8b67-1344a19ab0a1",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "import helpers\n",
22 | "from products.tasks import scrape_product_url_task, scrape_products_task"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 28,
28 | "id": "93cd08b1-8d07-4f1a-9cad-69d2f8e3d35b",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "url = \"https://www.amazon.com/PlayStation%C2%AE5-Digital-slim-PlayStation-5/dp/B0CL5KNB9M/ref=sr_1_3?crid=B7GWFZXZ2OED&dib=eyJ2IjoiMSJ9.kgp6If9Ie8zGHwBo-0htBLyQbbKjs5VuqpcJV5opH4mRqQT9y1GDUhgEGC4Ze5c7iOpklu5U3l_vF3hGTmGfZf8jvVY7cSGvtmhRbSth2-wUchP4cPB4bCopxZnBPpqLbX4wU-JZkepl_i4fGdXQJXUMLc256FqdCdlbjr6ZMyFHhWIJq2G38fcfQx3z9RS1e48jNXaYXv1rWtJ3Y30-OZP-ckGz15zF5vR6k6z6G6c.HDrf64xu0Nz7DYLZvUdGglWizRZpAXBxsxbsgsW26Tc&dib_tag=se&keywords=ps5&qid=1709675943&sprefix=ps5%2Caps%2C190&sr=8-3\""
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 31,
38 | "id": "c4379de2-ed36-4a62-9a2b-1a7a3404773b",
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "data": {
43 | "text/plain": [
44 | ""
45 | ]
46 | },
47 | "execution_count": 31,
48 | "metadata": {},
49 | "output_type": "execute_result"
50 | }
51 | ],
52 | "source": [
53 | "scrape_product_url_task.delay(url)"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 32,
59 | "id": "eaf49535-9717-46c1-88b7-f0a189e97009",
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "text/plain": [
65 | ""
66 | ]
67 | },
68 | "execution_count": 32,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "scrape_products_task.delay()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "id": "5a8ce7a8-3d85-4e18-9b6c-22b605d050ae",
81 | "metadata": {},
82 | "outputs": [],
83 | "source": []
84 | }
85 | ],
86 | "metadata": {
87 | "kernelspec": {
88 | "display_name": "Python 3 (ipykernel)",
89 | "language": "python",
90 | "name": "python3"
91 | },
92 | "language_info": {
93 | "codemirror_mode": {
94 | "name": "ipython",
95 | "version": 3
96 | },
97 | "file_extension": ".py",
98 | "mimetype": "text/x-python",
99 | "name": "python",
100 | "nbconvert_exporter": "python",
101 | "pygments_lexer": "ipython3",
102 | "version": "3.11.4"
103 | }
104 | },
105 | "nbformat": 4,
106 | "nbformat_minor": 5
107 | }
108 |
--------------------------------------------------------------------------------
/src/nbs/setup.py:
--------------------------------------------------------------------------------
1 | # https://www.codingforentrepreneurs.com/shorts/django-setup-for-use-in-jupyter-notebooks/
2 | import os, sys
3 |
4 | PWD = os.getenv("PWD")
5 | DJANGO_PROJECT = os.environ.get("DJANGO_PROJECT") or "cfehome"
6 | DJANGO_ROOT_DIR = os.environ.get("DJANGO_ROOT_DIR") or "src"
7 | if not PWD.endswith(f"/{DJANGO_ROOT_DIR}"):
8 | # src is the django-root
9 | PWD = os.path.join(PWD, DJANGO_ROOT_DIR)
10 |
11 |
12 | PROJ_MISSING_MSG = """Set an enviroment variable:\n
13 | `DJANGO_PROJECT=your_project_name`\n
14 | or call:\n
15 | `init_django(project_name=your_project_name)`
16 | """
17 |
18 |
19 | def init_django(project_name=None):
20 | os.chdir(PWD)
21 | dj_project_name = project_name or DJANGO_PROJECT
22 | if dj_project_name == None:
23 | raise Exception(PROJ_MISSING_MSG)
24 | sys.path.insert(0, os.getenv('PWD'))
25 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', f'{project_name}.settings')
26 | os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
27 | import django
28 | django.setup()
--------------------------------------------------------------------------------
/src/products/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/products/__init__.py
--------------------------------------------------------------------------------
/src/products/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
5 | from .models import Product, ProductScrapeEvent
6 |
7 |
8 | admin.site.register(Product)
9 |
10 | admin.site.register(ProductScrapeEvent)
--------------------------------------------------------------------------------
/src/products/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class ProductsConfig(AppConfig):
5 | default_auto_field = "django.db.models.BigAutoField"
6 | name = "products"
7 |
--------------------------------------------------------------------------------
/src/products/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.3 on 2024-03-18 18:53
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 | initial = True
9 |
10 | dependencies = []
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name="Product",
15 | fields=[
16 | (
17 | "id",
18 | models.BigAutoField(
19 | auto_created=True,
20 | primary_key=True,
21 | serialize=False,
22 | verbose_name="ID",
23 | ),
24 | ),
25 | ("asin", models.CharField(db_index=True, max_length=120, unique=True)),
26 | ("title", models.CharField(blank=True, max_length=220, null=True)),
27 | (
28 | "current_price",
29 | models.FloatField(blank=True, default=0.0, null=True),
30 | ),
31 | ("timestamp", models.DateTimeField(auto_now_add=True)),
32 | ("updated", models.DateTimeField(auto_now=True)),
33 | ("metadata", models.JSONField(blank=True, null=True)),
34 | ],
35 | ),
36 | migrations.CreateModel(
37 | name="ProductScrapeEvent",
38 | fields=[
39 | (
40 | "id",
41 | models.BigAutoField(
42 | auto_created=True,
43 | primary_key=True,
44 | serialize=False,
45 | verbose_name="ID",
46 | ),
47 | ),
48 | ("url", models.URLField(blank=True, null=True)),
49 | ("data", models.JSONField(blank=True, null=True)),
50 | ("asin", models.CharField(db_index=True, max_length=120, unique=True)),
51 | (
52 | "product",
53 | models.ForeignKey(
54 | on_delete=django.db.models.deletion.CASCADE,
55 | related_name="scrape_events",
56 | to="products.product",
57 | ),
58 | ),
59 | ],
60 | ),
61 | ]
62 |
--------------------------------------------------------------------------------
/src/products/migrations/0002_alter_productscrapeevent_asin.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.3 on 2024-03-18 18:55
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 | dependencies = [
8 | ("products", "0001_initial"),
9 | ]
10 |
11 | operations = [
12 | migrations.AlterField(
13 | model_name="productscrapeevent",
14 | name="asin",
15 | field=models.CharField(blank=True, max_length=120, null=True),
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/src/products/migrations/0003_product__trigger_scrape_product_trigger_scrape_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.3 on 2024-03-18 19:07
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 | dependencies = [
8 | ("products", "0002_alter_productscrapeevent_asin"),
9 | ]
10 |
11 | operations = [
12 | migrations.AddField(
13 | model_name="product",
14 | name="_trigger_scrape",
15 | field=models.BooleanField(default=False),
16 | ),
17 | migrations.AddField(
18 | model_name="product",
19 | name="trigger_scrape",
20 | field=models.BooleanField(default=False),
21 | ),
22 | migrations.AddField(
23 | model_name="product",
24 | name="url",
25 | field=models.URLField(blank=True, null=True),
26 | ),
27 | ]
28 |
--------------------------------------------------------------------------------
/src/products/migrations/0004_remove_product__trigger_scrape_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.3 on 2024-03-18 19:45
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 | dependencies = [
8 | ("products", "0003_product__trigger_scrape_product_trigger_scrape_and_more"),
9 | ]
10 |
11 | operations = [
12 | migrations.RemoveField(
13 | model_name="product",
14 | name="_trigger_scrape",
15 | ),
16 | migrations.RemoveField(
17 | model_name="product",
18 | name="trigger_scrape",
19 | ),
20 | migrations.AddField(
21 | model_name="product",
22 | name="active",
23 | field=models.BooleanField(default=False),
24 | ),
25 | ]
26 |
--------------------------------------------------------------------------------
/src/products/migrations/0005_alter_product_active.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.3 on 2024-03-18 19:45
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 | dependencies = [
8 | ("products", "0004_remove_product__trigger_scrape_and_more"),
9 | ]
10 |
11 | operations = [
12 | migrations.AlterField(
13 | model_name="product",
14 | name="active",
15 | field=models.BooleanField(default=True, help_text="Scrape daily?"),
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/src/products/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Web-Scraping-with-Django-Celery/d795c4fd1854c1c25afe443b3bac6d7874a5d2b1/src/products/migrations/__init__.py
--------------------------------------------------------------------------------
/src/products/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | # Create your models here.
4 |
5 | from .tasks import scrape_product_url_task
6 |
7 | class Product(models.Model):
8 | asin = models.CharField(max_length=120, unique=True, db_index=True)
9 | url = models.URLField(blank=True, null=True)
10 | title = models.CharField(max_length=220, blank=True, null=True)
11 | current_price = models.FloatField(blank=True, null=True, default=0.00)
12 | timestamp = models.DateTimeField(auto_now_add=True)
13 | updated = models.DateTimeField(auto_now=True)
14 | metadata = models.JSONField(null=True, blank=True)
15 | active = models.BooleanField(default=True, help_text="Scrape daily?")
16 |
17 |
18 |
19 | class ProductScrapeEventManager(models.Manager):
20 | def create_scrape_event(self, data, url=None):
21 | asin = data.get('asin') or None
22 | if asin is None:
23 | return None
24 | product, _ = Product.objects.update_or_create(
25 | asin=asin,
26 | defaults={
27 | "url": url,
28 | "title": data.get('title') or "",
29 | "current_price": data.get('price') or 0.00,
30 | "metadata": data,
31 | }
32 | )
33 | event = self.create(
34 | product=product,
35 | url=url,
36 | asin=asin,
37 | data=data,
38 | )
39 | return event
40 |
41 |
42 | class ProductScrapeEvent(models.Model):
43 | product = models.ForeignKey(Product, on_delete=models.CASCADE, related_name='scrape_events')
44 | url = models.URLField(blank=True, null=True)
45 | data = models.JSONField(null=True, blank=True)
46 | asin = models.CharField(max_length=120, null=True, blank=True)
47 |
48 | objects = ProductScrapeEventManager()
--------------------------------------------------------------------------------
/src/products/tasks.py:
--------------------------------------------------------------------------------
1 | from django.apps import apps
2 | from celery import shared_task
3 | import helpers
4 |
5 |
6 | @shared_task
7 | def scrape_product_url_task(url):
8 | if url is None:
9 | return
10 | elif url == "":
11 | return
12 | ProductScrapeEvent = apps.get_model('products', 'ProductScrapeEvent')
13 | # open the url
14 | html = helpers.scrape(url=url, solve_captcha=False)
15 | # scrape the url
16 | data = helpers.extract_amazon_product_data(html)
17 | # save the scraped data
18 | ProductScrapeEvent.objects.create_scrape_event(data, url=url)
19 | return
20 |
21 |
22 | @shared_task
23 | def scrape_products_task():
24 | Product = apps.get_model('products', 'Product')
25 | qs = Product.objects.filter(active=True)
26 | for obj in qs:
27 | url = obj.url
28 | scrape_product_url_task.delay(url)
--------------------------------------------------------------------------------
/src/products/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/src/products/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------