├── requirements.txt
├── images
    └── thumbnail.jpg
├── README.md
├── LICENSE
├── .gitignore
└── walmart_scraper.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | 


--------------------------------------------------------------------------------
/images/thumbnail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeithGalli/advanced-scraping/HEAD/images/thumbnail.jpg


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Advanced Web Scraping Tutorial
 2 | <img src="./images/thumbnail.jpg" width="50%"/>
 3 | <br/>
 4 | 
 5 | Code corresponding to my [recent video](https://youtu.be/DcI_AZqfZVc) on web scraping with Python BeautifulSoup
 6 | 
 7 | ## About/Navigation
 8 | At certain parts of the video, I reference accessing the code at different stages of completeness. 
 9 | 
10 | Here are the files that you may be looking for:
11 | - [Initial Run - 24:50 in video](https://github.com/KeithGalli/advanced-scraping/commit/5afbdd59e4982c1a6ce7ad1fe9cb5047eaf8ac25)
12 | - [Improvements - 27:19 in video](https://github.com/KeithGalli/advanced-scraping/blob/original/search_scraper.py)
13 | - [Final Code w/ Proxies](./search_scraper.py)
14 | 
15 | Make sure to add a **.env** file when you start using the Bright Data proxies!
16 | 
17 | Shout out to Bright Data for sponsoring this video, get started using [this link](https://brdta.com/keithgalli)!
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Keith Galli
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/walmart_scraper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import json
  4 | import queue
  5 | import time
  6 | import os
  7 | from dotenv import load_dotenv
  8 | from requests.exceptions import ProxyError, HTTPError
  9 | 
 10 | load_dotenv()
 11 | 
 12 | BASE_URL = "https://www.walmart.com"
 13 | OUTPUT_FILE = "product_info.jsonl"
 14 | 
 15 | # Fake browser-like headers
 16 | BASE_HEADERS = {
 17 |     "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
 18 |     "accept": "application/json",
 19 |     "accept-language": "en-US",
 20 |     "accept-encoding": "gzip, deflate, br, zstd",
 21 | }
 22 | 
 23 | host = 'brd.superproxy.io'
 24 | port = 22225
 25 | username = os.environ['BRD_USERNAME']
 26 | password = os.environ['BRD_PASSWORD']
 27 | 
 28 | proxy_url = f'http://{username}:{password}@{host}:{port}'
 29 | 
 30 | proxies = {
 31 |     'http': proxy_url,
 32 |     'https': proxy_url
 33 | }
 34 | 
 35 | # List of search queries
 36 | search_queries = ["computers", "laptops", "desktops", "monitors", "printers", "hard+drives", "usb", "cords", "cameras", 
 37 |                   "mouse", "keyboard", "microphones", "speakers", "radio", "tablets", "android", "apple", "watch", "smart+watch", 
 38 |                   "fridge", "airconditioning", "wifi", "router", "modem", "desk", "xbox", "playstation", "nintendo"]
 39 | 
 40 | # Initialize a queue for product URLs and a set for seen URLs
 41 | product_queue = queue.Queue()
 42 | seen_urls = set()
 43 | 
 44 | def get_product_links_from_search_page(query, page_number):
 45 |     search_url = f"https://www.walmart.com/search?q={query}&page={page_number}"
 46 |     max_retries = 5
 47 |     backoff_factor = 3
 48 |     for attempt in range(max_retries):
 49 |         try:
 50 |             response = requests.get(search_url, headers=BASE_HEADERS, proxies=proxies)
 51 |             response.raise_for_status()
 52 |             soup = BeautifulSoup(response.text, 'html.parser')
 53 |             product_links = []
 54 | 
 55 |             found = False
 56 |             for a_tag in soup.find_all('a', href=True):
 57 |                 if '/ip/' in a_tag['href']:
 58 |                     found = True
 59 |                     if "https" in a_tag['href']:
 60 |                         full_url = a_tag['href']
 61 |                     else:
 62 |                         full_url = BASE_URL + a_tag['href']
 63 | 
 64 |                     if full_url not in seen_urls:
 65 |                         product_links.append(full_url)
 66 | 
 67 |             if not found:
 68 |                 print("\n\n\nSOUP WHEN NOT FOUND", soup)
 69 | 
 70 |             return product_links
 71 | 
 72 |         except ProxyError as e:
 73 |             wait_time = backoff_factor ** attempt
 74 |             print(f"Proxy error: {e}. Retrying in {wait_time} seconds...")
 75 |             time.sleep(wait_time)
 76 |         except HTTPError as e:
 77 |             if e.response.status_code == 412:
 78 |                 print(f"Precondition Failed (412): {e}. Skipping URL.")
 79 |                 break
 80 |             wait_time = backoff_factor ** attempt
 81 |             print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
 82 |             time.sleep(wait_time)
 83 |         except Exception as e:
 84 |             print(f"Failed to get product links for query: {query} on page: {page_number}. Error: {e}")
 85 |             break
 86 | 
 87 |     print(f"Skipping query after {max_retries} retries: {query} on page: {page_number}")
 88 |     return []
 89 | 
 90 | def extract_product_info(product_url):
 91 |     print("Processing URL", product_url)
 92 |     max_retries = 5
 93 |     backoff_factor = 3
 94 |     for attempt in range(max_retries):
 95 |         try:
 96 |             response = requests.get(product_url, headers=BASE_HEADERS, proxies=proxies)
 97 |             response.raise_for_status()
 98 |             soup = BeautifulSoup(response.text, 'html.parser')
 99 |             script_tag = soup.find('script', id='__NEXT_DATA__')
100 | 
101 |             if script_tag is None:
102 |                 return None
103 | 
104 |             data = json.loads(script_tag.string)
105 |             initial_data = data["props"]["pageProps"]["initialData"]["data"]
106 |             product_data = initial_data["product"]
107 |             reviews_data = initial_data.get("reviews", {})
108 | 
109 |             product_info = {
110 |                 "price": product_data["priceInfo"]["currentPrice"]["price"],
111 |                 "review_count": reviews_data.get("totalReviewCount", 0),
112 |                 "item_id": product_data["usItemId"],
113 |                 "avg_rating": reviews_data.get("averageOverallRating", 0),
114 |                 "product_name": product_data["name"],
115 |                 "brand": product_data.get("brand", ""),
116 |                 "availability": product_data["availabilityStatus"],
117 |                 "image_url": product_data["imageInfo"]["thumbnailUrl"],
118 |                 "short_description": product_data.get("shortDescription", "")
119 |             }
120 | 
121 |             return product_info
122 | 
123 |         except ProxyError as e:
124 |             wait_time = backoff_factor ** attempt
125 |             print(f"Proxy error: {e}. Retrying in {wait_time} seconds...")
126 |             time.sleep(wait_time)
127 |         except HTTPError as e:
128 |             if e.response.status_code == 412:
129 |                 print(f"Precondition Failed (412): {e}. Skipping URL.")
130 |                 break
131 |             wait_time = backoff_factor ** attempt
132 |             print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
133 |             time.sleep(wait_time)
134 |         except Exception as e:
135 |             print(f"Failed to process URL: {product_url}. Error: {e}")
136 |             break
137 | 
138 |     print(f"Skipping URL after {max_retries} retries: {product_url}")
139 |     return None
140 | 
141 | def main():
142 |     with open(OUTPUT_FILE, 'w') as file:
143 |         while search_queries:
144 |             current_query = search_queries.pop(0)
145 |             print("\n\nCURRENT QUERY", current_query, "\n\n")
146 |             page_number = 1
147 | 
148 |             while True:
149 |                 product_links = get_product_links_from_search_page(current_query, page_number)
150 |                 if not product_links or page_number > 99:
151 |                     break
152 | 
153 |                 for link in product_links:
154 |                     if link not in seen_urls:
155 |                         product_queue.put(link)
156 |                         seen_urls.add(link)
157 | 
158 |                 while not product_queue.empty():
159 |                     product_url = product_queue.get()
160 |                     product_info = extract_product_info(product_url)
161 |                     if product_info:
162 |                         file.write(json.dumps(product_info) + "\n")
163 | 
164 |                 page_number += 1
165 |                 print(page_number)
166 | 
167 | if __name__ == "__main__":
168 |     main()


--------------------------------------------------------------------------------