├── requirements.txt
├── images
└── thumbnail.jpg
├── README.md
├── LICENSE
├── .gitignore
└── walmart_scraper.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 |
--------------------------------------------------------------------------------
/images/thumbnail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeithGalli/advanced-scraping/HEAD/images/thumbnail.jpg
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Advanced Web Scraping Tutorial
2 |
3 |
4 |
5 | Code corresponding to my [recent video](https://youtu.be/DcI_AZqfZVc) on web scraping with Python BeautifulSoup
6 |
7 | ## About/Navigation
8 | At certain parts of the video, I reference accessing the code at different stages of completeness.
9 |
10 | Here are the files that you may be looking for:
11 | - [Initial Run - 24:50 in video](https://github.com/KeithGalli/advanced-scraping/commit/5afbdd59e4982c1a6ce7ad1fe9cb5047eaf8ac25)
12 | - [Improvements - 27:19 in video](https://github.com/KeithGalli/advanced-scraping/blob/original/search_scraper.py)
13 | - [Final Code w/ Proxies](./search_scraper.py)
14 |
15 | Make sure to add a **.env** file when you start using the Bright Data proxies!
16 |
17 | Shout out to Bright Data for sponsoring this video, get started using [this link](https://brdta.com/keithgalli)!
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Keith Galli
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/walmart_scraper.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import json
4 | import queue
5 | import time
6 | import os
7 | from dotenv import load_dotenv
8 | from requests.exceptions import ProxyError, HTTPError
9 |
10 | load_dotenv()
11 |
12 | BASE_URL = "https://www.walmart.com"
13 | OUTPUT_FILE = "product_info.jsonl"
14 |
15 | # Fake browser-like headers
16 | BASE_HEADERS = {
17 | "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
18 | "accept": "application/json",
19 | "accept-language": "en-US",
20 | "accept-encoding": "gzip, deflate, br, zstd",
21 | }
22 |
23 | host = 'brd.superproxy.io'
24 | port = 22225
25 | username = os.environ['BRD_USERNAME']
26 | password = os.environ['BRD_PASSWORD']
27 |
28 | proxy_url = f'http://{username}:{password}@{host}:{port}'
29 |
30 | proxies = {
31 | 'http': proxy_url,
32 | 'https': proxy_url
33 | }
34 |
35 | # List of search queries
36 | search_queries = ["computers", "laptops", "desktops", "monitors", "printers", "hard+drives", "usb", "cords", "cameras",
37 | "mouse", "keyboard", "microphones", "speakers", "radio", "tablets", "android", "apple", "watch", "smart+watch",
38 | "fridge", "airconditioning", "wifi", "router", "modem", "desk", "xbox", "playstation", "nintendo"]
39 |
40 | # Initialize a queue for product URLs and a set for seen URLs
41 | product_queue = queue.Queue()
42 | seen_urls = set()
43 |
44 | def get_product_links_from_search_page(query, page_number):
45 | search_url = f"https://www.walmart.com/search?q={query}&page={page_number}"
46 | max_retries = 5
47 | backoff_factor = 3
48 | for attempt in range(max_retries):
49 | try:
50 | response = requests.get(search_url, headers=BASE_HEADERS, proxies=proxies)
51 | response.raise_for_status()
52 | soup = BeautifulSoup(response.text, 'html.parser')
53 | product_links = []
54 |
55 | found = False
56 | for a_tag in soup.find_all('a', href=True):
57 | if '/ip/' in a_tag['href']:
58 | found = True
59 | if "https" in a_tag['href']:
60 | full_url = a_tag['href']
61 | else:
62 | full_url = BASE_URL + a_tag['href']
63 |
64 | if full_url not in seen_urls:
65 | product_links.append(full_url)
66 |
67 | if not found:
68 | print("\n\n\nSOUP WHEN NOT FOUND", soup)
69 |
70 | return product_links
71 |
72 | except ProxyError as e:
73 | wait_time = backoff_factor ** attempt
74 | print(f"Proxy error: {e}. Retrying in {wait_time} seconds...")
75 | time.sleep(wait_time)
76 | except HTTPError as e:
77 | if e.response.status_code == 412:
78 | print(f"Precondition Failed (412): {e}. Skipping URL.")
79 | break
80 | wait_time = backoff_factor ** attempt
81 | print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
82 | time.sleep(wait_time)
83 | except Exception as e:
84 | print(f"Failed to get product links for query: {query} on page: {page_number}. Error: {e}")
85 | break
86 |
87 | print(f"Skipping query after {max_retries} retries: {query} on page: {page_number}")
88 | return []
89 |
90 | def extract_product_info(product_url):
91 | print("Processing URL", product_url)
92 | max_retries = 5
93 | backoff_factor = 3
94 | for attempt in range(max_retries):
95 | try:
96 | response = requests.get(product_url, headers=BASE_HEADERS, proxies=proxies)
97 | response.raise_for_status()
98 | soup = BeautifulSoup(response.text, 'html.parser')
99 | script_tag = soup.find('script', id='__NEXT_DATA__')
100 |
101 | if script_tag is None:
102 | return None
103 |
104 | data = json.loads(script_tag.string)
105 | initial_data = data["props"]["pageProps"]["initialData"]["data"]
106 | product_data = initial_data["product"]
107 | reviews_data = initial_data.get("reviews", {})
108 |
109 | product_info = {
110 | "price": product_data["priceInfo"]["currentPrice"]["price"],
111 | "review_count": reviews_data.get("totalReviewCount", 0),
112 | "item_id": product_data["usItemId"],
113 | "avg_rating": reviews_data.get("averageOverallRating", 0),
114 | "product_name": product_data["name"],
115 | "brand": product_data.get("brand", ""),
116 | "availability": product_data["availabilityStatus"],
117 | "image_url": product_data["imageInfo"]["thumbnailUrl"],
118 | "short_description": product_data.get("shortDescription", "")
119 | }
120 |
121 | return product_info
122 |
123 | except ProxyError as e:
124 | wait_time = backoff_factor ** attempt
125 | print(f"Proxy error: {e}. Retrying in {wait_time} seconds...")
126 | time.sleep(wait_time)
127 | except HTTPError as e:
128 | if e.response.status_code == 412:
129 | print(f"Precondition Failed (412): {e}. Skipping URL.")
130 | break
131 | wait_time = backoff_factor ** attempt
132 | print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
133 | time.sleep(wait_time)
134 | except Exception as e:
135 | print(f"Failed to process URL: {product_url}. Error: {e}")
136 | break
137 |
138 | print(f"Skipping URL after {max_retries} retries: {product_url}")
139 | return None
140 |
141 | def main():
142 | with open(OUTPUT_FILE, 'w') as file:
143 | while search_queries:
144 | current_query = search_queries.pop(0)
145 | print("\n\nCURRENT QUERY", current_query, "\n\n")
146 | page_number = 1
147 |
148 | while True:
149 | product_links = get_product_links_from_search_page(current_query, page_number)
150 | if not product_links or page_number > 99:
151 | break
152 |
153 | for link in product_links:
154 | if link not in seen_urls:
155 | product_queue.put(link)
156 | seen_urls.add(link)
157 |
158 | while not product_queue.empty():
159 | product_url = product_queue.get()
160 | product_info = extract_product_info(product_url)
161 | if product_info:
162 | file.write(json.dumps(product_info) + "\n")
163 |
164 | page_number += 1
165 | print(page_number)
166 |
167 | if __name__ == "__main__":
168 | main()
--------------------------------------------------------------------------------