├── Marketplace Scraper - Comments Only.ipynb
├── Marketplace Scraper_Tutorial_Updated.ipynb
└── README.md


/Marketplace Scraper - Comments Only.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "c2524fb2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Import necessary libraries"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "7f5d4715",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Set up Splinter\n",
 21 |     "\n",
 22 |     "# Set up base url\n",
 23 |     "\n",
 24 |     "# Set up search parameters\n",
 25 |     "\n",
 26 |     "#Set up full url"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "id": "3ee819c2",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# Visit the website"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "46948ef0",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Scroll down to load more results\n",
 47 |     "\n",
 48 |     "# Define the number of times to scroll the page\n",
 49 |     "\n",
 50 |     "# Define the delay (in seconds) between each scroll\n",
 51 |     "\n",
 52 |     "# Loop to perform scrolling"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "652468fb",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Parse the HTML\n",
 63 |     "\n",
 64 |     "\n",
 65 |     "# Create a BeautifulSoup object from the scraped HTML\n",
 66 |     "\n",
 67 |     "# Check if HTML was scraped correctly\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "24a16d2d",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# End the automated browsing session\n"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 2,
 83 |    "id": "51cde2e5",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Extract all the necessary info and insert into lists"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "id": "9411a71c",
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# Create a regular expression pattern to match city and state entries like \"City, State\"\n",
 98 |     "\n",
 99 |     "\n",
100 |     "# Initialize an empty list to store adjusted mileage entries\n",
101 |     "\n",
102 |     "\n",
103 |     "# Iterate through the original mileage entries\n",
104 |     "\n",
105 |     "    # Append the current mileage entry to the adjusted list\n",
106 |     "\n",
107 |     "    \n",
108 |     "    # Check if the current mileage entry matches the pattern and there are at least two entries in the adjusted list\n",
109 |     "\n",
110 |     "        # If the conditions are met, insert \"0K km\" in between the two consecutive city and state entries\n",
111 |     "        "
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 3,
117 |    "id": "29e0033d",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "# Extracted mileage list (separate from location and extract numeric values only)\n",
122 |     "# Define regular expressions to extract numeric mileage values in \"K km\" and \"K miles\" format\n",
123 |     "\n",
124 |     "# Initialize an empty list to store cleaned mileage values\n",
125 |     "\n",
126 |     "\n",
127 |     "# Iterate through the adjusted mileage entries\n",
128 |     "\n",
129 |     "    # Try to find a match for the \"K km\" format\n",
130 |     "    \n",
131 |     "    \n",
132 |     "    # Try to find a match for the \"K miles\" format\n",
133 |     "    \n",
134 |     "    \n",
135 |     "    # Check if either of the formats is found\n",
136 |     "    \n",
137 |     "        # If \"K km\" format is found, convert it to meters and append to the cleaned list\n",
138 |     "        \n",
139 |     "        # If \"K miles\" format is found, convert it to meters and append to the cleaned list\n",
140 |     "        "
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "6e5837ce",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# Add all values to a list of dictionaries"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "5c1112af",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "# Add the prefix to the URLs"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "id": "58651bef",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "# Filter the DataFrame to include rows where the 'Model' column matches the specified model, regardless of case."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "id": "a3cf69d5",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# Group the data by \"Year\" and calculate the mean price for each year\n",
181 |     "\n",
182 |     "# Create a wider figure\n",
183 |     "\n",
184 |     "# Create the column chart\n",
185 |     "\n",
186 |     "# Add values on top of the bars with no digits after the period\n",
187 |     "\n",
188 |     "# Convert to an integer to remove digits after the period\n",
189 |     "\n",
190 |     "# Set the labels and title\n",
191 |     "\n",
192 |     "\n",
193 |     "# Use adaptive scale for the year axis\n",
194 |     "\n",
195 |     "\n",
196 |     "# Show the chart\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "id": "f9966f84",
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "# Find the URL of the cheapest Honda Civic directly from the filtered DataFrame\n",
207 |     "# Print the URL"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "b76ba927",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "# Export to CSV"
218 |    ]
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "kernelspec": {
223 |    "display_name": "Python 3 (ipykernel)",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "codemirror_mode": {
229 |     "name": "ipython",
230 |     "version": 3
231 |    },
232 |    "file_extension": ".py",
233 |    "mimetype": "text/x-python",
234 |    "name": "python",
235 |    "nbconvert_exporter": "python",
236 |    "pygments_lexer": "ipython3",
237 |    "version": "3.9.13"
238 |   }
239 |  },
240 |  "nbformat": 4,
241 |  "nbformat_minor": 5
242 | }
243 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # facebook-marketplace-webscraping
2 | Facebook Marketplace Scraper Using Python, BeautifulSoup, Splinter, Pandas...
3 | 
4 | <h3>
5 |   🔥  Avoid getting BLOCKED or RESTRICTED with <a href="https://shorturl.at/eCOdT" target="_blank">quality proxies</a>.
6 | <h3/>
7 | 
8 | 


--------------------------------------------------------------------------------