├── Marketplace Scraper - Comments Only.ipynb ├── Marketplace Scraper_Tutorial_Updated.ipynb └── README.md /Marketplace Scraper - Comments Only.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "c2524fb2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Import necessary libraries" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "7f5d4715", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Set up Splinter\n", 21 | "\n", 22 | "# Set up base url\n", 23 | "\n", 24 | "# Set up search parameters\n", 25 | "\n", 26 | "#Set up full url" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "3ee819c2", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Visit the website" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "46948ef0", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Scroll down to load more results\n", 47 | "\n", 48 | "# Define the number of times to scroll the page\n", 49 | "\n", 50 | "# Define the delay (in seconds) between each scroll\n", 51 | "\n", 52 | "# Loop to perform scrolling" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "652468fb", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# Parse the HTML\n", 63 | "\n", 64 | "\n", 65 | "# Create a BeautifulSoup object from the scraped HTML\n", 66 | "\n", 67 | "# Check if HTML was scraped correctly\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "24a16d2d", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# End the automated browsing session\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "id": "51cde2e5", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# Extract all the necessary info and insert into lists" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "id": "9411a71c", 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# Create a regular expression pattern to match city and state entries like \"City, State\"\n", 98 | "\n", 99 | "\n", 100 | "# Initialize an empty list to store adjusted mileage entries\n", 101 | "\n", 102 | "\n", 103 | "# Iterate through the original mileage entries\n", 104 | "\n", 105 | " # Append the current mileage entry to the adjusted list\n", 106 | "\n", 107 | " \n", 108 | " # Check if the current mileage entry matches the pattern and there are at least two entries in the adjusted list\n", 109 | "\n", 110 | " # If the conditions are met, insert \"0K km\" in between the two consecutive city and state entries\n", 111 | " " 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 3, 117 | "id": "29e0033d", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "# Extracted mileage list (separate from location and extract numeric values only)\n", 122 | "# Define regular expressions to extract numeric mileage values in \"K km\" and \"K miles\" format\n", 123 | "\n", 124 | "# Initialize an empty list to store cleaned mileage values\n", 125 | "\n", 126 | "\n", 127 | "# Iterate through the adjusted mileage entries\n", 128 | "\n", 129 | " # Try to find a match for the \"K km\" format\n", 130 | " \n", 131 | " \n", 132 | " # Try to find a match for the \"K miles\" format\n", 133 | " \n", 134 | " \n", 135 | " # Check if either of the formats is found\n", 136 | " \n", 137 | " # If \"K km\" format is found, convert it to meters and append to the cleaned list\n", 138 | " \n", 139 | " # If \"K miles\" format is found, convert it to meters and append to the cleaned list\n", 140 | " " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "6e5837ce", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# Add all values to a list of dictionaries" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "5c1112af", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "# Add the prefix to the URLs" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "58651bef", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# Filter the DataFrame to include rows where the 'Model' column matches the specified model, regardless of case." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "a3cf69d5", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# Group the data by \"Year\" and calculate the mean price for each year\n", 181 | "\n", 182 | "# Create a wider figure\n", 183 | "\n", 184 | "# Create the column chart\n", 185 | "\n", 186 | "# Add values on top of the bars with no digits after the period\n", 187 | "\n", 188 | "# Convert to an integer to remove digits after the period\n", 189 | "\n", 190 | "# Set the labels and title\n", 191 | "\n", 192 | "\n", 193 | "# Use adaptive scale for the year axis\n", 194 | "\n", 195 | "\n", 196 | "# Show the chart\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "f9966f84", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# Find the URL of the cheapest Honda Civic directly from the filtered DataFrame\n", 207 | "# Print the URL" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "b76ba927", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# Export to CSV" 218 | ] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 3 (ipykernel)", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.9.13" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 5 242 | } 243 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # facebook-marketplace-webscraping 2 | Facebook Marketplace Scraper Using Python, BeautifulSoup, Splinter, Pandas... 3 | 4 |

5 | 🔥 Avoid getting BLOCKED or RESTRICTED with quality proxies. 6 |

7 | 8 | --------------------------------------------------------------------------------