├── data ├── netflix_titles_dirty_01.csv.gz ├── netflix_titles_dirty_02.csv.gz ├── netflix_titles_dirty_03.csv.gz ├── netflix_titles_dirty_04.csv.gz ├── netflix_titles_dirty_05.csv.gz ├── netflix_titles_dirty_06.csv.gz └── netflix_titles_dirty_07.csv.gz ├── assets ├── SparkLiveTraining-shellcommands.png ├── Live Training Slidedeck - Cleaning Data with Pyspark.pdf └── datacamp.svg ├── notebooks ├── python_live_session_template_spark.ipynb ├── python_live_session_template.ipynb └── Cleaning_Data_with_PySpark.ipynb ├── README.md └── Q&A-20200617.md /data/netflix_titles_dirty_01.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/data/netflix_titles_dirty_01.csv.gz -------------------------------------------------------------------------------- /data/netflix_titles_dirty_02.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/data/netflix_titles_dirty_02.csv.gz -------------------------------------------------------------------------------- /data/netflix_titles_dirty_03.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/data/netflix_titles_dirty_03.csv.gz -------------------------------------------------------------------------------- /data/netflix_titles_dirty_04.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/data/netflix_titles_dirty_04.csv.gz -------------------------------------------------------------------------------- /data/netflix_titles_dirty_05.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/data/netflix_titles_dirty_05.csv.gz -------------------------------------------------------------------------------- /data/netflix_titles_dirty_06.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/data/netflix_titles_dirty_06.csv.gz -------------------------------------------------------------------------------- /data/netflix_titles_dirty_07.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/data/netflix_titles_dirty_07.csv.gz -------------------------------------------------------------------------------- /assets/SparkLiveTraining-shellcommands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/assets/SparkLiveTraining-shellcommands.png -------------------------------------------------------------------------------- /assets/Live Training Slidedeck - Cleaning Data with Pyspark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/data-cleaning-with-pyspark-live-training/master/assets/Live Training Slidedeck - Cleaning Data with Pyspark.pdf -------------------------------------------------------------------------------- /notebooks/python_live_session_template_spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "6Ijg5wUCTQYG" 8 | }, 9 | "source": [ 10 | "
\n",
11 | "\n",
12 | "
\n",
36 | "\n",
37 | "
| \n", 148 | " | listing_id | \n", 149 | "name | \n", 150 | "host_id | \n", 151 | "host_name | \n", 152 | "neighbourhood_full | \n", 153 | "coordinates | \n", 154 | "room_type | \n", 155 | "price | \n", 156 | "number_of_reviews | \n", 157 | "last_review | \n", 158 | "reviews_per_month | \n", 159 | "availability_365 | \n", 160 | "rating | \n", 161 | "number_of_stays | \n", 162 | "5_stars | \n", 163 | "listing_added | \n", 164 | "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", 169 | "13740704 | \n", 170 | "Cozy,budget friendly, cable inc, private entra... | \n", 171 | "20583125 | \n", 172 | "Michel | \n", 173 | "Brooklyn, Flatlands | \n", 174 | "(40.63222, -73.93398) | \n", 175 | "Private room | \n", 176 | "45$ | \n", 177 | "10 | \n", 178 | "2018-12-12 | \n", 179 | "0.70 | \n", 180 | "85 | \n", 181 | "4.100954 | \n", 182 | "12.0 | \n", 183 | "0.609432 | \n", 184 | "2018-06-08 | \n", 185 | "
| 1 | \n", 188 | "22005115 | \n", 189 | "Two floor apartment near Central Park | \n", 190 | "82746113 | \n", 191 | "Cecilia | \n", 192 | "Manhattan, Upper West Side | \n", 193 | "(40.78761, -73.96862) | \n", 194 | "Entire home/apt | \n", 195 | "135$ | \n", 196 | "1 | \n", 197 | "2019-06-30 | \n", 198 | "1.00 | \n", 199 | "145 | \n", 200 | "3.367600 | \n", 201 | "1.2 | \n", 202 | "0.746135 | \n", 203 | "2018-12-25 | \n", 204 | "
| 2 | \n", 207 | "21667615 | \n", 208 | "Beautiful 1BR in Brooklyn Heights | \n", 209 | "78251 | \n", 210 | "Leslie | \n", 211 | "Brooklyn, Brooklyn Heights | \n", 212 | "(40.7007, -73.99517) | \n", 213 | "Entire home/apt | \n", 214 | "150$ | \n", 215 | "0 | \n", 216 | "NaN | \n", 217 | "NaN | \n", 218 | "65 | \n", 219 | "NaN | \n", 220 | "NaN | \n", 221 | "NaN | \n", 222 | "2018-08-15 | \n", 223 | "
| 3 | \n", 226 | "6425850 | \n", 227 | "Spacious, charming studio | \n", 228 | "32715865 | \n", 229 | "Yelena | \n", 230 | "Manhattan, Upper West Side | \n", 231 | "(40.79169, -73.97498) | \n", 232 | "Entire home/apt | \n", 233 | "86$ | \n", 234 | "5 | \n", 235 | "2017-09-23 | \n", 236 | "0.13 | \n", 237 | "0 | \n", 238 | "4.763203 | \n", 239 | "6.0 | \n", 240 | "0.769947 | \n", 241 | "2017-03-20 | \n", 242 | "
| 4 | \n", 245 | "22986519 | \n", 246 | "Bedroom on the lively Lower East Side | \n", 247 | "154262349 | \n", 248 | "Brooke | \n", 249 | "Manhattan, Lower East Side | \n", 250 | "(40.71884, -73.98354) | \n", 251 | "Private room | \n", 252 | "160$ | \n", 253 | "23 | \n", 254 | "2019-06-12 | \n", 255 | "2.29 | \n", 256 | "102 | \n", 257 | "3.822591 | \n", 258 | "27.6 | \n", 259 | "0.649383 | \n", 260 | "2020-10-23 | \n", 261 | "
\n",
36 | "\n",
37 | "