├── clean-data └── 2018-fCC-New-Coders-Survey-Data.csv ├── raw-data └── 2018-new-coder-survey.csv ├── readme.md └── scripts └── clean-data-2018.R /readme.md: -------------------------------------------------------------------------------- 1 | # The 2018 New Coder Survey 2 | 3 | This is the open dataset from freeCodeCamp's 2018 survey of more than 30,000 developers. 4 | 5 | ## The data 6 | 7 | You can [read more about last year's 2017 survey here](https://medium.freecodecamp.com/take-the-2017-new-coder-survey-and-help-us-build-a-massive-public-dataset-8c808cbee7eb), along with [some survey statistics](https://medium.freecodecamp.com/we-asked-20-000-people-who-they-are-and-how-theyre-learning-to-code-fff5d668969). 8 | 9 | You can [discuss these datasets in freeCodeCamp's Data Science Gitter chatroom](https://gitter.im/FreeCodeCamp/DataScience). 10 | 11 | As of December 19, 2018, only the raw CSV dump has been uploaded (exported directly from Typeform). Here's the full survey dataset: [2018-new-coder-survey.csv](https://github.com/freeCodeCamp/2018-new-coder-survey/blob/master/raw-data/2018-new-coder-survey.csv). 12 | 13 | The cleaned up and combined data set will eventually be available in the `clean-data/` directory. Soon you will be able to check out the scripts we used to clean the data set in the `/scipts` directory. 14 | 15 | ## License 16 | 17 | This 2018 New Coder Survey is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ 18 | -------------------------------------------------------------------------------- /scripts/clean-data-2018.R: -------------------------------------------------------------------------------- 1 | # title: Clean freeCodeCamp's 2018 New Coder Survey 2 | # description: This script cleans specifically freeCodeCamp's 2018 New Coder 3 | # Survey. 4 | # author: Eric Leung (@erictleung) 5 | # date: 2018-09-03 6 | # last_updated: 2019-01-28 7 | 8 | 9 | # Overview ------------------------------------------------------------- 10 | # - Load packages Load necessary packages for cleaning 11 | # - Useful Functions Sub-components for processing data 12 | # - Main Processing Functions Big, main components of cleaning 13 | # - Main Function Run entire script to clean data 14 | 15 | 16 | # Load Packages -------------------------------------------------------- 17 | 18 | # Used packages in tidyverse 19 | # - dplyr 20 | # - tidyr 21 | # - stringr 22 | library(tidyverse) 23 | library(here) 24 | library(tools) 25 | 26 | 27 | # Sub-Process Functions ----------------------------------- 28 | # Description: 29 | # These functions perform larger grouped data transformations 30 | 31 | # Title: 32 | # Change Characters to One 33 | # Description: 34 | # Lots of columns need to be changed from characters to just 1, indicated that 35 | # the respondent checked this option. Check if input is NA and changes it to a 36 | # character 1. 37 | # 38 | # Note: vchar_to_one is meant to be used on vectors. 39 | # Input: 40 | # Vector of characters, can have NA 41 | # Output: 42 | # Vector of characters, with just "1"'s and NAs 43 | # Usage: 44 | # > test_vec <- c(sample(letters, 10), rep(NA, 3)) 45 | # > vchar_to_one(tes) 46 | # # x f r w i v q a l h 47 | # # "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" NA NA NA 48 | char_to_one <- function(x) { 49 | if (!is.na(x)) { 50 | "1" 51 | } else { 52 | x 53 | } 54 | } 55 | vchar_to_one <- Vectorize(char_to_one) 56 | 57 | 58 | # Main Process Functions ---------------------------------- 59 | # Description: 60 | # These functions encompass the bulk work of the cleaning and transformation 61 | 62 | # Rename Survey Variables 63 | # Usage: 64 | # > renamed_data <- rename_data_vars(dat) 65 | rename_data_vars <- function(dat) { 66 | renamed_data <- dat %>% 67 | rename( 68 | is_software_dev = "Are you already working as a software developer?", 69 | is_first_dev_job = "Is this your first software development job?", 70 | months_job_search = "Before you got this job, how many months did you spend looking for a job?", 71 | job_pref = "Would you prefer to...", 72 | 73 | # Job interests 74 | job_intr_fllstck = "Full-Stack Web Developer", 75 | job_intr_backend = "Back-End Web Developer", 76 | job_intr_frntend = "Front-End Web Developer", 77 | job_intr_mobile = "Mobile Developer", 78 | job_intr_devops = "DevOps / SysAdmin", 79 | job_intr_datasci = "Data Scientist", 80 | job_intr_teacher = "Teacher / Trainer / Developer Evangelist", 81 | job_intr_qa_engn = "Quality Assurance Engineer", 82 | job_intr_ux_engn = "User Experience Designer", 83 | job_intr_projm = "Product Manager", 84 | job_intr_gamedev = "Game Developer", 85 | job_intr_infosec = "Information Security", 86 | job_intr_dataengn = "Data Engineer", 87 | job_intr_other = "Other", 88 | 89 | when_appl_job = "When do you plan to start applying for developer jobs?", 90 | expected_earn = "About how much money do you expect to earn per year at your first developer job (in US Dollars)?", 91 | job_lctn_pref = "Would you prefer to work...", 92 | job_relocate = "Are you willing to relocate for a job?", 93 | 94 | # Reasons to code 95 | reasons_to_code = "What is your biggest reason for learning to code?", 96 | reasons_to_code_other = "Other_1", # Very inspiring column to read 97 | 98 | # Learning resources 99 | rsrc_fcc = "freeCodeCamp", 100 | rsrc_mdn = "Mozilla Developer Network (MDN)", 101 | rsrc_so = "Stack Overflow", 102 | rsrc_edx = "EdX", 103 | rsrc_coursera = "Coursera", 104 | rsrc_khan_acdm = "Khan Academy", 105 | rsrc_pluralsght = "Pluralsight", 106 | rsrc_codeacdm = "Codecademy", 107 | rsrc_udacity = "Udacity", 108 | rsrc_udemy = "Udemy", 109 | rsrc_code_wars = "Code Wars", 110 | rsrc_treehouse = "Treehouse", 111 | rsrc_hackerrank = "HackerRank", 112 | rsrc_frntendmstr = "Front End Masters", 113 | rsrc_lynda = "Lynda.com", 114 | rsrc_egghead = "Egghead.io", 115 | rsrc_css_tricks = "CSS Tricks", 116 | rsrc_other = "Other_2", 117 | 118 | # Coding events attended 119 | codeevnt_fcc = "freeCodeCamp study groups", 120 | codeevnt_hackthn = "hackathons", 121 | codeevnt_confs = "conferences", 122 | codeevnt_workshps = "workshops", 123 | codeevnt_startupwknd = "Startup Weekend", 124 | codeevnt_nodeschl = "NodeSchool", 125 | codeevnt_womenwc = "Women Who Code", 126 | codeevnt_girldevit = "Girl Develop It", 127 | codeevnt_coderdojo = "CoderDojo", 128 | codeevnt_meetup = "Meetup.com events", 129 | codeevnt_railsbrdg = "RailsBridge", 130 | codeevnt_gamejam = "Game Jam", 131 | codeevnt_railsgrls = "Rails Girls", 132 | codeevnt_djangogrls = "Django Girls", 133 | codeevnt_wkndbtcmp = "weekend bootcamps", 134 | codeevnt_other = "Other_3", 135 | 136 | # Podcasts listened to 137 | podcast_fcc = "The freeCodeCamp Podcast", 138 | podcast_codenewbie = "Code Newbie", 139 | podcast_changelog = "The Changelog", 140 | podcast_sedaily = "Software Engineering Daily", 141 | podcast_js_jabber = "JavaScript Jabber", 142 | podcast_syntaxfm = "Syntax.fm", 143 | podcast_ltcwm = "Learn To Code With Me", 144 | podcast_fullstckrd = "Full Stack Radio", 145 | podcast_frnthppyhr = "Front End Happy Hour", 146 | podcast_codingblcks = "Coding Blocks", 147 | podcast_shoptalk = "Shop Talk Show", 148 | podcast_devtea = "Developer Tea", 149 | podcast_progthrwdwn = "Programming Throwdown", 150 | podcast_geekspeak = "Geek Speak", 151 | podcast_hanselmnts = "Hanselminutes", 152 | podcast_talkpythonme = "Talk Python To Me", 153 | podcast_rubyrogues = "Ruby Rogues", 154 | podcast_codepenrd = "CodePen Radio", 155 | podcast_seradio = "Software Engineering Radio", 156 | podcast_other = "Other_4", 157 | 158 | # YouTube channels 159 | yt_mit_ocw = "MIT Open Courseware", 160 | yt_fcc = "freeCodeCamp's YouTube channel", 161 | yt_computerphile = "Computerphile", 162 | yt_devtips = "DevTips", 163 | yt_cs_dojo = "CS Dojo", 164 | yt_engn_truth = "Engineered Truth", 165 | yt_learncodeacdm = "LearnCode.Academy", 166 | yt_lvluptuts = "LevelUpTuts", 167 | yt_funfunfunct = "Fun Fun Function", 168 | yt_codingtuts360 = "Coding Tutorials 360", 169 | yt_codingtrain = "Coding Train", 170 | yt_derekbanas = "Derek Banas", 171 | yt_simplilearn = "Simplilearn", 172 | yt_simpleprog = "Simple Programmer (Bulldog Mindset)", 173 | yt_mozillahacks = "Mozilla Hacks", 174 | yt_googledevs = "Google Developers", 175 | yt_other = "Other_5", 176 | 177 | # Learning information 178 | hours_learning = "About how many hours do you spend learning each week?", 179 | months_programming = "About how many months have you been programming for?", 180 | 181 | # Bootcamps 182 | bootcamp_attend = "Have you attended a full-time coding bootcamp?", 183 | bootcamp_name = "Which one?", 184 | bootcamp_finished = "Have you finished yet?", 185 | bootcamp_have_loan = "Did you take out a loan to pay for the bootcamp?", 186 | bootcamp_recommend = "Based on your experience, would you recommend this bootcamp to your friends?", 187 | 188 | money_for_learning = "Aside from university tuition, about how much money have you spent on learning to code so far (in US dollars)?", 189 | age = "How old are you?", 190 | 191 | # Individual's gender 192 | gender = "What's your gender?", 193 | gender_other = "Other_6", 194 | 195 | # Demographics 196 | country_citizen = "Which country are you a citizen of?", 197 | country_live = "Which country do you currently live in?", 198 | live_city_population = "About how many people live in your city?", 199 | is_ethnic_minority = "Are you an ethnic minority in your country?", 200 | lang_at_home = "Which language do you you speak at home with your family?", 201 | 202 | # Education 203 | school_degree = "What's the highest degree or level of school you have completed?", 204 | school_major = "What was the main subject you studied in university?", 205 | 206 | # Personal and family information 207 | marital_status = "What's your marital status?", 208 | has_finance_depends = "Do you financially support any dependents?", 209 | has_children = "Do you have children?", 210 | num_children = "How many children do you have?", 211 | do_finance_support = "Do you financially support any elderly relatives or relatives with disabilities?", 212 | debt_amt = "Do you have any debt?", 213 | home_mrtg_has = "Do you have a home mortgage?", 214 | home_mrtg_owe = "About how much do you owe on your home mortgage (in US Dollars)?", 215 | student_debt_has = "Do you have student loan debt?", 216 | student_debt_amt = "About how much do you owe in student loans (in US Dollars)?", 217 | 218 | # Employment status information 219 | curr_emplymnt = "Regarding employment status, are you currently...", 220 | curr_emplymnt_other = "Other_7", 221 | curr_field = "Which field do you work in?", 222 | last_yr_income = "About how much money did you make last year (in US dollars)?", 223 | communite_time = "About how many minutes does it take you to get to work each day?", 224 | is_self_employed = "Do you consider yourself under-employed?", 225 | has_served_military = "Have you served in your country's military before?", 226 | is_recv_disab_bnft = "Do you receive disability benefits from your government?", 227 | has_high_spd_ntnet = "Do you have high speed internet at your home?", 228 | 229 | # Miscellaneous 230 | time_start = "Start Date (UTC)", 231 | time_end = "Submit Date (UTC)", 232 | network_id = "Network ID" 233 | ) 234 | renamed_data 235 | } 236 | 237 | 238 | # Main Function ------------------------------------------- 239 | 240 | # Title: 241 | # Main Cleaning Function 242 | # Description: 243 | # This is the main cleaning and transformation function. It will write a new 244 | # file in the `clean-data/` directory. 245 | # Usage: 246 | # > main() 247 | main <- function() { 248 | # Read in data 249 | data_path <- here("raw-data", "2018-New-Coders-Survey.csv") 250 | dat <- data_path %>% 251 | read_csv() %>% 252 | rename(ID = "#") 253 | 254 | # Rename variables with easier names 255 | renamed_data = rename_data_vars(dat) 256 | 257 | # Change variables (jobs,rsrc,codeevnt,podcast,yt) to boolean 258 | bool_changed_data <- renamed_data %>% 259 | # Change job interest 260 | mutate_at(vars(starts_with("job_intr_"), -job_intr_other), 261 | vchar_to_one) %>% 262 | 263 | # Change resources 264 | mutate_at(vars(starts_with("rsrc_"), -rsrc_other), 265 | vchar_to_one) %>% 266 | 267 | # Change coding events 268 | mutate_at(vars(starts_with("codeevnt_"), -codeevnt_other), 269 | vchar_to_one) %>% 270 | 271 | # Change podcasts 272 | mutate_at(vars(starts_with("podcast_"), -podcast_other), 273 | vchar_to_one) %>% 274 | 275 | # Change YouTube channels 276 | mutate_at(vars(starts_with("yt_"), -yt_other), 277 | vchar_to_one) 278 | 279 | # Remove outliers for age, but keep NA values 280 | # Oldest living is 116, so filtering on that age 281 | # https://en.wikipedia.org/wiki/List_of_the_oldest_living_people 282 | age_outlier_removed <- bool_changed_data %>% 283 | filter(age < 116 | is.na(age)) 284 | 285 | # Remove questionable months learning by cross-checking age 286 | # Here, convert age to months and take the difference between age 287 | # and months programming. Here, remove entries that claim you've 288 | # programmed more than you've been alive. 289 | # Counts: 290 | # - 5 years = 60 291 | # - 10 years = 120 292 | # - 50 years = 600 293 | age_checked_learning <- age_outlier_removed %>% 294 | mutate(months_age = age * 12) %>% 295 | mutate(prog_age_diff = months_age - months_programming) %>% 296 | filter(prog_age_diff > 0) %>% 297 | select(-c(prog_age_diff, months_age)) 298 | 299 | # Remove excess money for learning 300 | # Keep NA values as well 301 | remove_excess_money_spent <- age_checked_learning %>% 302 | filter(money_for_learning < 250000 | 303 | is.na(money_for_learning)) 304 | 305 | # Remove high number of children 306 | rm_high_num_kids <- remove_excess_money_spent %>% 307 | filter(num_children < 20 | is.na(num_children)) 308 | 309 | # Clean other columns by title casing terms 310 | cleaned_other_cols <- rm_high_num_kids %>% 311 | # Clean other job interests 312 | mutate(job_intr_other = if_else( 313 | !is.na(job_intr_other), 314 | toTitleCase(job_intr_other), 315 | job_intr_other)) %>% 316 | 317 | # Clean other resources used 318 | mutate(rsrc_other = if_else( 319 | !is.na(rsrc_other), 320 | toTitleCase(rsrc_other), 321 | rsrc_other)) %>% 322 | 323 | # Clean other coding events 324 | mutate(codeevnt_other = if_else( 325 | !is.na(codeevnt_other), 326 | toTitleCase(codeevnt_other), 327 | codeevnt_other)) %>% 328 | 329 | # Clean other podcasts 330 | mutate(podcast_other = if_else( 331 | !is.na(podcast_other), 332 | toTitleCase(podcast_other), 333 | podcast_other)) %>% 334 | 335 | # Clean other YouTube channels 336 | mutate(yt_other = if_else( 337 | !is.na(yt_other), 338 | toTitleCase(yt_other), 339 | yt_other)) 340 | 341 | # Remove inconsistency in children 342 | # Keep all rational combinations of children 343 | # - Doesn't have children and has 0 or didn't answer 344 | # - Has children and has 1 or more children or didn't answer 345 | # - Didn't answer either question 346 | # Variables: 347 | # - num_children 348 | # - has_children 349 | rm_inconsistent_kids <- cleaned_other_cols %>% 350 | filter( 351 | (has_children == 0 & num_children %in% c(0, NA)) | 352 | (has_children == 1 & (num_children > 0 | is.na(num_children))) | 353 | (is.na(has_children) & is.na(num_children)) 354 | ) 355 | 356 | # Remove high last year's income if greater than 1 million 357 | rm_high_income <- rm_inconsistent_kids %>% 358 | filter(last_yr_income < 1000000 | is.na(last_yr_income)) 359 | 360 | # Remove too quick of responses 361 | rm_quick_responses <- rm_high_income %>% 362 | mutate(total_time_sec = time_end - time_start) %>% 363 | mutate(total_time_min = total_time_sec / 60) %>% 364 | mutate(total_time_min = as.numeric(total_time_min)) %>% 365 | rename(time_total_sec = total_time_sec) %>% 366 | filter(time_total_sec > 100) %>% 367 | select(-c(total_time_min)) 368 | 369 | # Rename to final 370 | final <- rm_quick_responses 371 | 372 | # Combine data and create cleaned data 373 | out_path <- here("clean-data", "2018-fCC-New-Coders-Survey-Data.csv") 374 | write_csv(x = final, path = out_path) 375 | } 376 | --------------------------------------------------------------------------------