├── .gitignore ├── SubmissionConversion.xlsx ├── read_data.m ├── README.md ├── make_predictions.m └── features.m /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.m~ 3 | -------------------------------------------------------------------------------- /SubmissionConversion.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benhamner/Air-Quality-Prediction-Hackathon-Winning-Model/HEAD/SubmissionConversion.xlsx -------------------------------------------------------------------------------- /read_data.m: -------------------------------------------------------------------------------- 1 | function data = read_data() 2 | 3 | fprintf('Reading data ...\n'); 4 | fid = fopen('TrainingData.csv'); 5 | fgetl(fid); 6 | 7 | data = zeros(37821,95); 8 | days = {'"Saturday"','"Sunday"','"Monday"','"Tuesday"','"Wednesday"','"Thursday"','"Friday"'}; 9 | row_cnt = 0; 10 | 11 | while ~feof(fid) 12 | row_cnt = row_cnt + 1; 13 | line = fgetl(fid); 14 | C = strread(line,'%s','delimiter',','); 15 | for i=1:95 16 | if i==5 17 | data(row_cnt,5) = find(strcmp(days,C{5})); 18 | else 19 | if strcmp(C{i},'NA') 20 | data(row_cnt,i) = -1000000; 21 | else 22 | data(row_cnt,i) = str2num(C{i}); 23 | end 24 | end 25 | end 26 | end 27 | 28 | fprintf('Read in %d rows\n', row_cnt); 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Winning Code for the EMC Data Science Global Hackathon (Air Quality Prediction) 2 | ------------------------------------------------------------------------------- 3 | 4 | **Competition page:** https://www.kaggle.com/c/dsg-hackathon 5 | 6 | **Blog post on methodology:** http://blog.kaggle.com/2012/05/01/chucking-everything-into-a-random-forest-ben-hamner-on-winning-the-air-quality-prediction-hackathon/ 7 | 8 | To train and recreate the winning submission (may be slightly different, as the random number generator didn't have a static seed), 9 | 10 | 1. Download TrainingData.csv from https://www.kaggle.com/c/dsg-hackathon/data and put it in this folder 11 | 2. Run make_predictions.m from the Matlab command prompt 12 | 3. Copy the resulting predictions from predictions.csv to the appropriate spreadsheet in SubmissionConversion.xls 13 | 4. Save the submission worksheet as a new CSV file -------------------------------------------------------------------------------- /make_predictions.m: -------------------------------------------------------------------------------- 1 | function make_predictions() 2 | 3 | prediction_offsets = [1 2 3 4 5 10 17 24 48 72]; 4 | 5 | data = read_data(); 6 | 7 | test_predictions = zeros(2100,39); 8 | 9 | options = statset() 10 | 11 | %%% Uncomment the lines below to train models in parallel 12 | % matlabpool open 4 13 | % options = statset('UseParallel','always'); 14 | 15 | for p=1:10 16 | prediction_offset = prediction_offsets(p); 17 | [fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset); 18 | tic 19 | for i=1:size(train_targets,2) 20 | [p,i] 21 | locs = find(train_targets(:,i)>=0); 22 | tm = TreeBagger(12,fea_train(locs,:),train_targets(locs,i),'method','regression','minleaf',200,'options',options); 23 | pred = predict(tm,fea_test); 24 | for j=1:length(test_chunk_id) 25 | test_predictions(test_chunk_id(j)*10-10+p,i) = pred(j); 26 | end 27 | end 28 | toc 29 | end 30 | 31 | for i=1:210 32 | if isempty(find(i==test_chunk_id)) 33 | for j=1:39 34 | test_predictions( (i-1)*10+1:i*10,j) = median(test_predictions(:,j)); 35 | end 36 | end 37 | end 38 | 39 | dlmwrite('predictions.csv',test_predictions); 40 | -------------------------------------------------------------------------------- /features.m: -------------------------------------------------------------------------------- 1 | function [fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset) 2 | 3 | time_back = 8; 4 | 5 | fea_train = zeros(40000, 3 + 89*time_back); 6 | fea_test = zeros(500, 3 + 89*time_back); 7 | 8 | train_targets = zeros(40000, 39); 9 | 10 | test_chunk_id = []; 11 | 12 | fea_cnt = 0; 13 | test_cnt = 0; 14 | for i=1:size(data,1)-time_back-prediction_offset+1 15 | if data(i,2)==data(i+time_back+prediction_offset-1,2) 16 | fea_cnt = fea_cnt + 1; 17 | fea_train(fea_cnt,1:3) = data(i, 4:6); 18 | this_fea = data(i:i+time_back-1,7:95); 19 | fea_train(fea_cnt,4:end) = this_fea(:)'; 20 | 21 | train_targets(fea_cnt, :) = data(i+time_back+prediction_offset-1, 95-39+1:95); 22 | end 23 | 24 | if data(i,2) ~= data(i+1,2) 25 | test_cnt = test_cnt + 1; 26 | i_back = i - time_back + 1; 27 | fea_test(test_cnt,1:3) = data(i_back, 4:6); 28 | 29 | this_fea = data(i_back:i_back+time_back-1,7:95); 30 | fea_test(test_cnt,4:end) = this_fea(:)'; 31 | test_chunk_id(end+1) = data(i_back,2); 32 | end 33 | end 34 | 35 | test_cnt = test_cnt + 1; 36 | i_back = size(data,1) - time_back + 1; 37 | fea_test(test_cnt,1:3) = data(i_back, 4:6); 38 | 39 | this_fea = data(i_back:i_back+time_back-1,7:95); 40 | fea_test(test_cnt,4:end) = this_fea(:)'; 41 | test_chunk_id(end+1) = data(i_back,2); 42 | 43 | train_targets = train_targets(1:fea_cnt,:); 44 | fea_train = fea_train(1:fea_cnt,:); 45 | fea_test = fea_test(1:test_cnt, :); 46 | --------------------------------------------------------------------------------