├── .gitignore
├── SubmissionConversion.xlsx
├── read_data.m
├── README.md
├── make_predictions.m
└── features.m


/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.m~
3 | 


--------------------------------------------------------------------------------
/SubmissionConversion.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benhamner/Air-Quality-Prediction-Hackathon-Winning-Model/HEAD/SubmissionConversion.xlsx


--------------------------------------------------------------------------------
/read_data.m:
--------------------------------------------------------------------------------
 1 | function data = read_data()
 2 | 
 3 | fprintf('Reading data ...\n');
 4 | fid = fopen('TrainingData.csv');
 5 | fgetl(fid);
 6 | 
 7 | data = zeros(37821,95);
 8 | days = {'"Saturday"','"Sunday"','"Monday"','"Tuesday"','"Wednesday"','"Thursday"','"Friday"'};
 9 | row_cnt = 0;
10 | 
11 | while ~feof(fid)
12 |     row_cnt = row_cnt + 1;
13 |     line = fgetl(fid);
14 |     C = strread(line,'%s','delimiter',',');
15 |     for i=1:95
16 |         if i==5
17 |             data(row_cnt,5) = find(strcmp(days,C{5}));
18 |         else
19 |             if strcmp(C{i},'NA')
20 |                 data(row_cnt,i) = -1000000;
21 |             else
22 |                 data(row_cnt,i) = str2num(C{i});
23 |             end
24 |         end
25 |     end
26 | end
27 | 
28 | fprintf('Read in %d rows\n', row_cnt);
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Winning Code for the EMC Data Science Global Hackathon (Air Quality Prediction)
 2 | -------------------------------------------------------------------------------
 3 | 
 4 | **Competition page:** https://www.kaggle.com/c/dsg-hackathon
 5 | 
 6 | **Blog post on methodology:** http://blog.kaggle.com/2012/05/01/chucking-everything-into-a-random-forest-ben-hamner-on-winning-the-air-quality-prediction-hackathon/
 7 | 
 8 | To train and recreate the winning submission (may be slightly different, as the random number generator didn't have a static seed),
 9 | 
10 | 1. Download TrainingData.csv from https://www.kaggle.com/c/dsg-hackathon/data and put it in this folder
11 | 2. Run make_predictions.m from the Matlab command prompt
12 | 3. Copy the resulting predictions from predictions.csv to the appropriate spreadsheet in SubmissionConversion.xls
13 | 4. Save the submission worksheet as a new CSV file


--------------------------------------------------------------------------------
/make_predictions.m:
--------------------------------------------------------------------------------
 1 | function make_predictions()
 2 | 
 3 | prediction_offsets = [1 2 3 4 5 10 17 24 48 72];
 4 | 
 5 | data = read_data();
 6 | 
 7 | test_predictions = zeros(2100,39);
 8 | 
 9 | options = statset()
10 | 
11 | %%% Uncomment the lines below to train models in parallel
12 | % matlabpool open 4
13 | % options = statset('UseParallel','always');
14 | 
15 | for p=1:10
16 |     prediction_offset = prediction_offsets(p);
17 |     [fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset);
18 |     tic
19 |     for i=1:size(train_targets,2)
20 |         [p,i]
21 |         locs = find(train_targets(:,i)>=0);
22 |         tm = TreeBagger(12,fea_train(locs,:),train_targets(locs,i),'method','regression','minleaf',200,'options',options);
23 |         pred = predict(tm,fea_test);
24 |         for j=1:length(test_chunk_id)
25 |             test_predictions(test_chunk_id(j)*10-10+p,i) = pred(j);
26 |         end
27 |     end
28 |     toc
29 | end
30 | 
31 | for i=1:210
32 |     if isempty(find(i==test_chunk_id))
33 |         for j=1:39
34 |             test_predictions( (i-1)*10+1:i*10,j) = median(test_predictions(:,j));
35 |         end
36 |     end
37 | end
38 | 
39 | dlmwrite('predictions.csv',test_predictions);
40 | 


--------------------------------------------------------------------------------
/features.m:
--------------------------------------------------------------------------------
 1 | function [fea_train, train_targets, fea_test, test_chunk_id] = features(data, prediction_offset)
 2 | 
 3 | time_back = 8;
 4 | 
 5 | fea_train = zeros(40000, 3 + 89*time_back);
 6 | fea_test = zeros(500, 3 + 89*time_back);
 7 | 
 8 | train_targets = zeros(40000, 39);
 9 | 
10 | test_chunk_id = [];
11 | 
12 | fea_cnt = 0;
13 | test_cnt = 0;
14 | for i=1:size(data,1)-time_back-prediction_offset+1
15 |     if data(i,2)==data(i+time_back+prediction_offset-1,2)
16 |         fea_cnt = fea_cnt + 1;
17 |         fea_train(fea_cnt,1:3) = data(i, 4:6);
18 |         this_fea = data(i:i+time_back-1,7:95);
19 |         fea_train(fea_cnt,4:end) = this_fea(:)';
20 |         
21 |         train_targets(fea_cnt, :) = data(i+time_back+prediction_offset-1, 95-39+1:95);
22 |     end
23 |     
24 |     if data(i,2) ~= data(i+1,2)
25 |         test_cnt = test_cnt + 1;
26 |         i_back = i - time_back + 1;
27 |         fea_test(test_cnt,1:3) = data(i_back, 4:6);
28 | 
29 |         this_fea = data(i_back:i_back+time_back-1,7:95);
30 |         fea_test(test_cnt,4:end) = this_fea(:)';
31 |         test_chunk_id(end+1) = data(i_back,2);
32 |     end
33 | end
34 | 
35 | test_cnt = test_cnt + 1;
36 | i_back = size(data,1) - time_back + 1;
37 | fea_test(test_cnt,1:3) = data(i_back, 4:6);
38 | 
39 | this_fea = data(i_back:i_back+time_back-1,7:95);
40 | fea_test(test_cnt,4:end) = this_fea(:)'; 
41 | test_chunk_id(end+1) = data(i_back,2);
42 | 
43 | train_targets = train_targets(1:fea_cnt,:);
44 | fea_train = fea_train(1:fea_cnt,:);
45 | fea_test = fea_test(1:test_cnt, :);
46 | 


--------------------------------------------------------------------------------