├── ASIS_Soil_SVM.R ├── GBM_talk_Austin_R_Users_20140724.R ├── README.md ├── bci_challenge ├── README.md ├── createMatFiles_fromR.R ├── primary_model.R └── use_tf_kernel.m ├── loan_default_loss_basics.R ├── modern_logistic_regression ├── README.md ├── fast_solution.py ├── fast_solution_v2.py └── fast_solution_v3.py ├── rain ├── README.md ├── prepareTrain.R └── rain_functions.R ├── rossmann-store-sales └── ACM-data-science-camp.R ├── see_click_predict_fix_basis.R └── will_it_rain └── histogram_starter_model.R /ASIS_Soil_SVM.R: -------------------------------------------------------------------------------- 1 | ## this code borrows the main structure from the forum post regarding beating the benchmark 2 | ## additions to it are mainly: using two layers, updating cost parameters, capping low/high values 3 | ## trimming the range for some targets, and using the [incorrectly calculated] derivatives. 4 | 5 | library(e1071) 6 | ## this is not the true derivative, but it took a while before I caught it, and didn't fix it 7 | ## it is the one-column difference 8 | deriv<-function(a){b<-c(0,a[1:length(a)-1]); a[1]<-0; return(a-b)} 9 | 10 | train <- read.csv("./training.csv",header=TRUE,stringsAsFactors=FALSE) 11 | test <- read.csv("./sorted_test.csv",header=TRUE,stringsAsFactors=FALSE) 12 | submission <- test[,1] 13 | header <- test[,1] 14 | train[,1]<-NULL 15 | test[,1]<-NULL 16 | labels <- train[,c("Ca","P","pH","SOC","Sand")] 17 | 18 | ## running these in loops is not good R programming, but doing it once and exporting to .Rdata, it was simple enough to move on 19 | dTrain<-train[,1:3578]*0 20 | dTest<-test[,1:3578]*0 21 | for(i in 1:nrow(dTrain)){dTrain[i,]<-deriv(t(train[i,1:3578])); if(i%%30==1){print(i)}} 22 | for(i in 1:nrow(dTest)){dTest[i,]<-deriv(t(test[i,1:3578])); if(i%%30==1){print(i)}} 23 | d2Train<-train[,1:3578]*0 24 | d2Test<-test[,1:3578]*0 25 | for(i in 1:nrow(d2Train)){d2Train[i,]<-deriv(t(dTrain[i,1:3578])); if(i%%30==1){print(i)}} 26 | for(i in 1:nrow(d2Test)){d2Test[i,]<-deriv(t(dTest[i,1:3578])); if(i%%30==1){print(i)}} 27 | 28 | ## 10-fold cross-validation, but applied in continuous chunks to help keep grid sections together 29 | ## intro model applied just to infrared columns in the last ~35% 30 | cuts<-rep(0,11) 31 | for(i in 1:10){cuts[i+1]<-round(nrow(train)*i*0.1,0)} 32 | getCaHoldout<-function(col){ 33 | for(fold in 1:10){ 34 | idx<-rep(TRUE,nrow(train)) 35 | idx[(cuts[fold]+1):cuts[fold+1]]<-FALSE 36 | trainLocal<-cbind(train[idx,2200:3578],dTrain[idx,2200:3578]) 37 | testLocal<-cbind(train[idx==FALSE,2200:3578],dTrain[idx==FALSE,2200:3578]) 38 | #train$CaEstimate<-round(labels$Ca[idx],1) 39 | #test$CaEstimate<-round(labels$Ca[idx==FALSE],1) 40 | 41 | svmCa <- svm(trainLocal,labels[idx,col],cost=10000,scale=FALSE) 42 | if(fold==1){p <- predict(svmCa,newdata=testLocal); h <- predict(svmCa,newdata=cbind(test[,2200:3578],dTest[,2200:3578]))} 43 | if(fold>1){p<-c(p,predict(svmCa,newdata=testLocal)); h<-cbind(h,predict(svmCa,newdata=cbind(test[,2200:3578],dTest[,2200:3578])))} 44 | } 45 | return(c(p,rowMeans(h))) 46 | } 47 | reduce1<-getCaHoldout(1) 48 | reduce2<-getCaHoldout(2) 49 | reduce3<-getCaHoldout(3) 50 | reduce4<-getCaHoldout(4) 51 | reduce5<-getCaHoldout(5) 52 | 53 | ##create separate sets for each subproblem 54 | ## Ca: derivatives appear not to help; having more data appears to help, despite significant part of spectrum 55 | ## P: use as much as possible, lenght+derivatives; likely should use holdout predictions from the other elements as well 56 | ## pH: tough to improve; do not add derivatives 57 | ## SOC: no derivatives; local points help vs. all 58 | ## Sand: derivatives help; local points help 59 | 60 | #make column usage specific to each target 61 | #align with ordering of labels/targets: labels <- train[,c("Ca","P","pH","SOC","Sand")] 62 | starts<-c(1,1,2200,2200,2300) 63 | ends<-rep(3578,5) 64 | derivStarts<-c(2,2200,2,2,2300) 65 | derivEnds<-c(3,3578,3,3,3578) 66 | #deriv2Starts<-c(2,2,2,2,2900) 67 | #deriv2Ends<-c(3,3,3,3,3250) 68 | 69 | ##add back in the initial predictions from all five models, to all five models; e.g. Ca can see P, pH, SOC, and Sand predictions 70 | i<-1 71 | svmTrains<-list(cbind(train[,starts[i]:ends[i]],dTrain[,derivStarts[i]:derivEnds[i]],reduce1[1:nrow(train)] 72 | ,reduce2[1:nrow(train)],reduce3[1:nrow(train)],reduce4[1:nrow(train)],reduce5[1:nrow(train)])) 73 | svmTests<-list(cbind(test[,starts[i]:ends[i]],dTest[,derivStarts[i]:derivEnds[i]],reduce1[(nrow(train)+1):length(reduce1)] 74 | ,reduce2[(nrow(train)+1):length(reduce2)],reduce3[(nrow(train)+1):length(reduce3)] 75 | ,reduce4[(nrow(train)+1):length(reduce4)],reduce5[(nrow(train)+1):length(reduce5)] 76 | )) 77 | colnames(svmTrains[[i]])[(ncol(svmTrains[[i]])-4):ncol(svmTrains[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel") 78 | colnames(svmTests[[i]])[(ncol(svmTests[[i]])-4):ncol(svmTests[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel") 79 | for(i in 2:5){ 80 | svmTrains<-c(svmTrains,list(cbind(train[,starts[i]:ends[i]],dTrain[,derivStarts[i]:derivEnds[i]],reduce1[1:nrow(train)] 81 | ,reduce2[1:nrow(train)],reduce3[1:nrow(train)],reduce4[1:nrow(train)],reduce5[1:nrow(train)]))) 82 | svmTests<-c(svmTests,list(cbind(test[,starts[i]:ends[i]],dTest[,derivStarts[i]:derivEnds[i]],reduce1[(nrow(train)+1):length(reduce1)] 83 | ,reduce2[(nrow(train)+1):length(reduce2)],reduce3[(nrow(train)+1):length(reduce3)] 84 | ,reduce4[(nrow(train)+1):length(reduce4)],reduce5[(nrow(train)+1):length(reduce5)] 85 | ))) 86 | colnames(svmTrains[[i]])[(ncol(svmTrains[[i]])-4):ncol(svmTrains[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel") 87 | colnames(svmTests[[i]])[(ncol(svmTests[[i]])-4):ncol(svmTests[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel") 88 | } 89 | 90 | ##last-second add of Depth as binary column 91 | isTopTrain<-as.data.frame(ifelse(train$Depth=="Topsoil",1,0)) 92 | isTopTest<-as.data.frame(ifelse(test$Depth=="Topsoil",1,0)) 93 | colnames(isTopTrain)<-"isTopsoil" 94 | colnames(isTopTest)<-"isTopsoil" 95 | ## separate costs per SVM; these were tuned using cross-validation and roughly grid search 96 | ## adjusting gamma was also suggested by the tuning, but it was incorrectly applied when I used it and I scaled back to just cost 97 | costs<-c(15000,15000,15000,10000,20000) 98 | 99 | AllSvms <- lapply(1:ncol(labels),function(i){svm(cbind(svmTrains[[i]],isTopTrain),labels[,i],cost=costs[i],scale=FALSE)}) 100 | predictions1 <- sapply(1:ncol(labels),function(i){predict(AllSvms[[i]],newdata=cbind(svmTests[[i]],isTopTest))}) 101 | 102 | predictions<-as.data.frame(unlist(predictions1)) 103 | colnames(predictions) <- c("Ca","P","pH","SOC","Sand") 104 | 105 | ## truncate final predictions to be within range of training data; this helped considerably in early modeling 106 | predictions[,1]<-pmax(pmin(predictions[,1],max(labels$Ca)),min(labels$Ca)) 107 | predictions[,2]<-pmax(pmin(predictions[,2],1),min(labels$P)) 108 | predictions[,3]<-pmax(pmin(predictions[,3],max(labels$pH)),min(labels$pH)) 109 | predictions[,4]<-pmax(pmin(predictions[,4],max(labels$SOC)),min(labels$SOC)) 110 | predictions[,5]<-pmax(pmin(predictions[,5],max(labels$Sand)),min(labels$Sand)) 111 | 112 | submission <- cbind(PIDN=header,predictions) 113 | write.csv(submission,"submission_20141021a.csv",row.names=FALSE,quote=FALSE) 114 | -------------------------------------------------------------------------------- /GBM_talk_Austin_R_Users_20140724.R: -------------------------------------------------------------------------------- 1 | library(Metrics) ##load evaluation package 2 | setwd("C:/Users/Mark_Landry/Documents/K/dozer/") 3 | ##Done in advance to speed up loading of data set 4 | train<-read.csv("Train.csv") 5 | train$saleTransform<-strptime(train$saledate,"%m/%d/%Y %H:%M") 6 | train<-train[order(train$saleTransform),] 7 | save(train,file="rTrain.Rdata") 8 | 9 | 10 | load("rTrain.Rdata") 11 | xTrain<-train[(nrow(train)-149999):(nrow(train)-50000),5:ncol(train)] 12 | xTest<-train[(nrow(train)-49999):nrow(train),5:ncol(train)] 13 | yTrain<-train[(nrow(train)-149999):(nrow(train)-50000),2] 14 | yTest<-train[(nrow(train)-49999):nrow(train),2] 15 | 16 | dim(xTrain); dim(xTest) 17 | sapply(xTrain,function(x) length(levels(x))) 18 | ## check levels; gbm is robust, but still has a limit of 1024 per factor; for initial model, remove 19 | ## after iterating through model, would want to go back and compress these factors to investigate 20 | ## their usefulness (or other information analysis) 21 | xTrain$saledate<-NULL; xTest$saledate<-NULL 22 | xTrain$fiModelDesc<-NULL; xTest$fiModelDesc<-NULL 23 | xTrain$fiBaseModel<-NULL; xTest$fiBaseModel<-NULL 24 | xTrain$saleTransform<-NULL; xTest$saleTransform<-NULL 25 | 26 | library(gbm) 27 | ## Set up parameters to pass in; there are many more hyper-parameters available, but these are the most common to control 28 | GBM_NTREES = 400 29 | ## 400 trees in the model; can scale back later for predictions, if desired or overfitting is suspected 30 | GBM_SHRINKAGE = 0.05 31 | ## shrinkage is a regularization parameter dictating how fast/aggressive the algorithm moves across the loss gradient 32 | ## 0.05 is somewhat aggressive; default is 0.001, values below 0.1 tend to produce good results 33 | ## decreasing shrinkage generally improves results, but requires more trees, so the two should be adjusted in tandem 34 | GBM_DEPTH = 4 35 | ## depth 4 means each tree will evaluate four decisions; 36 | ## will always yield [3*depth + 1] nodes and [2*depth + 1] terminal nodes (depth 4 = 9) 37 | ## because each decision yields 3 nodes, but each decision will come from a prior node 38 | GBM_MINOBS = 30 39 | ## regularization parameter to dictate how many observations must be present to yield a terminal node 40 | ## higher number means more conservative fit; 30 is fairly high, but good for exploratory fits; default is 10 41 | 42 | ## Fit model 43 | g<-gbm.fit(x=xTrain,y=yTrain,distribution = "gaussian",n.trees = GBM_NTREES,shrinkage = GBM_SHRINKAGE, 44 | interaction.depth = GBM_DEPTH,n.minobsinnode = GBM_MINOBS) 45 | ## gbm fit; provide all remaining independent variables in xTrain; provide targets as yTrain; 46 | ## gaussian distribution will optimize squared loss; 47 | 48 | ## get predictions; first on train set, then on unseen test data 49 | tP1 <- predict.gbm(object = g,newdata = xTrain,GBM_NTREES) 50 | hP1 <- predict.gbm(object = g,newdata = xTest,GBM_NTREES) 51 | 52 | ## compare model performance to default (overall mean) 53 | rmse(yTrain,tP1) ## 9452.742 on data used for training 54 | rmse(yTest,hP1) ## 9740.559 ~3% drop on unseen data; does not seem to be overfit 55 | rmse(yTest,mean(yTrain)) ## 24481.08 overall mean; cut error rate (from perfection) by 60% 56 | 57 | ## look at variables 58 | summary(g) ## summary will plot and then show the relative influence of each variable to the entire GBM model (all trees) 59 | 60 | ## test dominant variable mean 61 | library(sqldf) 62 | trainProdClass<-as.data.frame(cbind(as.character(xTrain$fiProductClassDesc),yTrain)) 63 | testProdClass<-as.data.frame(cbind(as.character(xTest$fiProductClassDesc),yTest)) 64 | colnames(trainProdClass)<-c("fiProductClassDesc","y"); colnames(testProdClass)<-c("fiProductClassDesc","y") 65 | ProdClassMeans<-sqldf("SELECT fiProductClassDesc,avg(y) avg, COUNT(*) n FROM trainProdClass GROUP BY fiProductClassDesc") 66 | ProdClassPredictions<-sqldf("SELECT case when n > 30 then avg ELSE 31348.63 end avg 67 | FROM ProdClassMeans P LEFT JOIN testProdClass t ON t.fiProductClassDesc = P.fiProductClassDesc") 68 | rmse(yTest,ProdClassPredictions$avg) ## 29082.64 ? peculiar result on the fiProductClassDesc means, which seemed fairly stable and useful 69 | ##seems to say that the primary factor alone is not helpful; full tree needed 70 | 71 | 72 | ## Investigate actual GBM model 73 | pretty.gbm.tree(g,1) ## show underlying model for the first decision tree 74 | summary(xTrain[,10]) ## underlying model showed variable 9 to be first point in tree (9 with 0 index = 10th column) 75 | g$initF ## view what is effectively the "y intercept" 76 | mean(yTrain) ## equivalence shows gaussian y intercept is the mean 77 | t(g$c.splits[1][[1]]) ## show whether each factor level should go left or right 78 | plot(g,10) ## plot fiProductClassDesc, the variable with the highest rel.inf 79 | plot(g,3) ## plot YearMade, continuous variable with 2nd highest rel.inf 80 | interact.gbm(g,xTrain,c(10,3)) 81 | ## compute H statistic to show interaction; integrates 82 | interact.gbm(g,xTrain,c(10,3)) 83 | ## example of uninteresting interaction 84 | 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Public Kaggle Code and Info 2 | ====== 3 | 4 | I will try to post code when I wind up close enough to the top that my models might be slightly interesting. 5 | 6 | Unfortunately, most of my development is spent in fragments, and often split between SQL and R, so I'll do the best I can of getting functions posted and some sense of driver code enough to get the point across. 7 | -------------------------------------------------------------------------------- /bci_challenge/README.md: -------------------------------------------------------------------------------- 1 | Upcoming notes regarding the Kaggle BCI Challenge. 2 | -------------------------------------------------------------------------------- /bci_challenge/createMatFiles_fromR.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(R.matlab) 3 | personList<-c("S02","S06","S07","S11","S12","S13","S14","S16","S17","S18","S20","S21","S22","S23","S24","S26") 4 | sessionList<-c("01","02","03","04","05") 5 | for(i in 1:length(personList)){ 6 | for(j in 1:length(sessionList)){ 7 | fileName<-paste0("train/Data_",personList[i],"_Sess",sessionList[j]) 8 | x<-as.matrix(fread(paste0(fileName,".csv"))) 9 | writeMat(paste0(fileName,".mat"),x=x) 10 | } 11 | } 12 | 13 | library(data.table) 14 | library(R.matlab) 15 | personList<-c("S01","S03","S04","S05","S08","S09","S10","S15","S19","S25") 16 | sessionList<-c("01","02","03","04","05") 17 | for(i in 1:length(personList)){ 18 | for(j in 1:length(sessionList)){ 19 | fileName<-paste0("train/Data_",personList[i],"_Sess",sessionList[j]) 20 | x<-as.matrix(fread(paste0(fileName,".csv"))) 21 | writeMat(paste0(fileName,".mat"),x=x) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /bci_challenge/primary_model.R: -------------------------------------------------------------------------------- 1 | ################################################################### 2 | ## Function Definitions 3 | ## these are very hard-coded to work with this data set 4 | ## simply here to save typing, not much for modularity/flexibility 5 | ################################################################### 6 | 7 | getOneSec<-function(person,session){ 8 | ##fread a given person ID and session ID (character IDs, not integers) 9 | ##find all feedback events, get 1.000 seconds that follow 10 | ##retain the Cz and EOG channels 11 | ##returns data.frame with session, person, feedback id, Cz and EOG channel data 12 | library(data.table) 13 | fileName<-paste0("train/Data_",person,"_Sess",session,".csv") 14 | a<-fread(fileName) 15 | a[,rowId:=seq(1:nrow(a))] 16 | feedBacks<-a[FeedBackEvent==1,rowId] 17 | feedBackNum<-1 18 | returnSet<-a[feedBacks[feedBackNum]:(feedBacks[feedBackNum]+200),list(Cz,EOG,feedBackNum)] 19 | for(feedBackNum in 2:length(feedBacks)){returnSet<-rbind(returnSet,a[feedBacks[feedBackNum]:(feedBacks[feedBackNum]+200),list(Cz,EOG,feedBackNum)])} 20 | returnSet[,Person:=person] 21 | returnSet[,Session:=session] 22 | returnSet[,IdFeedBack:=paste0(Person,"_Sess",Session,"_FB",ifelse(feedBackNum<10,"00",ifelse(feedBackNum<100,"0","")),feedBackNum)] 23 | return(as.data.frame(returnSet)) 24 | } 25 | 26 | getMovingAvgs<-function(vec){ 27 | ## originally picked specific vectors, but after comparing; always used p15 28 | globalMin<-min(vec) 29 | globalMax<-max(vec) 30 | scaled<-(globalMax-vec)/(globalMax-globalMin) 31 | p5<-rep(0,186) 32 | p15<-rep(0,186) 33 | for(i in 1:186){ 34 | p5[i]<-mean(scaled[(i+10):(i+14)]) 35 | p15[i]<-mean(scaled[i:(i+14)]) 36 | } 37 | return(as.data.frame(cbind(p5,p15))) 38 | } 39 | 40 | fitGbm<-function(x,y,outType="auc",trees=100,shrink=0.05,no=10,depth=4,cvInt=1){ 41 | ##runs a gbm and passes back either the score, the fit object, or a vector of predictions 42 | ## also performs very specifically leave-one-out CV 43 | library(gbm) 44 | library(Metrics) 45 | if(cvInt>=0){ 46 | idx<-rep(FALSE,nrow(x)) 47 | idx[(1+(cvInt-1)*340):(cvInt*340)]<-TRUE 48 | x1<-x[idx==FALSE,] 49 | y1<-y[idx==FALSE] 50 | x2<-x[idx,] 51 | y2<-y[idx] 52 | } 53 | if(cvInt<0){x1<-x; x2<-x; y1<-y; y2<-y} 54 | fit<-gbm.fit(x=x1,y=y1,distribution="adaboost",n.trees=trees,shrinkage=shrink,n.minobsinnode=no,interaction.depth=depth) 55 | p<-predict(fit,newdata=x2,n.trees=trees,type="response") 56 | if(outType=="auc"){print(summary(fit,plotit=FALSE)); return(auc(y2,p))} 57 | if(outType=="predictions"){return(p)} 58 | if(outType=="model"){return(fit)} 59 | } 60 | 61 | prepareFeatures<-function(personSession,depth=60,vec=1){ 62 | ## given session data, calculate some hand-crafted features that seemed 63 | ## interesting when looking at plots 64 | ## stated AUCs are against the training set, in its entirety, I believe; so probably higher than what would translate to new data 65 | for(i in 1:depth){ 66 | b<-getMovingAvgs(personSession[(1+((i-1)*201)):(i*201),vec]) 67 | tmp<-cbind( 68 | sum(b[1:92,2])-sum(b[93:184,2]) ##auc 0.625 69 | ,findMin(b[,2],20,70) ##auc 0.604 70 | ,findDepthToMinDrop(b[,2],20,70) ##auc 0.5809985 71 | ,findDepthAfterMinDrop(b[,2],20,70) ##auc 0.5875286 72 | ,findDepthAfterMinDrop(b[,2],90,170) ##auc 0.5737571 73 | ,b[nrow(b),2]-b[1,2] ##auc 0.5654925 74 | ,sd(b[,2]) ##auc 0.55485 (1-auc) 75 | ,getVolatility(b[,2],110,160) ##auc 0.5299781 76 | ,sd(b[20:70,2])/getVolatility(b[,2],20,70) ##auc 0.5608 77 | ,personSession[(1+((i-1)*201)),3] 78 | ,personSession[(1+((i-1)*201)),5] 79 | ,t(b[1:184,2]) 80 | ,min(personSession[(1+((i-1)*201)),vec]) 81 | ,max(personSession[(1+((i-1)*201)),vec]) 82 | ,max(personSession[(1+((i-1)*201)),vec])-min(personSession[(1+((i-1)*201)),vec]) 83 | ) 84 | if(i==1){x<-tmp} 85 | if(i>1){x<-rbind(x,tmp)} 86 | } 87 | return(x) 88 | } 89 | 90 | findMin<-function(vec,cutMin=1,cutMax=10000){ 91 | vec<-vec[cutMin:(pmin(length(vec),cutMax))] 92 | id<-seq(1:length(vec)) 93 | return(id[vec==min(vec)]) 94 | } 95 | 96 | findDepthToMinDrop<-function(vec,cutMin=1,cutMax=10000){ 97 | vec<-vec[cutMin:(pmin(length(vec),cutMax))] 98 | id<-seq(1:length(vec)) 99 | end<-id[vec==min(vec)] 100 | return(max(vec[pmax(1,end-30):end])-vec[end]) 101 | } 102 | 103 | findDepthAfterMinDrop<-function(vec,cutMin=1,cutMax=10000){ 104 | vec<-vec[cutMin:(pmin(length(vec),cutMax))] 105 | id<-seq(1:length(vec)) 106 | end<-id[vec==min(vec)] 107 | return(max(vec[pmin(length(vec),end+30):end])-vec[end]) 108 | } 109 | 110 | getVolatility<-function(vec,cutMin=1,cutMax=10000){ 111 | vec<-vec[cutMin:(pmin(length(vec),cutMax))] 112 | return(sd(vec[2:length(vec)]-vec[1:(length(vec)-1)])) 113 | } 114 | 115 | 116 | getFeedBacks<-function(person,session,cap=100){ 117 | ##fread a given person ID and session ID (character IDs, not integers) 118 | ##return offsets of the FeedBackEvents 119 | library(data.table) 120 | fileName<-paste0("train/Data_",person,"_Sess",session,".csv") 121 | a<-fread(fileName) 122 | a[,rowId:=seq(1:nrow(a))] 123 | feedBacks<-a[FeedBackEvent==1,rowId] 124 | return(feedBacks[1:pmin(cap,length(feedBacks))]) 125 | } 126 | 127 | createMagic<-function(v,cutoff){ 128 | ## take in a vector of feedback differences 129 | ## produce a vector where negative means likely wrong; positive is likely right 130 | ## and the values are scaled by the implied accuracy rate from the feedback timings 131 | magic<-ifelse(v>=70,0,ifelse(v>cutoff,-1,1)) 132 | net<-sum(magic) 133 | scalar<-ifelse(net>75,4, 134 | ifelse(net>56,3, 135 | ifelse(net>12,2,1))) 136 | magic<-magic*scalar 137 | return(magic) 138 | } 139 | 140 | 141 | 142 | ############################### 143 | ## Create Train Data Frame 144 | ############################### 145 | 146 | ##get main features: 11 hand-crafted and higher-level features (subject/session) plus ~180 15-period averaged min/max scaled Cz signals 147 | ##currently set to use vector 1 (Cz); later used 2 (EOG), but minimal improvement 148 | personList<-c("S02","S06","S07","S11","S12","S13","S14","S16","S17","S18","S20","S21","S22","S23","S24","S26") 149 | sessionList<-c("01","02","03","04","05") 150 | depthList<-c(60,60,60,60,100) 151 | for(i in 1:length(personList)){ 152 | for(j in 1:length(sessionList)){ 153 | if(i*j==1){xAll<-prepareFeatures(getOneSec(personList[i],sessionList[j]),depthList[j],1)} 154 | if(i*j>1){xAll<-rbind(xAll,prepareFeatures(getOneSec(personList[i],sessionList[j]),depthList[j],1))} 155 | print(dim(xAll)) 156 | } 157 | } 158 | 159 | 160 | ## use coefficients of PLS model associating EOG with features from Matlab time-frequency kernel 161 | for(i in 1:length(personList)){ 162 | for(j in 1:length(sessionList)){ 163 | fileBase<-paste0("pls/Data_",personList[i],"_Sess",sessionList[j],"_pls.csv") 164 | if(i*j==1){plsAll<-read.csv(fileBase,header=FALSE)} 165 | if(i*j>1){plsAll<-rbind(plsAll,read.csv(fileBase,header=FALSE))} 166 | } 167 | } 168 | 169 | ## get timings between subjects 170 | for(i in 1:length(personList)){ 171 | for(j in 1:length(sessionList)){ 172 | if(i*j==1){xFB<-getFeedBacks(personList[i],sessionList[j],depthList[j])} 173 | if(i*j>1){xFB<-c(xFB,getFeedBacks(personList[i],sessionList[j],depthList[j]))} 174 | } 175 | print(i) 176 | } 177 | xFBDiff<-pmax(0,c(0,xFB[2:length(xFB)]-xFB[1:(length(xFB)-1)])) 178 | xFBDiff[xFBDiff==0]<-4500 179 | xFBDiff<-round(xFBDiff/50,0) 180 | 181 | trainAll<-cbind(xAll,plsAll) 182 | trainAll$xFBDiff<-xFBDiff 183 | 184 | ##count up the zones for spelling 185 | #spellCheck<-c(rep(65,340),rep(93,340),rep(88,340),rep(62,340),rep(60,340),rep(50,340),rep(79,340),rep(63,340),rep(56,340),rep(70,340), 186 | # rep(62,340),rep(88,340),rep(94,340),rep(73,340),rep(68,340),rep(73,340)) 187 | trainAll$spellCheck<-rep(0,nrow(trainAll)) #spellCheck 188 | delay<-rep(0,nrow(trainAll)) 189 | ## attempt to encode not the straight numbers from spellCheck, but a great/good/average/bad speller type encoding 190 | ## the intent is to scale to unseen data (test has some really bad ones) 191 | for(i in 1:15){delay[(241+(i-1)*340):(i*340)]<-trainAll$spellCheck[(241+(i-1)*340):(i*340)]*10000+round(delay[(242+(i-1)*340):(1+i*340)]/4,0)} 192 | delay[5341:5439]<-trainAll$spellCheck[5341:5439]*10000+round(delay[5342:5440]/4,0) ##for i = 16 193 | delay<-as.factor(as.character(delay)) 194 | 195 | x2<-as.data.frame(unlist(sapply(trainAll,function(x) as.numeric(as.character(x))))) 196 | x2$delay<-delay 197 | 198 | trainSess5<-c(0,1600) 199 | for(i in 1:16){trainSess5[(1+(i-1)*100):(i*100)]<-xFBDiff[(241+(i-1)*340):(i*340)]} 200 | for(i in 1:length(trainSess5)){if(i%%5==1) trainSess5[i]<-trainSess5[i]-20} 201 | 202 | xMagic<-rep(0,nrow(x2)) 203 | xSingleMagic<-rep(0,nrow(x2)) 204 | adjFbDiff<-x2$xFBDiff 205 | 206 | trainCuts<-c( 37, 40,40, 42,42,44,44, 45,46,47, 47,47,48,51,52, 52) 207 | for(i in 1:16){ 208 | m1<-createMagic(trainSess5[(1+(i-1)*100):(i*100)],trainCuts[i]) 209 | singleMagic<-pmax(5,round(sum(pmax(0,m1)/max(m1))/10,0)) 210 | xMagic[(241+(i-1)*340):(i*340)]<-m1 211 | xSingleMagic[(1+(i-1)*340):(i*340)]<-singleMagic 212 | adjFbDiff[(241+(i-1)*340):(i*340)]<-0 213 | } 214 | 215 | x2$xMagic<-as.factor(xMagic) 216 | x2$xSingleMagic<-xSingleMagic 217 | x2$xxAdjFBDiff<-adjFbDiff 218 | 219 | ## fix alignment problem 220 | x2$xMagic[1:5439]<-x2$xMagic[2:5440] 221 | 222 | 223 | ## add EOG/Cz differences 224 | maxLoop<-nrow(xAll) 225 | for(i in 1:maxLoop){ 226 | a<-as.numeric(xAll[i,12:190]) ##eog 227 | b<-as.numeric(x2[i,12:190]) ##cz 228 | a<-(a-min(a))/(max(a)-min(a)) ##eog 229 | b<-(b-min(b))/(max(b)-min(b)) ##cz 230 | diffs<-a-b ##eog-cz 231 | scalar<-1-(max(diffs)/100) 232 | if(i==1){diff2<-t(as.data.frame(b-(a*scalar)))} 233 | if(i>1){diff2<-rbind(diff2,t(as.data.frame(b-(a*scalar))))} 234 | } 235 | 236 | maxLoop<-nrow(xAll) 237 | for(i in 1:maxLoop){ 238 | a<-as.numeric(xAll[i,12:190]) ##eog 239 | b<-as.numeric(x2[i,12:190]) ##cz 240 | a<-(a-min(a))/(max(a)-min(a)) ##eog 241 | b<-(b-min(b))/(max(b)-min(b)) ##cz 242 | straightDiff<-a-b 243 | if(i==1){straightDiffs<-t(as.data.frame(straightDiff))} 244 | if(i>1){straightDiffs<-rbind(straightDiffs,t(as.data.frame(straightDiff)))} 245 | } 246 | 247 | 248 | ## add new features 249 | x4<-cbind(x2,diff2) 250 | 251 | diffDf<-as.data.frame(straightDiffs); colnames(diffDf)<-paste0("diff",colnames(diffDf)); rownames(diffDf)<-NULL 252 | x6<-cbind(x4,diffDf) 253 | 254 | 255 | ############################### 256 | ## Create Test Data Frame 257 | ############################### 258 | 259 | testPersonList<-c("S01","S03","S04","S05","S08","S09","S10","S15","S19","S25") 260 | sessionList<-c("01","02","03","04","05") 261 | depthList<-c(60,60,60,60,100) 262 | for(i in 1:length(testPersonList)){ 263 | for(j in 1:length(sessionList)){ 264 | if(i*j==1){xAllTest<-prepareFeatures(getOneSec(testPersonList[i],sessionList[j]),depthList[j])} 265 | if(i*j>1){xAllTest<-rbind(xAllTest,prepareFeatures(getOneSec(testPersonList[i],sessionList[j]),depthList[j]))} 266 | print(dim(xAllTest)) 267 | } 268 | } 269 | 270 | 271 | for(i in 1:length(testPersonList)){ 272 | for(j in 1:length(sessionList)){ 273 | fileBase<-paste0("pls/Data_",testPersonList[i],"_Sess",sessionList[j],"_pls.csv") 274 | if(i*j==1){plsAllTest<-read.csv(fileBase,header=FALSE)} 275 | if(i*j>1){plsAllTest<-rbind(plsAllTest,read.csv(fileBase,header=FALSE))} 276 | } 277 | } 278 | 279 | 280 | for(i in 1:length(testPersonList)){ 281 | for(j in 1:length(sessionList)){ 282 | if(i*j==1){xFBTest<-getFeedBacks(testPersonList[i],sessionList[j],depthList[j])} 283 | if(i*j>1){xFBTest<-c(xFBTest,getFeedBacks(testPersonList[i],sessionList[j],depthList[j]))} 284 | } 285 | print(i) 286 | } 287 | xTestFBDiff<-pmax(0,c(0,xFBTest[2:length(xFBTest)]-xFBTest[1:(length(xFBTest)-1)])) 288 | xTestFBDiff[xTestFBDiff==0]<-4500 289 | xTestFBDiff<-round(xTestFBDiff/50,0) 290 | 291 | testAll<-cbind(xAllTest,plsAllTest) 292 | testAll$xTestFBDiff<-xTestFBDiff 293 | 294 | ## isolate all session5's for analysis 295 | testSess5<-c(0,1000) 296 | for(i in 1:10){testSess5[(1+(i-1)*100):(i*100)]<-xTestFBDiff[(241+(i-1)*340):(i*340)]} 297 | for(i in 1:length(testSess5)){if(i%%5==1) testSess5[i]<-testSess5[i]-20} 298 | 299 | testAll$spellCheck<-0 300 | x3<-as.data.frame(unlist(sapply(testAll,function(x) as.numeric(as.character(x))))) 301 | x3$delay<-x2$delay[1:nrow(x3)] 302 | colnames(x3)<-colnames(x2)[1:ncol(x3)] 303 | 304 | testCuts<-c( 36, 38,39,39, 40,41,42, 44, 47, 51 ) 305 | xMagicTest<-rep(0,nrow(x3)) 306 | xSingleMagicTest<-rep(0,nrow(x3)) 307 | adjFbDiffTest<-x3$xTestFBDiff 308 | 309 | for(i in 1:10){ 310 | m1<-createMagic(testSess5[(1+(i-1)*100):(i*100)],testCuts[i]) 311 | singleMagic<-pmax(5,round(sum(pmax(0,m1)/max(m1))/10,0)) 312 | xMagicTest[(241+(i-1)*340):(i*340)]<-m1 313 | xSingleMagicTest[(1+(i-1)*340):(i*340)]<-singleMagic 314 | adjFbDiffTest[(241+(i-1)*340):(i*340)]<-0 315 | } 316 | 317 | x3$xMagic<-as.factor(xMagicTest) 318 | x3$xSingleMagic<-xSingleMagicTest 319 | x3$xxAdjFBDiff<-adjFbDiffTest 320 | 321 | ## fix alignment problem 322 | x3$xMagic[1:3399]<-x3$xMagic[2:3400] 323 | 324 | maxLoop<-nrow(x3) 325 | for(i in 1:maxLoop){ 326 | a<-as.numeric(x3[i,12:190]) ##eog 327 | b<-as.numeric(x3[i,12:190]) ##cz 328 | a<-(a-min(a))/(max(a)-min(a)) ##eog 329 | b<-(b-min(b))/(max(b)-min(b)) ##cz 330 | diffs<-a-b ##eog-cz 331 | scalar<-1-(max(diffs)/100) 332 | if(i==1){diff2Test<-t(as.data.frame(b-(a*scalar)))} 333 | if(i>1){diff2Test<-rbind(diff2Test,t(as.data.frame(b-(a*scalar))))} 334 | } 335 | 336 | maxLoop<-nrow(x3) 337 | for(i in 1:maxLoop){ 338 | a<-as.numeric(x3[i,12:190]) ##eog 339 | b<-as.numeric(x3[i,12:190]) ##cz 340 | a<-(a-min(a))/(max(a)-min(a)) ##eog 341 | b<-(b-min(b))/(max(b)-min(b)) ##cz 342 | straightDiff<-a-b 343 | if(i==1){straightDiffsTest<-t(as.data.frame(straightDiff))} 344 | if(i>1){straightDiffsTest<-rbind(straightDiffsTest,t(as.data.frame(straightDiff)))} 345 | } 346 | 347 | x5<-cbind(x3,diff2Test) 348 | diffDfTest<-as.data.frame(straightDiffsTest); colnames(diffDfTest)<-paste0("diff",colnames(diffDfTest)); rownames(diffDfTest)<-NULL 349 | x7<-cbind(x5,diffDfTest) 350 | 351 | 352 | ############################################# 353 | ## Test Training by running 354 | ## leave-one-out (LOO) validation 355 | ## but waiting to check AUC against 356 | ## the overall set of points, as 357 | ## that will be different than averaging 358 | ## each subject's AUC 359 | ## Most testing was done on smaller GBMs (100/0.1 vs 1000/0.01) 360 | ############################################# 361 | 362 | l<-read.csv("TrainLabels.csv") 363 | y<-l[,2] 364 | 365 | cols<-c(1:11,77:88,132,125:128,64:66,120,165,199:205,402:404) 366 | 367 | s1<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,1) 368 | s2<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,2) 369 | s3<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,3) 370 | s4<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,4) 371 | s5<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,5) 372 | s6<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,6) 373 | s7<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,7) 374 | s8<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,8) 375 | s9<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,9) 376 | s10<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,10) 377 | s11<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,11) 378 | s12<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,12) 379 | s13<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,13) 380 | s14<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,14) 381 | s15<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,15) 382 | s16<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,16) 383 | 384 | sAll1k<-c(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16) 385 | library(Metrics) 386 | auc(y,sAll1k) 387 | 388 | 389 | ################################## 390 | ## Generate submission 391 | ################################## 392 | fullFit<-fitGbm(x6[,cols],y,"model",1000,0.01,10,10,-1) 393 | summary(fullFit,plotit=FALSE)[1:30,] 394 | pFinal<-predict(fullFit,newdata=x7[,cols],n.trees=1000,type="response") 395 | plot(pFinal) 396 | z<-read.csv("SampleSubmission.csv") 397 | z[,2]<-pFinal 398 | 399 | ################################ 400 | ## Post Processing 401 | ## 402 | ## The encoding for subject accuracy 403 | ## is a conservative one, to ensure 404 | ## it would scale to new subjects. 405 | ## This model alone was good for 3rd 406 | ## place. But it jumped ~0.04 in AUC 407 | ## to bump up and down all points for 408 | ## a single subject, based on their 409 | ## rate of abnormal lenghts. 410 | 411 | z[(1+0*340):(1*340),2]<-z[(1+0*340):(1*340),2]-0 412 | z[(1+1*340):(2*340),2]<-z[(1+1*340):(2*340),2]-0.05 413 | z[(1+2*340):(3*340),2]<-z[(1+2*340):(3*340),2]-0 414 | z[(1+3*340):(4*340),2]<-z[(1+3*340):(4*340),2]-0.06 415 | z[(1+4*340):(5*340),2]<-z[(1+4*340):(5*340),2]-0 416 | z[(1+5*340):(6*340),2]<-z[(1+5*340):(6*340),2]+0.01 417 | z[(1+6*340):(7*340),2]<-z[(1+6*340):(7*340),2]+0.02 418 | z[(1+7*340):(8*340),2]<-z[(1+7*340):(8*340),2]+0.02 419 | z[(1+8*340):(9*340),2]<-z[(1+8*340):(9*340),2]-0 420 | z[(1+9*340):(10*340),2]<-z[(1+9*340):(10*340),2]-0.275 421 | write.csv(z,"bci_submission.csv",row.names=FALSE,quote=FALSE) 422 | 423 | 424 | ## Table of rate of normal lengths vs GBM mean 425 | ## 1 0.67 0.64530 426 | ## 2 0.41 0.64200 427 | ## 3 0.83 0.69560 428 | ## 4 0.31 0.65300 429 | ## 5 0.68 0.71900 430 | ## 6 0.79 0.66480 431 | ## 7 0.92 0.93950 432 | ## 8 0.91 0.94390 433 | ## 9 0.64 0.69590 434 | ## 10 0.54 0.65420 435 | -------------------------------------------------------------------------------- /bci_challenge/use_tf_kernel.m: -------------------------------------------------------------------------------- 1 | % The purpose of this script is as follows: 2 | % 1) Used for EEG Kaggle Competition 3 | % 2) Data is then processed into "feature" sets using the B-J 4 | % time-frequency kernel. 5 | % 3) Once the "feature" sets have been extracted, the "features" are 6 | % processed using a time-series model or decomposed into principal components. 7 | % 8 | % File Version History: 9 | % 01/27/2015 Original version written by Robert Chong 10 | % 02/11/2015 Starting to work with the team 11 | % 02/20/2015 adapting slightly to take in R's as.matrix .mat files 12 | 13 | % House keeping 14 | clear all; clc; close all 15 | 16 | 17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 18 | % Set paths that contain m-files 19 | %addpath(genpath('/Users/rchong/Documents/MATLAB/')) % Path adds the time-frequency toolboxes 20 | %addpath(genpath('/Users/rchong/Downloads/BCI_Challenge/')) % Path adds the current script and data 21 | addpath(genpath('/home/mark/Documents/bci')) 22 | %addpath(genpath('/home/mark/Documents/bci/tfdn')) 23 | %addpath(genpath('/home/mark/Documents/bci/NaN')) 24 | %addpath(genpath('/home/mark/Documents/bci/statistics')) 25 | 26 | %% loop through each person and session; to speed this up, I broke this list into four pieces and 27 | %% had separate sessions working on each piece; if you do that you need to adjust the personList and k length 28 | personList = ["S01","S02","S03","S04","S05","S06","S07","S08","S09","S10","S11","S12","S13","S14","S15","S16","S17","S18","S19","S20","S21","S22","S23","S24","S25","S26"]; 29 | sessionList= ["01","02","03","04","05"]; 30 | 31 | for k = 1:26 32 | for j = 1:5 33 | fileBase = strcat('train/Data_',personList((1+(k-1)*3):k*3),'_Sess',sessionList((1+(j-1)*2):j*2)); 34 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 35 | % Load dataset & Assign Variable names 36 | load(strcat(fileBase,'.mat')); 37 | %tsFile = csvread('/Users/rchong/Downloads/BCI_Challenge/train/Data_S02_Sess01.csv') 38 | 39 | % Find all the index where FeedBackEvent equals 1 40 | idx = find(x(1:length(x),59)); 41 | 42 | for i = 1:length(idx) 43 | idxPos = idx(i); 44 | data = x(idxPos:(idxPos+260),30); 45 | %30 is the column for Cz; eventually loop through the columns of choice, if time 46 | 47 | fSample = 200; 48 | windowSize = 1000; 49 | alpha = 0.27; 50 | 51 | %tfr_AF3 = rgk(data,alpha); 52 | tfr_AF3 = bintfd(data,windowSize,'Hamming','Analytic',0,'PosOnly','NoMex'); 53 | tfrRGK = tfr_AF3; 54 | 55 | % Time 56 | TAtimeMargRGK = t_margin(tfrRGK); % tfdn-toolbox -- not divided [m x n] 57 | TAtimeMargRGK = TAtimeMargRGK'; 58 | 59 | % Marginals 60 | % Frequency 61 | TAfreqMargRGK = f_margin(tfrRGK); % tfdn-toolbox -- not divided [m x n] 62 | 63 | % Run PLS Modeling here 64 | y = zscore(x(idxPos:idxPos+260,58)); 65 | X = zscore(tfrRGK'); 66 | [XL,yl,XS,YS,beta,PCTVAR] = plsregress(X,y,140); 67 | 68 | % save the top 200 PLS components; continually append in the loop for later writing to CSV 69 | if(i==1) 70 | outVec = rot90(beta(1:200)); 71 | else 72 | outVec = vertcat(outVec,rot90(beta(1:200))); 73 | endif 74 | end 75 | % write out the matrix of all PLS components for use later (in R, or anywhere) 76 | csvwrite(strcat(fileBase,'_pls.csv'),outVec); 77 | strcat('Done with: ',fileBase) 78 | end 79 | end 80 | -------------------------------------------------------------------------------- /loan_default_loss_basics.R: -------------------------------------------------------------------------------- 1 | ## Posting the basics of our loss model. 2 | ## R objects were often saved to speed the process up, so currently the pieces are being put in, 3 | ## without any cohesion to get the content in one place. 4 | 5 | gbmCv<-function(xTrain,yTrain,n=10,trees=500,shrink=0.05,depth=6,minobs=30,dist="laplace"){ 6 | require(gbm,Metrics) 7 | t<-seq(1:nrow(xTrain)) 8 | for(i in (1:n)){ 9 | xH<-xTrain[t%%n==(i-1),] 10 | xT<-xTrain[t%%n!=(i-1),] 11 | yH<-yTrain[t%%n==(i-1)] 12 | yT<-yTrain[t%%n!=(i-1)] 13 | GBM_model <- gbm.fit(x=xT,y=yT,distribution=dist,n.trees=trees,shrinkage=shrink,interaction.depth=depth,n.minobsinnode=minobs,verbose=FALSE) 14 | pT<-predict.gbm(object=GBM_model,newdata=xT,trees) 15 | pH<-predict.gbm(object=GBM_model,newdata=xH,trees) 16 | if(i==1){cvDf<-as.data.frame(cbind(i,mae(yT,pT),mae(yH,pH)))} 17 | else{cvDf<-rbind(cvDf,as.data.frame(cbind(i,mae(yT,pT),mae(yH,pH))))} 18 | } 19 | return(cvDf) 20 | } 21 | 22 | gbmDualCv<-function(xTrain,yTrain,n=10,trees=500,shrink=0.05,depth=6,minobs=30){ 23 | require(gbm,Metrics) 24 | t<-seq(1:nrow(final_x)) 25 | for(i in (1:n)){ 26 | xH<-final_x[t%%n==(i-1),] 27 | xT<-final_x[t%%n!=(i-1),] 28 | yH<-y[t%%n==(i-1)] 29 | yT<-y[t%%n!=(i-1)] 30 | GBM_laplace <- gbm.fit(x=xT,y=((yT)^0.5),distribution="laplace",n.trees=trees,shrinkage=0.05,interaction.depth=depth,n.minobsinnode=30,verbose=FALSE) 31 | GBM_gauss <- gbm.fit(x=xT,y=(yT^0.5),distribution="gaussian",n.trees=trees,shrinkage=0.05,interaction.depth=depth,n.minobsinnode=30,verbose=FALSE) 32 | pT<-predict.gbm(object=GBM_laplace,newdata=xT,trees) 33 | pH<-predict.gbm(object=GBM_laplace,newdata=xH,trees) 34 | pTg<-predict.gbm(object=GBM_gauss,newdata=xT,trees) 35 | pHg<-predict.gbm(object=GBM_gauss,newdata=xH,trees) 36 | 37 | if(i==1){cvDf<-as.data.frame(cbind(i, 38 | mae(yT,pmax(0,pT^2)), 39 | mae(yH,pmax(0,pH^2)), 40 | mae(yT,pmax(0,pTg^2)), 41 | mae(yH,pmax(0,pHg^2)), 42 | mae(yH,pmax(0,(pH^2+pHg^2)/2))))} 43 | else{cvDf<-rbind(cvDf,as.data.frame(cbind(i, 44 | mae(yT,pmax(0,pT^2)), 45 | mae(yH,pmax(0,pH^2)), 46 | mae(yT,pmax(0,pTg^2)), 47 | mae(yH,pmax(0,pHg^2)), 48 | mae(yH,pmax(0,(pH^2+pHg^2)/2))))} 49 | } 50 | return(cvDf) 51 | } 52 | 53 | ##Find initial features for GBM using those most highly correlated with loss 54 | ## this is overkill to get the full correlation matrix just to find the variables 55 | ## correlated with the target, but much time was spent looking through the rest of the matrix 56 | ## though it was never used. 57 | topCorrelatedFeatures<-function(train,threshold=0.25){ 58 | require(Hmisc) 59 | rc<-rcorr(as.matrix(train)) 60 | rcdf<-as.data.frame(rc$r) 61 | r2<-as.data.frame(t(rcdf[1:2,])) 62 | r3<-r2[abs(r2$loss)>0.25,] 63 | attach(r3) 64 | out<-r3[order(-loss),] 65 | detach(r3) 66 | return(out) 67 | } 68 | 69 | ##Impute values for algorithms that require it, using caret's imputation method 70 | require(caret); require(Metrics) 71 | pp<-preProcess(final_x,method="medianImpute") ##very fast 72 | x2<-predict(pp,final_x) 73 | x3<-predict(pp,test[,2:ncol(test)]) 74 | 75 | 76 | ## Main driver section 77 | dirs<-read.csv("project_locations.csv") 78 | setwd(as.character(dirs[1,2])) 79 | rm(dirs) 80 | 81 | ##to do: replace this with code to create the data set 82 | load("final_x.Rdata") 83 | load("final_y.Rdata") 84 | load("test_final_with_id.Rdata") ##test 85 | 86 | ## example CV test 87 | df<-gbmDualCv(final_x,y,10,1000,0.05,10,30); sapply(df,mean) 88 | 89 | ## run final data 90 | fp<-gbmFull(final_x,y,test,10,700,0.05,7,30,"laplace") 91 | write.table(fp,"gbm_20140227b.csv",append = FALSE, sep=',', row.names=FALSE, quote=TRUE, col.names=TRUE) 92 | -------------------------------------------------------------------------------- /modern_logistic_regression/README.md: -------------------------------------------------------------------------------- 1 | Modern Logistic Regression 2 | ====== 3 | 4 | This is a small collection of files that support the talk I gave to the Austin Machine Learning Meetup on 1/19/2015. 5 | 6 | These scripts are used directly (OK by the amusing license) from Kaggle forums, and provide a fantastic entry point for robust classification tasks. 7 | 8 | * First version: easiest to start adapting 9 | * link: http://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/10322/beat-the-benchmark-with-less-then-200mb-of-memory 10 | * techniques: online learning, logistic regression, hash trick, adaptive learning rate 11 | * Second version: same plus multiclass output handling 12 | * link: http://www.kaggle.com/c/tradeshift-text-classification/forums/t/10537/beat-the-benchmark-with-less-than-400mb-of-memory 13 | * techniques: online learning, logistic regression, hash trick, adaptive learning rate, multiclass output 14 | * Third version: FTRL-proximal 15 | * link: http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 16 | * techniques: online learning, logistic regression, hash trick, FTRL-proximal (stochastic gradient descent, L1 & L2 regularization) 17 | -------------------------------------------------------------------------------- /modern_logistic_regression/fast_solution.py: -------------------------------------------------------------------------------- 1 | ''' 2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2004 Sam Hocevar 6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 | ''' 16 | 17 | 18 | from datetime import datetime 19 | from csv import DictReader 20 | from math import exp, log, sqrt 21 | 22 | 23 | # parameters ################################################################# 24 | 25 | train = 'train.csv' # path to training file 26 | test = 'test.csv' # path to testing file 27 | 28 | D = 2 ** 20 # number of weights use for learning 29 | alpha = .1 # learning rate for sgd optimization 30 | 31 | 32 | # function definitions ####################################################### 33 | 34 | # A. Bounded logloss 35 | # INPUT: 36 | # p: our prediction 37 | # y: real answer 38 | # OUTPUT 39 | # logarithmic loss of p given y 40 | def logloss(p, y): 41 | p = max(min(p, 1. - 10e-12), 10e-12) 42 | return -log(p) if y == 1. else -log(1. - p) 43 | 44 | 45 | # B. Apply hash trick of the original csv row 46 | # for simplicity, we treat both integer and categorical features as categorical 47 | # INPUT: 48 | # csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...} 49 | # D: the max index that we can hash to 50 | # OUTPUT: 51 | # x: a list of indices that its value is 1 52 | def get_x(csv_row, D): 53 | x = [0] # 0 is the index of the bias term 54 | for key, value in csv_row.items(): 55 | index = int(value + key[1:], 16) % D # weakest hash ever ;) 56 | x.append(index) 57 | return x # x contains indices of features that have a value of 1 58 | 59 | 60 | # C. Get probability estimation on x 61 | # INPUT: 62 | # x: features 63 | # w: weights 64 | # OUTPUT: 65 | # probability of p(y = 1 | x; w) 66 | def get_p(x, w): 67 | wTx = 0. 68 | for i in x: # do wTx 69 | wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1. 70 | return 1. / (1. + exp(-max(min(wTx, 20.), -20.))) # bounded sigmoid 71 | 72 | 73 | # D. Update given model 74 | # INPUT: 75 | # w: weights 76 | # n: a counter that counts the number of times we encounter a feature 77 | # this is used for adaptive learning rate 78 | # x: feature 79 | # p: prediction of our model 80 | # y: answer 81 | # OUTPUT: 82 | # w: updated model 83 | # n: updated count 84 | def update_w(w, n, x, p, y): 85 | for i in x: 86 | # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic 87 | # (p - y) * x[i] is the current gradient 88 | # note that in our case, if i in x then x[i] = 1 89 | w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.) 90 | n[i] += 1. 91 | 92 | return w, n 93 | 94 | 95 | # training and testing ####################################################### 96 | 97 | # initialize our model 98 | w = [0.] * D # weights 99 | n = [0.] * D # number of times we've encountered a feature 100 | 101 | # start training a logistic regression model using on pass sgd 102 | loss = 0. 103 | for t, row in enumerate(DictReader(open(train))): 104 | y = 1. if row['Label'] == '1' else 0. 105 | 106 | del row['Label'] # can't let the model peek the answer 107 | del row['Id'] # we don't need the Id 108 | 109 | # main training procedure 110 | # step 1, get the hashed features 111 | x = get_x(row, D) 112 | 113 | # step 2, get prediction 114 | p = get_p(x, w) 115 | 116 | # for progress validation, useless for learning our model 117 | loss += logloss(p, y) 118 | if t % 1000000 == 0 and t > 1: 119 | print('%s\tencountered: %d\tcurrent logloss: %f' % ( 120 | datetime.now(), t, loss/t)) 121 | 122 | # step 3, update model with answer 123 | w, n = update_w(w, n, x, p, y) 124 | 125 | # testing (build kaggle's submission file) 126 | with open('submission1234.csv', 'w') as submission: 127 | submission.write('Id,Predicted\n') 128 | for t, row in enumerate(DictReader(open(test))): 129 | Id = row['Id'] 130 | del row['Id'] 131 | x = get_x(row, D) 132 | p = get_p(x, w) 133 | submission.write('%s,%f\n' % (Id, p)) 134 | -------------------------------------------------------------------------------- /modern_logistic_regression/fast_solution_v2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2004 Sam Hocevar 6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 | ''' 16 | 17 | 18 | from datetime import datetime 19 | from math import log, exp, sqrt 20 | 21 | 22 | # TL; DR 23 | # the main learning process start at line 122 24 | 25 | 26 | # parameters ################################################################# 27 | 28 | train = 'train.csv' # path to training file 29 | label = 'trainLabels.csv' # path to label file of training data 30 | test = 'test.csv' # path to testing file 31 | 32 | D = 2 ** 18 # number of weights use for each model, we have 32 of them 33 | alpha = .1 # learning rate for sgd optimization 34 | 35 | 36 | # function, generator definitions ############################################ 37 | 38 | # A. x, y generator 39 | # INPUT: 40 | # path: path to train.csv or test.csv 41 | # label_path: (optional) path to trainLabels.csv 42 | # YIELDS: 43 | # ID: id of the instance (can also acts as instance count) 44 | # x: a list of indices that its value is 1 45 | # y: (if label_path is present) label value of y1 to y33 46 | def data(path, label_path=None): 47 | for t, line in enumerate(open(path)): 48 | # initialize our generator 49 | if t == 0: 50 | # create a static x, 51 | # so we don't have to construct a new x for every instance 52 | x = [0] * 146 53 | if label_path: 54 | label = open(label_path) 55 | label.readline() # we don't need the headers 56 | continue 57 | # parse x 58 | for m, feat in enumerate(line.rstrip().split(',')): 59 | if m == 0: 60 | ID = int(feat) 61 | else: 62 | # one-hot encode everything with hash trick 63 | # categorical: one-hotted 64 | # boolean: ONE-HOTTED 65 | # numerical: ONE-HOTTED! 66 | # note, the build in hash(), although fast is not stable, 67 | # i.e., same value won't always have the same hash 68 | # on different machines 69 | x[m] = abs(hash(str(m) + '_' + feat)) % D 70 | # parse y, if provided 71 | if label_path: 72 | # use float() to prevent future type casting, [1:] to ignore id 73 | y = [float(y) for y in label.readline().split(',')[1:]] 74 | yield (ID, x, y) if label_path else (ID, x) 75 | 76 | 77 | # B. Bounded logloss 78 | # INPUT: 79 | # p: our prediction 80 | # y: real answer 81 | # OUTPUT 82 | # bounded logarithmic loss of p given y 83 | def logloss(p, y): 84 | p = max(min(p, 1. - 10e-15), 10e-15) 85 | return -log(p) if y == 1. else -log(1. - p) 86 | 87 | 88 | # C. Get probability estimation on x 89 | # INPUT: 90 | # x: features 91 | # w: weights 92 | # OUTPUT: 93 | # probability of p(y = 1 | x; w) 94 | def predict(x, w): 95 | wTx = 0. 96 | for i in x: # do wTx 97 | wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1. 98 | return 1. / (1. + exp(-max(min(wTx, 20.), -20.))) # bounded sigmoid 99 | 100 | 101 | # D. Update given model 102 | # INPUT: 103 | # alpha: learning rate 104 | # w: weights 105 | # n: sum of previous absolute gradients for a given feature 106 | # this is used for adaptive learning rate 107 | # x: feature, a list of indices 108 | # p: prediction of our model 109 | # y: answer 110 | # MODIFIES: 111 | # w: weights 112 | # n: sum of past absolute gradients 113 | def update(alpha, w, n, x, p, y): 114 | for i in x: 115 | # alpha / sqrt(n) is the adaptive learning rate 116 | # (p - y) * x[i] is the current gradient 117 | # note that in our case, if i in x then x[i] = 1. 118 | n[i] += abs(p - y) 119 | w[i] -= (p - y) * 1. * alpha / sqrt(n[i]) 120 | 121 | 122 | # training and testing ####################################################### 123 | start = datetime.now() 124 | 125 | # a list for range(0, 33) - 13, no need to learn y14 since it is always 0 126 | K = [k for k in range(33) if k != 13] 127 | 128 | # initialize our model, all 32 of them, again ignoring y14 129 | w = [[0.] * D if k != 13 else None for k in range(33)] 130 | n = [[0.] * D if k != 13 else None for k in range(33)] 131 | 132 | loss = 0. 133 | loss_y14 = log(1. - 10**-15) 134 | 135 | for ID, x, y in data(train, label): 136 | 137 | # get predictions and train on all labels 138 | for k in K: 139 | p = predict(x, w[k]) 140 | update(alpha, w[k], n[k], x, p, y[k]) 141 | loss += logloss(p, y[k]) # for progressive validation 142 | loss += loss_y14 # the loss of y14, logloss is never zero 143 | 144 | # print out progress, so that we know everything is working 145 | if ID % 100000 == 0: 146 | print('%s\tencountered: %d\tcurrent logloss: %f' % ( 147 | datetime.now(), ID, (loss/33.)/ID)) 148 | 149 | with open('./submission1234.csv', 'w') as outfile: 150 | outfile.write('id_label,pred\n') 151 | for ID, x in data(test): 152 | for k in K: 153 | p = predict(x, w[k]) 154 | outfile.write('%s_y%d,%s\n' % (ID, k+1, str(p))) 155 | if k == 12: 156 | outfile.write('%s_y14,0.0\n' % ID) 157 | 158 | print('Done, elapsed time: %s' % str(datetime.now() - start)) 159 | -------------------------------------------------------------------------------- /modern_logistic_regression/fast_solution_v3.py: -------------------------------------------------------------------------------- 1 | ''' 2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2004 Sam Hocevar 6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 | ''' 16 | 17 | 18 | from datetime import datetime 19 | from csv import DictReader 20 | from math import exp, log, sqrt 21 | 22 | 23 | # TL; DR, the main training process starts on line: 250, 24 | # you may want to start reading the code from there 25 | 26 | 27 | ############################################################################## 28 | # parameters ################################################################# 29 | ############################################################################## 30 | 31 | # A, paths 32 | train = 'train_rev2' # path to training file 33 | test = 'test_rev2' # path to testing file 34 | submission = 'submission1234.csv' # path of to be outputted submission file 35 | 36 | # B, model 37 | alpha = .1 # learning rate 38 | beta = 1. # smoothing parameter for adaptive learning rate 39 | L1 = 1. # L1 regularization, larger value means more regularized 40 | L2 = 1. # L2 regularization, larger value means more regularized 41 | 42 | # C, feature/hash trick 43 | D = 2 ** 20 # number of weights to use 44 | interaction = False # whether to enable poly2 feature interactions 45 | 46 | # D, training/validation 47 | epoch = 1 # learn training data for N passes 48 | holdafter = 9 # data after date N (exclusive) are used as validation 49 | holdout = None # use every N training instance for holdout validation 50 | 51 | 52 | ############################################################################## 53 | # class, function, generator definitions ##################################### 54 | ############################################################################## 55 | 56 | class ftrl_proximal(object): 57 | ''' Our main algorithm: Follow the regularized leader - proximal 58 | 59 | In short, 60 | this is an adaptive-learning-rate sparse logistic-regression with 61 | efficient L1-L2-regularization 62 | 63 | Reference: 64 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 65 | ''' 66 | 67 | def __init__(self, alpha, beta, L1, L2, D, interaction): 68 | # parameters 69 | self.alpha = alpha 70 | self.beta = beta 71 | self.L1 = L1 72 | self.L2 = L2 73 | 74 | # feature related parameters 75 | self.D = D 76 | self.interaction = interaction 77 | 78 | # model 79 | # n: squared sum of past gradients 80 | # z: weights 81 | # w: lazy weights 82 | self.n = [0.] * D 83 | self.z = [0.] * D 84 | self.w = {} 85 | 86 | def _indices(self, x): 87 | ''' A helper generator that yields the indices in x 88 | 89 | The purpose of this generator is to make the following 90 | code a bit cleaner when doing feature interaction. 91 | ''' 92 | 93 | # first yield index of the bias term 94 | yield 0 95 | 96 | # then yield the normal indices 97 | for index in x: 98 | yield index 99 | 100 | # now yield interactions (if applicable) 101 | if self.interaction: 102 | D = self.D 103 | L = len(x) 104 | 105 | x = sorted(x) 106 | for i in xrange(L): 107 | for j in xrange(i+1, L): 108 | # one-hot encode interactions with hash trick 109 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 110 | 111 | def predict(self, x): 112 | ''' Get probability estimation on x 113 | 114 | INPUT: 115 | x: features 116 | 117 | OUTPUT: 118 | probability of p(y = 1 | x; w) 119 | ''' 120 | 121 | # parameters 122 | alpha = self.alpha 123 | beta = self.beta 124 | L1 = self.L1 125 | L2 = self.L2 126 | 127 | # model 128 | n = self.n 129 | z = self.z 130 | w = {} 131 | 132 | # wTx is the inner product of w and x 133 | wTx = 0. 134 | for i in self._indices(x): 135 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 136 | 137 | # build w on the fly using z and n, hence the name - lazy weights 138 | # we are doing this at prediction instead of update time is because 139 | # this allows us for not storing the complete w 140 | if sign * z[i] <= L1: 141 | # w[i] vanishes due to L1 regularization 142 | w[i] = 0. 143 | else: 144 | # apply prediction time L1, L2 regularization to z and get w 145 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 146 | 147 | wTx += w[i] 148 | 149 | # cache the current w for update stage 150 | self.w = w 151 | 152 | # bounded sigmoid function, this is the probability estimation 153 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 154 | 155 | def update(self, x, p, y): 156 | ''' Update model using x, p, y 157 | 158 | INPUT: 159 | x: feature, a list of indices 160 | p: click probability prediction of our model 161 | y: answer 162 | 163 | MODIFIES: 164 | self.n: increase by squared gradient 165 | self.z: weights 166 | ''' 167 | 168 | # parameter 169 | alpha = self.alpha 170 | 171 | # model 172 | n = self.n 173 | z = self.z 174 | w = self.w 175 | 176 | # gradient under logloss 177 | g = p - y 178 | 179 | # update z and n 180 | for i in self._indices(x): 181 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 182 | z[i] += g - sigma * w[i] 183 | n[i] += g * g 184 | 185 | 186 | def logloss(p, y): 187 | ''' FUNCTION: Bounded logloss 188 | 189 | INPUT: 190 | p: our prediction 191 | y: real answer 192 | 193 | OUTPUT: 194 | logarithmic loss of p given y 195 | ''' 196 | 197 | p = max(min(p, 1. - 10e-15), 10e-15) 198 | return -log(p) if y == 1. else -log(1. - p) 199 | 200 | 201 | def data(path, D): 202 | ''' GENERATOR: Apply hash-trick to the original csv row 203 | and for simplicity, we one-hot-encode everything 204 | 205 | INPUT: 206 | path: path to training or testing file 207 | D: the max index that we can hash to 208 | 209 | YIELDS: 210 | ID: id of the instance, mainly useless 211 | x: a list of hashed and one-hot-encoded 'indices' 212 | we only need the index since all values are either 0 or 1 213 | y: y = 1 if we have a click, else we have y = 0 214 | ''' 215 | 216 | for t, row in enumerate(DictReader(open(path))): 217 | # process id 218 | ID = row['id'] 219 | del row['id'] 220 | 221 | # process clicks 222 | y = 0. 223 | if 'click' in row: 224 | if row['click'] == '1': 225 | y = 1. 226 | del row['click'] 227 | 228 | # extract date 229 | date = int(row['hour'][4:6]) 230 | 231 | # turn hour really into hour, it was originally YYMMDDHH 232 | row['hour'] = row['hour'][6:] 233 | 234 | # build x 235 | x = [] 236 | for key in row: 237 | value = row[key] 238 | 239 | # one-hot encode everything with hash trick 240 | index = abs(hash(key + '_' + value)) % D 241 | x.append(index) 242 | 243 | yield t, date, ID, x, y 244 | 245 | 246 | ############################################################################## 247 | # start training ############################################################# 248 | ############################################################################## 249 | 250 | start = datetime.now() 251 | 252 | # initialize ourselves a learner 253 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 254 | 255 | # start training 256 | for e in xrange(epoch): 257 | loss = 0. 258 | count = 0 259 | 260 | for t, date, ID, x, y in data(train, D): # data is a generator 261 | # t: just a instance counter 262 | # date: you know what this is 263 | # ID: id provided in original data 264 | # x: features 265 | # y: label (click) 266 | 267 | # step 1, get prediction from learner 268 | p = learner.predict(x) 269 | 270 | if (holdafter and date > holdafter) or (holdout and t % holdout == 0): 271 | # step 2-1, calculate validation loss 272 | # we do not train with the validation data so that our 273 | # validation loss is an accurate estimation 274 | # 275 | # holdafter: train instances from day 1 to day N 276 | # validate with instances from day N + 1 and after 277 | # 278 | # holdout: validate with every N instance, train with others 279 | loss += logloss(p, y) 280 | count += 1 281 | else: 282 | # step 2-2, update learner with label (click) information 283 | learner.update(x, p, y) 284 | 285 | print('Epoch %d finished, validation logloss: %f, elapsed time: %s' % ( 286 | e, loss/count, str(datetime.now() - start))) 287 | 288 | 289 | ############################################################################## 290 | # start testing, and build Kaggle's submission file ########################## 291 | ############################################################################## 292 | 293 | with open(submission, 'w') as outfile: 294 | outfile.write('id,click\n') 295 | for t, date, ID, x, y in data(test, D): 296 | p = learner.predict(x) 297 | outfile.write('%s,%s\n' % (ID, str(p))) 298 | -------------------------------------------------------------------------------- /rain/README.md: -------------------------------------------------------------------------------- 1 | Code for Kaggle Will it Rain competition 2 | -------------------------------------------------------------------------------- /rain/prepareTrain.R: -------------------------------------------------------------------------------- 1 | ############################## 2 | ### Large script for moving through aggregations by applying a similar methodology to each 3 | ### Appears as a large script because that's how it really was worked on; each pattern was 4 | ### repeated on new features 5 | ### 6 | ### Makes a lot of assumptions about having data in place. Working to streamline this all 7 | ### to look like a production process. But getting "something" out now. 8 | ############################## 9 | 10 | source("rain_functions.R") 11 | 12 | q<-lapply(train.RadarQualityIndex, function(x) as.numeric(removeInvalids(x))) 13 | 14 | HybridScan<-lapply(train.HybridScan, function(x) as.numeric(removeInvalids(x))) 15 | wtHybridScan<-mapply(function(x,y) getAggregates(x,"weightedMean",y),x=HybridScan,y=q) 16 | wtHybridScan[is.na(wtHybridScan)]<-NA 17 | meanHybridScan<-mapply(function(x,y) getAggregates(x,"mean",y),x=HybridScan,y=q) 18 | meanHybridScan[is.na(meanHybridScan)]<-NA 19 | wtSquaredHybridScan<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=HybridScan,y=q) 20 | wtSquaredHybridScan[is.na(wtSquaredHybridScan)]<-NA 21 | rm(HybridScan) 22 | 23 | 24 | minDistanceToRadar<-unlist(lapply(train.DistanceToRadar,min)) 25 | 26 | Reflectivity<-lapply(train.Reflectivity, function(x) as.numeric(removeInvalids(x))) 27 | wtReflectivity<-mapply(function(x,y) getAggregates(x,"weightedMean",y),x=Reflectivity,y=q) 28 | wtReflectivity[is.na(wtReflectivity)]<-NA 29 | meanReflectivity<-mapply(function(x,y) getAggregates(x,"mean",y),x=Reflectivity,y=q) 30 | meanReflectivity[is.na(meanReflectivity)]<-NA 31 | rm(Reflectivity); gc() 32 | 33 | ReflectivityQC<-lapply(train.ReflectivityQC, function(x) as.numeric(removeInvalids(x))) 34 | wtReflectivityQC<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=ReflectivityQC,y=q) 35 | wtReflectivityQC[is.na(wtReflectivityQC)]<-NA 36 | meanReflectivityQC<-mapply(function(x,y) getAggregates(x,"mean",y),x=ReflectivityQC,y=q) 37 | meanReflectivityQC[is.na(meanReflectivityQC)]<-NA 38 | rm(ReflectivityQC); gc() 39 | 40 | Zdr<-lapply(train.Zdr, function(x) as.numeric(removeInvalids(x))) 41 | wtZdr<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=Zdr,y=q) 42 | wtZdr[is.na(wtZdr)]<-NA 43 | meanZdr<-mapply(function(x,y) getAggregates(x,"mean",y),x=Zdr,y=q) 44 | meanZdr[is.na(meanZdr)]<-NA 45 | rm(Zdr); gc() 46 | 47 | LogWaterVolume<-lapply(train.LogWaterVolume, function(x) as.numeric(removeInvalids(x))) 48 | wtLogWaterVolume<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=LogWaterVolume,y=q) 49 | wtLogWaterVolume[is.na(wtLogWaterVolume)]<-NA 50 | meanLogWaterVolume<-mapply(function(x,y) getAggregates(x,"mean",y),x=LogWaterVolume,y=q) 51 | meanLogWaterVolume[is.na(meanLogWaterVolume)]<-NA 52 | rm(LogWaterVolume); gc() 53 | 54 | Composite<-lapply(train.Composite, function(x) as.numeric(removeInvalids(x))) 55 | wtComposite<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=Composite,y=q) 56 | wtComposite[is.na(wtComposite)]<-NA 57 | meanComposite<-mapply(function(x,y) getAggregates(x,"mean",y),x=Composite,y=q) 58 | meanComposite[is.na(meanComposite)]<-NA 59 | rm(Composite); gc() 60 | 61 | RR1<-lapply(train.RR1, function(x) as.numeric(removeInvalids(x))) 62 | wtRR1<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RR1,y=q) 63 | wtRR1[is.na(wtRR1)]<-NA 64 | meanRR1<-mapply(function(x,y) getAggregates(x,"mean",y),x=RR1,y=q) 65 | meanRR1[is.na(meanRR1)]<-NA 66 | rm(RR1); gc() 67 | 68 | RR2<-lapply(train.RR2, function(x) as.numeric(removeInvalids(x))) 69 | wtRR2<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RR2,y=q) 70 | wtRR2[is.na(wtRR2)]<-NA 71 | meanRR2<-mapply(function(x,y) getAggregates(x,"mean",y),x=RR2,y=q) 72 | meanRR2[is.na(meanRR2)]<-NA 73 | rm(RR2); gc() 74 | 75 | RR3<-lapply(train.RR3, function(x) as.numeric(removeInvalids(x))) 76 | wtRR3<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RR3,y=q) 77 | wtRR3[is.na(wtRR3)]<-NA 78 | meanRR3<-mapply(function(x,y) getAggregates(x,"mean",y),x=RR3,y=q) 79 | meanRR3[is.na(meanRR3)]<-NA 80 | rm(RR3); gc() 81 | 82 | RhoHV<-lapply(train.RhoHV, function(x) as.numeric(removeInvalids(x))) 83 | wtRhoHV<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RhoHV,y=q) 84 | wtRhoHV[is.na(wtRhoHV)]<-NA 85 | meanRhoHV<-mapply(function(x,y) getAggregates(x,"mean",y),x=RhoHV,y=q) 86 | meanRhoHV[is.na(meanRhoHV)]<-NA 87 | rm(RhoHV); gc() 88 | 89 | Velocity<-lapply(train.Velocity, function(x) as.numeric(removeInvalids(x))) 90 | wtVelocity<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=Velocity,y=q) 91 | wtVelocity[is.na(wtVelocity)]<-NA 92 | meanVelocity<-mapply(function(x,y) getAggregates(x,"mean",y),x=Velocity,y=q) 93 | meanVelocity[is.na(meanVelocity)]<-NA 94 | rm(Velocity); gc() 95 | 96 | modRain<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(1,2),1,0)))) 97 | heavyRain<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x)==3,1,0)))) 98 | rainHail<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x)==4,1,0)))) 99 | bigDrops<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x)==5,1,0)))) 100 | snow<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(10,11),1,0)))) 101 | graupel<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(13,14),1,0)))) 102 | validReadings<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) >0,1,0)))) 103 | allPrecipitation<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(1,2,3,4,5,10,11,12,13,14),1,0)))) 104 | 105 | #train.MassWeightedMean<-lapply(as.character(train[,MassWeightedMean]),function(x) as.numeric(strsplit(x," ")[[1]])); train[,MassWeightedMean:=NULL]; gc() 106 | #train.MassWeightedSD<-lapply(as.character(train[,MassWeightedSD]),function(x) as.numeric(strsplit(x," ")[[1]])); train[,MassWeightedSD:=NULL]; gc() 107 | #train.HydrometeorType<-lapply(as.character(train[,HydrometeorType]),function(x) as.numeric(strsplit(x," ")[[1]])); train[,HydrometeorType:=NULL]; gc() 108 | 109 | maxRadarQualityIndex<-mapply(function(x,y) getAggregates(x,"max",y),x=q,y=q) 110 | maxRadarQualityIndex[is.na(maxRadarQualityIndex)]<-NA 111 | avgRadarQualityIndex<-mapply(function(x,y) getAggregates(x,"mean",y),x=q,y=q) 112 | avgRadarQualityIndex[is.na(avgRadarQualityIndex)]<-NA 113 | maxTimeToEnd<-mapply(function(x,y) getAggregates(x,"max",y),x=train.TimeToEnd,y=q) 114 | maxTimeToEnd[is.na(maxTimeToEnd)]<-NA 115 | minTimeToEnd<-mapply(function(x,y) getAggregates(x,"min",y),x=train.TimeToEnd,y=q) 116 | minTimeToEnd[is.na(minTimeToEnd)]<-NA 117 | diffTimeToEnd<-maxTimeToEnd-minTimeToEnd 118 | diffTimeToEnd[is.na(diffTimeToEnd)]<-NA 119 | 120 | t5<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>49 & x<61,1,0)))) 121 | t4<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>39 & x<51,1,0)))) 122 | t3<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>29 & x<41,1,0)))) 123 | t2<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>19 & x<31,1,0)))) 124 | t1<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>9 & x<21,1,0)))) 125 | t0<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x<11,1,0)))) 126 | time10s<-t5+t4+t3+t2+t1+t0 127 | 128 | load("RdataFiles/train.Reflectivity.Rdata") 129 | load("RdataFiles/train.TimeToEnd.Rdata") 130 | Ref<-lapply(train.Reflectivity, function(x) as.numeric(removeInvalids(x))) 131 | r5<-unlist(mapply(function(x,y) max(ifelse(x>49 & x<61,y,0)),x=train.TimeToEnd,y=Ref)) 132 | r4<-unlist(mapply(function(x,y) max(ifelse(x>39 & x<50,y,0)),x=train.TimeToEnd,y=Ref)) 133 | r3<-unlist(mapply(function(x,y) max(ifelse(x>29 & x<40,y,0)),x=train.TimeToEnd,y=Ref)) 134 | r2<-unlist(mapply(function(x,y) max(ifelse(x>19 & x<30,y,0)),x=train.TimeToEnd,y=Ref)) 135 | r1<-unlist(mapply(function(x,y) max(ifelse(x>9 & x<20,y,0)),x=train.TimeToEnd,y=Ref)) 136 | r0<-unlist(mapply(function(x,y) max(ifelse(x<10,y,0)),x=train.TimeToEnd,y=Ref)) 137 | refMax10s<-r5+r4+r3+r2+r1+r0 138 | refMaxSql10s<-r5^2+r4^2+r3^2+r2^2+r1^2+r0^2 139 | save(refMax10s,file="RdataFiles/refMax10s.Rdata") 140 | save(refMaxSql10s,file="RdataFiles/refMaxSql10s.Rdata") 141 | rm(r5,r4,r3,r2,r1,r0,Ref,train.Reflectivity) 142 | 143 | 144 | load("RdataFiles/train.RhoHV.Rdata") 145 | Rho<-train.RhoHV 146 | Rho5<-unlist(mapply(function(x,y) max(ifelse(x>49 & x<61,y,0)),x=train.TimeToEnd,y=Rho)) 147 | Rho4<-unlist(mapply(function(x,y) max(ifelse(x>39 & x<50,y,0)),x=train.TimeToEnd,y=Rho)) 148 | Rho3<-unlist(mapply(function(x,y) max(ifelse(x>29 & x<40,y,0)),x=train.TimeToEnd,y=Rho)) 149 | Rho2<-unlist(mapply(function(x,y) max(ifelse(x>19 & x<30,y,0)),x=train.TimeToEnd,y=Rho)) 150 | Rho1<-unlist(mapply(function(x,y) max(ifelse(x>9 & x<20,y,0)),x=train.TimeToEnd,y=Rho)) 151 | Rho0<-unlist(mapply(function(x,y) max(ifelse(x<10,y,0)),x=train.TimeToEnd,y=Rho)) 152 | RhoMax10s<-Rho5+Rho4+Rho3+Rho2+Rho1+Rho0 153 | RhoMaxSq10s<-Rho5^2+Rho4^2+Rho3^2+Rho2^2+Rho1^2+Rho0^2 154 | save(RhoMax10s,file="RdataFiles/RhoMax10s.Rdata") 155 | save(RhoMaxSq10s,file="RdataFiles/RhoMaxSq10s.Rdata") 156 | rm(Rho5,Rho4,Rho3,Rho2,Rho1,Rho0,Rho,train.RhoHV) 157 | 158 | 159 | load("RdataFiles/train.Reflectivity.Rdata") 160 | load("RdataFiles/train.DistanceToRadar.Rdata") 161 | Sys.time(); vec<-mapply(function(x,y) getBestRadarScalar(cbind(x,y)),x=train.DistanceToRadar,y=train.Reflectivity); Sys.time() ## 14 minutes 162 | Sys.time(); siteRef<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=train.Reflectivity,y=vec); Sys.time() ## 3 minutes 163 | 164 | yBinary<-ifelse(Expected<=0,1,0) 165 | colnames(yBinary)<-"yBinary" 166 | ##for H2O; for R, remove first two columns 167 | x<-as.data.frame(cbind(Expected,yBinary,wtHybridScan,meanHybridScan,wtSquaredHybridScan,minDistanceToRadar,wtReflectivity,meanReflectivity, 168 | wtReflectivityQC,meanReflectivityQC,meanLogWaterVolume,wtLogWaterVolume,meanZdr,wtZdr, 169 | wtComposite,meanComposite,wtRR1,meanRR1,wtRR2,meanRR2,wtRR3,meanRR3,wtRhoHV,meanRhoHV,wtVelocity,meanVelocity, 170 | modRain,heavyRain,rainHail,bigDrops,snow,graupel,validReadings,allPrecipitation, 171 | avgRadarQualityIndex,maxRadarQualityIndex,maxTimeToEnd,minTimeToEnd,diffTimeToEnd,time10s 172 | ,refMax10s,refMaxSql10s,RhoMax10s,RhoMaxSq10s,siteRef 173 | )) 174 | x1<-x[1:(nrow(x)*0.9),] 175 | x2<-x[(1+nrow(x1)):nrow(x),] 176 | 177 | #if testing, this makes things easier: 178 | x0<-x2 179 | -------------------------------------------------------------------------------- /rain/rain_functions.R: -------------------------------------------------------------------------------- 1 | trimSpaces <- function(x) return(gsub("^ *|(?<= ) | *$", "", x, perl=T)) 2 | 3 | removeInvalids<-function(x,xNew="") return(gsub("999",xNew, 4 | gsub("-99901",xNew, 5 | gsub("-99902",xNew, 6 | gsub("-99903",xNew, 7 | gsub("-99900",xNew, 8 | gsub("999.0",xNew, 9 | gsub("-99901.0",xNew, 10 | gsub("-99902.0",xNew, 11 | gsub("-99903.0",xNew, 12 | gsub("-99900.0",xNew, 13 | gsub("nan",xNew, 14 | x)))))))))))) 15 | 16 | getAggregates<-function(x,type="weightedMean",wts=NA,exponent=1){ 17 | ## pass in a vector of values and vector of weights 18 | ## get back a single number, according to the specified type 19 | ## type: mean, median, sd, max, min, meanSquared, sdSquared 20 | ## handles NAs, including fields with all NAs (returns NAs) 21 | 22 | if(type=="min") return(min(x^exponent,na.rm=TRUE)) 23 | if(type=="max") return(max(x^exponent,na.rm=TRUE)) 24 | if(type=="median") return(median(x^exponent,na.rm=TRUE)) 25 | if(type=="sd") return(sd(x^exponent,na.rm=TRUE)) 26 | if(type=="mean") return(mean(x^exponent,na.rm=TRUE)) 27 | 28 | if(length(x)==sum(is.na(wts))) return(NA) 29 | wts[is.na(wts)]<-0 30 | x[is.na(x)]<-0 31 | if(type=="weightedMean") return(sum(x^exponent*wts)/sum(wts)) 32 | 33 | return(NA) 34 | } 35 | 36 | getGlmByMM<-function(x1,x2,x3,raw1,raw2,threshold){ 37 | library(glmnet) 38 | library(Metrics) 39 | library(Matrix) 40 | y1<-ifelse(raw1<=threshold,1,0) 41 | y2<-ifelse(raw2<=threshold,1,0) 42 | x1[is.na(x1)] <- 0 43 | x2[is.na(x2)] <- 0 44 | x3[is.na(x3)] <- 0 45 | g<-glmnet(as.matrix(x1),as.factor(y1),family=c("binomial")) 46 | p2<-pmin(1.0,pmax(0.0,predict(g,as.matrix(x2),s=g$lambda[length(g$lambda)],type="response"))) 47 | p3<-pmin(1.0,pmax(0.0,predict(g,as.matrix(x3),s=g$lambda[length(g$lambda)],type="response"))) 48 | print(mse(y2,p2)) 49 | return(p3) 50 | } 51 | 52 | ## pretty bulky this way, but it's very simple to get things up and running. 53 | getGbmByMM<-function(x1,x2,x3,raw1,raw2,threshold,trees=100,depth=5,minObs=30,shrink=0.1){ 54 | ## x1: x values for the full training set 55 | ## x2: x values for the holdout set (for which we have answers) 56 | ## x3: x values for the prediction set (generally that for which we do not have answers) 57 | ## raw1: target for full training: rain, in mm; called raw because this will be converted to binary 58 | ## raw2: target for the holdout set: also rain, in mm 59 | ## threshold: which rain "bucket" we are predicted: 0 = p(rain<=0mm); 1 = p(rain<=1mm); etc. 60 | ## trees, depth, minObs,shrink: passthrough GBM parameters 61 | ## 62 | ## returns: prediction vector, bound between 0.0 and 1.0 63 | ## 64 | ## additional feedback: prints the holdout score, mse using raw2 and predictions given x2 65 | 66 | library(gbm) 67 | library(Metrics) ##can easily remove this dependency by using the simple MSE equation 68 | y1<-ifelse(raw1<=threshold,1,0) 69 | y2<-ifelse(raw2<=threshold,1,0) 70 | 71 | G<-gbm.fit(x1,y1,distribution="bernoulli",n.trees=trees, interaction.depth=depth,n.minobsinnode=minObs,shrinkage=shrink) 72 | p2<-pmin(1.0,pmax(0.0,predict(G,x2,n.trees=trees,type="response"))) 73 | p3<-pmin(1.0,pmax(0.0,predict(G,x3,n.trees=trees,type="response"))) 74 | print(summary(G,plotit=FALSE)) 75 | print(mse(y2,p2)) 76 | return(p3) 77 | } 78 | 79 | getH2OGbmByMM<-function(h2oInst,x1,x2,x3,xCols,threshold,trees=100,depth=5,minObs=20,shrink=0.1,printImportance=0,printGbm=TRUE){ 80 | ## x1: parsed H2O object for the full training set 81 | ## x2: parsed H2O object for the holdout set (for which we have answers) 82 | ## x3: parsed H2O object for the prediction set (generally that for which we do not have answers) 83 | ## threshold: which rain "bucket" we are predicted: 0 = p(rain<=0mm); 1 = p(rain<=1mm); etc. 84 | ## trees, depth,shrink: passthrough GBM parameters 85 | ## 86 | ## returns: prediction vector, bound between 0.0 and 1.0 87 | ## 88 | ## additional feedback: prints the holdout score, mse using raw2 and predictions given x2 & importance (if desired) 89 | 90 | library(Metrics) ##can easily remove this dependency by using the simple MSE equation 91 | 92 | ## convert the yBinary column into the target, specific to the threshold value passed in 93 | train$yBinary<-ifelse(train$Expected<=threshold,1,0) 94 | hold$yBinary<-ifelse(hold$Expected<=threshold,1,0) 95 | 96 | ## train a gbm model, then use it to predict the passed in holdout (x2) and prediction sets (x3) 97 | my.gbm <- h2o.gbm(x=xCols,y="yBinary",distribution="bernoulli",data=x1,key="removeWhenDone.hex",n.trees=trees,interaction.depth=depth,shrinkage=shrink,n.minobsinnode=minObs,importance=T) 98 | holdVals <- h2o.predict(my.gbm,x2) ## make predictions 99 | returnObj<-h2o.predict(my.gbm,x3) 100 | returnVals<-returnObj[,3] 101 | 102 | ## print mse and importance values if requested 103 | print(mse(hold$yBinary,holdVals[,3])) 104 | if(printImportance<0){print(my.gbm@model$varimp)} 105 | if(printImportance>0){print(my.gbm@model$varimp[printImportance,])} 106 | 107 | ## clean up unneeded cache objects 108 | h2o.rm(h2oInst,"removeWhenDone.hex") 109 | return(returnVals) 110 | } 111 | 112 | getH2ODeepLearningByMM<-function(h2oInst,x1,x2,x3,xCols,threshold, 113 | hidden=c(200,200),epochs=1,activation="RectifierWithDropout", classification=TRUE, ...){ 114 | ## x1: parsed H2O object for the full training set 115 | ## x2: parsed H2O object for the holdout set (for which we have answers) 116 | ## x3: parsed H2O object for the prediction set (generally that for which we do not have answers) 117 | ## threshold: which rain "bucket" we are predicted: 0 = p(rain<=0mm); 1 = p(rain<=1mm); etc. 118 | ## remaining: passthrough to h2o.deeplearning 119 | ## 120 | ## returns: prediction vector, bound between 0.0 and 1.0 121 | ## 122 | ## additional feedback: prints the holdout score, mse using raw2 and predictions given x2 & importance (if desired) 123 | 124 | library(Metrics) ##can easily remove this dependency by using the simple MSE equation 125 | slotNum<-ifelse(classification==TRUE,3,1) 126 | 127 | ## convert the yBinary column into the target, specific to the threshold value passed in 128 | train$yBinary<-ifelse(train$Expected<=threshold,1,0) 129 | hold$yBinary<-ifelse(hold$Expected<=threshold,1,0) 130 | 131 | ## train a gbm model, then use it to predict the passed in holdout (x2) and prediction sets (x3) 132 | my.deepL <- h2o.deeplearning(x=xCols,y="yBinary",data=x1,key="removeWhenDone.hex", 133 | activation=activation,hidden=hidden,epochs=epochs, classification=classification, ...) 134 | holdVals <- h2o.predict(my.deepL,x2) ## make predictions 135 | returnObj<-h2o.predict(my.deepL,x3) 136 | returnVals<-returnObj[,slotNum] 137 | 138 | ## print mse and importance values if requested 139 | print(mse(hold$yBinary,holdVals[,slotNum])) 140 | 141 | #if(printImportance<0){print(my.gbm@model$varimp)} 142 | #if(printImportance>0){print(my.gbm@model$varimp[printImportance,])} 143 | 144 | ## clean up unneeded cache objects 145 | h2o.rm(h2oInst,"removeWhenDone.hex") 146 | return(returnVals) 147 | } 148 | 149 | clearh2o<-function(h2oInst){ 150 | h2o.rm(h2oInst, grep(pattern = "GBMPredict", x = h2o.ls(h2oInst)$Key, value = TRUE)) 151 | h2o.rm(h2oInst, grep(pattern = "Last.value", x = h2o.ls(h2oInst)$Key, value = TRUE)) 152 | } 153 | 154 | ### Utility functions to help look at the data 155 | roundTexttoText<-function(x,decimals=1){ 156 | if(as.character(x) %in% c("-",""," ","NaN","nan", "-99901","-99902","-99903","999","-99901.0","-99902.0","-99903.0","-99900","-99900.0")){return("")} 157 | else{return(round(as.numeric(as.character(x)),decimals))} 158 | } 159 | 160 | getDfFromRaw<-function(i,train){ 161 | ### This will show the raw data for a particular hourly reading in an easy to read format. 162 | ### It merges the vectors, discards garbage readings, and formats the output to try and make it concise 163 | ### Currently it requires the vectors to exist, but it would be easy to make this work from the raw data 164 | ### 165 | ### Usage: getDfFromRaw(8,train), which will show you the 13 radar readings for the 8th record 166 | ### it can take a second or two to go through all the parsing 167 | df<-as.data.frame(cbind(rep(train[i,Expected],length(train$TimeToEnd[[i]])), 168 | unlist(lapply(lapply(as.character(train$TimeToEnd),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 169 | unlist(lapply(lapply(as.character(train$DistanceToRadar),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 170 | unlist(lapply(lapply(as.character(train$RadarQualityIndex),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 171 | unlist(lapply(lapply(as.character(train$HydrometeorType),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 172 | unlist(lapply(lapply(as.character(train$HybridScan),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 173 | unlist(lapply(lapply(as.character(train$Reflectivity),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 174 | unlist(lapply(lapply(as.character(train$ReflectivityQC),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 175 | unlist(lapply(lapply(as.character(train$RR1),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 176 | unlist(lapply(lapply(as.character(train$RR2),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 177 | unlist(lapply(lapply(as.character(train$RR3),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 178 | unlist(lapply(lapply(as.character(train$Zdr),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 179 | unlist(lapply(lapply(as.character(train$MassWeightedMean),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 180 | unlist(lapply(lapply(as.character(train$MassWeightedSD),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 181 | unlist(lapply(lapply(as.character(train$Composite),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 182 | unlist(lapply(lapply(as.character(train$LogWaterVolume),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))), 183 | unlist(lapply(lapply(as.character(train$Velocity),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))) 184 | )) 185 | colnames(df)<-c("y","Tm","Dist","Qual","Type","HybSc","Refl","RefQc","RR1","RR2","RR3","Zdr","MWM","MWsd","Cmp","LWV","V") 186 | return(df) 187 | } 188 | 189 | 190 | getDfFromVectors<-function(i){ 191 | ### This will show the raw data for a particular hourly reading. 192 | ### It merges the vectors, discards garbage readings, and formats the output to try and make it concise 193 | ### This version is faster to use if you have the vectors pre-processed in the names provided by our pre-processing code 194 | ### 195 | ### Usage: getDfFromVectors(8), which will show you the 13 radar readings for the 8th record 196 | df<-as.data.frame(cbind(rep(train[i,Expected],length(train.TimeToEnd[[i]])), 197 | train.TimeToEnd[[i]], 198 | unlist(lapply(train.DistanceToRadar[[i]],function(x) roundTexttoText(x))), 199 | unlist(lapply(train.RadarQualityIndex[[i]],function(x) roundTexttoText(x))), 200 | unlist(lapply(train.HydrometeorType[[i]],function(x) roundTexttoText(x))), 201 | unlist(lapply(train.HybridScan[[i]],function(x) roundTexttoText(x))), 202 | unlist(lapply(train.Reflectivity[[i]],function(x) roundTexttoText(x))), 203 | unlist(lapply(train.ReflectivityQC[[i]],function(x) roundTexttoText(x))), 204 | unlist(lapply(train.RR1[[i]],function(x) roundTexttoText(x))), 205 | unlist(lapply(train.RR2[[i]],function(x) roundTexttoText(x))), 206 | unlist(lapply(train.RR3[[i]],function(x) roundTexttoText(x))), 207 | unlist(lapply(train.Zdr[[i]],function(x) roundTexttoText(x))), 208 | unlist(lapply(train.MassWeightedMean[[i]],function(x) roundTexttoText(x))), 209 | unlist(lapply(train.MassWeightedSD[[i]],function(x) roundTexttoText(x))), 210 | unlist(lapply(train.Composite[[i]],function(x) roundTexttoText(x))), 211 | unlist(lapply(train.LogWaterVolume[[i]],function(x) roundTexttoText(x))), 212 | unlist(lapply(train.Velocity[[i]],function(x) roundTexttoText(x))) 213 | )) 214 | colnames(df)<-c("y","Tm","Dist","Qual","Type","HybSc","Refl","RefQc","RR1","RR2","RR3","Zdr","MWM","MWsd","Cmp","LWV","V") 215 | return(df) 216 | } 217 | -------------------------------------------------------------------------------- /rossmann-store-sales/ACM-data-science-camp.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(h2o) 3 | 4 | cat("reading the train and test data (with data.table) \n") 5 | train <- fread("../input/train.csv",stringsAsFactors = T) 6 | store <- fread("../input/store.csv",stringsAsFactors = T) 7 | 8 | dim(train) ## look at size 9 | train ## take a look at some example rows 10 | store ## examples from the store data 11 | 12 | ## Merge (i.e. join) the train set with the store data 13 | train <- merge(train,store,by="Store") 14 | train ## glance at merged data 15 | 16 | ## Convert date field into a proper date format, then extract month and year 17 | train[,Date:=as.Date(Date)] 18 | train[,month:=as.integer(format(Date, "%m"))] 19 | train[,year:=as.integer(format(Date, "%y"))] 20 | train ## make sure the calculations came out as expected 21 | 22 | ## Use H2O's random forest 23 | ## Start cluster with all available threads 24 | h2o.init(nthreads=-1,max_mem_size='6G') 25 | ## Load data into cluster from R 26 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex") 27 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex") 28 | dim(trainHex); dim(validHex) 29 | 30 | ## Set up variable to use all features other than those specified here 31 | features<-colnames(train)[!(colnames(train) %in% c("Id","Date","Sales","Customers"))] 32 | ## Train a random forest using all default parameters 33 | rfHex <- h2o.randomForest(x=features, 34 | y="Sales", 35 | training_frame=trainHex, 36 | validation_frame=validHex, 37 | model_id="introRF") 38 | 39 | ## Now let's see how it performed 40 | 41 | ## The specific competition error metric is not defined, so we will define it 42 | rmspe<-function(actuals,predictions){return(mean(((actuals[actuals>0]-predictions[actuals>0])/actuals[actuals>0])^2)^0.5)} 43 | ## Create a data frame with the actuals and H2O predictions 44 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1])) 45 | 46 | ## Score the predictions 47 | rmspe(judge[,1],judge[,2]) 48 | 49 | ## And now take a look at the model summary 50 | summary(rfHex) 51 | 52 | ## How good is that model? 53 | theMean<-mean(trainHex$Sales) 54 | theMedian<-median(as.data.frame(trainHex$Sales)[,1]) 55 | theLogMean<-expm1(mean(log1p(as.data.frame(trainHex$Sales)[,1]))) 56 | rmspe(judge[,1],rep(theMean,nrow(judge))) 57 | rmspe(judge[,1],rep(theMedian,nrow(judge))) 58 | rmspe(judge[,1],rep(theLogMean,nrow(judge))) 59 | 60 | ## Seems like it's great. But is it? 61 | 62 | train[Store==1,.(Sales,Date)][order(Date),plot(Sales)] 63 | 64 | ## A lot of 0s 65 | train[,.(.N,ZeroRate=sum(Sales==0)/.N,AvgSales=mean(Sales),AvgSalesNoZero=mean(ifelse(Sales==0,NA,Sales),na.rm=T))] 66 | 67 | ## Look at example records with 0 Sales 68 | train[Sales==0,] 69 | 70 | ## Summarize by Open status 71 | train[,.(.N,mean(Sales)),Open] ## data table syntax, SQL style: [i,j,by] = [WHERE,SELECT,GROUP BY] 72 | 73 | ## So the model is learning the Open status well, as we saw from the summary showing it as the most important column 74 | ## In practice, this is fine, as it's an easy and obvious way to separate the data. 75 | ## Nobody will be impressed by our ability to separate Open/Closed stores really well. 76 | ## But there are two considerations, still. One is that it is so specific, it might be better to separate the data anyway 77 | ## via a rule-based system. The rationale is that with anything that samples columns, it might try and fit a tree without 78 | ## vital information. And with 17k records at exactly 0 sales, plus the intuitive concept that Closed = 0 sales, there 79 | ## should be no advantage leaving that data in. 80 | ## So let's try removing it. 81 | ## Additionally, we are not scored on results where the actuals are 0. That is for math simplicity on Kaggle's side. 82 | ## In practice, you do not want to be so leniant. If your model ignores non-zero predictions when the actual was zero 83 | ## you will have a problem with whatever action is connected to your predictions (e.g. setting production volume) 84 | ## So for both reasons, we will remove the 0 sales and conduct all future modeling without these in the training set 85 | 86 | train <- train[Sales > 0,] ## We are not judged on 0 sales records in test set 87 | 88 | ## Now let's try to retrain the model and see if there is a difference. 89 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex") 90 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex") 91 | dim(trainHex); dim(validHex) 92 | rfHex <- h2o.randomForest(x=features,y="Sales",training_frame=trainHex,validation_frame=validHex,model_id="introRF") 93 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1])) 94 | rmspe(judge[,1],judge[,2]) 95 | 96 | ## Great. The model is better. What next? 97 | summary(rfHex) 98 | 99 | ## Odd that Store is less important than CompetitionDistance. 100 | ## We have chosen to solve a time series problem with decision trees, which should require heavy use 101 | ## of the main two dimensions (store & time) 102 | class(train$Store) 103 | 104 | ## Change integer encoding to a factor: should be no logical reason why an ID number encodes any value in the numbering scheme 105 | train[,Store:=as.factor(as.numeric(Store))] 106 | 107 | ## Now let's try to retrain the model and see if there is a difference. 108 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex") 109 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex") 110 | dim(trainHex); dim(validHex) 111 | rfHex <- h2o.randomForest(x=features,y="Sales",training_frame=trainHex,validation_frame=validHex,model_id="introRF", 112 | nbins_cats=1115) 113 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1])) 114 | rmspe(judge[,1],judge[,2]) 115 | 116 | ## More improvement. What next? 117 | 118 | ## log transformation to not be as sensitive to high sales 119 | ## decent rule of thumb: 120 | ## if the data spans an order of magnitude, consider a log transform 121 | train[,logSales:=log1p(Sales)] 122 | features<-colnames(train)[!(colnames(train) %in% c("Id","Date","Sales","Customers","logSales"))] 123 | 124 | ## Run through experimentation cycle again 125 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex") 126 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex") 127 | dim(trainHex); dim(validHex) 128 | rfHex <- h2o.randomForest(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introRF", 129 | nbins_cats=1115) 130 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1])) 131 | rmspe(judge[,1],expm1(judge[,2])) ## added expm1 term to undo the log transformation for real results 132 | 133 | ## There is a lot more experimentation that can be done with features. 134 | ## But for the moment, we'll try some other modeling choices at this point. 135 | ## First, let's try a GBM 136 | gbmHex <- h2o.gbm(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introGBM", 137 | nbins_cats=1115) 138 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(gbmHex,validHex))[,1])) 139 | rmspe(judge[,1],expm1(judge[,2])) ## added expm1 term to undo the log transformation for real results 140 | 141 | ## It ran faster, but is less accurate. Let's add a stochastic element to it 142 | gbmHex <- h2o.gbm(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introGBM", 143 | nbins_cats=1115,sample_rate = 0.7,col_sample_rate = 0.7,max_depth = 10,learn_rate=0.05) 144 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(gbmHex,validHex))[,1])) 145 | rmspe(judge[,1],expm1(judge[,2])) ## added expm1 term to undo the log transformation for real results 146 | 147 | ## Still not matching RF accuracy 148 | gbmHex <- h2o.gbm(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introGBM", 149 | nbins_cats=1115,sample_rate = 0.5,col_sample_rate = 0.5,max_depth = 15,learn_rate=0.05,ntrees = 50) 150 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(gbmHex,validHex))[,1])) 151 | rmspe(judge[,1],expm1(judge[,2])) ## added expm1 term to undo the log transformation for real results 152 | 153 | ## Closer. To really compare the two, we'd likely want to do a grid search here. 154 | 155 | 156 | ## When we like our model, predict on test to see if our out of sample translates reasonably well 157 | ## Note: this wasn't run at the Data Science Camp 158 | test <- fread("../input/test.csv",stringsAsFactors = T) 159 | test <- merge(test,store,by="Store") 160 | test[,Date:=as.Date(Date)] 161 | test[,month:=as.integer(format(Date, "%m"))] 162 | test[,year:=as.integer(format(Date, "%y"))] 163 | test[,Store:=as.factor(as.numeric(Store))] 164 | summary(test) 165 | 166 | testHex<-as.h2o(test) 167 | testPredictions<-expm1(as.data.frame(h2o.predict(rfHex,testHex))[,1]) 168 | summary(testPredictions) 169 | -------------------------------------------------------------------------------- /see_click_predict_fix_basis.R: -------------------------------------------------------------------------------- 1 | train<-read.csv("train.csv") 2 | train<-train[train$num_votes!=327,] ##remove large outlier 3 | train<-train[train$num_views<1500,] ##remove large outlier 4 | train$created_time<-strptime(train$created_time,"%Y-%m-%d %H:%M:%S") 5 | #train<-train[as.Date(train$created_time)>as.Date("2012-10-14"),] 6 | train<-train[as.Date(train$created_time)>as.Date("2013-01-31"),] 7 | 8 | test<-read.csv("test.csv") 9 | test$created_time<-strptime(test$created_time,"%Y-%m-%d %H:%M:%S") 10 | 11 | ######################### 12 | ##### Functions ######### 13 | triRmsle<-function(p1,a1,p2,a2,p3,a3){ 14 | require(Metrics) 15 | a<-rmsle(rbind(p1,p2,p3),rbind(a1,a2,a3)) 16 | return(a) 17 | } 18 | quickScalarRmsle<-function(df,scalars){return(triRmsle(df[,1]*scalars[,1],df[,1],df[,2]*scalars[,1],df[,2],df[,3]*scalars[,1],df[,3]))} 19 | ##usage: quickScalarRmsle(targets,cbind(targets[,1]*0,targets[,2]*0,targets[,3]*0)) ##0.7330985 20 | 21 | getGbmFit<-function(x,y,t){require(gbm); GBM_NTREES = t; GBM_SHRINKAGE = 0.05; GBM_DEPTH = 4; GBM_MINOBS = 30; 22 | return(gbm.fit(x = x,y = y,distribution = "gaussian",n.trees = GBM_NTREES,shrinkage = GBM_SHRINKAGE,interaction.depth = GBM_DEPTH,n.minobsinnode = GBM_MINOBS,verbose = FALSE))} 23 | 24 | getDistributionTable<-function(singleColumnDF,n){ 25 | require(sqldf) 26 | colnames(singleColumnDF)<-"x" 27 | out<-as.data.frame(sqldf(paste("SELECT x FROM singleColumnDF GROUP BY x ORDER BY COUNT(*) DESC LIMIT",n))) 28 | return(out) 29 | } 30 | 31 | binarizeColumn <- function(data,valListDF,colNum,remove=TRUE){ 32 | for(i in 1:nrow(valListDF)){ 33 | newCol<-ifelse(data[,colNum]==valListDF[i,1],1,0) 34 | data<-as.data.frame(cbind(data,newCol)) 35 | colnames(data)[ncol(data)]<-as.character(valListDF[i,1]) 36 | } 37 | if(remove) {data[,colNum]<-NULL} 38 | return(data) 39 | } 40 | 41 | ###### End of functions ####### 42 | ############################### 43 | 44 | 45 | x<-test[,c(1,1,1,1:8)] 46 | colnames(x)<-colnames(train)[c(6,7,8,1:5,9:11)] 47 | x<-as.data.frame(rbind(train[,c(6,7,8,1:5,9:11)],x)) 48 | x<-as.data.frame(cbind(seq(1:nrow(x)),x)) 49 | colnames(x)[1]<-"sortNum" 50 | 51 | a1<-1; a2<-nrow(train); b1<-nrow(train)+1; b2<-nrow(x) 52 | a1;a2;b1;b2 53 | 54 | city<-as.factor(ifelse(x$longitude<(-100),"Oakland",ifelse(x$longitude<(-82),"Chicago",ifelse(x$longitude<(-75),"Richmond","New Haven")))) 55 | descLength<-round(log(nchar(as.character(x$description))+1),0) 56 | descSummary<-round(log(nchar(as.character(x$summary))+1),0) 57 | latlong2_str<-as.factor(paste(round(x$latitude/2,2)*2,round(x$longitude/2,2)*2,sep='_')) 58 | hrOfDay<-as.factor(substr(x$created_time,12,13)) 59 | dayOfWeek <- as.factor(weekdays(as.Date(x$created_time))) 60 | l10sum<-as.factor(tolower(substr(gsub(" ","",gsub("[[:punct:]]","",x$summary)),1,10))) 61 | ##left_10_desc<-as.factor(tolower(substr(gsub(" ","",gsub("[[:punct:]]","",x$description)),1,10))) 62 | x$latitude<-NULL; x$longitude<-NULL 63 | 64 | x<-as.data.frame(cbind(x,city,descLength,descSummary,latlong2_str,hrOfDay,dayOfWeek,l10sum)) 65 | 66 | levels(x$source)<-c(levels(x$source),"n/a"); x$source[is.na(x$source)]<-"n/a" 67 | levels(x$tag_type)<-c(levels(x$tag_type),"n/a"); x$tag_type[is.na(x$tag_type)]<-"n/a" 68 | 69 | ## binarization 70 | distL10Summ<-getDistributionTable(as.data.frame(x$l10sum),50) 71 | x<-binarizeColumn(x,distL10Summ,17) 72 | 73 | distlatlong2_str<-getDistributionTable(as.data.frame(x$latlong2_str),50) 74 | x<-binarizeColumn(x,distlatlong2_str,14) 75 | 76 | distdayOfWeek<-getDistributionTable(as.data.frame(x$dayOfWeek),7) 77 | x<-binarizeColumn(x,distdayOfWeek,15) 78 | 79 | disthrOfDay<-getDistributionTable(as.data.frame(x$hrOfDay),24) 80 | x<-binarizeColumn(x,disthrOfDay,14) 81 | 82 | distdescSummary<-getDistributionTable(as.data.frame(x$descSummary),6) 83 | x<-binarizeColumn(x,distdescSummary,13) 84 | 85 | distdescLength<-getDistributionTable(as.data.frame(x$descLength),6) 86 | x<-binarizeColumn(x,distdescLength,12) 87 | 88 | distcity<-getDistributionTable(as.data.frame(x$city),4) 89 | x<-binarizeColumn(x,distcity,11) 90 | 91 | disttag_type<-getDistributionTable(as.data.frame(x$tag_type),20) 92 | x<-binarizeColumn(x,disttag_type,10) 93 | 94 | distsource<-getDistributionTable(as.data.frame(x$source),9) 95 | x<-binarizeColumn(x,distsource,8) 96 | 97 | a1<-1; a2<-nrow(train); b1<-nrow(train)+1; b2<-nrow(x) 98 | a1;a2;b1;b2 99 | train<-x[x$sortNum=b1,] 101 | dim(train); dim(test) 102 | 103 | trees<-100 104 | fitVotes<-getGbmFit(train[,9:ncol(train)], log(1+train[,2]), trees) 105 | fitComments<-getGbmFit(train[,9:ncol(train)], log(1+train[,3]), trees) 106 | fitViews<-getGbmFit(train[,9:ncol(train)], log(1+train[,4]), trees) 107 | 108 | cvFullVotes<-predict(object = fitVotes,newdata=train[,9:ncol(train)], type="response",trees) 109 | cvFullComments<-predict(object = fitComments,newdata=train[,9:ncol(train)], type="response",trees) 110 | cvFullViews<-predict(object = fitViews,newdata=train[,9:ncol(train)], type="response",trees) 111 | 112 | testVotesNS<-predict(object = fitVotes,newdata=test[,9:ncol(test)], type="response",trees) 113 | testCommentsNS<-predict(object = fitComments,newdata=test[,9:ncol(test)], type="response",trees) 114 | testViewsNS<-predict(object = fitViews,newdata=test[,9:ncol(test)], type="response",trees) 115 | 116 | quickScalarRmsle(cbind(train[,2],train[,3],train[,4]),cbind(exp(cvFullVotes)-1,exp(cvFullComments)-1,exp(cvFullViews)-1)) 117 | summary(fitViews) 118 | summary(fitVotes) 119 | summary(fitComments) 120 | 121 | submission<-as.data.frame(cbind(test[,5],round(exp(testViews)-1,3),round(exp(testVotes)-1,3),round(exp(testComments)-1,3))) 122 | colnames(submission)<-c("id","num_views","num_votes","num_comments") 123 | submission$num_views<-pmax(submission$num_views,0) 124 | submission$num_votes<-pmax(submission$num_votes,0) 125 | submission$num_comments<-pmax(submission$num_comments,0) 126 | t<-gsub("[[:punct:]]", "", as.character(Sys.time())) 127 | fn<-paste("submission_",gsub(" ","_",t),".csv",sep='') 128 | write.table(submission,fn, append = FALSE, sep=',', row.names=FALSE, quote=FALSE, col.names=TRUE) 129 | -------------------------------------------------------------------------------- /will_it_rain/histogram_starter_model.R: -------------------------------------------------------------------------------- 1 | ##################################################### 2 | ## Starter Script for Kaggle Will it Rain competition 3 | ##################################################### 4 | ## 5 | ## This is a script to show a simple alternative to 6 | ## using a sigmoid distribution. 7 | ## It is essentially a histogram prediction, using 8 | ## coarse measurements of the same RR1 value used 9 | ## in the provided benchmark. 10 | ## 11 | ## Packages: data.table 12 | ## 13 | ## Performance: 0.00872817 14 | ## Relative performance: 37th out of 189; improves upon 15 | ## provided benchmark (0.01177621), and using all 1's (0.01017651) 16 | ###################################################### 17 | start<-Sys.time() 18 | #setwd("directory where the data files are located, if not getwd()") 19 | 20 | ## read in a fixed number of rows (increase for improved accuracy) 21 | ## you can use read.csv in place of data.table; it's just much slower 22 | library(data.table) 23 | train<-fread("train_2013.csv",nrows=250000,select=c("RR1","Expected")) 24 | 25 | ## parse the readings inside each RR1 value, which are space-delimited, and take the mean of all values 26 | rr1.mean<-unlist(lapply(train[,RR1], function(x) mean(pmax(0,pmin(70,as.numeric(strsplit(x," ")[[1]])))))) 27 | ## in case there are any NAs, replace with 0 28 | rr1.mean[is.na(rr1.mean)]<-0 29 | 30 | ## look at the data 31 | ## notice most observations are 0, and after that most between 0 and 0.5 32 | ## so using typical quantiles or breaks will merely subdivide a really small range of measurements 33 | ## So instead, we break off the most popular buckets, and then divide the remainder of the RR1 distribution 34 | ## So we are asking for the ranges of RR1, but forcing the first and second range starts to be 0 and 0.5 35 | table(round(rr1.mean,1)) 36 | breaks<-c(-Inf,0,quantile(rr1.mean[rr1.mean>0.5],c(0:8)/8)) 37 | names(breaks)<-NULL 38 | ##store the break length to use throughout the rest of the code, in case you want to measure more breaks than 10 39 | n<-length(breaks)-1 40 | 41 | ## create a matrix to store predictions for key points along the RR1 distribution 42 | observations<-matrix(,n,10) 43 | 44 | ## now fill in with empirical values 45 | for(i in 1:n){ 46 | ## for the various cutpoints (breaks) in the RR1 distribution, measure the frequency 47 | ## of the bottom 10 levels of the target value (0mm,1mm,2mm,...9mm) 48 | observations[i,1:10]<-hist( 49 | pmin(70,pmax(0,train[,Expected][rr1.mean>breaks[i]&rr1.mean<=breaks[i+1]])), 50 | plot=FALSE, 51 | breaks=c(0:(n-1),70) 52 | )$density 53 | } 54 | 55 | ## convert histogram to cumulative distribution 56 | ## this ensures we have non-decreasing predictions as well (requirement) 57 | for(i in 2:n){observations[,i]<-observations[,(i-1)]+observations[,i]} 58 | 59 | ## look over the values again 60 | ## looks a little peculiar in that some higher ranges have lower outcomes, but it is empirical 61 | observations 62 | 63 | ## now construct a prediction by using the matrix as a lookup table to all 70 predictions 64 | ## i.e. we'll find the RR1 value for every record, figure out which row in our small prediction table 65 | ## it corresponds to, and use that entire row of the matrix as our prediction "vector" 66 | 67 | test<-fread("test_2014.csv",select=c("Id","RR1")) 68 | 69 | ## parse the readings inside each RR1 value, which are space-delimited 70 | rr1.mean<-unlist(lapply(test[,RR1], function(x) mean(pmax(0,pmin(70,as.numeric(strsplit(x," ")[[1]])))))) 71 | rr1.mean[is.na(rr1.mean)]<-0 72 | 73 | ## seed the predictions with 1, which means it will rain <= each column 100% of the time 74 | predictions<-as.data.frame(cbind(test$Id,matrix(1,nrow=nrow(test),ncol=70))) 75 | colnames(predictions)<-c("Id",paste0("Predicted",(seq(1:70)-1))) 76 | 77 | ## override the 100% values with our table lookup; want to improve this? smooth the edge 78 | for(i in 1:n){predictions[rr1.mean>breaks[i]&rr1.mean<=breaks[i+1],2:(n+1)]<-as.data.frame(t(observations[i,1:n]))} 79 | 80 | ## output predictions; outputs as 184MB, but compresses to 4.3MB (lot of 1's) 81 | ##scipen turns off scientific notation (for ID values such as 100000); Kaggle rejects those 82 | options("scipen"=100, "digits"=8) 83 | write.table(predictions,"histogram_benchmark.csv",quote = FALSE, sep = ",",row.names=FALSE) 84 | 85 | ##how long did it take (2.9 minutes on a Windows i5 with 6GB memory and slow HDD) 86 | stop<-Sys.time() 87 | stop-start 88 | --------------------------------------------------------------------------------